From 187511909594737eb45eef37ad6010edad010534 Mon Sep 17 00:00:00 2001
From: WoosungMyung <dntjd517@naver.com>
Date: Sat, 4 Jan 2025 08:47:46 +0900
Subject: [PATCH 01/21] Explanation of Function change

---
 src/bootstrap.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index c1d085e4c..e49514b90 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -70,7 +70,7 @@ static int localIdFromRoot(int rank, int root, int nRanks, int nRoots) {
   int ir = BOOTSTRAP_PID(root, nRoots);
   return rank - firstRankFromRoot(ir, nRanks, nRoots);
 }
-// return the number of child for a root, root will be periodized
+// Check if the given rank is the first rank from the root
 static int isFirstFromRoot(int rank, int root, int nRanks, int nRoots) {
   return (rank == firstRankFromRoot(root, nRanks, nRoots));
 }

From 6aae37927840b2bd7b7d42d2f0050e75f88ee97f Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Wed, 18 Dec 2024 08:26:06 -0800
Subject: [PATCH 02/21] 2.24.3-1

Network user buffer support for collectives
 * Leverage user buffer registration to achieve zero-copy
   inter-node communications for Ring, NVLS and Collnet

Add RAS subsystem
 * Create a RAS thread keeping track of all NCCL communicators.
 * Add a ncclras tool contacting the RAS thread and getting a
   report.

Add fp8 support
 * Add support for e5m2 and e4m3 8-bit floating point operations.
 * Use Tree/PAT algorithms when possible for better numerical
   stability.

Add NIC fusion
 * Add a NET API to ask the network plugin to fuse a set of
   interfaces together.
 * Fuse multiple NICs under the same PCI switch as a single,
   larger NIC.

Socket connection failure retry
 * Retry in case of socket connection failure (unreachable host)
 * Avoid "Software caused connection abort" errors on retries

QP connection failure retry
 * Retry in case of IB QP connection failure during ibv_modify_qp.

NET API improvements
 * Allow plugins to force a flush in case data and completion
   ordering is not guaranteed.
 * Indicate when completion is not needed (e.g. for the LL128
   protocol), allowing plugins to skip generating a completion.
 * Allow for full offload of allgather operations when using one
   GPU per node.

NCCL_ALGO/NCCL_PROTO strict enforcement
 * Extend NCCL_ALGO/NCCL_PROTO syntax to be able to specify
   ALGO/PROTO filters for each collective operation.
 * Strictly enforce the ALGO/PROTO filters, no longer fall back
   on the ring algorithm when the filtering leaves no option and
   error out instead.

Enable CUMEM host allocations
 * Use cumem functions for host memory allocation by default.

Improved profiler plugin API
 * Avoid dependencies with NCCL includes.
 * Add information on whether the buffer is registered or not

Adjust PAT tuning
 * Improve transition between PAT and ring at scale.

Fix hangs when running with different CPU architectures
 * Detect when we use a mix of GPU architectures
 * Ensure Algo/Proto decisions are made based on that unified
   state.

Fix FD leak in UDS
 * Fix a leak when mapping buffers intra-node with cumem IPCs.

Fix crash when mixing buffer registration and graph buffer registration.
 * Separate local and graph registration to avoid crashes when we free
   buffers.

Fix user buffer registration with dmabuf
 * Make ncclSend/ncclRecv communication with buffer registration functional
   on network plugins relying on dmabuf for buffer registration.

Fix crash in IB code caused by uninitialized fields.

Fix non-blocking ncclSend/ncclRecv
 * Fix case where ncclSend/ncclRecv would return ncclSuccess in non-blocking
   mode even though the operation was not enqueued onto the stream.
 * Issue #1495

Various compiler tweaks and fixes
 * PR #758

Fix typo in ncclTopoPrintGraph
 * Issue #1468
---
 ext-net/README.md                       |   61 +-
 ext-net/example/nccl/net.h              |    3 +
 ext-net/example/nccl/net_device.h       |    3 +-
 ext-net/example/nccl/net_v8.h           |    2 -
 ext-net/example/nccl/net_v9.h           |   99 ++
 ext-net/example/plugin.c                |  102 +-
 ext-profiler/example/event.h            |   21 +-
 ext-profiler/example/nccl/profiler.h    |    1 +
 ext-profiler/example/nccl/profiler_v1.h |   53 +-
 ext-profiler/example/nccl/profiler_v2.h |  146 ++
 ext-profiler/example/plugin.c           |  114 +-
 ext-profiler/example/print_event.c      |   72 +-
 ext-tuner/example/nccl/tuner.h          |    9 +-
 ext-tuner/example/plugin.c              |    9 +-
 makefiles/common.mk                     |    5 +
 makefiles/version.mk                    |    4 +-
 src/Makefile                            |   30 +-
 src/bootstrap.cc                        |  103 +-
 src/collectives.cc                      |   36 +-
 src/debug.cc                            |   16 +
 src/device/all_gather.h                 |  145 +-
 src/device/all_reduce.h                 |  167 +--
 src/device/broadcast.h                  |   52 +-
 src/device/common.h                     |    3 +
 src/device/common_kernel.h              |   19 +-
 src/device/generate.py                  |   35 +-
 src/device/network/unpack/unpack.h      |    4 +
 src/device/onerank.cu                   |    4 +
 src/device/primitives.h                 |    7 +-
 src/device/prims_ll.h                   |    6 +-
 src/device/prims_ll128.h                |    6 +-
 src/device/prims_simple.h               |  238 +--
 src/device/reduce_kernel.h              |  171 ++-
 src/device/reduce_scatter.h             |   55 +-
 src/device/sendrecv.h                   |   18 +-
 src/enqueue.cc                          |  820 +++++------
 src/graph/paths.cc                      |   71 +-
 src/graph/search.cc                     |    2 +-
 src/graph/topo.cc                       |  600 +++++++-
 src/graph/topo.h                        |    4 +
 src/graph/tuning.cc                     |  246 ++--
 src/graph/xml.cc                        |   16 +-
 src/graph/xml.h                         |   27 +-
 src/group.cc                            |   22 +-
 src/include/collectives.h               |  321 ++++-
 src/include/comm.h                      |   22 +-
 src/include/debug.h                     |    2 +
 src/include/device.h                    |   30 +-
 src/include/enqueue.h                   |   11 +
 src/include/graph.h                     |   11 +-
 src/include/ibvwrap.h                   |   12 +
 src/include/nccl_common.h               |    1 +
 src/include/nccl_net.h                  |  168 ++-
 src/include/nccl_profiler.h             |  121 +-
 src/include/nccl_tuner.h                |   53 +-
 src/include/net_device.h                |    3 +-
 src/include/nvmlwrap.h                  |    2 +-
 src/include/profiler.h                  |    8 +-
 src/include/proxy.h                     |   32 +-
 src/include/ras.h                       |   24 +
 src/include/register.h                  |   21 +-
 src/include/shmutils.h                  |    2 +-
 src/include/socket.h                    |   26 +-
 src/include/transport.h                 |   18 +-
 src/include/utils.h                     |    3 +-
 src/init.cc                             |  144 +-
 src/misc/cudawrap.cc                    |   37 +-
 src/misc/ibvwrap.cc                     |   94 +-
 src/misc/ipcsocket.cc                   |   18 +-
 src/misc/nvmlwrap.cc                    |    8 +-
 src/misc/profiler.cc                    |  220 ++-
 src/misc/shmutils.cc                    |    6 +-
 src/misc/socket.cc                      |  299 ++--
 src/misc/tuner.cc                       |   57 +-
 src/nccl.h.in                           |   15 +-
 src/net.cc                              |  681 ++++++---
 src/proxy.cc                            |   95 +-
 src/ras/client.cc                       |  318 ++++
 src/ras/client_support.cc               | 1755 +++++++++++++++++++++++
 src/ras/collectives.cc                  |  762 ++++++++++
 src/ras/peers.cc                        |  960 +++++++++++++
 src/ras/ras.cc                          |  668 +++++++++
 src/ras/ras_internal.h                  |  512 +++++++
 src/ras/rasnet.cc                       | 1189 +++++++++++++++
 src/register.cc                         |  204 ---
 src/register/coll_reg.cc                |  446 ++++++
 src/register/register.cc                |  179 +++
 src/register/sendrecv_reg.cc            |   35 +
 src/transport.cc                        |   28 +-
 src/transport/coll_net.cc               |  585 +++++---
 src/transport/generic.cc                |   22 +-
 src/transport/net.cc                    |  468 ++++--
 src/transport/net_ib.cc                 |  552 ++++---
 src/transport/net_socket.cc             |   17 +-
 src/transport/nvls.cc                   |  287 ++--
 src/transport/p2p.cc                    |  404 +++---
 src/transport/shm.cc                    |   30 +-
 97 files changed, 12537 insertions(+), 3076 deletions(-)
 create mode 100644 ext-net/example/nccl/net_v9.h
 create mode 100644 ext-profiler/example/nccl/profiler_v2.h
 create mode 100644 src/include/ras.h
 create mode 100644 src/ras/client.cc
 create mode 100644 src/ras/client_support.cc
 create mode 100644 src/ras/collectives.cc
 create mode 100644 src/ras/peers.cc
 create mode 100644 src/ras/ras.cc
 create mode 100644 src/ras/ras_internal.h
 create mode 100644 src/ras/rasnet.cc
 delete mode 100644 src/register.cc
 create mode 100644 src/register/coll_reg.cc
 create mode 100644 src/register/register.cc
 create mode 100644 src/register/sendrecv_reg.cc

diff --git a/ext-net/README.md b/ext-net/README.md
index 781fd904a..aa1a3945e 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,9 +60,9 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v6)
+# API (v9)
 
-Below is the main `ncclNet_v6` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
@@ -73,7 +73,7 @@ typedef struct {
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -82,24 +82,26 @@ typedef struct {
   // This call must not block for the connection to be established, and instead
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
   /* DMA-BUF support */
   ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -110,7 +112,17 @@ typedef struct {
   ncclResult_t (*closeSend)(void* sendComm);
   ncclResult_t (*closeRecv)(void* recvComm);
   ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_t;
 ```
 
 ## Error codes
@@ -136,11 +148,19 @@ not need to rely on CUDA, this should not be common.
 NCCL will call the `init` function first, then query the number of network devices with the
 `devices` function, getting each network device properties with `getProperties`.
 
+If NCCL wishes to initialize virtual devices, used in NIC fusion currently, it can call `makeVDevice`
+specifying a list of physical devices (the original devices listed from `devices`) it wishes to
+merge together. If the plugin does not support NIC fusion, it can set `makeVDevice` to null.
+
 To establish a connection between two network devices, NCCL will first call `listen` on the
 receiving side, pass the returned handle to the sender side of the connection, and call `connect`
 with that handle. Finally, `accept` will be called on the receiving side to finalize the connection
 establishment.
 
+`connect` and `accept` can receive an optional `netDevComm` pointer from the caller, if the caller
+wishes to make use of device networking. This parameter may be ignored by the plugin if it does
+not support device-side networking.
+
 Once the connection is established, communication will be done using the functions `isend`,
 `irecv` and `test`. Prior to calling `isend` or `irecv`, NCCL will call the `regMr` function on
 all buffers to allow RDMA NICs to prepare buffers. `deregMr` will be used to unregister buffers.
@@ -219,6 +239,12 @@ different offset within the original buffer, with a smaller size, etc), then der
 The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
 on the network adapter.
 
+The `forceFlush` field can request the NCCL core to call flush for all transfers. By default,
+flushes are only called when the GPU architecture or PCI topology would not not guarantee correct
+PCI ordering. Plugins can set it to one if the NIC operates in a mode where e.g. the data and the
+completion paths use different PCI links and therefore need a call to flush() to guarantee
+ordering.
+
 The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
 important to ensure proper optimization of flows within the node.
 
@@ -234,6 +260,17 @@ The `maxComms` field indicates the maximum number of connections we can create.
 The `maxRecvs` field indicates the maximum number for grouped receive operations (see grouped
 receive).
 
+The `netDeviceType` indicates which type of device networking this plugin supports. The current supported
+options are `NCCL_NET_DEVICE_HOST` and `NCCL_NET_DEVICE_UNPACK`.
+
+The `netDeviceVersion` indicates the version of device networking this plugin supports. Currently, this must match the associated netDeviceVersion of this netDeviceType compiled into NCCL core. Net device functionality is built as apart of NCCL core's device code.
+
+The `maxP2pBytes` and `maxCollBytes` fields indicate the maximum size the plugin can handle for
+point-to-point and collective calls. This will tell the NCCL core to cut large operations into
+multiple smaller chunks if needed.
+
+`vProps` is the list of devices that have been fused into the current device. Each entry is an index pointing to the child device.
+
 ### Connection establishment
 
 Connections are used in an unidirectional manner. There is therefore a sender side and a receiver
@@ -332,6 +369,12 @@ handled by a single request handle.
 The sizes provided to `irecv` can (and will) be larger than the size of the `isend` operation.
 The contrary (receive size being lower than the send size) is an error, however.
 
+NCCL sets request pointer in `irecv` to `NCCL_NET_OPTIONAL_RECV_COMPLETION` when it is using
+LL or LL128 protocols. In these cases, NCCL polls on flag embedded in data to detect completion
+of irecv and is resilient to redundant network writes. This allows the plugin to optimize request
+completions on such irecvs (for example, complete the request immediately). The plugin is still
+expected to set a valid request pointer on return which NCCL can poll to check for completion.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 2aea8c439..112967ab8 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -12,6 +12,8 @@
 #include "err.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -20,6 +22,7 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+#include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
 #include "net_v6.h"
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index b430d9064..874fb5999 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h
index 316155820..54a61f61b 100644
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@@ -23,8 +23,6 @@ typedef struct {
   int netDeviceVersion;            // Version number for network offload
 } ncclNetProperties_v8_t;
 
-typedef ncclNetProperties_v8_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
new file mode 100644
index 000000000..61035ecc9
--- /dev/null
+++ b/ext-net/example/nccl/net_v9.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_NET_V9_H_
+#define NCCL_NET_V9_H_
+
+#include "net_device.h"
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+#endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 128dde9b4..285224261 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,15 +7,15 @@
 #include "net.h"
 
 #define __hidden __attribute__ ((visibility("hidden")))
+#define NCCL_PLUGIN_MAX_RECVS 1
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
 __hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
-
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
-__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props) {
+__hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
   // Below are default values, if unsure don't change.
 
   props->name = "Example";
@@ -27,6 +27,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
   props->ptrSupport = NCCL_PTR_HOST;
   // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
   props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
   // Speed in *Mbps*. 100000 means 100G
   props->speed = 100000;
   // Port number, used in conjunction with guid
@@ -36,20 +38,27 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_v8_t* props
   // Maximum number of comm objects we can create.
   props->maxComms = 1024*1024;
   // Maximum number of receive operations taken by irecv().
-  props->maxRecvs = 1;
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
   // Coupling with NCCL network device-side code.
-  props->netDeviceType = 0;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  return ncclInternalError;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
 }
+
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -57,10 +66,11 @@ __hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError
 __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
 
 #define PLUGIN_NAME "Plugin"
 
-const ncclNet_v8_t ncclNetPlugin_v8 = {
+ncclNet_v9_t ncclNetPlugin_v9 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -80,8 +90,60 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
   .closeListen = pluginCloseListen,
   .getDeviceMr = pluginGetDeviceMr,
   .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice,
 };
 
+__hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
+  ncclNetProperties_t props;
+  ncclResult_t ret = pluginGetProperties(dev, &props);
+  if (ret != ncclSuccess) return ret;
+  props_v8->name = props.name;
+  props_v8->pciPath = props.pciPath;
+  props_v8->guid = props.guid;
+  props_v8->ptrSupport = props.ptrSupport;
+  props_v8->regIsGlobal = props.regIsGlobal;
+  props_v8->speed = props.speed;
+  props_v8->latency = props.latency;
+  props_v8->port = props.port;
+  props_v8->maxComms = props.maxComms;
+  props_v8->maxRecvs = props.maxRecvs;
+  props_v8->netDeviceType = props.netDeviceType;
+  props_v8->netDeviceVersion = props.netDeviceVersion;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+  size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
+  for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+}
+
+const ncclNet_v8_t ncclNetPlugin_v8 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v8,
+  .listen = pluginListen,
+  .connect = pluginConnect,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+ };
+
 __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* props_v7) {
   ncclNetProperties_t props;
   ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -91,6 +153,7 @@ __hidden ncclResult_t pluginGetProperties_v7(int dev, ncclNetProperties_v7_t* pr
   props_v7->guid = props.guid;
   props_v7->ptrSupport = props.ptrSupport;
   props_v7->speed = props.speed;
+  props_v7->latency = props.latency;
   props_v7->port = props.port;
   props_v7->maxComms = props.maxComms;
   props_v7->maxRecvs = props.maxRecvs;
@@ -114,8 +177,8 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -134,6 +197,7 @@ __hidden ncclResult_t pluginGetProperties_v6(int dev, ncclNetProperties_v6_t* pr
   props_v6->guid = props.guid;
   props_v6->ptrSupport = props.ptrSupport;
   props_v6->speed = props.speed;
+  props_v6->latency = props.latency;
   props_v6->port = props.port;
   props_v6->maxComms = props.maxComms;
   props_v6->maxRecvs = props.maxRecvs;
@@ -154,8 +218,8 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -174,8 +238,8 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
   .accept = pluginAccept_v6,
   .regMr = pluginRegMr_v7,
   .deregMr = pluginDeregMr,
-  .isend = pluginIsend,
-  .irecv = pluginIrecv,
+  .isend = pluginIsend_v8,
+  .irecv = pluginIrecv_v8,
   .iflush = pluginIflush,
   .test = pluginTest,
   .closeSend = pluginCloseSend,
@@ -198,11 +262,11 @@ static ncclResult_t pluginGetProperties_v4(int dev, ncclNetProperties_v4_t* prop
   return ncclSuccess;
 }
 static ncclResult_t pluginIsend_v4(void *sendComm, void* data, int size, void *mhandle, void** request) {
-  return pluginIsend(sendComm, data, size, 0, mhandle, request);
+  return pluginIsend_v8(sendComm, data, size, 0, mhandle, request);
 }
 static ncclResult_t pluginIrecv_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
   int tag = 0;
-  return pluginIrecv(recvComm, 1, &data, &size, &tag, &mhandle, request);
+  return pluginIrecv_v8(recvComm, 1, &data, &size, &tag, &mhandle, request);
 }
 static ncclResult_t pluginIflush_v4(void* recvComm, void* data, int size, void* mhandle, void** request) {
   return pluginIflush(recvComm, 1, &data, &size, &mhandle, request);
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
index 743280813..1486a2248 100644
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@@ -14,6 +14,7 @@
 
 #define MAX_CHANNELS                     32
 #define MAX_STEPS                        16
+#define MAX_OPS                          16 // Up to 64K ranks for PAT
 
 #define PROXY_OP_SEND_STATE_OFFSET       (ncclProfilerProxyOpSendPosted)
 #define PROXY_OP_RECV_STATE_OFFSET       (ncclProfilerProxyOpRecvPosted)
@@ -86,7 +87,7 @@ struct taskEventBase {
   int rank;                         // rank of the operation in NCCL communicator
   const char* name;                 // FIXME: unused
   uint64_t commHash;                // communicator identifier
-  uint8_t func;                     // ncclFunc*
+  const char* func;                 // ncclFunc*
   int refCount;                     // number of references for this operation
   struct group* parent;             // parent event group
   struct taskEventBase* next;       // next top level event in group
@@ -102,16 +103,14 @@ struct collective {
   size_t count;
   size_t trafficBytes;
   int root;
-  uint8_t datatype;
+  const char* datatype;
   uint8_t nMaxChannels;
-  uint8_t algo;
-  uint8_t proto;
-  int op;
+  const char* algo;
+  const char* proto;
   int nWarps;
-  int isCollnet;
-  int isNvls;
-  struct proxyOp send[MAX_CHANNELS];// array of send proxy operation events
-  struct proxyOp recv[MAX_CHANNELS];// array of recv proxy operation events
+  struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
+  struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
+  int nProxyOps[MAX_CHANNELS];
 };
 
 struct p2p {
@@ -119,9 +118,9 @@ struct p2p {
   uint8_t func;
   void const* buff;
   size_t count;
-  uint8_t datatype;
+  const char* datatype;
   int peer;
-  struct proxyOp op;
+  struct proxyOp op[MAX_CHANNELS];
 };
 
 struct group {
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
index db7bc3fea..6680cfece 100644
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@@ -13,6 +13,7 @@
 #include "common.h"
 #include "err.h"
 
+#include "profiler_v2.h"
 #include "profiler_v1.h"
 
 #endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h
index 8724a1c66..7d34bed57 100644
--- a/ext-profiler/example/nccl/profiler_v1.h
+++ b/ext-profiler/example/nccl/profiler_v1.h
@@ -9,16 +9,6 @@
 
 #include <stdint.h>
 
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
-};
-
 typedef struct {
   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -69,42 +59,8 @@ typedef struct {
   };
 } ncclProfilerEventDescr_v1_t;
 
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
 
 typedef struct {
   const char* name;
@@ -142,9 +98,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;
 
-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h
new file mode 100644
index 000000000..aab4ccf86
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v2.h
@@ -0,0 +1,146 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_V2_H_
+#define NCCL_PROFILER_V2_H_
+
+#include <stdint.h>
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+};
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index f9de60813..64d5d8be1 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -21,11 +21,18 @@
 static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time
 
-static int groupPoolSize = 16;
-static int collPoolSize = 16;
-static int p2pPoolSize = 1024;
-static int proxyCtrlPoolSize = 16;
-static int detachPoolSize = 128;
+static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
+static const int defaultGroupPoolSize = 16;
+static const int defaultCollPoolSize = 16;
+static const int defaultP2pPoolSize = 1024;
+static const int defaultProxyCtrlPoolSize = 16;
+static const int defaultDetachPoolSize = 128;
+
+static int groupPoolSize;
+static int collPoolSize;
+static int p2pPoolSize;
+static int proxyCtrlPoolSize;
+static int detachPoolSize;
 static int detachPoolBase;
 static int detachPoolIndex;
 static int detachPoolDone;
@@ -56,25 +63,25 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
   pthread_mutex_lock(&lock);
   if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     // first thread initializes event mask, environment and detach pool
-    __atomic_store_n(eActivationMask, ncclProfileColl | ncclProfileP2p, __ATOMIC_RELAXED);
-    if (getenv("NCCL_PROFILE_EVENT_MASK")) {
-      __atomic_store_n(eActivationMask, atoi(getenv("NCCL_PROFILE_EVENT_MASK")), __ATOMIC_RELAXED);
-    }
-    if (getenv("NCCL_PROFILE_GROUP_POOL_SIZE")) {
-      groupPoolSize = atoi(getenv("NCCL_PROFILE_GROUP_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_COLL_POOL_SIZE")) {
-      collPoolSize = atoi(getenv("NCCL_PROFILE_COLL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_P2P_POOL_SIZE")) {
-      p2pPoolSize = atoi(getenv("NCCL_PROFILE_P2P_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE")) {
-      proxyCtrlPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE"));
-    }
-    if (getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE")) {
-      detachPoolSize = atoi(getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE"));
-    }
+    const char* str;
+    str = getenv("NCCL_PROFILE_EVENT_MASK");
+    __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
+
+    str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
+    groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_POOL_SIZE");
+    collPoolSize = str ? atoi(str) : defaultCollPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_POOL_SIZE");
+    p2pPoolSize = str ? atoi(str) : defaultP2pPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_CTRL_POOL_SIZE");
+    proxyCtrlPoolSize = str ? atoi(str) : defaultProxyCtrlPoolSize;
+
+    str = getenv("NCCL_PROFILE_PROXY_DETACH_POOL_SIZE");
+    detachPoolSize = str ? atoi(str) : defaultDetachPoolSize;
+
     // detach pool is used to store PXN proxyOps and is shared among threads
     detachPool = (struct proxyOp *)calloc(detachPoolSize, sizeof(*detachPool));
     if (detachPool == NULL) {
@@ -107,6 +114,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
   ctx->proxyCtrlPool = (struct proxyCtrl *)calloc(proxyCtrlPoolSize, sizeof(*ctx->proxyCtrlPool));
   if (ctx->proxyCtrlPool == NULL) goto fail;
 
+  // Print event pool sizes for debugging
+  //fprintf(stdout, "Profiler: Group pool size (bytes): %lu\n", sizeof(struct group)*groupPoolSize);
+  //fprintf(stdout, "Profiler: Coll  pool size (bytes): %lu\n", sizeof(struct collective)*collPoolSize);
+  //fprintf(stdout, "Profiler: P2p   pool size (bytes): %lu\n", sizeof(struct p2p)*p2pPoolSize);
+  //fprintf(stdout, "Profiler: Proxy pool size (bytes): %lu\n", sizeof(struct proxyCtrl)*proxyCtrlPoolSize);
+  //fprintf(stdout, "Profiler: PXN   pool size (bytes): %lu\n", sizeof(struct proxyOp)*detachPoolSize);
+
   *context = ctx;
   return ncclSuccess;
 
@@ -154,7 +168,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   free(ctx);
 
   // last thread cleans up shared detach pool
-  if (__atomic_fetch_sub(&initialized, 1, __ATOMIC_RELAXED) - 1 == 0) {
+  if (__atomic_sub_fetch(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     start = (detachPoolIndex - detachPoolSize >= 0) ? detachPoolIndex - detachPoolSize : 0;
     end = detachPoolIndex;
     for (int i = start; i < end; i++) {
@@ -171,7 +185,7 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
 
 __hidden void updateEvent(void* handle);
 
-__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr) {
+__hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
   *eHandle = NULL;
   struct context* ctx = (struct context *)context;
   if (eDescr->type == ncclProfileGroup) {
@@ -185,14 +199,15 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
         if (base->type == ncclProfileColl) {
           struct collective* c = (struct collective *)base;
           // reset event proxyOps & proxySteps
-          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
-          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
+          memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
           // release collective events in the group and return them to the collective pool
           __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
         } else if (base->type == ncclProfileP2p) {
           struct p2p* p = (struct p2p *)base;
           // reset event proxyOp and proxySteps
-          memset(&p->op, 0, sizeof(struct proxyOp));
+          memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
           // release p2p events in the group and return them to the p2p pool
           __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
         }
@@ -203,7 +218,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       return ncclSuccess;
     }
     event->type = ncclProfileGroup;
-    __atomic_store_n(&event->refCount, 1, __ATOMIC_RELAXED);
     event->ctx = ctx;
     event->groupId = groupId;
     event->startTs = gettime() - startTime;
@@ -238,14 +252,11 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->count = eDescr->coll.count;
     event->root = eDescr->coll.root;
     event->datatype = eDescr->coll.datatype;
-    event->op = eDescr->coll.op;
     event->trafficBytes = eDescr->coll.trafficBytes;
     event->nMaxChannels = eDescr->coll.nMaxChannels;
     event->nWarps = eDescr->coll.nWarps;
     event->algo = eDescr->coll.algo;
     event->proto = eDescr->coll.proto;
-    event->isCollnet = eDescr->coll.isCollnet;
-    event->isNvls = eDescr->coll.isNvls;
     *eHandle = event;
     taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
     // increment the group ref counter so the event will staty open
@@ -326,9 +337,13 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 
     if (eventBase->type == ncclProfileColl) {
       struct collective* parent = (struct collective *)eDescr->parentObj;
-      struct proxyOp* event = (eDescr->proxyOp.isSend) ? &parent->send[eDescr->proxyOp.channelId] : &parent->recv[eDescr->proxyOp.channelId];
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = (eDescr->proxyOp.isSend) ?
+        &parent->send[channelId][parent->nProxyOps[channelId]++] :
+        &parent->recv[channelId][parent->nProxyOps[channelId]++];
+
       event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
       event->pid = eDescr->proxyOp.pid;
       event->rank = eDescr->rank;
       event->peer = eDescr->proxyOp.peer;
@@ -338,13 +353,14 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
     } else { // ncclProfileP2p
       struct p2p* parent = (struct p2p *)eDescr->parentObj;
-      struct proxyOp* event = &parent->op;
+      int channelId = eDescr->proxyOp.channelId;
+      struct proxyOp* event = &parent->op[channelId];
       event->type = ncclProfileProxyOp;
-      event->channelId = eDescr->proxyOp.channelId;
+      event->channelId = channelId;
       event->pid = eDescr->proxyOp.pid;
       event->rank = eDescr->rank;
       event->peer = eDescr->proxyOp.peer;
@@ -354,7 +370,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
-      __atomic_store_n(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
     }
  } else if (eDescr->type == ncclProfileProxyStep) {
@@ -379,7 +395,7 @@ void updateEvent(void* handle) {
   uint8_t type = *(uint8_t *)handle;
   if (type == ncclProfileGroup) {
     struct group* event = (struct group *)handle;
-    if (__atomic_fetch_sub(&event->refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->stopTs = gettime() - startTime;
       // return group event to the pool
       __atomic_fetch_add(&event->ctx->groupPoolBase, 1, __ATOMIC_RELAXED);
@@ -387,7 +403,7 @@ void updateEvent(void* handle) {
     debugEvent(event, "GroupStop");
   } else if (type == ncclProfileColl) {
     struct collective* event = (struct collective *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->base.stopTs = gettime() - startTime;
       debugEvent(event, "CollStop");
       updateEvent(event->base.parent);
@@ -396,7 +412,7 @@ void updateEvent(void* handle) {
     debugEvent(event, "CollStop");
   } else if (type == ncclProfileP2p) {
     struct p2p* event = (struct p2p *)handle;
-    if (__atomic_fetch_sub(&event->base.refCount, 1, __ATOMIC_RELAXED) == 1) {
+    if (__atomic_sub_fetch(&event->base.refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->base.stopTs = gettime() - startTime;
       debugEvent(event, "P2pStop");
       updateEvent(event->base.parent);
@@ -408,7 +424,7 @@ void updateEvent(void* handle) {
     event->stopTs = gettime() - startTime;
     if (event->pid != pid) {
       // only for proxyOps that don't have a parent collective/p2p (i.e., PXN)
-      int done = __atomic_fetch_add(&detachPoolDone, 1, __ATOMIC_RELAXED) + 1;
+      int done = __atomic_add_fetch(&detachPoolDone, 1, __ATOMIC_RELAXED);
       if (done == detachPoolSize) {
         // reset the event completed (done) counter
         __atomic_store_n(&detachPoolDone, 0, __ATOMIC_RELAXED);
@@ -451,12 +467,20 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
     struct collective* event = (struct collective *)eHandle;
     event->base.stopTs = gettime() - startTime;
     return ncclSuccess;
+  } else if (type == ncclProfileP2p) {
+    // stopping the p2p event in NCCL core does not
+    // mean the p2p has completed. It means the p2p
+    // was submitted/enqueued so we need to keep the event open
+    struct p2p* event = (struct p2p *)eHandle;
+    event->base.stopTs = gettime() - startTime;
+    return ncclSuccess;
   }
+
   updateEvent(eHandle);
   return ncclSuccess;
 }
 
-__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs) {
+__hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
   // the event handle might be null if we run out of events
   if (eHandle == NULL) return ncclSuccess;
 
@@ -482,7 +506,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   return ncclSuccess;
 }
 
-ncclProfiler_v1_t ncclProfiler_v1 = {
+ncclProfiler_t ncclProfiler_v2 = {
   "Example-profiler",
   exampleProfilerInit,
   exampleProfilerStartEvent,
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c
index 490ba7ce4..f26a9eeb2 100644
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.c
@@ -11,56 +11,6 @@
 
 #define __hidden __attribute__ ((visibility("hidden")))
 
-__hidden const char* ncclFuncToString(int func) {
-  switch(func) {
-    case 0:
-      return "ncclBroadcast";
-    case 1:
-      return "ncclReduce";
-    case 2:
-      return "ncclAllGather";
-    case 3:
-      return "ncclReduceScatter";
-    case 4:
-      return "ncclAllReduce";
-    case 5:
-      return "ncclSendRecv";
-    case 6:
-      return "ncclSend";
-    case 7:
-      return "ncclRecv";
-  }
-  return NULL;
-}
-
-__hidden const char* ncclAlgoToString(int algo) {
-  switch(algo) {
-    case 0:
-      return "Tree";
-    case 1:
-      return "Ring";
-    case 2:
-      return "CollnetDirect";
-    case 3:
-      return "CollnetChain";
-    case 4:
-      return "Nvls";
-    case 5:
-      return "NvlsTree";
-  }
-}
-
-__hidden const char* ncclProtoToString(int proto) {
-  switch(proto) {
-    case 0:
-      return "LL";
-    case 1:
-      return "LL128";
-    case 2:
-      return "Simple";
-  }
-}
-
 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
 // category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
@@ -77,24 +27,24 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": %d, \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
-          ncclFuncToString(event->base.func), collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, ncclAlgoToString(event->algo), ncclProtoToString(event->proto), event->nMaxChannels);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels);
 }
 
 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), collId++, getpid(), 1, event->base.stopTs);
+          event->base.func, collId++, getpid(), 1, event->base.stopTs);
 }
 
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": %d}},\n",
-          ncclFuncToString(event->base.func), p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n",
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
 }
 
 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          ncclFuncToString(event->base.func), p2pId++, getpid(), 1, event->base.stopTs);
+          event->base.func, p2pId++, getpid(), 1, event->base.stopTs);
 }
 
 static __thread int proxyOpId;
@@ -250,14 +200,18 @@ void printEvent(FILE* fh, void* handle) {
     struct collective* c = (struct collective *)handle;
     printCollEventHeader(fh, c);
     for (int i = 0; i < MAX_CHANNELS; i++) {
-      printEvent(fh, &c->send[i]);
-      printEvent(fh, &c->recv[i]);
+      for (int j = 0; j < c->nProxyOps[i]; j++) {
+        printEvent(fh, &c->send[i][j]);
+        printEvent(fh, &c->recv[i][j]);
+      }
     }
     printCollEventTrailer(fh, c);
   } else if (type == ncclProfileP2p) {
     struct p2p* p = (struct p2p *)handle;
     printP2pEventHeader(fh, p);
-    printEvent(fh, &p->op);
+    for (int i = 0; i < MAX_CHANNELS; i++) {
+      printEvent(fh, &p->op[i]);
+    }
     printP2pEventTrailer(fh, p);
   } else if (type == ncclProfileProxyOp) {
     struct proxyOp* p = (struct proxyOp *)handle;
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
index aafabd72d..77b543d12 100644
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@@ -67,6 +67,7 @@ typedef struct {
   //   - numPipeOps: number of operations in the group
   //   - numAlgo: number of algorithms in collCostTable
   //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
   //
   // Outputs:
   //   - nChannels: number of channels (hence SMs) to be used.
@@ -82,15 +83,15 @@ typedef struct {
   // Unset fields will be set automatically by NCCL.
   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
+                              int regBuff, int* nChannels);
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
   ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
+} ncclTuner_v4_t;
 
-typedef ncclTuner_v3_t ncclTuner_t;
+typedef ncclTuner_v4_t ncclTuner_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
 
 #endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index c3cf00dfd..7925dcfa1 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -12,10 +12,11 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels) {
+                              int regBuff, int* nChannels) {
   // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
-  if (collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
-    collCostTable[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
   }
   *nChannels = 1;
   return ncclSuccess;
@@ -25,7 +26,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
 
 #define PLUGIN_NAME "Example"
 
-const ncclTuner_v3_t ncclTunerPlugin_v3 = {
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .getCollInfo = pluginGetCollInfo,
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 59e4151ce..82164ab5c 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -12,6 +12,7 @@ DEBUG ?= 0
 ASAN ?= 0
 UBSAN ?= 0
 TRACE ?= 0
+WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
@@ -115,6 +116,10 @@ ifeq ($(NVTX), 0)
 CXXFLAGS  += -DNVTX_DISABLE
 endif
 
+ifneq ($(WERROR), 0)
+CXXFLAGS  += -Werror
+endif
+
 ifneq ($(KEEP), 0)
 NVCUFLAGS += -keep
 endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index bcc0ff3ce..252300934 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 23
-NCCL_PATCH   := 4
+NCCL_MINOR   := 24
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b254eac32..2c5d9e863 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,17 +7,22 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk
 
 ##### src files
-INCEXPORTS  := nccl.h nccl_net.h
+INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc register.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
-	$(wildcard transport/*.cc)
+	$(wildcard transport/*.cc) \
+	$(wildcard register/*.cc) \
+	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
+BINSRCFILES := ras/client.cc
 
 ##### lib files
 LIBNAME     := libnccl.so
 STATICLIBNAME := libnccl_static.a
+##### binaries
+BINNAME := ncclras
 ##### pkgconfig files
 PKGCONFIGFILE := nccl.pc
 ##### dirs
@@ -26,11 +31,12 @@ INCDIR := $(BUILDDIR)/include
 LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
 PKGDIR := $(BUILDDIR)/lib/pkgconfig
+BINDIR := $(BUILDDIR)/bin
 ##### target files
 CUDARTLIB  ?= cudart_static
 
+# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 ifeq ($(CUDARTLIB), cudart_static)
-	# Use compatibility shim only with static cudart; see https://github.com/NVIDIA/nccl/issues/658
 	LIBSRCFILES += enhcompat.cc
 endif
 
@@ -40,18 +46,21 @@ LIBTARGET  := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH))
 STATICLIBTARGET := $(STATICLIBNAME)
 PKGTARGET  := $(PKGCONFIGFILE)
 LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
-DEPFILES   := $(LIBOBJ:%.o=%.d)
+BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
+DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
 ##### rules
-build : lib staticlib
+build : lib staticlib binary
 
 lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET)
 
 staticlib : $(LIBDIR)/$(STATICLIBTARGET)
 
+binary : $(BINDIR)/$(BINNAME)
+
 $(DEVMANIFEST): ALWAYS_REBUILD $(INCTARGETS)
 	$(MAKE) -C ./device
 
@@ -85,6 +94,11 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	mkdir -p $(LIBDIR)
 	ar cr $@ $(LIBOBJ) $$(cat $(DEVMANIFEST))
 
+$(BINDIR)/$(BINNAME): $(BINOBJ)
+	@printf "Linking    %-35s > %s\n" $(BINNAME) $@
+	mkdir -p $(BINDIR)
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
 $(PKGDIR)/nccl.pc : nccl.pc.in
 	mkdir -p $(PKGDIR)
 	@printf "Generating %-35s > %s\n" $< $@
@@ -121,15 +135,17 @@ $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 
 clean :
 	$(MAKE) -C device clean
-	rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
+	rm -rf ${BINDIR} ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR}
 
 install : build
 	mkdir -p $(PREFIX)/lib
 	mkdir -p $(PREFIX)/lib/pkgconfig
 	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
 	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
 
 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
 # Note that formatting.mk defines a new target so in order to not overwrite the default target,
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index c1d085e4c..d11e59953 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -13,6 +13,7 @@
 #include <sys/types.h>
 #include "proxy.h"
 #include "param.h"
+#include "ras.h"
 
 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -110,13 +111,13 @@ ncclResult_t bootstrapNetInit() {
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
           pthread_mutex_unlock(&bootstrapNetLock);
-          return ncclInternalError;
+          return ncclInvalidUsage;
         }
       }
       char line[SOCKET_NAME_MAXLEN+MAX_IF_NAME_SIZE+2];
-      sprintf(line, " %s:", bootstrapNetIfName);
+      snprintf(line, sizeof(line), " %s:", bootstrapNetIfName);
       ncclSocketToString(&bootstrapNetIfAddr, line+strlen(line));
-      INFO(NCCL_BOOTSTRAP, "Bootstrap : Using%s", line);
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
       bootstrapNetInitDone = 1;
     }
     pthread_mutex_unlock(&bootstrapNetLock);
@@ -152,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
   }
   if (*sendReq) {
     NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -166,7 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size, &tag, &dataHandle, recvReq));
+    size_t size64 = size; 
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
   }
   if (*recvReq) {
     NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -302,7 +304,7 @@ static void* bootstrapRoot(void* rargs) {
       // if the number of root > 1, we will receive one extra info from the first local_id of the next root
       n2send = nRankFromRoot(iroot, nranks, nroots);
       nrecv = n2send + ((nroots > 1) ? 1 : 0);
-      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv * sizeof(union ringConnectInfo)), res, out);
+      NCCLCHECKGOTO(ncclCalloc(&rankInfo, nrecv), res, out);
       NCCLCHECKGOTO(ncclCalloc(&rankAddressesRoot, nrecv), res, out);
     }
 
@@ -492,29 +494,37 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
         struct netIf userIfs[MAX_OOB_DEVS];
         int nUserIfs = parseStringList(userIfEnv, userIfs, MAX_OOB_DEVS);
         // loop over the device and return the first one matching
-        int devId = 0;
         int nDev = 0;
         NCCLCHECK(comm->ncclNet->devices(&nDev));
+        int devId = 0;
         while (devId < nDev) {
           ncclNetProperties_t props;
           comm->ncclNet->getProperties(devId, &props);
           // check against user specified HCAs/ports
-          bool found = matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot;
-          if (found) {
+          if (matchIfList(props.name, props.port, userIfs, nUserIfs, searchExact) ^ searchNot) {
+            // All plain physical devices have been initialized at this point
             devOOB = devId;
             break;
           }
           devId++;
         }
         if (devOOB == -1) {
-          WARN("no device found matching NCCL_OOB_NET_IFNAME=%s, ignoring", userIfEnv);
-          goto noEnv;
+          if (!searchNot)
+            WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          else
+            WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
+          pthread_mutex_unlock(&bootstrapNetLock);
+          return ncclInvalidArgument;
         }
       } else {
-      noEnv:
         // default choice is device 0
         devOOB = 0;
       }
+      // display info on the chosen device
+      ncclNetProperties_t props;
+      ncclResult_t res = comm->ncclNet->getProperties(devOOB, &props);
+      bool hasProp = res == ncclSuccess;
+      INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
     }
     pthread_mutex_unlock(&bootstrapNetLock);
   }
@@ -545,7 +555,8 @@ static ncclResult_t socketRingConnect(ncclSocketAddress* addr, struct ncclSocket
 }
 static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* state,
                                 union ncclSocketAddress* peerAddresss,
-                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS) {
+                                union ncclSocketAddress* peerProxy, uint64_t* peerUDS,
+                                struct rasRankInit* rasRanks) {
   ncclResult_t res = ncclSuccess;
   int rank = comm->rank;
   int nRanks = comm->nRanks;
@@ -553,6 +564,7 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
     union ncclSocketAddress peerAddress;
     union ncclSocketAddress peerProxy;
     uint64_t peerUDS;
+    struct rasRankInit rasRank;
   }* ringData = NULL;
 
   NCCLCHECK(ncclCalloc(&ringData, nRanks));
@@ -563,6 +575,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
     memcpy(&(ringData[rank].peerProxy), peerProxy + rank, sizeof(union ncclSocketAddress));
   if (peerUDS)
     memcpy(&(ringData[rank].peerUDS), peerUDS + rank, sizeof(uint64_t));
+  if (rasRanks)
+    memcpy(&(ringData[rank].rasRank), rasRanks + rank, sizeof(*rasRanks));
 
   // allgather
   NCCLCHECKGOTO(bootstrapAllGather(state, ringData, sizeof(struct bootstrapRingData)), res, exit);
@@ -575,6 +589,8 @@ static ncclResult_t ringAllInfo(struct ncclComm* comm, struct bootstrapState* st
       memcpy(peerProxy + irank, &(ringData[irank].peerProxy), sizeof(union ncclSocketAddress));
     if (peerUDS)
       memcpy(peerUDS + irank, &(ringData[irank].peerUDS), sizeof(uint64_t));
+    if (rasRanks)
+      memcpy(rasRanks + irank, &(ringData[irank].rasRank), sizeof(*rasRanks));
   }
 
 exit:
@@ -598,7 +614,10 @@ static ncclResult_t sendToRoot(struct ncclBootstrapHandle* handle, struct ncclCo
 NCCL_PARAM(StaggerRate, "UID_STAGGER_RATE", 7000);
 NCCL_PARAM(StaggerThreshold, "UID_STAGGER_THRESHOLD", 256);
 
+NCCL_PARAM(RasEnable, "RAS_ENABLE", 1);
+
 ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
+  ncclResult_t result = ncclSuccess;
   int rank = comm->rank;
   int nranks = comm->nRanks;
   // char nextPeerHandle[NCCL_NET_HANDLE_MAXSIZE];
@@ -607,6 +626,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   struct ncclSocket sock, listenSockRoot;
   struct extInfo info = {0};
   union ringConnectInfo nextPeer;
+  bool performRasAddRanks = true;
+  struct rasRankInit* rasRanks = nullptr;
 
   uint64_t timers[BOOTSTRAP_INIT_TIME_N] = {0};
 
@@ -696,23 +717,45 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   // in case of failure, those resources will be free'd when calling bootstrapDestroy, so we can return immediatly
   NCCLCHECK(ncclCalloc(&state->peerProxyAddresses, nranks));
   NCCLCHECK(ncclCalloc(&proxySocket, 1));
-  NCCLCHECK(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), result, fail);
 
-  NCCLCHECK(ncclCalloc(&state->peerProxyAddressesUDS, nranks));
-  NCCLCHECK(getUDS(state->peerProxyAddressesUDS + rank));
+  NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), result, fail);
+  NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), result, fail);
 
   // create a socket for others to reach out (P2P)
   union ncclSocketAddress peerSocketAddress;
-  NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
-  NCCLCHECK(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)));
+  NCCLCHECKGOTO(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap), result, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), result, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
 
+  // Initialize RAS
+  if (ncclParamRasEnable() == 1) {
+    // The RAS thread will take care of freeing the memory allocated below.
+    NCCLCHECK(ncclCalloc(&rasRanks, nranks));
+    memcpy(&rasRanks[rank].addr, &bootstrapNetIfAddr, sizeof(rasRanks[rank].addr));
+    rasRanks[rank].pid = getpid();
+    rasRanks[rank].cudaDev = comm->cudaDev;
+    rasRanks[rank].nvmlDev = comm->nvmlDev;
+    if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+      // We should still participate in the ringAllInfo below as the peers will be waiting for us.
+      // Just make sure that the address is clearly invalid...
+      memset(rasRanks+rank, '\0', sizeof(*rasRanks));
+      performRasAddRanks = false;
+    }
+  }
+
   BOOTSTRAP_PROF_OPEN(timers[BOOTSTRAP_INIT_TIME_RING]);
-  NCCLCHECK(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, rasRanks), result, fail);
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_RING]);
 
   // Create the service proxy and get the UDS
-  NCCLCHECK(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS));
+  NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), result, fail);
+
+  if (ncclParamRasEnable() == 1 && performRasAddRanks) {
+    if (ncclRasAddRanks(rasRanks, nranks) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }
 
   BOOTSTRAP_PROF_CLOSE(timers[BOOTSTRAP_INIT_TIME_TOTAL]);
   TRACE(NCCL_BOOTSTRAP, "rank %d nranks %d - DONE", rank, nranks);
@@ -722,8 +765,11 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
        timers[BOOTSTRAP_INIT_TIME_RECV] / 1e9,
        timers[BOOTSTRAP_INIT_TIME_RING] / 1e9,
        timers[BOOTSTRAP_INIT_TIME_DELAY] / 1e9);
-
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxySocket);
+  goto exit;
 }
 
 ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks) {
@@ -761,6 +807,11 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   union ncclSocketAddress peerSocketAddress;
   NCCLCHECK(createListenSocket(comm, comm->magic, &STATE_LISTEN(state, peerSocket), &peerSocketAddress, ncclSocketTypeBootstrap));
 
+  if (ncclParamRasEnable() == 1) {
+    if (ncclRasCommInit(comm, nullptr) != ncclSuccess)
+      INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
+  }
+
   // Get addr from next rank using the parent's connections
   NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
   NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
@@ -773,14 +824,14 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
     NCCLCHECK(socketRingConnect(&nextPeer.addr, &STATE_RING(state, socket.send), &STATE_LISTEN(state, socket), &STATE_RING(state, socket.recv), comm->magic, state->abortFlag));
   }
 
-  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks * sizeof(union ncclSocketAddress)), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
   if (parent->config.splitShare) {
     /* map local rank to top parent local rank. */
     for (int i = 0; i < nranks; ++i) {
       comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
     }
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, NULL, NULL, NULL), ret, fail);
   } else {
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddresses, nranks), ret, fail);
     NCCLCHECKGOTO(ncclCalloc(&state->peerProxyAddressesUDS, nranks), ret, fail);
@@ -788,7 +839,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
     NCCLCHECKGOTO(ncclCalloc(&proxySocket, 1), ret, fail);
     NCCLCHECKGOTO(getUDS(state->peerProxyAddressesUDS + rank), ret, fail);
     NCCLCHECKGOTO(createListenSocket(comm, comm->magic, proxySocket, state->peerProxyAddresses + rank, ncclSocketTypeProxy), ret, fail);
-    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
+    NCCLCHECKGOTO(ringAllInfo(comm, state, state->peerP2pAddresses, state->peerProxyAddresses, state->peerProxyAddressesUDS, NULL), ret, fail);
     NCCLCHECKGOTO(ncclProxyInit(comm, proxySocket, state->peerProxyAddresses, state->peerProxyAddressesUDS), ret, fail);
   }
 
@@ -811,7 +862,7 @@ static ncclResult_t socketConnect(void* commState, int peer, int tag, struct ncc
   struct bootstrapState* state = (struct bootstrapState*)commState;
 
   struct socketAckInfo ack = (struct socketAckInfo){.rank = state->rank, .tag = tag};
-  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(sock, state->peerP2pAddresses + peer, state->magic, ncclSocketTypeBootstrap, state->abortFlag), ret, fail);
   NCCLCHECKGOTO(ncclSocketConnect(sock), ret, fail);
   NCCLCHECKGOTO(socketSend(sock, &ack, sizeof(struct socketAckInfo)), ret, fail);
   return ncclSuccess;
diff --git a/src/collectives.cc b/src/collectives.cc
index be9468d49..479d4c511 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -44,9 +44,9 @@ const char* ncclDatatypeToString(ncclDataType_t type) {
   case ncclFloat16: return "ncclFloat16";
   case ncclFloat32: return "ncclFloat32";
   case ncclFloat64: return "ncclFloat64";
-#if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16: return "ncclBfloat16";
-#endif
+  case ncclFloat8e4m3: return "ncclFloat8e4m3";
+  case ncclFloat8e5m2: return "ncclFloat8e5m2";
   default: return "Unknown";
   }
 }
@@ -87,8 +87,7 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
     ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -111,8 +110,7 @@ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
     ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
@@ -133,16 +131,14 @@ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, n
   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
     BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 /* Deprecated original "in place" function, similar to MPI */
 NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  NCCLCHECK(ncclBroadcast(buff, buff, count, datatype, root, comm, stream));
-  return ncclSuccess;
+  return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }
 
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
@@ -166,8 +162,7 @@ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
   struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
     REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
@@ -189,8 +184,7 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
     REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
-  NCCLCHECK(ncclEnqueueCheck(&info));
-  return ncclSuccess;
+  return ncclEnqueueCheck(&info);
 }
 
 struct NvtxParamsSendRecv {
@@ -212,12 +206,7 @@ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatyp
   struct ncclInfo info = { ncclFuncSend, "Send",
     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }
 
 NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
@@ -230,10 +219,5 @@ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int
   struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
     1, 1 };
-  ncclResult_t ret;
-  NCCLCHECK(ncclGroupStart());
-  NCCLCHECKGOTO(ncclEnqueueCheck(&info), ret, exit);
-exit:
-  NCCLCHECK(ncclGroupEnd());
-  return ret;
+  return ncclEnqueueCheck(&info);
 }
diff --git a/src/debug.cc b/src/debug.cc
index d21ea3d12..2ea6eabde 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -8,6 +8,7 @@
 #include "nccl_net.h"
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
 #include <strings.h>
 #include <sys/syscall.h>
@@ -89,6 +90,8 @@ static void ncclDebugInit() {
         mask = NCCL_REG;
       } else if (strcasecmp(subsys, "PROFILE") == 0) {
         mask = NCCL_PROFILE;
+      } else if (strcasecmp(subsys, "RAS") == 0) {
+        mask = NCCL_RAS;
       } else if (strcasecmp(subsys, "ALL") == 0) {
         mask = NCCL_ALL;
       }
@@ -224,6 +227,19 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   }
 }
 
+NCCL_API(void, ncclResetDebugInit);
+void ncclResetDebugInit() {
+  // Cleans up from a previous ncclDebugInit() and reruns.
+  // Use this after changing NCCL_DEBUG and related parameters in the environment.
+  __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
+  if (ncclDebugFile != stdout) {
+    fclose(ncclDebugFile);
+    ncclDebugFile = stdout;
+  }
+  ncclDebugLevel = -1;
+  ncclDebugInit();
+}
+
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
 
 void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index fb56e483b..5d79d7357 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -9,64 +9,88 @@
 #include "primitives.h"
 
 namespace {
-  template<typename T, typename RedOp, typename Proto>
+  template<typename T, typename RedOp, typename Proto, bool isNetOffload = false>
   __device__ __forceinline__ void runRing(int tid, int nthreads, struct ncclDevWorkColl* work) {
     ncclRing *ring = &ncclShmem.channel.ring;
     const int *ringRanks = ring->userRanks;
     const int nranks = ncclShmem.comm.nRanks;
-    size_t count, partOffset, partCount, chunkCount;
+    ssize_t count, partOffset, partCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &partOffset, &partCount, &chunkCount);
-    size_t offset;
-    size_t dataOffset;
+    ssize_t offset;
+    ssize_t dataOffset;
     int nelem;
     int rankDest;
-
+    int workNthreads;
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0> prims
-      (tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
-    for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
-      /////////////// begin AllGather steps ///////////////
-      nelem = min(chunkCount, partCount - elemOffset);
-      dataOffset = partOffset + elemOffset;
+    // If isNetOffload == true, we only use 1 warp to drive Ring algo/network communication
+    // and the rest of warps proceed to copy src data into dst buffer in parallel when AG
+    // is not in-place.
+    if (isNetOffload) {
+      workNthreads = WARP_SIZE;
+      chunkCount = NCCL_MAX_NET_SIZE;
+    } else {
+      workNthreads = nthreads;
+    }
 
-      // step 0: push data to next GPU
-      rankDest = ringRanks[0];
-      offset = dataOffset + rankDest * count;
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0, isNetOffload> prims
+        (tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work, NULL, isNetOffload ? NCCL_MAX_NET_SIZE : 0);
+      for (size_t elemOffset = 0; elemOffset < partCount; elemOffset += chunkCount) {
+        /////////////// begin AllGather steps ///////////////
+        nelem = min(chunkCount, partCount - elemOffset);
+        dataOffset = partOffset + elemOffset;
+
+        // step 0: push data to next GPU
+        rankDest = ringRanks[0];
+        offset = dataOffset + rankDest * count;
 
-      if (inputBuf + dataOffset == outputBuf + offset) { // In place
-        prims.directSend(dataOffset, offset, nelem);
-      } else {
-        prims.directCopySend(dataOffset, offset, nelem);
-      }
+        if ((inputBuf + dataOffset == outputBuf + offset) || isNetOffload) { // In place or onePPN
+          prims.directSend(dataOffset, offset, nelem);
+        } else {
+          prims.directCopySend(dataOffset, offset, nelem);
+        }
+
+        // k-2 steps: copy to next GPU
+        for (int j = 1; j < nranks - 1; ++j) {
+          rankDest = ringRanks[nranks - j];
+          offset = dataOffset + rankDest * count;
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
+        }
 
-      // k-2 steps: copy to next GPU
-      for (int j=1; j<nranks-1; ++j) {
-        rankDest = ringRanks[nranks-j];
+        // Make final copy from buffer to dest.
+        rankDest = ringRanks[1];
         offset = dataOffset + rankDest * count;
 
-        prims.directRecvCopyDirectSend(offset, nelem);
+        // Final wait/copy.
+        prims.directRecv(offset, offset, nelem);
       }
-
-      // Make final copy from buffer to dest.
-      rankDest = ringRanks[1];
-      offset = dataOffset + rankDest * count;
-
-      // Final wait/copy.
-      prims.directRecv(offset, offset, nelem);
+    } else if (inputBuf != outputBuf + ringRanks[0] * count) {
+      inputBuf = inputBuf + partOffset;
+      outputBuf = outputBuf + partOffset + ringRanks[0] * count;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, partCount);
     }
+    // we have to wait for all warps before we can proceed to the next work;
+    // otherwise, we can have contention if next work will use the outputBuf
+    // in this work. We use bar 14 to avoid conflicts with prims barrier and
+    // __syncthread().
+    if (isNetOffload) barrier_sync(14, nthreads);
   }
 }
 
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
-    using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
-    runRing<T, RedOp, Proto>(tid, nthreads, work);
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;
+    if (isNetOffload)
+      runRing<T, RedOp, ProtoSimple<1, 1>, true>(tid, nthreads, work);
+    else
+      runRing<T, RedOp, ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>, false>(tid, nthreads, work);
   }
 };
 
@@ -96,7 +120,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatAg);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
 
     PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
     int last = 0;
@@ -137,6 +161,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndBcast) {
         // Bcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
@@ -148,6 +173,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.send(offset, nelem);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       }
     } else {
       /* direct allgather */
@@ -204,11 +230,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       int part = ncclShmem.channelId - work->channelLo;
       char* inbuf = (char*)work->sendbuff;
       char* outbuf = (char*)work->recvbuff;
-      ssize_t sizePerRank = work->collnet.count*sizeof(T);
-      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*sizePerRank);
+      ssize_t countPerRank = work->collnet.count*sizeof(T);
+      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank*countPerRank);
 
-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
 
@@ -221,15 +247,15 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         if (rail == nRails) rail = 0;
       }
       do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
         int railAllOffset = 0;
         while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
           int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
           int outIsDst = (inPlace && rank == ncclShmem.comm.rank) ? 0 : 1;
           if (nSrcs != 0 && outIsDst+nDsts != 0) {
             reduceCopy<ncclCollUnroll(), RedOp, T,
@@ -238,11 +264,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
                      /*PreOpSrcs=*/0>
             (tid, tn, 0, nullptr, false,
              /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
-               return work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
+               return work->regUsed && (recvDirectFlag & NCCL_P2P_READ) ? (char*)srcPtrs[src] + userOneBeg : (char*)srcPtrs[src] + railAllOffset;
              },
              /*nDsts=*/outIsDst+nDsts, [=]__device__(int d) -> void* {
                return d < outIsDst ? outbuf + userOneBeg
-                                   : work->regUsed && (sendDirectFlag & NCCL_DIRECT_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
+                                   : work->regUsed && (sendDirectFlag & NCCL_P2P_WRITE) ? (char*)dstPtrs[d-outIsDst] + userOneBeg
                                    : (char*)dstPtrs[d-outIsDst] + railAllOffset;
              },
              delta);
@@ -262,8 +288,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     const int nChannels = work->channelHi - work->channelLo + 1;
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
-    ssize_t sizePerRank = work->collnet.count*sizeof(T);
+    ssize_t countPerRank = work->collnet.count;
     size_t chunkSize = work->collnet.chunkCount;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
     bool isMultiRail = (direct->nHeads > 1);
     int nWarps1 = 1;
     int nWarps2 = (isMultiRail ? 2 : 1);
@@ -277,9 +304,12 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
 
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          // If this rank has local peers (i.e, hasDn == true), we cannot offload all data to network.
+          // In this case, steps should be computed based on chunkSize and so on; otherwise, we just
+          // bump the step by 1 to kick off collnet progress.
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
           Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
         }
         __syncwarp();
@@ -288,11 +318,11 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, nullptr, &direct->out, work->sendbuff, nullptr,
             /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t beg = max(railAllBeg, railOneBeg);
           ssize_t end = min(railAllEnd, railOneEnd);
           prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
@@ -304,10 +334,9 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+          Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
         }
         __syncwarp();
       } else {
@@ -315,7 +344,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
         Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
           prims(tid, tn, &direct->out, direct->heads + 1, nullptr, work->recvbuff,
             /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 0, 0, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*BcastSendNotRecv=*/true> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
@@ -333,7 +362,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
         prims(tid, tn, direct->heads+1, nullptr, nullptr, work->recvbuff,
               /*redOpArg=*/0, 2*Proto::MaxGroupWidth, 0, 0, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*BcastSendNotRecv=*/false> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 36b8d3206..c6c131517 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -69,7 +69,7 @@ namespace {
         chunkOffset = chunk * chunkCount;
         offset = gridOffset + elemOffset + chunkOffset;
         nelem = (int)min(chunkCount, remCount - chunkOffset);
-        prims.directRecvCopyDirectSend(offset, nelem);
+        prims.directRecvCopyDirectSend(offset, offset, nelem);
       }
 
       // Make final copy from buffer to dest.
@@ -139,7 +139,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
       }
     }
@@ -222,7 +222,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecvCopyDirectSend(offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
       }
     }
@@ -268,22 +268,30 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     const int tidStartBcast = nThreadsGather;
     const int tidStartScatter = tidStartBcast + nThreadsBcast;
     const int tidStartReduce = tidStartScatter + nThreadsScatter;
-
     using Proto = ProtoSimple<1, 1>;
 
     if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
       // Scatter
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
         prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, work->sendbuff, work->recvbuff,
-           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1);
+           work->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
+      // For collnet UB case, we need to organize buffers differently for contiguous buffer access
+      // across channels. This access pattern should be consistent with code in coll_net.cc
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        if (work->regUsed) {
-          prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        } else {
-          prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
-        }
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.scatter(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
       }
       // Coverity complains about a possible overrun inside the destructor of "prims", but that's actually
       // a false positive.
@@ -291,24 +299,20 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     } else if (tid >= tidStartReduce && direct->out != -1) {
       if (hasDn) {
         // Reduce, send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
           prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, work->sendbuff, work->recvbuff,
-             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+             work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          if (work->regUsed) {
-            prims.directRecvReduceSend(offset, nelem);
-          } else {
-            prims.recvReduceSend(offset, nelem);
-          }
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                    : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.recvReduceDirectSend(offset, offset, nelem);
         }
       } else {
         // Directly send to network
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (tid == tidStartReduce) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
           }
           __syncwarp();
         } else {
@@ -316,8 +320,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, work->sendbuff, work->recvbuff,
              work->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-            int nelem = min(chunkSize, size-offset);
+            ssize_t offset = gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+            int nelem = min(chunkSize, size - offset);
             prims.send(offset, nelem);
           }
         }
@@ -327,10 +331,21 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
       Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/1, Proto, 0>
         prims(tid, nThreadsGather, direct->up, NULL, work->sendbuff, work->recvbuff,
            work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, work);
+      ssize_t offsetBase, peerOffset;
+      ssize_t maxNelems;
+      if (work->netRegUsed) {
+        offsetBase = bid * chunkSize;
+        maxNelems = size;  // never be the min
+        peerOffset = nChannels * chunkSize;
+      } else {
+        offsetBase = bid * direct->nHeads * chunkSize;
+        maxNelems = direct->nHeads * chunkSize;
+        peerOffset = chunkSize;
+      }
       for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-        ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
-        int nelem = min(direct->nHeads*chunkSize, size-offset);
-        prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
+        ssize_t offset = gridOffset + offsetBase;
+        ssize_t nelem = min(maxNelems, size - offset);
+        prims.directGather(offset, nelem, chunkSize, peerOffset, direct->headRank, direct->shift);
       }
     } else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
       if (hasDn) {
@@ -342,15 +357,15 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
           prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, work->sendbuff, work->recvbuff,
              work->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
-          int nelem = min(chunkSize, size-offset);
-          prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
+          ssize_t offset = work->netRegUsed ? gridOffset + (bid + direct->headRank * nChannels) * chunkSize
+                                            : gridOffset + (bid * direct->nHeads + direct->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp=*/true);
         }
       } else {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (tid == tidStartBcast) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
+            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, 1);
           }
           __syncwarp();
         } else {
@@ -394,8 +409,6 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
       ssize_t gridOffset, channelCount, chunkSize;
       ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkSize);
       const ssize_t loopCount = nvls->nHeads * chunkSize;
-      ssize_t offset;
-      int nelem;
       int remCount = channelCount%(nvls->nHeads*chunkSize);
       int lastChunkSize = alignUp(divUp(remCount, nvls->nHeads), 16384/sizeof(T));
 
@@ -407,8 +420,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndGather) {
@@ -419,8 +432,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
-          offset = gridOffset + elemOffset;
-          nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
+          ssize_t offset = gridOffset + elemOffset;
+          int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce) {
@@ -430,7 +443,8 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
             work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t elemOffset = 0; elemOffset < channelCount; elemOffset += loopCount) {
-          ssize_t chunkOffset;
+          ssize_t chunkOffset, offset;
+          int nelem;
           if (channelCount - elemOffset < loopCount) chunkSize = lastChunkSize;
           chunkOffset = elemOffset + nvls->headRank * chunkSize;
           offset = gridOffset + chunkOffset;
@@ -456,6 +470,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
           prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndGather) {
         // Gather
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
@@ -464,38 +479,23 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
             work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * nvls->nHeads * chunkSize;
-          int nelem = work->regUsed ? 0 :min(nvls->nHeads * chunkSize, size - offset);
+          int nelem = work->regUsed ? 0 : min(nvls->nHeads * chunkSize, size - offset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
       } else if (tid < tidEndReduce && nvls->headRank != -1) {
-        if (!hasOut) {
-          // Reduce, broadcast through NVLS
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
-        } else {
-          // Reduce, send to network
-          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-          // Coverity complains about a possible overrun inside the class below, but that's actually
-          // a false positive.
-          // coverity[identity_transfer:FALSE]
-          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-            prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
-              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
-            int nelem = min(chunkSize, size - offset);
-            prims.directRecvDirectSend(offset, offset, nelem);
-          }
+        // Reduce, send to network
+        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+        // Coverity complains about a possible overrun inside the class below, but that's actually
+        // a false positive.
+        // coverity[identity_transfer:FALSE]
+        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+        prims(tid - tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, work->recvbuff,
+          work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
+        for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          int nelem = min(chunkSize, size - offset);
+          prims.directRecvDirectSend(offset, offset, nelem);
         }
       } else if (tid < tidEndBcast && nvls->headRank != -1) {
         // Recv from network, broadcast
@@ -504,10 +504,11 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
         // a false positive.
         // coverity[identity_transfer:FALSE]
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
+          prims(tid - tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, work->recvbuff,
             work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-          ssize_t offset = gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
+          ssize_t offset = work->regUsed && work->netRegUsed ? gridOffset + (nvls->headRank * nChannels + bid) * chunkSize
+                                                             : gridOffset + (bid * nvls->nHeads + nvls->headRank) * chunkSize;
           int nelem = min(chunkSize, size - offset);
           prims.directRecvDirectSend(offset, offset, nelem);
         }
@@ -660,10 +661,9 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
 
     if (tid < nthreadsSplit) {
       if (recv == -1) {
-        if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+        if (work->netRegUsed) {
           if (groupTid == 0) {
-            int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, steps);
+            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::sendPeerNotify(send, connIndex, 1);
           }
           __syncwarp();
         } else {
@@ -673,8 +673,10 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
+            // coverity[overrun-call] => Coverity think prims.index can be greater than 1
             prims.directSend(offset, offset, nelem);
           }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
         }
       } else {
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
@@ -683,18 +685,19 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
         for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
           ssize_t offset = gridOffset + bid * int(chunkSize);
           int nelem = min(chunkSize, size - offset);
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
           prims.directRecvReduceDirectSend(offset, offset, nelem);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       }
     }
     else {
       if (recv == nranks) {
         // I'm the first in the broadcast chain, I need to perform the division (postOp)
         if (send == -1) {
-          if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+          if (work->netRegUsed) {
             if (groupTid == 0) {
-              int steps = (int)divUp(size * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, steps);
+              Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>::recvPeerNotify(recv, connIndex, 1);
             }
             __syncwarp();
           } else {
@@ -720,7 +723,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid * int(chunkSize);
             int nelem = min(chunkSize, size - offset);
-            prims.directRecvCopyDirectSend(offset, nelem, /*postOp*/true);
+            prims.directRecvCopyDirectSend(offset, offset, nelem, /*postOp*/true);
           }
         }
       } else {
@@ -740,7 +743,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecvCopyDirectSend(offset, nelem);
+            prims.directRecvCopyDirectSend(offset, offset, nelem);
           }
         }
       }
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 851b01d94..017d379ad 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -15,37 +15,49 @@ namespace {
     const int rank = ring->userRanks[0];
     const int nextRank = ring->userRanks[1];
     const int root = work->root;
-    size_t chunkCount;
-    size_t channelCount;
-    size_t gridOffset;
-    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (size_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
+    ssize_t chunkCount;
+    ssize_t channelCount;
+    ssize_t gridOffset;
+    ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), (ssize_t*)nullptr, &gridOffset, &channelCount, &chunkCount);
     size_t offset;
     int nelem;
+    int workNthreads;
+    bool isNetOffload = work->isOneRPN && work->netRegUsed;
 
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
-    // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
-    // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
-    // coverity[callee_ptr_arith:FALSE]
-    Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
-      prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
+    workNthreads = isNetOffload ? WARP_SIZE : nthreads;
 
-    for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-      offset = gridOffset + elemOffset;
-      nelem = min(chunkCount, channelCount - elemOffset);
+    if (tid < workNthreads) {
+      // Coverity reports that the callee treats &ring->next as an array.  However, due to the use of
+      // FanSymmetric<1>, only the first element is ever accessed, so it's fine.
+      // coverity[callee_ptr_arith:FALSE]
+      Primitives<T, RedOp, FanSymmetric<1>, 1, Proto, 0>
+        prims(tid, workNthreads, &ring->prev, &ring->next, inputBuf, outputBuf, work->redOpArg, 0, 0, 0, work);
 
-      if (rank == root) {
-        if (inputBuf == outputBuf) {
-          prims.directSend(offset, offset, nelem);
+      for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+        offset = gridOffset + elemOffset;
+        nelem = min(chunkCount, channelCount - elemOffset);
+
+        if (rank == root) {
+          if (inputBuf == outputBuf || isNetOffload) {
+            prims.directSend(offset, offset, nelem);
+          } else {
+            prims.directCopySend(offset, offset, nelem);
+          }
+        } else if (nextRank == root) {
+          prims.directRecv(offset, offset, nelem);
         } else {
-          prims.directCopySend(offset, offset, nelem);
+          prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
-      } else if (nextRank == root) {
-        prims.directRecv(offset, offset, nelem);
-      } else {
-        prims.directRecvCopyDirectSend(offset, nelem);
       }
+    } else if (inputBuf != outputBuf && rank == root) {
+      inputBuf = inputBuf + gridOffset;
+      outputBuf = outputBuf + gridOffset;
+      reduceCopy<COLL_UNROLL, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs=*/0>
+        (tid - workNthreads, nthreads - workNthreads, work->redOpArg, &work->redOpArg, false, 1, (void**)&inputBuf, 1, (void**)&outputBuf, channelCount);
     }
+    if (isNetOffload) barrier_sync(14, nthreads);
   }
 }
 
diff --git a/src/device/common.h b/src/device/common.h
index 967421b7d..05465ff5a 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -396,6 +396,9 @@ __device__ void ncclDevFunc_Nop();
     ncclKernelMain<specializedFnId, RunWorkBatch<coll, ty, redop<ty>, algo, proto>>(&args4K.args); \
   }
 
+#define DEFINE_ncclDevKernel_nop(suffix, coll, redop, ty, algo, proto, specializedFnId) \
+  __global__ void ncclDevKernel_##suffix(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K) {}
+
 #define DEFINE_ncclDevFunc(suffix, coll, redop, ty, algo, proto) \
   __device__ void ncclDevFunc_##suffix() { \
     RunWorkBatch<coll, ty, redop<ty>, algo, proto>().run(); \
diff --git a/src/device/common_kernel.h b/src/device/common_kernel.h
index f932f51f0..00bb1e333 100644
--- a/src/device/common_kernel.h
+++ b/src/device/common_kernel.h
@@ -65,19 +65,23 @@ __device__ __forceinline__ void reduceCopyPacks(
   uintptr_t minSrcs[MinSrcs + !MinSrcs];
   uintptr_t minDsts[MinDsts + !MinDsts];
   #pragma unroll
-  for (int s=0; s < MinSrcs; s++)
+  for (int s=0; s < MinSrcs; s++) {
     minSrcs[s] = cvta_to_global(srcPtrFn(s)) + threadBytesBehind;
+  }
+
   #pragma unroll
-  for (int d=0; d < MinDsts; d++)
+  for (int d=0; d < MinDsts; d++) {
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
     minDsts[d] = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+  }
 
   // We dictate loop termination condition according to whether partial hunks
   // can be handled or not.
   while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) {
     BytePack<BytePerPack> acc[Unroll];
 
+    // minSrcs[0] cannot be nullptr so we always process it
     { RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
@@ -163,7 +167,8 @@ __device__ __forceinline__ void reduceCopyPacks(
       }
     }
     for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
-      uintptr_t dst = cvta_to_global(dstPtrFn(d)) + threadBytesBehind;
+      uintptr_t dstPtr = cvta_to_global(dstPtrFn(d));
+      uintptr_t dst = dstPtr + threadBytesBehind;
       #pragma unroll Unroll
       for (int u=0; u < Unroll; u++) {
         st_global<BytePerPack>(dst, acc[u]);
@@ -173,11 +178,15 @@ __device__ __forceinline__ void reduceCopyPacks(
 
     nWarps = nThreads/WARP_SIZE;
     #pragma unroll
-    for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
+    for (int s=0; s < MinSrcs; s++) {
+      minSrcs[s] += (nWarps-1)*BytePerHunk;
+    }
     #pragma unroll
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
-    for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
+    for (int d=0; d < MinDsts; d++) {
+      minDsts[d] += (nWarps-1)*BytePerHunk;
+    }
     threadBytesBehind += nWarps*BytePerHunk;
     threadBytesAhead -= nWarps*BytePerHunk;
     nHunksAhead -= nWarps;
diff --git a/src/device/generate.py b/src/device/generate.py
index a0d225946..b69a2d7cc 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -5,7 +5,7 @@
 # Order of redops, tys, protos, algos must match src/include/device.h
 all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
 all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
-all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
 all_protos = ["LL","LL128","SIMPLE"]
 all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
 
@@ -107,6 +107,9 @@ def required_cuda(coll, redop, ty, algo, proto):
   if coll in ("AllReduce","Reduce","ReduceScatter"):
     if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
     if ty=="bf16": cudart = max(cudart, 11000)
+    if ty.startswith("f8"):
+      cudart = max(cudart, 11080)
+      arch = max(arch, 900)
 
   if "NVLS" in algo:
     if coll in ("AllReduce","Reduce","ReduceScatter"):
@@ -125,7 +128,7 @@ def required_cuda(coll, redop, ty, algo, proto):
 def equivalent_primary(coll, redop, ty, algo, proto):
   if coll in ("AllReduce", "Reduce", "ReduceScatter"):
     # map signed integer sum/prod to unsigned
-    if redop in ("Sum","Prod","PreMulSum") and ty[0]=="i":
+    if redop in ("Sum","Prod","PreMulSum","SumPostDiv") and ty[0]=="i":
       return (coll, redop, "u"+ty[1:], algo, proto)
     # map signed integer min/max to unsigned for non-NVLS
     if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
@@ -365,7 +368,9 @@ def partition_by_name(fns):
   "f16": "half",
   "f32": "float",
   "f64": "double",
-  "bf16": "__nv_bfloat16"
+  "bf16": "__nv_bfloat16",
+  "f8e4m3": "__nv_fp8_e4m3",
+  "f8e5m2": "__nv_fp8_e5m2"
 }
 
 # Generate each <gensrc>/<impl>.cu:
@@ -385,15 +390,23 @@ def partition_by_name(fns):
       sym = paste("_", coll, redop, ty, algo, proto)
       fn_id = primary_to_index[kfn]
       cudart, arch = required_cuda(*kfn)
+      s = "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
       if (cudart, arch) != (0, 0):
-        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
-      out(
-        "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
-        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
-                algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id)
-      )
-      if (cudart, arch) != (0, 0):
-        out("#endif\n")
+        # Add conditional compilation logic around s. If CUDART_VERSION is satisfactory
+        # we must compile a kernel regardless of __CUDA_ARCH__ since the host code has
+        # to link against some stub.
+        s = "#if CUDART_VERSION >= {cudart}\n" \
+            "  #if __CUDA_ARCH__ < {arch}\n" \
+            "    DEFINE_ncclDevKernel_nop({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" \
+            "  #else\n" \
+            "    " + s + \
+            "  #endif\n" \
+            "#endif\n"
+      out(s.format(
+        cudart=cudart, arch=arch, sym=sym, coll=coll,
+        redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+        algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id
+      ))
 
     for fn in fns:
       (coll, redop, ty, algo, proto) = fn
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
index e76099821..941b4328d 100644
--- a/src/device/network/unpack/unpack.h
+++ b/src/device/network/unpack/unpack.h
@@ -33,17 +33,21 @@ inline __device__ void load64gpu(const uint64_t* ptr, uint64_t &v) {
 // Map internal association of handle with group and peer index (called once at init time)
 inline __device__ void ncclNetDeviceUnpackSetup(void* ohandle, const int group, const int index) {
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.g_meta[index] = handle->meta;
   ncclShmem.devicePlugin.unpack.bounce_buf = handle->bounce_buf;
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.head[index] = handle->head;
 }
 
 inline __device__ void ncclNetDeviceIncrementHead(const int group, const int index) {
+  // coverity[index_parm:FALSE]
   ncclShmem.groups[group].devicePlugin.unpack.head[index]++;
 }
 
 inline __device__ void ncclNetDeviceSaveHead(void* ohandle, const int group, const int index) {
   struct unpackNetDeviceHandle* handle = (struct unpackNetDeviceHandle*) ohandle;
+  // coverity[index_parm:FALSE]
   handle->head = ncclShmem.groups[group].devicePlugin.unpack.head[index];
 }
 
diff --git a/src/device/onerank.cu b/src/device/onerank.cu
index 5ff4a85b1..c187dcc44 100644
--- a/src/device/onerank.cu
+++ b/src/device/onerank.cu
@@ -62,6 +62,10 @@ ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct
   case ncclUint32:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint32_t>>; break;
   case ncclInt64:    kernel = (void const*)&oneRankReduce<FuncPreMulSum<int64_t>>; break;
   case ncclUint64:   kernel = (void const*)&oneRankReduce<FuncPreMulSum<uint64_t>>; break;
+  #if defined(__CUDA_FP8_TYPES_EXIST__) && __CUDA_ARCH__ >= 900
+  case ncclFloat8e4m3: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e4m3>>; break;
+  case ncclFloat8e5m2: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_fp8_e5m2>>; break;
+  #endif
   case ncclFloat16:  kernel = (void const*)&oneRankReduce<FuncPreMulSum<half>>; break;
   #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16: kernel = (void const*)&oneRankReduce<FuncPreMulSum<__nv_bfloat16>>; break;
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 1913640e8..73c10c264 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -103,7 +103,7 @@ struct FanSymmetric {
 };
 
 // The primitives class. Specialized per protocol in the other headers.
-template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p, bool isNetOffload = false>
 class Primitives;
 
 // Used by LL & LL128 to implement direct members in the naive way.
@@ -121,9 +121,12 @@ struct PrimitivesWithoutDirect {
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
   }
-  __device__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+  __device__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
   }
+  __device__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return;
+  }
   __device__ void recvReduceCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     // Direct is only for the send part
     static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 1a1307f5c..3e00f3b85 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -4,9 +4,9 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>> {
 
   // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
   // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 2cb10cc49..617b7acf3 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -8,9 +8,9 @@
 
 #define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
 
-template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
-class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
-  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p, bool isNetOffload>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>> {
 
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 945878b76..005101940 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -14,9 +14,9 @@ enum primsMode {
 };
 
 template<typename T, typename RedOp, typename Fan, int Direct,
-         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts, bool isNetOffload>
 class Primitives<
-    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p, isNetOffload
   > {
   static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
   static constexpr int Input=0, Output=1;
@@ -34,11 +34,7 @@ class Primitives<
                        PatMode = 0x800,
                        NvlsMinPolling = 0x1000,
                        NetDeviceUnpack = 0x2000,
-                       AnyNetDeviceUnpack = 0x4000,
-                       NvlsDirectRead = 0x8000,
-                       NvlsDirectWrite = 0x10000,
-                       IpcWrite = 0x20000,
-                       IpcRead = 0x40000;
+                       AnyNetDeviceUnpack = 0x4000;
   const int tid, tidInBlock;
   const int nthreads;
   int nworkers;
@@ -119,12 +115,9 @@ class Primitives<
   template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
   __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
     const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
-    const bool noRecvWait = DirectRecv && Src && (flags & (DirectRead | IpcRead));        // no wait when directly reading from remote input
-    const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
     // Yes, for some template arguments this code will be unreachable.  That's fine.
     // coverity[dead_error_line]
-    if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
-        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
+    if ((flags & (Recv * RoleWaitRecv)) || (flags & (Send * RoleWaitSend))) {
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = loadStepValue(connStepPtr);
@@ -134,27 +127,38 @@ class Primitives<
     }
 
     if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
-      if (flags & ConnFifoEnabled)
+      if ((flags & ConnFifoEnabled) && (flags & (Send * RoleWaitSend)))
         connFifo[step%NCCL_STEPS].size = nelts*sizeof(T);
 
       void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                   : (ncclShmem.groups[group].srcs + Src);
       if (flags & NetRegMode) {
-         // Do nothing
+        if (P2p) {
+          ptrs[index] = NULL;
+        } else {
+          if (isSendNotRecv) {
+            if (!Recv)
+              ptrs[index] = NULL;
+            else
+              ptrs[index] = (T*)ncclShmem.groups[group].userOutput + dstIx + offset;
+          } else {
+            ptrs[index] = (T*)ncclShmem.groups[group].userOutput + srcIx + offset;
+          }
+        }
       } else if ((flags & ConnFifoEnabled) && connFifo[step%NCCL_STEPS].mode == NCCL_MODE_OFFSET) {
         ptrs[index] = connEltsFifo + loadInt(&connFifo[step%NCCL_STEPS].offset)/sizeof(T);
       } else if (isSendNotRecv && DirectSend) {
-        if (flags & (DirectWrite | NvlsDirectWrite | IpcWrite)) {
+        if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;
-        } else if ((flags & DirectRead) || (flags & IpcRead)) {  // empty send
+        } else if (flags & DirectRead) {  // empty send
           ptrs[index] = nullptr;
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
         }
       } else if (!isSendNotRecv && DirectRecv) {
-        if (flags & (DirectRead | NvlsDirectRead | IpcRead)) {
+        if (flags & DirectRead) {
           ptrs[index] = directBuff + srcIx + offset;
-        } else if ((flags & DirectWrite) || (flags & IpcWrite)) {
+        } else if (flags & DirectWrite) {
           ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
         } else {
           ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
@@ -198,7 +202,7 @@ class Primitives<
     int slice = 0;
     int offset = 0;
 
-    if (tid < nworkers && offset < nelem && ((flags & NetRegMode) == 0)) {
+    if (tid < nworkers && offset < nelem && !isNetOffload) {
       // Worker-only loop for non-empty slices. Non-workers and empty slices are
       // processed in the loop following this if block. The benefit of splitting
       // the loop like this is we pull two branches out of the critical path.
@@ -252,7 +256,7 @@ class Primitives<
              * so we need to check whether MultimemSrcs and MultimemDsts are 0. */
             && MultimemSrcs == 0 && MultimemDsts == 0 && !Src) {
           // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
-          if (Send) {
+          if (Send && Dst && ncclShmem.groups[group].srcs[0] != ncclShmem.groups[group].dsts[1]) {
             reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
               (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
                1, ncclShmem.groups[group].srcs,
@@ -269,16 +273,32 @@ class Primitives<
         } else if (ncclShmem.groups[group].srcs[0] && ncclShmem.groups[group].dsts[0]) {
           constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
                                     DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
-          reduceCopy<Unroll, RedOp, T,
-            MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
-            MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
-            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
-             Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
-             Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
-             workSize);
+          if (Send && Dst && ncclShmem.groups[group].dsts[1] == nullptr) {
+            // this case should only be directCopySend() with registered buffers and send to net peer
+            reduceCopy<Unroll, RedOp, T,
+              0, Recv + Src, Recv * MaxRecv + Src,
+              0, 1, 1, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                1, ncclShmem.groups[group].dsts,
+                workSize);
+          } else {
+            reduceCopy<Unroll, RedOp, T,
+              MultimemSrcs, Recv + Src, Recv * MaxRecv + Src,
+              MultimemDsts, Send + Dst, Send * MaxSend + Dst, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+                Recv * fan.nrecv() + Src, ncclShmem.groups[group].srcs,
+                Send * fan.nsend() + Dst, ncclShmem.groups[group].dsts,
+                workSize);
+          }
+        } else {
+          // we will come here when calling prims.directSend with net peer,
+          // in this case, ncclShmem.groups[group].dsts[0] == NULL, so we
+          // skip data flush.
+          workSize = 0;
         }
         barrier(); // This barrier has a counterpart in following loop
-        postPeer<Recv, Send>(0 < sliceSize);
+        postPeer<Recv, Send>(0 < workSize);
         offset += sliceSize;
         slice += 1;
         // Yes, for some template arguments this code will be unreachable.  That's fine.
@@ -295,10 +315,11 @@ class Primitives<
       sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
       { // Only workers could have Wait roles so we know the slice must be empty
         // since we've exited the loop above.
-        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, sliceSize);
       }
       barrier(); // Has couterpart in preceding worker-only loop.
-      postPeer<Recv, Send>(0 < sliceSize);
+      int workSize = ncclShmem.aborted ? 0 : sliceSize;
+      postPeer<Recv, Send>(0 < workSize);
       offset += sliceSize;
       slice += 1;
     }
@@ -347,17 +368,17 @@ class Primitives<
             ptrs[index] = connEltsFifo + offset/sizeof(T);
           } else if (Direct && fn.work->regUsed) {
             if (isSendNotRecv) {
-              if (flags & (DirectWrite | IpcWrite)) {
+              if (flags & DirectWrite) {
                 ptrs[index] = directBuff;
-              } else if (flags & (DirectRead | IpcRead)) {  // empty send
+              } else if (flags & DirectRead) {  // empty send
                 ptrs[index] = nullptr;
               } else {
                 ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
               }
             } else {
-              if (flags & (DirectRead | IpcRead)) {
+              if (flags & DirectRead) {
                 ptrs[index] = directBuff;
-              } else if (flags & (DirectWrite | IpcWrite)) {
+              } else if (flags & DirectWrite) {
                 if (Send)
                   ptrs[index] = directBuff;  // send to next from my output buffer
                 else
@@ -440,7 +461,7 @@ class Primitives<
             int i = (j+shift)%fan.nsend();
             ssize_t pOffset = i*peerOffset;
             // Skip the data I am responsible of reducing myself
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
             void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
             ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
@@ -452,7 +473,7 @@ class Primitives<
         } else if (Recv) {
           if (tid==0) ncclShmem.groups[group].dsts[0] = (T*)ncclShmem.groups[group].userOutput + outIx + offset;
           ssize_t pOffset = index*peerOffset;
-          if (skip >= 0 && index >= skip) pOffset += peerElem;
+          if (skip >= 0 && index >= skip) pOffset += peerOffset;
           // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
           waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx+pOffset, outIx+pOffset, offset, realSize);
           subBarrier();
@@ -460,7 +481,7 @@ class Primitives<
           for (int j=0; j<fan.nrecv(); j++) {
             int i = (j+shift)%fan.nrecv();
             pOffset = i*peerOffset;
-            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            if (skip >= 0 && i >= skip) pOffset += peerOffset;
             void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
             ssize_t realPeerSize = min(realSize, totalElem-pOffset);
             if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
@@ -474,7 +495,7 @@ class Primitives<
     }
   }
 
-  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
     conn = &peer->recv[connIndex];
     if (conn->netDeviceHandle.netDeviceType == NCCL_NET_DEVICE_UNPACK) {
       // handle must be a device ptr
@@ -499,33 +520,34 @@ class Primitives<
       if (conn->connFifo != nullptr) {
         flags |= ConnFifoEnabled;
         connFifo = conn->connFifo;
-      } else if (Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      }
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectRead;
           }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
+            connFifo[step % NCCL_STEPS].size = 0;
           }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectRead;
         }
       }
     }
   }
 
-  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int regFlag) {
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, uint32_t direct, int ipcRegFlag, int netRegFlag) {
     conn = &peer->send[connIndex];
     step = conn->step;
     step = roundUp(step, SlicePerChunk*StepPerSlice);
@@ -544,27 +566,26 @@ class Primitives<
       connStepCache = loadStepValue(connStepPtr);
       connStepSize = conn->stepSize/sizeof(T);
       connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
-      if (connFifo == nullptr && Direct && regFlag) {
-        // User buffers have been registered
-        if (conn->flags & (NCCL_IPC_READ | NCCL_IPC_WRITE)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_IPC_WRITE ? IpcWrite : IpcRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= IpcRead;
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? IpcRead : IpcWrite;
+      if (Direct) {
+        if (ipcRegFlag) {
+          // User buffers have been registered
+          if (conn->flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
+            if (P2p) {
+              flags |= conn->flags & NCCL_P2P_WRITE ? DirectWrite : DirectRead;
+            } else if (connIndex == 1 && direct) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= direct & NCCL_P2P_READ ? DirectRead : DirectWrite;
+            }
+          } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
+            /* NVLS direct */
+            flags |= DirectWrite;
           }
-        } else if (conn->flags & (NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
-          if (P2p) {
-            flags |= conn->flags & NCCL_DIRECT_WRITE ? DirectWrite : DirectRead;
-          } else if (connIndex == 1 && direct) {
-            flags |= DirectRead;  // scatter-reduce use direct pull
-          } else {
-            flags |= direct & NCCL_DIRECT_READ ? DirectRead : DirectWrite;
+        }
+        if (netRegFlag) {
+          if (conn->flags & NCCL_DIRECT_NIC) {
+            flags |= NetRegMode;
           }
-        } else if ((conn->flags & NCCL_NVLS_MIN_POLL)) {
-          /* NVLS direct */
-          flags |= NvlsDirectWrite;
         }
       }
     }
@@ -574,8 +595,8 @@ class Primitives<
   __device__ Primitives(
       int tid, int nthreads, int const *recvPeers, int const *sendPeers,
       void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
-      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* e = nullptr,
-      bool ipcReg = false, bool netReg = false, int stepSize_ = 0, int mode = primsModeDefault
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclDevWorkColl* collWork = nullptr,
+      struct ncclDevWorkP2p* p2pWork = nullptr, int stepSize_ = 0, int mode = primsModeDefault
     ):
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
@@ -643,11 +664,23 @@ class Primitives<
 
     // Coverity thinks that index could be -1 here but that's not actually the case.
     // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-    // coverity[negative_returns:FALSE]
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e ? e->direct : 0, e ? e->regUsed : ipcReg);
-
-    if (netReg) flags |= NetRegMode;
+    int sendIpcReg;
+    int recvIpcReg;
+    int sendNetReg;
+    int recvNetReg;
+    if (P2p) {
+      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+    } else {
+      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+    }
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
 
     if (barrierAny(flags & NetDeviceUnpack)) {
       flags |= AnyNetDeviceUnpack;
@@ -659,8 +692,10 @@ class Primitives<
       }
     }
 
-    // coverity[negative_returns:FALSE]
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)e, (uint8_t)(e ? e->regUsed : ipcReg), peer);
+    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+    // coverity[uninit_member] => coverity thinks fan.n is not initialized
   }
 
   __device__ ~Primitives() {
@@ -683,6 +718,16 @@ class Primitives<
     // Make sure all threads are done writing back conn->step and done using
     // ncclShmem.groups[group]
     barrier();
+
+    if ((flags & DirectRead) && (flags & RoleWaitSend) && P2p) {
+      // For sendrecv DirectRead, sender needs to wait for receiver reading data from src.
+      // This has to be done after barrier() since post thread might have contention with
+      // this check.
+      int spins = 0;
+      volatile uint64_t* tail = conn->tail;
+      volatile uint64_t* head = conn->head;
+      while (*tail > *head) if (checkAbort(spins)) break;
+    }
   }
 
   __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclDevWorkCollReg* work, uint8_t ipcReg, int peer) {
@@ -693,10 +738,10 @@ class Primitives<
     }
 
     if (Direct && ipcReg) {
-      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite || flags & IpcWrite);
-      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite || flags & IpcWrite || flags & NvlsDirectWrite);
-      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead || flags & IpcRead); // sender provides direct buffer (to be fetched)
-      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead || flags & IpcRead || flags & NvlsDirectRead); // receiver accepts direct buffer
+      bool recvProvider = (flags & RoleWaitRecv) && (flags & DirectWrite);
+      bool sendAcceptor = (flags & RoleWaitSend) && (flags & DirectWrite);
+      bool sendProvider = (flags & RoleWaitSend) && (flags & DirectRead); // sender provides direct buffer (to be fetched)
+      bool recvAcceptor = (flags & RoleWaitRecv) && (flags & DirectRead); // receiver accepts direct buffer
       if (recvProvider) {
         int spins = 0;
         void* volatile* slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
@@ -709,6 +754,7 @@ class Primitives<
             exchgPtr = (T*)outputBuf;
           } else {
             int localPeer = ncclShmem.comm.rankToLocalRank[peer];
+            // coverity[deref_parm:FALSE] => work cannot be NULL if ipcReg != NULL
             exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
           }
           *slot = reinterpret_cast<void*>(exchgPtr);
@@ -727,6 +773,7 @@ class Primitives<
           directBuff = reinterpret_cast<T*>(ptr);
           *slot = nullptr;
         } else {
+          // coverity[var_deref_op]
           directBuff = (T*)work->dnOutputs[index];
         }
       }
@@ -747,8 +794,10 @@ class Primitives<
           } else {
             int localPeer = ncclShmem.comm.rankToLocalRank[peer];
             if (MaxRecv == 0)
+              // coverity[var_deref_op]
               exchgPtr = (T*)(work->coll.sendbuffOffset + work->coll.sendbuffRmtAddrs[localPeer]);
             else
+              // coverity[var_deref_op]
               exchgPtr = (T*)(work->coll.recvbuffOffset + work->coll.recvbuffRmtAddrs[localPeer]);
           }
 
@@ -837,11 +886,11 @@ class Primitives<
   __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecvCopyDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, Output>(inpIx, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN) {
-    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, false);
+  __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
@@ -860,6 +909,9 @@ class Primitives<
   __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
     genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
   }
+  __device__ __forceinline__ void recvReduceDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
+  }
   __device__ __forceinline__ void directRecvReduceDirectSend(intptr_t inpIx, intptr_t outIx, ssize_t eltN, bool postOp=false) {
     genericOp<1, 1, 1, 1, Input, -1>(inpIx, outIx, eltN, postOp);
   }
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index b069c07ec..c2378e3df 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -20,6 +20,12 @@ struct IsFloatingPoint<half>: std::true_type {};
 template<>
 struct IsFloatingPoint<__nv_bfloat16>: std::true_type {};
 #endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+template<>
+struct IsFloatingPoint<__nv_fp8_e4m3>: std::true_type {};
+template<>
+struct IsFloatingPoint<__nv_fp8_e5m2>: std::true_type {};
+#endif
 template<>
 struct IsFloatingPoint<float>: std::true_type {};
 template<>
@@ -298,6 +304,24 @@ SPECIALIZE_REDUCE(FuncMinMax, double, 1, double, fn.isMinNotMax ? fmin(x, y) : f
 #endif
 #endif
 
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 1, __nv_fp8_e4m3, __nv_fp8_e4m3(fn.isMinNotMax ? __hmin(__half(x),__half(y)) : __hmax(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e4m3, 2, __nv_fp8x2_e4m3, __nv_fp8x2_e4m3(fn.isMinNotMax ? __hmin2(__half2(x),__half2(y)) : __hmax2(__half2(x),__half2(y))))
+
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hadd(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncSum, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hadd2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(__hmul(__half(x),__half(y))))
+  SPECIALIZE_REDUCE(FuncProd, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(__hmul2(__half2(x),__half2(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 1, __nv_fp8_e5m2, __nv_fp8_e5m2(fn.isMinNotMax ? __hmin(__half(x), __half(y)) : __hmax(__half(x), __half(y))))
+  SPECIALIZE_REDUCE(FuncMinMax, __nv_fp8_e5m2, 2, __nv_fp8x2_e5m2, __nv_fp8x2_e5m2(fn.isMinNotMax ? __hmin2(__half2(x), __half2(y)) : __hmax2(__half2(x), __half2(y))))
+#endif
+#endif
+
 #undef SPECIALIZE_REDUCE
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -416,9 +440,9 @@ template<>
 struct FuncPreMulSum<half> {
   using EltType = half;
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
-  half2 scalar;
+  __half2 scalar;
   __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
     u64 = opArg;
     scalar.x = val;
     scalar.y = val;
@@ -426,9 +450,9 @@ struct FuncPreMulSum<half> {
 #else
   float scalar;
   __device__ FuncPreMulSum(uint64_t opArg=0) {
-    union { uint64_t u64; half val; };
+    union { uint64_t u64; __half val; };
     u64 = opArg;
-    scalar = __half2float(val);
+    scalar = (float)val;
   }
 #endif
 };
@@ -459,11 +483,39 @@ struct FuncPreMulSum<half> {
   };
 #endif
 
-template<typename T>
-struct Apply_Reduce<FuncPreMulSum<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e4m3> {
+    using EltType = __nv_fp8_e4m3;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
+      scalar2.y = scalar2.x;
+    }
+  };
+
+  template<>
+  struct FuncPreMulSum<__nv_fp8_e5m2> {
+    using EltType = __nv_fp8_e5m2;
+    __half2 scalar2;
+    __device__ FuncPreMulSum(uint64_t opArg) {
+      union { uint64_t u64; __nv_fp8_storage_t val; };
+      u64 = opArg;
+      scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
+      scalar2.y = scalar2.x;
+    }
+  };
+#endif
+#endif
+
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncPreMulSum reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
 };
 
@@ -530,6 +582,51 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   #endif
 #endif
 
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp of FuncPreMulSum for fp8.
+
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+#if __CUDA_ARCH__ >= 900
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8_e4m3)> a
+      ) {
+      return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
+        FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8x2_e4m3)> a
+      ) {
+      return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2)));
+    }
+  };
+
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8_e5m2)> a
+      ) {
+      return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x)));
+    }
+  };
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
+        FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8x2_e5m2)> a
+      ) {
+      return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2)));
+    }
+  };
+#endif
+#endif
+
 ////////////////////////////////////////////////////////////////////////////////
 // FuncSumPostDiv
 
@@ -541,34 +638,44 @@ struct RedOpArg<FuncSumPostDiv<T>> {
   }
 };
 
-template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
-struct FuncSumPostDiv_IntOnly;
-
-template<typename T>
-struct FuncSumPostDiv: FuncSumPostDiv_IntOnly<T> {
-  __device__ FuncSumPostDiv(uint64_t opArg=0):
-    FuncSumPostDiv_IntOnly<T>(opArg) {
-  }
-};
-
 template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/false>: FuncSum<T> {
+struct FuncSumPostDiv {
+  static_assert(T(0) < T(-1), "FuncSumPostDiv is only for implementing ncclAvg on uint types.");
   using EltType = T;
-  int divisor;
-  __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {}
-};
-
-template<typename T>
-struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/true> {
-  static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
+  using UintType = typename std::conditional<sizeof(T)==8, uint64_t, uint32_t>::type;
+  uint32_t divisor:31, isSigned:1;
+  UintType recip;
+  
+  __device__ FuncSumPostDiv(uint64_t opArg=0) {
+    isSigned = opArg & 1;
+    divisor = opArg >> 1;
+    recip =  UintType(-1)/divisor;
+  }
+  __device__ T divide(T x) {
+    // x is negative iff we are in signed mode and the top bit is set
+    bool xneg = isSigned && (x & ~(T(-1)>>1));
+    // Compute abs(x):
+    // T(-x) vs -T(x) is critical. We have to negate then truncate the bits. Consider
+    // if we are doing signed 8-bit types, thus T=uint8_t. The value -1 is encoded
+    // as 0xff. -T(0xff) when promoted to 32-bit (which is implicit by compiler)
+    // gives 0xffffff01, but T(-0xff) is 0x1, and that is the abs value we want.
+    UintType xabs = xneg ? T(-x) : x;
+    // Compute quotient by multiplying by reciprical.
+    UintType q = sizeof(T)==8 ? __umul64hi(xabs, recip) : __umulhi(xabs, recip);
+    // Quotient may be off by one so do a fixup.
+    if (xabs - q*divisor >= divisor) q += 1;
+    // If original x was negative then we have to negate it back since we were
+    // working with its abs val.
+    return xneg ? -T(q) : T(q);
+  }
 };
 
-template<typename T>
-struct Apply_Reduce<FuncSumPostDiv<T>, /*EltPerPack=*/1>:
-    Apply_Reduce<FuncSum<T>, 1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+template<typename T, int EltPerPack>
+struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
+    Apply_Reduce<FuncSum<T>, EltPerPack> {
+  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncSumPostDiv reduce dispatches to FuncSum.
-    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+    return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
 };
 
@@ -576,7 +683,7 @@ template<typename T>
 struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = false;
   __device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
-    return toPack<T>(fromPack<T>(a) / fn.divisor);
+    return toPack<T>(fn.divide(fromPack<T>(a)));
   }
 };
 
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index f7b3c25e5..70538b117 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -89,7 +89,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
     T *inputBuf = (T*)work->sendbuff;
     T *outputBuf = (T*)work->recvbuff;
     Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, false, false, 0, primsModePatRs);
+      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
 
     PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
     int last = 0;
@@ -137,6 +137,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_S
           nelem = min(chunkCount, channelCount - elemOffset);
           prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
         }
+        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       } else if (tid < tidEndReduce) {
         // Reduce through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
@@ -206,10 +207,10 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
       int nRails = direct->nHeads;
       int part = ncclShmem.channelId - work->channelLo;
       void* inbuf = (void*)work->sendbuff;
-      ssize_t sizePerRank = work->collnet.count;
+      ssize_t countPerRank = work->collnet.count;
 
-      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*sizePerRank);
-      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*sizePerRank);
+      ssize_t railAllBeg = min(railGridOffset + part*chunkSize, nNodes*countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes*countPerRank);
       int railAllSize = railAllEnd - railAllBeg;
       if (tid < nDsts) dstSizes[tid] = railAllSize;
 
@@ -222,15 +223,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
         if (rail == nRails) rail = 0;
       }
       do {
-        int node = railAllBeg/sizePerRank;
+        int node = railAllBeg/countPerRank;
         int railAllOffset = 0;
         while (railAllOffset < railAllSize) {
-          ssize_t railOneBeg = node*sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railOneBeg = node*countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t railOneOffset = (railAllBeg+railAllOffset) - railOneBeg;
           int delta = min(railAllEnd, railOneEnd) - (railAllBeg+railAllOffset);
           int rank = ncclShmem.comm.collNetDenseToUserRank[node*nRails + rail];
-          ssize_t userOneBeg = rank*sizePerRank + railOneOffset;
+          ssize_t userOneBeg = rank*countPerRank + railOneOffset;
           if (nDsts != 0) {
             reduceCopy<ncclCollUnroll(), RedOp, T,
                      /*MultimemSrcs=*/0, 1+MinSrcs, 1+MaxSrcs,
@@ -239,7 +240,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
             (tid, tn, work->redOpArg, &work->redOpArg, false,
              /*nSrcs=*/1+nSrcs, [=]__device__(int s) {
                return s==0 ? (T*)inbuf + userOneBeg
-                           : work->regUsed && (recvDirectFlag & NCCL_DIRECT_READ)
+                           : work->regUsed && (recvDirectFlag & NCCL_P2P_READ)
                            ? (T*)srcPtrs[s-1] + userOneBeg
                            : (T*)srcPtrs[s-1] + railAllOffset;
              },
@@ -264,7 +265,8 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
     int const &nNodes = ncclShmem.comm.nNodes;
     ssize_t chunkSize = int(work->collnet.chunkCount);
-    ssize_t sizePerRank = work->collnet.count;
+    ssize_t countPerRank = work->collnet.count;
+    const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
 
     if (direct->out == -1) __trap();
     bool isMultiRail = (direct->nHeads > 1);
@@ -281,15 +283,15 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     int tn = nWarps1*WARP_SIZE;
     if (tid < tn) {
       // Phase 1: Scatter inputs to peers
-      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/1, Proto, 0>
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
         prims(tid, tn, nullptr, direct->heads+1, work->sendbuff, nullptr,
-              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1, work);
-      for (ssize_t railGridOffset=0; railGridOffset < nNodes*sizePerRank; railGridOffset += nChannels*chunkSize) {
+              work->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t railGridOffset=0; railGridOffset < nNodes*countPerRank; railGridOffset += nChannels*chunkSize) {
         Scatterer</*ReduceSendNotRecv=*/true> scat;
         scat.work = work;
         scat.chunkSize = chunkSize;
         scat.railGridOffset = railGridOffset;
-        prims.template process</*Recv=*/0, /*Send=*/1>(scat, NCCL_DIRECT_READ, 0);
+        prims.template process</*Recv=*/0, /*Send=*/1>(scat, 0, 0);
       }
       return;
     }
@@ -297,23 +299,22 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
 
     tn = nWarps2*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed && !hasDn) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
-          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, steps);
+          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>::sendPeerNotify(direct->out, 1, 1);
         }
         __syncwarp();
       } else {
         // Phase 2: Reduce from peers + local input -> send to network
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/1, Proto, 0>
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, direct->heads + 1, &direct->out, nullptr, nullptr,
-            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+            work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1);
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           Scatterer</*ReduceSendNotRecv=*/false> scat;
           scat.work = work;
           scat.chunkSize = chunkSize;
           scat.railGridOffset = railGridOffset;
-          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, NCCL_DIRECT_READ);
+          prims.template process</*Recv=*/1, /*Send=*/1>(scat, 0, 0);
         }
       }
       return;
@@ -322,9 +323,9 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
 
     tn = nWarps3*WARP_SIZE;
     if (tid < tn) {
-      if (work->regUsed == NCCL_COLLNET_REG_BUFFER) {
+      if (work->netRegUsed) {
         if (tid == 0) {
-          int steps = (int)divUp(nNodes * sizePerRank * sizeof(T), NCCL_MAX_COLLNET_SIZE);
+          int steps = hasDn ? (int)divUp(nNodes * countPerRank, nChannels * chunkSize) : 1;
           Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(direct->out, 0, steps);
         }
         __syncwarp();
@@ -333,11 +334,11 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
         Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
           prims(tid, tn, &direct->out, nullptr, nullptr, work->recvbuff,
             work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 0);
-        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * sizePerRank; railGridOffset += nChannels * chunkSize) {
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkSize) {
           ssize_t railAllBeg = railGridOffset + part * chunkSize;
-          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * sizePerRank);
-          ssize_t railOneBeg = ncclShmem.comm.node * sizePerRank;
-          ssize_t railOneEnd = railOneBeg + sizePerRank;
+          ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+          ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
           ssize_t beg = max(railAllBeg, railOneBeg);
           ssize_t end = min(railAllEnd, railOneEnd);
           prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index 9b039a41a..fe3b9ca77 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -15,33 +15,35 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
   template<typename Proto>
   __device__ void runSend(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->sendBytes;
-    int chunkSize = work->sendIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->sendChunkSize_u32fp8);
+    bool useLargeChunk = (work->sendIpcReg && ncclShmem.comm.isAllNvlink) || work->sendNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->sendChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
     Primitives<T, RedOp, FanAsymmetric<0, 1>, 1, Proto, 1>
       prims(tid, tn, nullptr, &work->sendRank, work->sendAddr, nullptr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->sendIpcReg, /*netReg=*/work->sendRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
       prims.directSend(cursor, cursor, n);
       cursor += n;
-    } while (cursor < bytes && work->sendRegistered == 0);
+    } while (cursor < bytes);
   }
 
   template<typename Proto>
   __device__ void runRecv(int tid, int tn, int group, struct ncclDevWorkP2p* work) {
     size_t bytes = work->recvBytes;
-    int chunkSize = work->recvIpcReg && ncclShmem.comm.isNvlink ? (1 << 30) : u32fp8Decode(work->recvChunkSize_u32fp8);
+    bool useLargeChunk = (work->recvIpcReg && ncclShmem.comm.isAllNvlink) || work->recvNetReg;
+    int chunkSize = useLargeChunk ? NCCL_MAX_NET_SIZE : u32fp8Decode(work->recvChunkSize_u32fp8);
+    int stepSize = useLargeChunk ? NCCL_MAX_NET_SIZE : ncclShmem.comm.p2pChunkSize;
     Primitives<T, RedOp, FanAsymmetric<1, 0>, 1, Proto, 1>
       prims(tid, tn, &work->recvRank, nullptr, nullptr, work->recvAddr,
-            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr,
-            /*ipcReg=*/work->recvIpcReg, /*netReg=*/work->recvRegistered, ncclShmem.comm.p2pChunkSize);
+            /*redOpArg(ignored)=*/0, group, 1, 1, nullptr, work, stepSize);
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
       prims.directRecv(cursor, cursor, n);
       cursor += n;
-    } while (cursor < bytes && work->recvRegistered == 0);
+    } while (cursor < bytes);
   }
 
   __device__ __forceinline__ void run() {
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 4edb42dec..285e17f69 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -16,6 +16,7 @@
 
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
+#include <cassert>
 
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
@@ -63,15 +64,6 @@ static inline int ncclFuncTrafficPerByte(ncclFunc_t func, int nRanks) {
   default: return 1;
   }
 }
-static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncReduceScatter ? nRanks*count : count;
-}
-static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncAllGather ? nRanks*count : count;
-}
-static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
-  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
-}
 
 /*****************************************************************************/
 /*       Launch system : synchronization and CUDA kernel launch              */
@@ -230,301 +222,8 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   }
 }
 
-int64_t ncclParamLocalRegister();
 NCCL_PARAM(GraphRegister, "GRAPH_REGISTER", 1);
 
-struct ncclIpcCleanupCallback {
-  struct ncclCommCallback base;
-  void* ptr;
-};
-static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
-  struct ncclIpcCleanupCallback* me = (struct ncclIpcCleanupCallback*)cb;
-  CUDACHECKIGNORE(cudaIpcCloseMemHandle(me->ptr));
-  free(me);
-  return ncclSuccess;
-}
-
-static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
-  if (conn->connected) {
-    if (conn->conn.flags & (NCCL_IPC_READ | NCCL_IPC_WRITE | NCCL_DIRECT_READ | NCCL_DIRECT_WRITE)) {
-      *needReg = true;
-    } else {
-      // network connection
-      *needReg = false;
-    }
-  } else {
-    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
-    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
-    int canConnect = 0;
-    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
-    if (canConnect) {
-      *needReg = true;
-    } else {
-      *needReg = false;
-    }
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t registerCollBuffers(
-    struct ncclComm* comm, struct ncclTaskColl* info,
-    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
-    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
-    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
-    bool* regNeedConnect
-  ) {
-  ncclResult_t result = ncclSuccess;
-
-  info->regBufType = NCCL_REGULAR_BUFFER;
-  *regNeedConnect = true;
-  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
-#if CUDART_VERSION >= 11030
-  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
-    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
-    bool regBufUsed = false;
-    const void *sendbuff = info->sendbuff;
-    void *recvbuff = info->recvbuff;
-    if (info->func == ncclFuncAllGather) sendbuff = NULL;
-    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-
-    /* first try local registration. */
-    if (ncclParamLocalRegister()) {
-      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv);
-    }
-
-    if (regBufUsed == false && comm->planner.persistent && ncclParamGraphRegister()) {
-      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &regBufUsed, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
-    }
-
-    if (regBufUsed) {
-      *regNeedConnect = false;
-      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
-       * saturate bandwidth. */
-      if (comm->nNodes == 1) {
-        if (info->func == ncclFuncReduceScatter)
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
-        else
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
-      } else {
-        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
-      }
-      info->regBufType = NCCL_NVLS_REG_BUFFER;
-    }
-  } else if ((info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && comm->collNetRegSupport && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv) {
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-    int sendRegBufFlag = 0;
-    int recvRegBufFlag = 0;
-    void *sendHandle, *recvHandle;
-
-    if (ncclParamLocalRegister()) {
-      ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle);
-      info->sendMhandle = sendHandle;
-      if (sendRegBufFlag) {
-        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle);
-        info->recvMhandle = recvHandle;
-      }
-    }
-
-    if ((sendRegBufFlag == 0 || recvRegBufFlag == 0) && comm->planner.persistent && ncclParamGraphRegister()) {
-      if (!sendRegBufFlag) {
-        ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &sendRegBufFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
-        info->sendMhandle = sendHandle;
-      }
-      if (sendRegBufFlag && !recvRegBufFlag) {
-        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &recvRegBufFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
-        info->recvMhandle = recvHandle;
-      }
-    }
-
-    if (sendRegBufFlag && recvRegBufFlag) {
-      info->nMaxChannels = 1;
-      info->regBufType = NCCL_COLLNET_REG_BUFFER;
-      if (sendRegBufFlag == 1 && recvRegBufFlag == 1) {
-        INFO(NCCL_REG, "rank %d successfully registered collNet sendbuff %p (handle %p), sendbuff size %ld, recvbuff %p (handle %p), recvbuff size %ld", comm->rank, info->sendbuff, sendHandle, sendbuffSize, info->recvbuff, recvHandle, recvbuffSize);
-      }
-    }
-  } else if (comm->intraNodeP2pSupport && info->protocol == NCCL_PROTO_SIMPLE) {
-    // IPC buffer registration
-    if (info->func == ncclFuncReduceScatter) goto exit;
-    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
-    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
-    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
-
-    int peerRanks[NCCL_MAX_LOCAL_RANKS];
-    int nPeers = 0;
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-    int regBufFlag = 0;
-    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
-
-    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
-      struct ncclChannel* channel = comm->channels;
-      for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
-        for (int updown = 0; updown < 2; ++updown) {
-          int peer;
-          if (updown == 0)
-            peer = channel->collnetDirect.up[r];
-          else
-            peer = channel->collnetDirect.down[r];
-          if (peer != -1) {
-            struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
-            bool needReg = false;
-
-            NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
-            if (needReg) {
-              bool found = false;
-              for (int p = 0; p < nPeers; ++p) {
-                if (peerRanks[p] == peer) {
-                  found = true;
-                  break;
-                }
-              }
-              if (!found) peerRanks[nPeers++] = peer;
-            }
-          }
-        }
-      }
-
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister())
-          ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-        if (regBufFlag) {
-          if (ncclParamLocalRegister())
-            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-          if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-          }
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    } else if (info->algorithm == NCCL_ALGO_RING) {
-      struct ncclReg* recvRegRecord;
-      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
-      if (recvRegRecord == NULL) goto exit;
-      for (int c = 0; c < comm->nChannels; ++c) {
-        struct ncclChannel* channel = comm->channels + c;
-        for (int r = 0; r < 2; ++r) {
-          bool needReg = false;
-          int peer;
-          struct ncclConnector* peerConn;
-          // P2P transport
-          if (r == 0)
-            peer = channel->ring.prev;
-          else
-            peer = channel->ring.next;
-          peerConn = &channel->peers[peer]->recv[0];
-          NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_RING], peer, &needReg));
-
-          if (needReg) {
-            bool found = false;
-            for (int p = 0; p < nPeers; ++p) {
-              if (peerRanks[p] == peer) {
-                found = true;
-                break;
-              }
-            }
-            if (!found) peerRanks[nPeers++] = peer;
-          }
-        }
-      }
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister()) {
-          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-        }
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
-      struct ncclReg* recvRegRecord;
-      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
-      if (recvRegRecord == NULL) goto exit;
-      for (int c = 0; c < comm->nChannels; ++c) {
-        struct ncclChannel* channel = comm->channels + c;
-        struct ncclTree* tree = NULL;
-        int peers[NCCL_MAX_TREE_ARITY + 1];
-
-        if (info->algorithm == NCCL_ALGO_TREE)
-          tree = &channel->tree;
-        else
-          tree = &channel->collnetChain;
-        for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
-        peers[NCCL_MAX_TREE_ARITY] = tree->up;
-        for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
-          int peer = peers[p];
-          bool peerNeedReg = false;
-          struct ncclConnector* recvConn = NULL;
-          // P2P transport
-          if (peer == -1 || peer == comm->nRanks) continue;
-          recvConn = &channel->peers[peer]->recv[0];
-          NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
-
-          if (peerNeedReg) {
-            bool found = false;
-            for (int pindex = 0; pindex < nPeers; ++pindex) {
-              if (peerRanks[pindex] == peer) {
-                found = true;
-                break;
-              }
-            }
-            if (!found) peerRanks[nPeers++] = peer;
-          }
-        }
-      }
-      if (nPeers > 0) {
-        if (ncclParamLocalRegister()) {
-          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
-        }
-        if (!regBufFlag && comm->planner.persistent && ncclParamGraphRegister()) {
-          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
-        }
-      }
-      if (regBufFlag) {
-        info->regBufType = NCCL_IPC_REG_BUFFER;
-      }
-    }
-
-    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
-      info->nMaxChannels = 16;
-    }
-  }
-exit:
-#endif
-  return result;
-}
-
-static ncclResult_t registerP2pBuffer(struct ncclComm* comm, void* userbuff, int peerRank, size_t size, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
-  ncclResult_t ret = ncclSuccess;
-  uintptr_t offset = 0;
-  uintptr_t* peerRmtAddrs = NULL;
-
-  *regFlag = 0;
-  if (ncclParamLocalRegister()) {
-    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
-  }
-  if (*regFlag == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
-    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
-  }
-
-  if (*regFlag)
-    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
-  return ret;
-}
-
 static ncclResult_t getCollNetSupport(struct ncclComm* comm, struct ncclTaskColl* task, int* collNetSupport);
 static ncclResult_t getAlgoInfo(
   struct ncclComm* comm, struct ncclTaskColl* task,
@@ -550,10 +249,72 @@ static bool testBudget(
   return ok;
 }
 
+ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
+  struct ncclKernelPlanner* planner = &comm->planner;
+  struct ncclTaskColl *task;
+
+  task = ncclIntruQueueHead(&planner->collTaskQueue);
+  while (task != nullptr) {
+    // Build a ncclDevWorkColl[Reg?] struct for each task.
+    void* regBufSend[NCCL_MAX_LOCAL_RANKS];
+    void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
+    bool regNeedConnect = true;
+    struct ncclWorkList* workNode = NULL;
+    struct ncclDevWorkColl devWork = {};
+
+    if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) {
+      workNode = ncclIntruQueueDequeue(&planner->tmpCollWorkQueue);
+      goto next;
+    }
+    ncclRegisterCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+
+    devWork.sendbuff = (void*)task->sendbuff;
+    devWork.recvbuff = (void*)task->recvbuff;
+    devWork.sendbuffOffset = task->sendbuffOffset;
+    devWork.recvbuffOffset = task->recvbuffOffset;
+    devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
+    devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
+    devWork.root = task->root;
+    devWork.nWarps = task->nWarps;
+    devWork.redOpArg = task->opDev.scalarArg;
+    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
+    devWork.oneNode = (comm->nNodes == 1);
+    devWork.isOneRPN = comm->isOneRPN;
+    devWork.netRegUsed = devWork.regUsed = 0;
+    if (task->regBufType & NCCL_NET_REG_BUFFER)
+      devWork.netRegUsed = 1;
+    if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
+      devWork.regUsed = 1;
+
+    if (task->regBufType & NCCL_NVLS_REG_BUFFER) {
+      struct ncclDevWorkCollReg workReg = {};
+      workReg.coll = devWork; // C++ struct assignment
+      /* NVLS only has one send and recv buffer registered */
+      workReg.dnInputs[0] = regBufSend[0];
+      workReg.dnOutputs[0] = regBufRecv[0];
+      workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
+      workNode->workType = ncclDevWorkTypeCollReg;
+      workNode->size = sizeof(struct ncclDevWorkCollReg);
+      memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
+    } else {
+      workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
+      workNode->workType = ncclDevWorkTypeColl;
+      workNode->size = sizeof(struct ncclDevWorkColl);
+      memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
+    }
+next:
+    ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode);
+    task = task->next;
+  }
+  assert(ncclIntruQueueEmpty(&planner->tmpCollWorkQueue));
+  return ncclSuccess;
+}
+
 // Called once per ncclGroup to organize the user submitted tasks in
 // comm->planner so that they can be peeled off into plans.
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
   struct ncclKernelPlanner* planner = &comm->planner;
+  planner->persistent = ncclCudaGraphValid(planner->capturingGraph);
   // Tasks from the sorter come out ordered size descending.
   struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
   // Tasks are assembled by (fn,op,ty) size ascending.
@@ -648,7 +409,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
     void* regBufSend[NCCL_MAX_LOCAL_RANKS];
     void* regBufRecv[NCCL_MAX_LOCAL_RANKS];
     bool regNeedConnect = true;
-    registerCollBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
+    ncclRegisterCollNvlsBuffers(comm, task, regBufSend, regBufRecv, &planner->collCleanupQueue, &regNeedConnect);
 
     if (comm->runtimeConn && comm->initAlgoChannels[task->algorithm] == false) {
       if (task->algorithm == NCCL_ALGO_NVLS_TREE && comm->initAlgoChannels[NCCL_ALGO_NVLS] == false && regNeedConnect == true) {
@@ -662,32 +423,28 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
       }
     }
 
-    struct ncclDevWorkColl devWork = {};
-    devWork.sendbuff = (void*)task->sendbuff;
-    devWork.recvbuff = (void*)task->recvbuff;
-    devWork.sendbuffOffset = task->sendbuffOffset;
-    devWork.recvbuffOffset = task->recvbuffOffset;
-    devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
-    devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
-    devWork.root = task->root;
-    devWork.nWarps = task->nWarps;
-    devWork.redOpArg = task->opDev.scalarArg;
-    devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
-    devWork.oneNode = (comm->nNodes == 1);
-    devWork.regUsed = task->regBufType;
-
-    struct ncclWorkList* workNode;
-    switch (task->regBufType) {
-    case NCCL_REGULAR_BUFFER:
-    case NCCL_IPC_REG_BUFFER:
-    case NCCL_COLLNET_REG_BUFFER:
-      { workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
-        workNode->workType = ncclDevWorkTypeColl;
-        workNode->size = sizeof(struct ncclDevWorkColl);
-        memcpy((void*)(workNode+1), (void*)&devWork, workNode->size);
-      } break;
-    case NCCL_NVLS_REG_BUFFER:
-      { struct ncclDevWorkCollReg workReg = {};
+    if (task->algorithm == NCCL_ALGO_NVLS_TREE || task->algorithm == NCCL_ALGO_NVLS) {
+      struct ncclDevWorkColl devWork = {};
+      devWork.sendbuff = (void*)task->sendbuff;
+      devWork.recvbuff = (void*)task->recvbuff;
+      devWork.sendbuffOffset = task->sendbuffOffset;
+      devWork.recvbuffOffset = task->recvbuffOffset;
+      devWork.sendbuffRmtAddrs = task->sendbuffRmtAddrs;
+      devWork.recvbuffRmtAddrs = task->recvbuffRmtAddrs;
+      devWork.root = task->root;
+      devWork.nWarps = task->nWarps;
+      devWork.redOpArg = task->opDev.scalarArg;
+      devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
+      devWork.oneNode = (comm->nNodes == 1);
+      devWork.netRegUsed = devWork.regUsed = 0;
+      if (task->regBufType & NCCL_NET_REG_BUFFER)
+        devWork.netRegUsed = 1;
+      if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
+        devWork.regUsed = 1;
+
+      struct ncclWorkList* workNode;
+      if (task->regBufType & NCCL_NVLS_REG_BUFFER) {
+        struct ncclDevWorkCollReg workReg = {};
         workReg.coll = devWork; // C++ struct assignment
         /* NVLS only has one send and recv buffer registered */
         workReg.dnInputs[0] = regBufSend[0];
@@ -695,15 +452,16 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkCollReg>(&comm->memScoped, 1);
         workNode->workType = ncclDevWorkTypeCollReg;
         workNode->size = sizeof(struct ncclDevWorkCollReg);
-        memcpy((void*)(workNode+1), (void*)&workReg, workNode->size);
-      } break;
-    default:
-      /* impossible value */
-      WARN("Invalid regBufType %d", task->regBufType);
-      return ncclInvalidArgument;
-    }
+        memcpy((void*)(workNode + 1), (void*)&workReg, workNode->size);
+      } else {
+        workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkColl>(&comm->memScoped, 1);
+        workNode->workType = ncclDevWorkTypeColl;
+        workNode->size = sizeof(struct ncclDevWorkColl);
+        memcpy((void*)(workNode + 1), (void*)&devWork, workNode->size);
+      }
 
-    ncclIntruQueueEnqueue(&planner->collWorkQueue, workNode);
+      ncclIntruQueueEnqueue(&planner->tmpCollWorkQueue, workNode);
+    }
     task = task->next;
   }
 
@@ -875,15 +633,32 @@ static ncclResult_t scheduleCollTasksToPlan(
         struct ncclProxyOp* proxyOp;
         if (c == (int)devWork->channelLo) {
           proxyOp = &proxyOpLo;
+          proxyOp->loopOffset = 0;
+          proxyOp->channelSize = countLo * elementSize;
         } else if (c == (int)devWork->channelHi) {
           proxyOp = &proxyOpHi;
+          proxyOp->loopOffset = (countLo + nMidChannels * countMid) * elementSize;
+          proxyOp->channelSize = countHi * elementSize;
         } else {
           proxyOp = &proxyOpMid;
+          proxyOp->loopOffset = (countLo + (c - devWork->channelLo - 1) * countMid) * elementSize;
+          proxyOp->channelSize = countMid * elementSize;
         }
         proxyOp->channelId = c;
         proxyOp->opCount = proxyOpId;
         proxyOp->task.coll = task;
         proxyOp->rank = comm->rank;
+        proxyOp->ringAlgo = NULL;
+        if (proxyOp->reg && task->algorithm == NCCL_ALGO_RING && (task->recvNetHandles[c] || task->sendNetHandles[c])) {
+          if (task->func == ncclFuncAllGather) {
+            proxyOp->ringAlgo = new RingAGAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->count * elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          } else if (task->func == ncclFuncAllReduce) {
+            proxyOp->ringAlgo = new RingARAlgorithm(task->sendbuff, task->recvbuff, comm->nRanks, comm->channels[c].ring.index, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, elementSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          } else if (task->func == ncclFuncBroadcast) {
+            proxyOp->ringAlgo = new RingBCAlgorithm(task->sendbuff, task->recvbuff, comm->rank, task->root, comm->nRanks, comm->channels[c].ring.userRanks, proxyOp->chunkSteps, proxyOp->sliceSteps, proxyOp->chunkSize, proxyOp->sliceSize, proxyOp->loopOffset, proxyOp->channelSize, task->sendNetHandles[c], task->recvNetHandles[c], task->srecvNetHandles[c]);
+          }
+          proxyOp->ringAlgo->incRefCount();
+        }
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
@@ -900,6 +675,10 @@ static ncclResult_t scheduleCollTasksToPlan(
     }
 
     if (comm->rank == 0) {
+      INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}",
+        ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclAlgoToString(task->algorithm),
+        ncclProtoToString(task->protocol), devWork->channelLo, devWork->channelHi);
+
       if (task->isCollnet) {
         TRACE(NCCL_COLL, "Collective %s(%s, %s, %s, %s) count=%ld devFuncId=%d channel{Lo..Hi}={%d..%d} count=%ld chunkCount=%d",
           ncclFuncToString(task->func), ncclDevRedOpToString(task->opDev.op),
@@ -956,6 +735,7 @@ static ncclResult_t addP2pToPlan(
   bool protoLL[2] = {!selfSend, !selfSend};
   bool network[2] = {false, false};
   bool proxySameProcess[2] = {true, true};
+  void** handles[2] = {NULL, NULL};
   uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound);
   if (!selfSend) {
     for (int part=0; part < nChannelsMax; part++) {
@@ -981,7 +761,7 @@ static ncclResult_t addP2pToPlan(
   int chunkSize[2];
   int chunkDataSize[2];
   int chunkDataSize_u32fp8[2];
-  bool registered[2] = {false, false};
+  bool netRegistered[2] = {false, false};
   bool ipcRegistered[2] = {false, false};
 
   for (int dir=0; dir < 2; dir++) { // 0=recv, 1=send
@@ -1007,10 +787,20 @@ static ncclResult_t addP2pToPlan(
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
     if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE) {
-        struct ncclReg* regRecord;
-        NCCLCHECK(ncclRegFind(comm, addrs[dir], bytes[dir], &regRecord));
-        registered[dir] = regRecord && regRecord->nDevs;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+        int regFlag = 0;
+        NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
+        for (int part = 0; part < nChannelsMax; part++) {
+          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
+          struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
+          int peerRank = dir ? sendRank : recvRank;
+          struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
+            : &channelPeers[peerRank]->recv[connIndex];
+          if (conn->conn.flags & NCCL_DIRECT_NIC)
+            ncclRegisterP2pNetBuffer(comm, addrs[dir], bytes[dir], conn, &regFlag, &handles[dir][part], &plan->cleanupQueue);
+          if (!regFlag) break;
+        }
+        netRegistered[dir] = regFlag ? true : false;
       }
     } else if (bytes[dir] > 0 && addrs[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && !selfSend) {
       int peerRank = dir ? sendRank : recvRank;
@@ -1020,12 +810,12 @@ static ncclResult_t addP2pToPlan(
       struct ncclConnector* conn = dir ? &channelPeers[peerRank]->send[connIndex]
         : &channelPeers[peerRank]->recv[connIndex];
       void* regAddr = NULL;
-      if (conn->conn.flags & (NCCL_IPC_WRITE | NCCL_IPC_READ | NCCL_DIRECT_WRITE | NCCL_DIRECT_READ)) {
+      if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
         // We require users registering buffers on both sides
-        NCCLCHECK(registerP2pBuffer(comm, addrs[dir], peerRank, bytes[dir], &regFlag, &regAddr, &plan->cleanupQueue));
+        NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue));
         if (regFlag) {
-          if (dir == 0 && conn->conn.flags & (NCCL_IPC_WRITE | NCCL_DIRECT_WRITE)) recvAddr = regAddr;
-          else if (dir == 1 && conn->conn.flags & (NCCL_IPC_READ | NCCL_DIRECT_READ)) sendAddr = regAddr;
+          if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr;
+          else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr;
         }
       }
       ipcRegistered[dir] = regFlag ? true : false;
@@ -1057,7 +847,7 @@ static ncclResult_t addP2pToPlan(
   work->channelBase = base;
   work->nSendChannels = nChannels[1];
   work->sendProtoLL = protoLL[1];
-  work->sendRegistered = registered[1];
+  work->sendNetReg = netRegistered[1];
   work->sendIpcReg = ipcRegistered[1];
   work->sendChunkSize_u32fp8 = chunkDataSize_u32fp8[1];
   work->sendRank = sendRank;
@@ -1065,7 +855,7 @@ static ncclResult_t addP2pToPlan(
   work->sendBytes = sendBytes==-1 ? 0 : sendBytes;
   work->nRecvChannels = nChannels[0];
   work->recvProtoLL = protoLL[0];
-  work->recvRegistered = registered[0];
+  work->recvNetReg = netRegistered[0];
   work->recvIpcReg = ipcRegistered[0];
   work->recvChunkSize_u32fp8 = chunkDataSize_u32fp8[0];
   work->recvRank = recvRank;
@@ -1084,7 +874,7 @@ static ncclResult_t addP2pToPlan(
     op->protocol = protocol[dir];
     op->pattern = dir ? ncclPatternSend : ncclPatternRecv;
     op->chunkSize = chunkSize[dir];
-    op->reg = registered[dir];
+    op->reg = netRegistered[dir];
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
@@ -1116,9 +906,10 @@ static ncclResult_t addP2pToPlan(
         size_t partBeg, partEnd;
         ncclP2pPartBounds(nParts, part, bytes, &partBeg, &partEnd);
         if (proxyOps[dir].reg) {
-          proxyOps[dir].nsteps = 1;
-          proxyOps[dir].recvbuff = (uint8_t*)addr+partBeg;
-          proxyOps[dir].nbytes = partEnd-partBeg;
+          (dir ? proxyOps[dir].sendbuff : proxyOps[dir].recvbuff) = (uint8_t*)addr + partBeg;
+          (dir ? proxyOps[dir].sendMhandle : proxyOps[dir].recvMhandle) = handles[dir][part];
+          proxyOps[dir].nbytes = partEnd - partBeg;
+          proxyOps[dir].nsteps = DIVUP(proxyOps[dir].nbytes, NCCL_MAX_NET_SIZE);
         } else {
           proxyOps[dir].nsteps = divUp(partEnd-partBeg, chunkDataSize);
           proxyOps[dir].nbytes = std::min(partEnd-partBeg, chunkDataSize);
@@ -1198,6 +989,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
         // Skip send to self in-place (we don't need to support this).
         ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
         ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv);
         comm->planner.nTasksP2p -= 2;
       } else {
         // Ensure room for worst case of one new batch per channel.
@@ -1302,8 +1095,13 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     plan->kernelArgs->workBuf = comm->workFifoBufDev;
     break;
   case ncclDevWorkStorageTypePersistent:
+    // We rely on 16-byte alignment
+    #if __cplusplus >= 201103L
+    fifoBufHost = aligned_alloc(16, ROUNDUP(workBytes, 16));
+    #else
     static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
     fifoBufHost = malloc(workBytes);
+    #endif
     fifoCursor = 0;
     fifoMask = ~0u;
     break;
@@ -1346,37 +1144,41 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     break;
   case ncclDevWorkStorageTypePersistent:
     { ncclResult_t result = ncclSuccess;
+      struct uploadWork_cleanup_t* cleanup = nullptr;
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       void* fifoBufDev = nullptr;
-      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+      CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
 
       // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
       // user's graph will be launched later, and it also acquires the deviceStream,
       // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, finish_scope);
+      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
 
-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
       plan->workBufPersistent = fifoBufDev;
       plan->kernelArgs->workBuf = fifoBufDev;
 
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
       cudaEvent_t memcpyDone;
-      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, finish_scope);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, finish_scope);
+      CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
 
-      struct uploadWork_cleanup_t* cleanup;
-      NCCLCHECK(ncclCalloc(&cleanup, 1));
+      NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
       cleanup->base.fn = uploadWork_cleanup_fn;
       cleanup->base.event = memcpyDone;
       cleanup->hostBuf = fifoBufHost;
-      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cleanup->base);
+      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, finish_scope);
-      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, finish_scope);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
 
     finish_scope:
-      CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-      if (result != ncclSuccess) return result;
+      if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode);
+      return result;
+    fail:
+      if (!cleanup) free(fifoBufHost);
+      goto finish_scope;
     } break;
   default: break;
   }
@@ -1388,6 +1190,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
   uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/};
   // Advance comm's collOpCount by number of colls in this plan.
   comm->sharedRes->collOpCount += plan->collOpCount;
+  comm->collOpCount += plan->collOpCount;
 
   struct ncclProxyOp* op = ncclIntruQueueHead(&plan->proxyOpQueue);
   while (op != nullptr) {
@@ -1410,18 +1213,9 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
 
     NCCLCHECK(ncclProxySaveOp(comm, op, nullptr));
     op->opCount = oldId; // Restore for next uploadProxyOps()
-
-    struct ncclProxyOp* opNext = op->enqNext;
-    if (!plan->persistent) {
-      // Non-persistent kernels upload ops only once so can be free'd here.
-      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, op);
-    }
-    op = opNext;
+    op = op->enqNext;
   }
 
-  // Erase proxyOpQueue since all ops were free'd back to mempool.
-  if (!plan->persistent) ncclIntruQueueConstruct(&plan->proxyOpQueue);
-
   for (int c=0; c < MAXCHANNELS; c++) {
     // Advance channel's p2pOpCount by number of p2p's in this plan channel.
     comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
@@ -1450,6 +1244,8 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  return;
 }
 
 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
@@ -1462,32 +1258,41 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
       CUDACHECK(cudaFree(plan->workBufPersistent));
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
     }
-    struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
-    while (q != nullptr) {
-      struct ncclProxyOp* q1 = q->enqNext;
-      ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
-      q = q1;
-    }
-    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-    while (ct != nullptr) {
-      struct ncclTaskColl* ct1 = ct->next;
-      ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
-      ct = ct1;
-    }
-    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
-    while (pt != nullptr) {
-      struct ncclTaskP2p* pt1 = pt->next;
-      ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
-      pt = pt1;
-    }
-    ncclResult_t result = ncclSuccess;
-    while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
-      struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
-      ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb
-      if (res1 != ncclSuccess) result = res1;
-    }
-    NCCLCHECK(result);
   }
+  // Free coll tasks
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct != nullptr) {
+    struct ncclTaskColl* ct1 = ct->next;
+    free(ct->sendNetHandles);
+    free(ct->recvNetHandles);
+    free(ct->srecvNetHandles);
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, ct);
+    ct = ct1;
+  }
+  // Free p2p tasks
+  struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+  while (pt != nullptr) {
+    struct ncclTaskP2p* pt1 = pt->next;
+    ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, pt);
+    pt = pt1;
+  }
+  // Free proxy ops
+  struct ncclProxyOp* q = ncclIntruQueueHead(&plan->proxyOpQueue);
+  while (q != nullptr) {
+    struct ncclProxyOp* q1 = q->enqNext;
+    if (q->ringAlgo && q->ringAlgo->decRefCount() == 0) delete q->ringAlgo;
+    ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, q);
+    q = q1;
+  }
+  // Run other free callbacks
+  ncclResult_t result = ncclSuccess;
+  while (!ncclIntruQueueEmpty(&plan->cleanupQueue)) {
+    struct ncclCommCallback* cb = ncclIntruQueueDequeue(&plan->cleanupQueue);
+    ncclResult_t res1 = cb->fn(comm, cb); // Expect to reclaim memory of cb
+    if (res1 != ncclSuccess) result = res1;
+  }
+  NCCLCHECK(result);
+  // Free plan struct
   ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
   return ncclSuccess;
 }
@@ -1509,10 +1314,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   planner->persistent = persistent;
   int nPlans = 0;
 
-  // Poll for callbacks sent to us from other threads. Typically these free
-  // resources from to our memory pools.
-  NCCLCHECK(ncclCommPollCallbacks(comm, /*waitSome=*/false));
-
   if (planner->nTasksColl + planner->nTasksP2p != 0) {
     do {
       memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));
@@ -1577,7 +1378,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
     }
     NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
 
-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking) {
+    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
@@ -1587,6 +1388,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
             acquired = true;
             NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
           }
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          plan->isHostCbEnq = true;
           NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
@@ -1602,6 +1405,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
+
 failure:
   return result;
 }
@@ -1694,7 +1498,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (!(plan->persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking)) {
+  if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) {
     // We are not using the host stream for proxy ops and reclaimation submission.
     NCCLCHECK(hostStreamPlanTask(comm, plan));
   } else {
@@ -1778,8 +1582,7 @@ static void initCollCostTable(float** collCostTable) {
 static ncclResult_t updateCollCostTable(
     struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
     int collNetSupport, int nvlsSupport, int numPipeOps,
-    float** collCostTable, int* backupAlgo, int* backupProto, float* backupTime
-  ) {
+    float** collCostTable) {
   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
 
   if (comm->nRanks == 1) {
@@ -1799,16 +1602,12 @@ static ncclResult_t updateCollCostTable(
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      bool backup;
-      float time;
-      NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &time, &backup));
-      if (!backup) {
-        table[a][p] = time;
-      } else {
-        if (time >= 0.0 && time < *backupTime) {
-          *backupAlgo = a;
-          *backupProto = p;
-          *backupTime = time;
+      NCCLCHECK(ncclTopoGetAlgoTime(comm, info->func, a, p, nBytes, numPipeOps, &table[a][p]));
+      // Relegate fp8 reduction trees of sufficient depth that they incur precision loss
+      // to be least preferred.
+      if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
+        if (a == NCCL_ALGO_RING && comm->nRanks > 8) {
+          table[a][p] *= 1024.0; // Any factor large enough to act as a partition between lossy and non-lossy algos.
         }
       }
     }
@@ -1819,7 +1618,7 @@ static ncclResult_t updateCollCostTable(
 
 static ncclResult_t topoGetAlgoInfo(
     struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
-    float** collCostTable, int backupAlgo, int backupProto, float backupTime, ncclSimInfo_t* simInfo
+    float** collCostTable, ncclSimInfo_t* simInfo
   ) {
   float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
 
@@ -1844,15 +1643,19 @@ static ncclResult_t topoGetAlgoInfo(
   // Yes, we are first assigning and then testing if protocol is sane, but that's OK in this case.
   // coverity[check_after_sink]
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
-    if (backupAlgo == NCCL_ALGO_UNDEF || backupProto == NCCL_PROTO_UNDEF) {
-      WARN("Error : no algorithm/protocol available");
-      return ncclInternalError;
+    char ncclAlgoEnvStr[1024] = "";
+    char ncclProtoEnvStr[1024] = "";
+    char* algoEnv = getenv("NCCL_ALGO");
+    if (algoEnv) {
+      snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
     }
-    info->algorithm = backupAlgo;
-    info->protocol = backupProto;
-    time = backupTime;
+    char* protoEnv = getenv("NCCL_PROTO");
+    if (protoEnv) {
+      snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
+    }
+    WARN("Error : no algorithm/protocol available for function %s with datatype %s.%s%s", ncclFuncToString(info->func), ncclDatatypeToString(info->datatype), ncclAlgoEnvStr, ncclProtoEnvStr);
+    return (algoEnv || protoEnv) ? ncclInvalidUsage : ncclInternalError;
   }
-  if (comm->rank == 0) INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %d proto %d time %f", ncclFuncToString(info->func), nBytes, info->algorithm, info->protocol, time);
   if (simInfo) simInfo->estimatedTime = time;
   TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", nBytes, info->algorithm, info->protocol, time);
 
@@ -1913,19 +1716,24 @@ static ncclResult_t getAlgoInfo(
   info->algorithm = NCCL_ALGO_UNDEF;
   info->protocol = NCCL_PROTO_UNDEF;
   int nMaxChannels = 0;
-  int backupAlgo = NCCL_ALGO_UNDEF;
-  int backupProto = NCCL_PROTO_UNDEF;
-  float backupTime = 3600000000.0;
   float collCostTable[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   initCollCostTable((float **)collCostTable);
-  NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable, &backupAlgo, &backupProto, &backupTime));
+  NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable));
   if (comm->tuner != NULL) {
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    struct ncclReg* regSendBuf;
+    struct ncclReg* regRecvBuf;
+    NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &regSendBuf));
+    NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &regRecvBuf));
+    int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()));
     NCCLCHECK(comm->tuner->getCollInfo(
           comm->tunerContext, info->func, nBytes,
           numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
-          &nMaxChannels));
+          regBuff, &nMaxChannels));
   }
-  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, backupAlgo, backupProto, backupTime, simInfo));
+  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
   info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels;
   return ncclSuccess;
 }
@@ -1975,37 +1783,7 @@ static ncclResult_t calcCollChunking(
   }
 
   int nstepsPerLoop, nchunksPerLoop;
-  switch (pattern) {
-  case ncclPatternTreeUp:
-  case ncclPatternTreeDown:
-  case ncclPatternTreeUpDown:
-  case ncclPatternPatUp:
-  case ncclPatternPatDown:
-  case ncclPatternPipelineFrom:
-  case ncclPatternPipelineTo:
-  case ncclPatternCollnetChain:
-    nstepsPerLoop = nchunksPerLoop = 1;
-    break;
-  case ncclPatternNvls:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
-    break;
-  case ncclPatternCollnetDirect:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads;
-    break;
-  case ncclPatternRing:
-    nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks;
-    break;
-  case ncclPatternRingTwice:
-    nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks;
-    break;
-  case ncclPatternNvlsTree:
-    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
-    break;
-  default:
-    WARN("Unknown pattern %d", pattern);
-    return ncclInternalError;
-  }
-
+  size_t loopOffset = 0;
   int stepSize   = comm->buffSizes[info->protocol]/NCCL_STEPS;
   int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
   int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -2066,22 +1844,60 @@ static ncclResult_t calcCollChunking(
   // Compute directFlags of work struct.
   if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
     // Set direct direction for broadcast-gather (read or write)
-    *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
+    *outDirectFlags = (nBytes/nChannels <= 1024 * 4) ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     *outDirectFlags = 0;
   }
 
   // Compute nSteps for proxies
-  //if (comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->func, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol);
   chunkSize = chunkSize / grainSize * grainSize; // align chunkSize to multiple grainSize
-  int nLoops = (int)DIVUP(nBytes, size_t(nChannels)*nchunksPerLoop*chunkSize);
+  switch (pattern) {
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown:
+  case ncclPatternPatUp:
+  case ncclPatternPatDown:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo:
+  case ncclPatternCollnetChain:
+    nstepsPerLoop = nchunksPerLoop = 1;
+    break;
+  case ncclPatternNvls:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    loopOffset = nChannels * chunkSize * comm->channels[0].nvls.headRank;
+    break;
+  case ncclPatternCollnetDirect:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].collnetDirect.nHeads;
+    loopOffset = nChannels * chunkSize * comm->channels[0].collnetDirect.headRank;
+    break;
+  case ncclPatternRing:
+    nstepsPerLoop = comm->nRanks-1; nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternRingTwice:
+    nstepsPerLoop = 2*(comm->nRanks-1); nchunksPerLoop = comm->nRanks;
+    break;
+  case ncclPatternNvlsTree:
+    nstepsPerLoop = 1; nchunksPerLoop = comm->channels[0].nvls.nHeads;
+    break;
+  default:
+    WARN("Unknown pattern %d", pattern);
+    return ncclInternalError;
+  }
+
+  // Compute nSteps for proxies
+  size_t loopSize = size_t(nChannels)*nchunksPerLoop*chunkSize;
+  int nLoops = (int)DIVUP(nBytes, loopSize);
   memset(proxyOp, 0, sizeof(*proxyOp));
   proxyOp->nsteps = nstepsPerLoop * nLoops * chunkSteps;
   proxyOp->sliceSteps = sliceSteps;
   proxyOp->chunkSteps = chunkSteps;
   proxyOp->chunkSize = chunkSize;
+  proxyOp->sliceSize = chunkSize / chunkSteps * sliceSteps;
+  proxyOp->loopSize = loopSize;
+  proxyOp->loopOffset = loopOffset;
   proxyOp->protocol = info->protocol;
   proxyOp->dtype = info->datatype;
+  proxyOp->algorithm = info->algorithm;
   if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
     proxyOp->redOp = ncclSum; // Network sees avg as sum
   } else {
@@ -2090,17 +1906,50 @@ static ncclResult_t calcCollChunking(
   proxyOp->pattern = pattern;
   proxyOp->coll = info->func;
   proxyOp->root = info->root;
+  proxyOp->isOneRPN = comm->isOneRPN;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
   // because some protocols need to transmit more than the total size, plus they sometimes
   // round up
   proxyOp->nbytes = stepSize*sliceSteps;
 
-  if (info->regBufType == NCCL_COLLNET_REG_BUFFER) {
+  if (info->regBufType & NCCL_NET_REG_BUFFER) {
     proxyOp->reg = 1;
-    proxyOp->nsteps = DIVUP(nBytes, NCCL_MAX_COLLNET_SIZE);
-    proxyOp->sendMhandle = info->sendMhandle;
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT || info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      if (proxyOp->isOneRPN) {
+        proxyOp->nsteps = 1;
+        proxyOp->loopOffset = 0;
+        proxyOp->sendbuff = (uint8_t*)info->sendbuff;
+        proxyOp->sendMhandle = info->sendMhandle;
+      } else {
+        if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) {
+          proxyOp->nbytes = nBytes / nchunksPerLoop;
+          proxyOp->loopSize = proxyOp->loopSize / nchunksPerLoop;
+          proxyOp->loopOffset = 0;
+          if (info->func == ncclFuncAllGather) {
+            proxyOp->sendbuff = (uint8_t*)info->sendbuff;
+            proxyOp->sendMhandle = info->sendMhandle;
+          }
+        } else {
+          proxyOp->sendbuff = (uint8_t*)info->recvbuff;
+          proxyOp->sendMhandle = info->recvMhandle;
+        }
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      if (proxyOp->isOneRPN && info->func == ncclFuncAllGather) {
+        proxyOp->chunkSize = NCCL_MAX_NET_SIZE;
+        proxyOp->sliceSize = NCCL_MAX_NET_SIZE;
+        proxyOp->chunkSteps = 1;
+        proxyOp->sliceSteps = 1;
+        proxyOp->loopSize = size_t(nChannels) * nchunksPerLoop * proxyOp->chunkSize;
+        proxyOp->nsteps = DIVUP(nBytes, proxyOp->loopSize) * nstepsPerLoop;
+        proxyOp->loopOffset = 0;
+      }
+    } else {
+      WARN("Net registration invalid algorithm %s", ncclAlgoToString(info->algorithm));
+      return ncclInternalError;
+    }
+
     proxyOp->recvMhandle = info->recvMhandle;
-    proxyOp->sendbuff = (uint8_t*)info->sendbuff;
     proxyOp->recvbuff = (uint8_t*)info->recvbuff;
     proxyOp->nbytes = nBytes;
   } else {
@@ -2119,7 +1968,7 @@ static ncclResult_t calcCollChunking(
     proxyOp->nbytes = DIVUP(nBytes, nChannels);
   }
 
-  *outChunkSize = chunkSize;
+  *outChunkSize = proxyOp->chunkSize;
   return ncclSuccess;
 }
 
@@ -2130,10 +1979,13 @@ static ncclResult_t hostToDevRedOp(
     int8_t   i8; uint8_t   u8;
     int32_t i32; uint32_t u32;
     int64_t i64; uint64_t u64;
-    half f16; float f32; double f64;
+    __half f16; float f32; double f64;
     #if defined(__CUDA_BF16_TYPES_EXIST__)
       __nv_bfloat16 bf16;
     #endif
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+      __nv_fp8_storage_t f8;
+    #endif
     void *ptr;
   };
   u64 = 0;
@@ -2144,7 +1996,8 @@ static ncclResult_t hostToDevRedOp(
   if (nbits <= 0) return ncclInvalidArgument;
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
-
+  bool datatype_signed = false;
+  
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
@@ -2162,10 +2015,22 @@ static ncclResult_t hostToDevRedOp(
   case ncclAvg:
     switch ((int)datatype) {
     case ncclInt8:  case ncclInt32:  case ncclInt64:
+      datatype_signed = true;
+      // no break, we want to fall through...
     case ncclUint8: case ncclUint32: case ncclUint64:
       opFull->op = ncclDevSumPostDiv;
-      u64 = comm->nRanks;
+      u64 = comm->nRanks<<1 | datatype_signed;
       break;
+    #if defined(__CUDA_FP8_TYPES_EXIST__)
+    case ncclFloat8e4m3:
+      opFull->op = ncclDevPreMulSum;
+      f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E4M3);
+      break;
+    case ncclFloat8e5m2:
+      opFull->op = ncclDevPreMulSum;
+      f8 = __nv_cvt_float_to_fp8(float(1.0/comm->nRanks), __NV_SATFINITE, __NV_E5M2);
+      break;
+    #endif
     case ncclFloat16:
       opFull->op = ncclDevPreMulSum;
       f16 = __float2half(float(1.0/comm->nRanks)); // __double2half not supported pre CUDA 11.x
@@ -2257,6 +2122,13 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     // Empty collectives can be discarded.
     if (info->count == 0) return ncclSuccess;
 
+    if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
+      if (comm->minCompCap < 90) {
+        WARN("FP8 reduction support begins with sm90 capable devices.");
+        return ncclInvalidArgument;
+      }
+    }
+
     // Copy reduction op state from op handle into info struct here since the
     // op handle may be destroyed before ncclGroupEnd().
     struct ncclDevRedOpFull opDev;
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 999312a0d..6e9356826 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -248,11 +248,31 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
 
 int ncclTopoUserP2pLevel = -1;
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank) {
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
+                              int* p2p, int *read, int* intermediateRank) {
+  int mnnvl = 0;
+  struct ncclPeerInfo* info1 = NULL;
+  struct ncclPeerInfo* info2 = NULL;
   *p2p = 0;
   if (read) *read = 0;
   if (intermediateRank) *intermediateRank = -1;
 
+  // Rule out different nodes / isolated containers
+  if (comm) {
+    info1 = comm->peerInfo+rank1;
+    info2 = comm->peerInfo+rank2;
+    if (info1->hostHash != info2->hostHash) {
+      if (comm->MNNVL) {
+        NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, &mnnvl));
+        if (!mnnvl) return ncclSuccess;
+      } else {
+        return ncclSuccess;
+      }
+    } else if (info1->shmDev != info2->shmDev) {
+      return ncclSuccess;
+    }
+  }
+
   // Get GPUs from topology
   int g1, g2;
   NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
@@ -297,7 +317,8 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank
   if (*p2p == 1) {
     // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
     // validate against NVML at all since they are pretending to be on other hw.
-    if (g1 != g2 && ncclParamIgnoreDisabledP2p() != 2) {
+    if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
+                                      info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
       int indexes[3] = {-1,-1,-1};
       int verticeN = 0;
       NCCLCHECK(ncclNvmlEnsureInitialized());
@@ -356,14 +377,14 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
 
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int64_t netId, int read, int* useGdr) {
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
   *useGdr = 0;
 
   // Get GPU and NET
   int n, g;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
   struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
 
   // Check that both the NIC and GPUs support it
@@ -404,12 +425,32 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
     distance = proxyGpu->paths[NET][n].type;
   }
   if (distance > netGdrLevel) {
-    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %lx (distance %d > %d)", busId, netId, distance, netGdrLevel);
+    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
   *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %lx / HCA %lx (distance %d <= %d), read %d", busId, netId, distance, netGdrLevel, read);
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
+  int netNum = system->nodes[NET].count;
+  int useGdr = 0;
+  *avail = false;
+  for (int n = 0; n < netNum; n++) {
+    int64_t netId = system->nodes[NET].nodes[n].id;
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 1, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+    NCCLCHECK(ncclTopoCheckGdr(system, rank, netId, 0, &useGdr));
+    if (useGdr) {
+      *avail = true;
+      break;
+    }
+  }
   return ncclSuccess;
 }
 
@@ -417,12 +458,17 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int6
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
 
 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+  *flush = 1;
+  ncclNetProperties_t props;
+  NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
+  if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
   int g;
-  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
+  struct ncclTopoSystem* system = comm->topo;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
-  *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
+  if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
   return ncclSuccess;
 }
 
@@ -516,7 +562,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
     NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
     int useGdr;
-    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->busId, netId, 1, &useGdr));
+    NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
     if (useGdr == 0) continue;
     int found = 0;
     for (int r=0; r<nr; r++) {
@@ -562,7 +608,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
   for (int g=0; g<system->nodes[GPU].count; g++) {
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
-      NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].gpu.rank, system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
+      NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
+                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
@@ -618,7 +665,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       if (gpu->paths[NET][n].type < PATH_PHB) {
         // Update path when we dont want to / can't use GPU Direct RDMA.
         int gdr;
-        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
+        NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
           int localCpu;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index ad6f58054..9b72ac160 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1142,7 +1142,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
     }
     if (system->nodes[NET].count > 0) {
-      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
+      sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
       offset = strlen(line);
     }
     INFO(NCCL_GRAPH, "%s", line);
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9771ae05c..d758ac989 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -296,7 +296,7 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
         NCCLCHECK(ncclTopoPrintRec(link->remNode, node, line, nextOffset));
       } else {
         if (link->remNode->type == NET) {
-          sprintf(line+nextOffset, "%s/%lx-%lx (%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
+          sprintf(line+nextOffset, "%s/%lx-%lx (%d/%lx/%d/%f)", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id), link->remNode->net.collSupport, link->remNode->net.asic, link->remNode->net.port, link->remNode->net.bw);
         } else {
           sprintf(line+nextOffset, "%s/%lx-%lx", topoNodeTypeStr[link->remNode->type], NCCL_TOPO_ID_SYSTEM_ID(link->remNode->id), NCCL_TOPO_ID_LOCAL_ID(link->remNode->id));
         }
@@ -383,6 +383,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
     if (strcmp(xmlNet->name, "net") != 0) continue;
     int index;
     NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
+    // This means that the "dev" attribute wasn't set on this net xml node. That means it should not be added to the system topology graph
     if (index == -1) continue;
     NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, systemId));
   }
@@ -403,7 +404,7 @@ struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
   { NULL, 60 /* Default fallback */ } }; // x100 Mbps per lane
-ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId) {
+ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* system, struct ncclTopoNode* parent, int systemId, int numaId) {
   const char* str;
 
   int type;
@@ -430,9 +431,9 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
   if (xmlNic != NULL) {
     type = NIC;
     // Ignore sub device ID and merge multi-port NICs into one PCI device.
-    busId &= 0xfffffffffffffff0;
     struct ncclTopoNode* nicNode = NULL;
-    int64_t id = NCCL_TOPO_ID(systemId, busId);
+    int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, busId);
+    int64_t id = NCCL_TOPO_ID(systemId, localNicId);
     NCCLCHECK(ncclTopoGetNode(system, &nicNode, type, id));
     if (nicNode == NULL) {
       NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, id));
@@ -453,7 +454,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
     for (int s=0; s<xmlPci->nSubs; s++) {
       struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
       if (strcmp(xmlSubPci->name, "pcilink") != 0) { // PCI links will be added later
-        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId));
+        NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node, systemId, numaId));
       }
     }
   }
@@ -520,12 +521,14 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
     struct ncclXmlNode* node = xmlCpu->subs[s];
-    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId));
+    if (strcmp(node->name, "pci") == 0) NCCLCHECK(ncclTopoAddPci(node, system, cpu, systemId, numaId));
     if (strcmp(node->name, "nic") == 0) {
       struct ncclTopoNode* nic = NULL;
-      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, 0));
+      int64_t localNicId = NCCL_TOPO_LOCAL_NIC_ID(numaId, 0);
+      int64_t id = NCCL_TOPO_ID(systemId, localNicId);
+      NCCLCHECK(ncclTopoGetNode(system, &nic, NIC, id));
       if (nic == NULL) {
-        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, NCCL_TOPO_ID(systemId, 0)));
+        NCCLCHECK(ncclTopoCreateNode(system, &nic, NIC, id));
         NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_BW));
         NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_BW));
       }
@@ -725,14 +728,528 @@ ncclResult_t ncclTopoRefreshBcmP2pLinks(void) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system) {
+// This is just checking for direct descendence
+int ncclTopoCheckPix(ncclXmlNode* common, ncclXmlNode** nodes, int nNodes) {
+  const char* tempBusId;
+  // If the common parent isn't a pci switch, then this isn't PIX
+  NCCLCHECK(xmlGetAttrStr(common, "busid", &tempBusId));
+  if (tempBusId == NULL) return 0;
+  TRACE(NCCL_GRAPH, "Checking pix for busid=%s", tempBusId);
+
+  // All the nodes must have a "nic" which is a parent, and then a pci node (busid) which must be a child of the "common"
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* node = nodes[i];
+    if (strcmp(node->name, "net") == 0) {
+      node = node->parent;
+      if (node == NULL) return 0;
+      if (strcmp(node->name, "nic") == 0) {
+        node = node->parent;
+        if (node == NULL) return 0;
+        // All nodes must descend from the same first level pci switch
+        if (strcmp(node->name, "pci") == 0) {
+          TRACE(NCCL_GRAPH, "Comparing parent of node=%p to common=%p", node->parent, common);
+          if (node->parent != common) return 0;
+        }
+      }
+    }
+  }
+
+  return 1;
+}
+
+#define NCCL_TOPO_XML_DEPTH_MAX 256
+typedef struct xmlNodeStack {
+  ncclXmlNode* elems[NCCL_TOPO_XML_DEPTH_MAX];
+  int tail;
+
+  ncclXmlNode* top() {
+    if (!empty()) {
+      return elems[tail - 1];
+    } else {
+      return NULL;
+    }
+  }
+
+  ncclXmlNode* pop() {
+    ncclXmlNode* node = top();
+    if (node) {
+      tail--;
+    }
+    return node;
+  }
+
+  void push(ncclXmlNode* node) {
+    if (tail < NCCL_TOPO_XML_DEPTH_MAX) {
+      elems[tail++] = node;
+    }
+  }
+
+  bool empty() {
+    return tail == 0;
+  }
+
+} xmlNodeStack;
+
+// 1. Find the common parent xmlNode between the given set of nodes
+ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
+  // Track a stack of parents per-net node being merged
+  xmlNodeStack* parents;
+  NCCLCHECK(ncclCalloc(&parents, nNodes));
+  // Find the common parent
+  ncclXmlNode* common = NULL;
+
+  if (nNodes == 1) {
+    common = nodes[0];
+    *path = PATH_LOC;
+    goto out;
+  }
+
+  for (int i = 0; i < nNodes; i++) {
+    ncclXmlNode* temp;
+    temp = nodes[i];
+    while (temp) {
+      parents[i].push(temp);
+      temp = strcmp(temp->name, "system") == 0 ? NULL : temp->parent;
+    }
+  }
+
+  common = NULL;
+  int c;
+  c = 1;
+  while (c && !parents[0].empty()) {
+    ncclXmlNode* temp = parents[0].top();
+    for (int i = 1; i < nNodes; i++) {
+      if (!parents[i].empty()) {
+        c &= (temp == parents[i].top());
+      } else {
+        c = 0;
+        break;
+      }
+    }
+
+    if (c) {
+      common = temp;
+      if (common == NULL) TRACE(NCCL_GRAPH, "COMMON IS NULL");
+      for (int i = 0; i < nNodes; i++) {
+        parents[i].pop();
+      }
+    // Check multi-port while we still have the mismatched parents
+    // For multi-port to be true, all parents (peers) must have the busId attribute with all but the last character matching
+    } else {
+      int multiPort = 1;
+      const char* tempBusId;
+
+      NCCLCHECK(xmlGetAttr(temp, "busid", &tempBusId));
+      if (tempBusId) {
+        for (int i = 1; i < nNodes; i++) {
+          if (!parents[i].empty()) {
+            const char* busId;
+            NCCLCHECK(xmlGetAttr(parents[i].top(), "busid", &busId));
+            if (busId) {
+              if (strlen(busId) != strlen(tempBusId)) {
+                multiPort = 0;
+                break;
+              }
+              if (strncmp(busId, tempBusId, strlen(busId)-1) != 0) {
+                multiPort = 0;
+                break;
+              }
+            } else {
+              multiPort = 0;
+              break;
+            }
+          }
+        }
+      } else {
+        multiPort = 0;
+      }
+
+      if (multiPort) {
+        *path = PATH_PORT;
+        goto out;
+      }
+    }
+  }
+
+  if (common == NULL) {
+    *path = PATH_DIS;
+  } else if (strcmp(common->name,"system") == 0) {
+    *path = PATH_SYS;
+  } else if (strcmp(common->name, "cpu") == 0) {
+    *path = PATH_PHB;
+  } else if (strcmp(common->name, "nic") == 0) {
+    *path = PATH_PORT;
+  } else if (strcmp(common->name, "net") == 0) {
+    *path = PATH_PORT;
+  } else if (ncclTopoCheckPix(common, nodes, nNodes)) {
+    *path = PATH_PIX;
+  } else {
+    *path = PATH_PXB;
+  }
+
+out:
+  *parent = common;
+  free(parents);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeUniqueBusId(struct ncclXml* xml, char* busId, struct ncclXmlNode** pciNode, struct ncclXmlNode* parent) {
+  int i = 0;
+  int64_t rBusId;
+  NCCLCHECK(busIdToInt64(busId, &rBusId));
+  // Try to find an unused busid - NCCL expects leaf busid to be unique
+  while (i < 100) {
+    rBusId++;
+    TRACE(NCCL_GRAPH, "Trying to make new busId %lx", rBusId);
+    int64ToBusId(rBusId, busId);
+    struct ncclXmlNode* temp = NULL;
+    NCCLCHECK(xmlFindTagKv(xml, "pci", &temp, "busid", busId));
+    if (temp == NULL) {
+      NCCLCHECK(xmlAddNode(xml, parent, "pci", pciNode));
+      NCCLCHECK(xmlSetAttr(*pciNode, "busid", busId));
+      TRACE(NCCL_GRAPH, "Made new busId %lx", rBusId);
+      return ncclSuccess;
+    }
+    TRACE(NCCL_GRAPH, "Conflicting busId %lx", rBusId);
+    i++;
+  }
+
+  WARN("TOPO/NET : Couldn't generate unique busId after %d tries", i);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** parent, struct ncclXmlNode* physNetNode) {
+  struct ncclXmlNode* newBusId = NULL;
+  struct ncclXmlNode* pci = physNetNode->parent;
+  if (pci) {
+    pci = pci->parent;
+    if (pci) {
+      if (strcmp(pci->name, "pci") == 0) {
+        char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+        memset(busId, 0, sizeof(busId));
+        const char* originalBusId;
+        // Seed busId with the current NIC 0's busId to make discovering a unique hash quicker
+        NCCLCHECK(xmlGetAttrStr(pci, "busid", &originalBusId));
+        snprintf(busId, sizeof(busId), "%s", originalBusId);
+        NCCLCHECK(ncclTopoMakeUniqueBusId(xml, busId, &newBusId, *parent));
+        for (int i = 0; i < pci->nAttrs; i++) {
+          NCCLCHECK(xmlSetAttr(newBusId, pci->attrs[i].key, pci->attrs[i].value));
+        }
+        NCCLCHECK(xmlSetAttr(newBusId, "busid", busId));
+        *parent = newBusId;
+      }
+    }
+  }
+
+  if (newBusId == NULL) {
+    const char* name;
+    NCCLCHECK(xmlGetAttr(physNetNode, "name", &name));
+    WARN("TOPO/NET : Can't find busId of child 0 %s", name);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
+struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+    WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+    return ncclInternalError;
+  }
+
+  // Trigger the merge, then get the new device's properties
+  int vDevIndex = 0;
+  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
+  if (ret == ncclInvalidUsage) {
+    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
+    NCCLCHECK(ret);
+  }
+
+  INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* semi_token;
+  char* semi = strtok_r(str, ";", &semi_token);
+  while (semi) {
+    TRACE(NCCL_NET, "Fusing %s", semi);
+    struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
+    int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
+    if (nUserIfs == 0) {
+      INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
+        str, semi);
+      continue;
+    }
+
+    ncclNetVDeviceProps_t vProps = {0};
+    for (int d = 0; d < nPhysDevs; d++) {
+      if (matchIfList(propsList[d].name, propsList[d].port, userIfs, nUserIfs, 1)) {
+        vProps.devs[vProps.ndevs++] = d;
+      }
+    }
+
+    if (vProps.ndevs != nUserIfs) {
+      WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
+        vProps.ndevs, nUserIfs, semi);
+      return ncclInvalidUsage;
+    }
+
+    if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+      WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+      return ncclInvalidUsage;
+    }
+
+    struct ncclXmlNode* netNode;
+    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
+
+    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+    for (int i = 0; i < vProps.ndevs; i++) {
+      placedDevs[vProps.devs[i]] = 1;
+    }
+
+    semi = strtok_r(NULL, ";", &semi_token);;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  // Compute the path type between each device
+  int* paths = NULL;
+  ncclResult_t res = ncclSuccess;
+  ncclCalloc(&paths, nPhysDevs*nPhysDevs);
+  TRACE(NCCL_GRAPH, "Allocated %d paths", nPhysDevs*nPhysDevs);
+  for (int i = 0; i < nPhysDevs; i++) {
+    for (int j = 0; j < nPhysDevs; j++) {
+      struct ncclXmlNode* nodes[2];
+      nodes[0] = physNetNodes[i];
+      nodes[1] = physNetNodes[j];
+      struct ncclXmlNode* parent;
+      NCCLCHECKGOTO(ncclTopoGetPath(nodes, 2, &paths[i*nPhysDevs + j], &parent), res, out);
+    }
+  }
+
+  // Place all remaining physical devices into a virtual device given the mergeLevel criteria
+  for (int i = 0; i < nPhysDevs; i++) {
+    // Select the first unplaced device "i" as the root
+    if (placedDevs[i] == 0) {
+      // Init a new vDevice
+      ncclNetVDeviceProps_t vProps;
+      vProps = {0};
+      vProps.devs[vProps.ndevs++] = i;
+      placedDevs[i] = 1;
+      TRACE(NCCL_GRAPH, "Placed dev %d", i);
+
+      // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
+      // (Don't merge the same device with itself)
+      for (int j = 0; j < nPhysDevs; j++) {
+        if (paths[i*nPhysDevs + j] <= mergeLevel &&
+        placedDevs[j] == 0 && j != i) {
+          vProps.devs[vProps.ndevs++] = j;
+          placedDevs[j] = 1;
+          TRACE(NCCL_GRAPH, "Placed dev %d path=%d", j, paths[i*nPhysDevs + j] );
+        }
+        if (vProps.ndevs == NCCL_NET_MAX_DEVS_PER_NIC) break;
+      }
+
+      if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
+        WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
+        return ncclInternalError;
+      }
+
+      struct ncclXmlNode* netNode;
+      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+    }
+  }
+
+out:
+  free(paths);
+  return res;
+}
+
+struct kvDict nicPathKvList[] = {
+  { "LOC",  PATH_LOC },
+  { "PORT", PATH_PORT },
+  { "PIX",  PATH_PIX },
+  { "PXB",  PATH_PXB },
+  { "PXN",  PATH_PXN },
+  { "PHB",  PATH_PHB },
+  { "SYS",  PATH_SYS },
+  { NULL, 0 }
+};
+
+ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
+  ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
+  ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
+  for (int i = 0; i < vProps->ndevs; i++) {
+    NCCLCHECK(getProperties(vProps->devs[i], props + i));
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECK(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name));
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Re-found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  int path = PATH_LOC;
+  NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
+  if (path == PATH_LOC) {
+    *parent = NULL;
+  } else if (parent && strcmp((*parent)->name, "pci") == 0) {
+    // If the common parent is PCI, we must reparent the new NIC under a made up busId
+    NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+  }
+  TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+  int* placedDevs = NULL;
+  struct ncclXmlNode** physNetNodes = NULL;
+  if (physicalDevs == 0) return ncclSuccess;
+
+  ncclCalloc(&physNetNodes, physicalDevs);
+  ncclResult_t res = ncclSuccess;
+
+  ncclNetProperties_t* props = NULL;
+  ncclCalloc(&props, physicalDevs);
+  for (int i = 0; i < physicalDevs; i++) {
+    NCCLCHECKGOTO(getProperties(i, props + i), res, out);
+    struct ncclXmlNode* physNetNode;
+    NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
+    physNetNodes[i] = physNetNode;
+    TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i,  props[i].name);
+  }
+
+  // By default, don't merge any devices
+  int mergeLevel;
+  mergeLevel = PATH_PORT;
+  char* mergeLevelEnv;
+  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
+  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+  char* forceMerge;
+  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
+  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+  memset(placedDevs, 0, sizeof(int)*physicalDevs);
+
+  if (forceMerge) {
+    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  }
+  NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+
+out:
+  free(physNetNodes);
+  free(props);
+  if (placedDevs) free(placedDevs);
+  return res;
+}
+
+static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
+  for (int n = startIndex; n < endIndex; n++) {
+    ncclNetProperties_t props;
+    NCCLCHECK(getProperties(n, &props));
+    struct ncclXmlNode* netNode = NULL;
+    struct ncclXmlNode* parent = NULL;
+    if (virtualNics) {
+      struct ncclXmlNode* net = NULL;
+      NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
+      // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
+      // Only run this if the net doesn't exist locally - this may alter the XML state
+      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
+    }
+
+    NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
+
+    const char* colAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+
+    // If coll == 0 but the netNode is tagged as coll, don't update the keep value
+    if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
+    NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
+    NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
+    NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
+    NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
+    NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
+    NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
+    // Only set coll if it's not 0
+    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+
+    const char* keepAttr;
+    NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
+    NCCLCHECK(xmlGetAttr(netNode, "keep", &keepAttr));
+    INFO(NCCL_GRAPH, "ncclTopoPopulateNics : Filled %s in topo with pciPath=%s keep=%s coll=%s",
+      props.name, props.pciPath, keepAttr, colAttr);
+  }
+
+  return ncclSuccess;
+}
+
+struct ncclTopoNetState {
+  int nVirtualNics;
+  int nPhysicalNics;
+  const char* name;
+};
+
+// Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
+static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
+  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
+  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
+  // Enumerate physical devices
+  NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
+  if (!usePhysicalDevices) {
+    if (state->nVirtualNics == -1) {
+      NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
+      int nDevs;
+      NCCLCHECK(devices(&nDevs));
+      state->nVirtualNics = nDevs - state->nPhysicalNics;
+    }
+    // Remove keep=1 for physical collnets
+    if (state->nVirtualNics > 0) {
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
+      // Populate new devices
+      NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
+ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
+  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
+  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
+    // Empty slot
+    if (states[i].name == NULL) {
+      states[i].nVirtualNics = -1;
+      states[i].nPhysicalNics = -1;
+      states[i].name = strdup(name);
+      *state = states + i;
+      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
+      return ncclSuccess;
+    // Found my slot
+    } else if (strcmp(states[i].name, name) == 0) {
+      *state = states + i;
+      return ncclSuccess;
+    }
+  }
+  WARN("NET/TOPO : Couldn't find net with name %s", name);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
   ncclResult_t ret = ncclSuccess;
   struct ncclXml* xml;
   char* mem = NULL;
   int* localRanks = NULL;
-  int netDevCount = 0;
   struct ncclXml* rankXml;
   int localRank = -1, nLocalRanks = 0;
+  int netLockHeld = 0;
   NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
   const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
   if (xmlTopoFile) {
@@ -761,47 +1278,24 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECKGOTO(xmlSetAttrInt(node, "rank", comm->rank), ret, fail);
     NCCLCHECKGOTO(xmlInitAttrInt(node, "gdr", comm->peerInfo[comm->rank].gdrSupport), ret, fail);
   }
+
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
+  pthread_mutex_lock(&netLock);
+  netLockHeld = 1;
+  INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
+  ncclTopoNetState* state;
+  state = NULL;
   if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(collNetDevices(comm, &netDevCount), ret, fail);
-    for (int n=0; n<netDevCount; n++) {
-      ncclNetProperties_t props;
-      NCCLCHECKGOTO(collNetGetProperties(comm, n, &props), ret, fail);
-      struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-      NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-      bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-      INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
-      NCCLCHECKGOTO(xmlInitAttrInt(netNode, "coll", 1), ret, fail);
-    }
-  }
-  if (netDevCount == 0) {
-    NCCLCHECKGOTO(comm->ncclNet->devices(&netDevCount), ret, fail);
-  }
-  for (int n=0; n<netDevCount; n++) {
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(n, &props), ret, fail);
-    comm->netDeviceType = props.netDeviceType;
-    struct ncclXmlNode* netNode;
-    NCCLCHECKGOTO(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "keep", 1), ret, fail);
-    NCCLCHECKGOTO(xmlSetAttrInt(netNode, "dev", n), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "speed", props.speed), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "port", props.port), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrFloat(netNode, "latency", props.latency), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrUint64(netNode, "guid", props.guid), ret, fail);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "maxconn", props.maxComms), ret, fail);
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", comm->ncclNet->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
-    NCCLCHECKGOTO(xmlInitAttrInt(netNode, "gdr", gdrSupport), ret, fail);
+    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
+      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
   }
+  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
+  NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
+    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
+  pthread_mutex_unlock(&netLock);
+  netLockHeld = 0;
 
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
   NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -845,19 +1339,21 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     NCCLCHECKGOTO(ncclTopoFuseXml(xml, peerXml), ret, fail);
   }
 
-  xmlTopoFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
-  if (xmlTopoFile && comm->rank == ncclParamTopoDumpFileRank()) {
-    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", xmlTopoFile);
-    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(xmlTopoFile, xml), ret, fail);
+  if (dumpXmlFile && comm->rank == ncclParamTopoDumpFileRank()) {
+    INFO(NCCL_ENV, "NCCL_TOPO_DUMP_FILE set by environment to %s", dumpXmlFile);
+    NCCLCHECKGOTO(ncclTopoDumpXmlToFile(dumpXmlFile, xml), ret, fail);
   }
 
-  NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  // Only update our topo tracking structure if we aren't dumping (separate steps)
+  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+
 exit:
   if (!comm->MNNVL && localRanks) free(localRanks);
   if (mem) free(mem);
   free(xml);
   return ret;
 fail:
+  if (netLockHeld) pthread_mutex_unlock(&netLock);
   goto exit;
 }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 0837fb4b3..8e7cda5b4 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -78,6 +78,9 @@ extern const char* topoLinkTypeStr[];
 // Connection through the network
 #define PATH_NET 8
 
+// New type of path which should precede PATH_PIX
+#define PATH_PORT PATH_NVL
+
 // Disconnected
 #define PATH_DIS 9
 extern const char* topoPathTypeStr[];
@@ -106,6 +109,7 @@ struct ncclTopoLinkList {
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
 #define NCCL_TOPO_ID_SYSTEM_ID(id) (id >> 56)
 #define NCCL_TOPO_ID_LOCAL_ID(id) (id & NCCL_TOPO_ID_LOCAL_ID_MASK)
+#define NCCL_TOPO_LOCAL_NIC_ID(numaid, busid) (((int64_t)numaid << 56) + busid)
 #define NCCL_TOPO_ID(systemid, localid) (((int64_t)systemid << 56) + (localid & NCCL_TOPO_ID_LOCAL_ID_MASK))
 
 struct ncclTopoNode {
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index f0a622452..f5f2e1185 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -31,23 +31,87 @@ static int getNthreads(const char* name, int env, int min, int max, int def) {
   return nt;
 }
 
-ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* list) {
-  int def, set;
-  if (str[0] == '^') {
-    def = 1; set = 0; str++;
-  } else {
-    def = 0; set = 1;
-  }
-  for (int i=0; i<nelems; i++) list[i] = def;
-  char* tokStr = strdup(str);
-  char* tmpStr;
-  char* token = strtok_r(tokStr, ",", &tmpStr);
-  while (token) {
-    for (int i=0; i<nelems; i++)
-      if (strcasecmp(token, elems[i]) == 0) list[i] = set;
-    token = strtok_r(NULL, ",", &tmpStr);
+// Parse a map of prefixes to a list of elements. The first prefix is
+// optional and, if not present, the list of elements will be applied
+// to all prefixes. Only the first list of elements can lack a
+// prefix. Prefixes (if present) are followed by a colon. Lists of
+// elements are comma delimited. Mappings of prefix to the lists of
+// elements are semi-colon delimited.
+//
+// For example:
+//
+//     NCCL_ALGO="ring,collnetdirect;allreduce:tree,collnetdirect;broadcast:ring"
+// Enable ring and collnetdirect for all functions, then select tree
+// and collnetdirect for allreduce and ring for broadcast.
+//
+//     NCCL_PROTO="LL,Simple;allreduce:^LL"
+// Enable LL and Simple for all functions, but everything except LL
+// for allreduce.
+//
+//     NCCL_PROTO="^LL128;allreduce:LL128"
+// Enable everything but LL128, but only LL128 for allreduce.
+ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
+  char* fullStr = strdup(str);
+  char* tmpFullStr;
+  char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
+  while (fullToken) {
+    char* subToken = strdup(fullToken);
+    char* tmpSubStr;
+    char* prefix = strtok_r(subToken, ":", &tmpSubStr);
+    char* elemList = strtok_r(NULL, ":", &tmpSubStr);
+    if (elemList == NULL) {
+      if (fullToken != fullStr) {
+        // It makes no sense for any entry other than the first to not have a prefix,
+        // because then all the prefixes before the prefix-less entry would be
+        // overwritten.
+        WARN("All entries except the first must have a prefix: \"%s\"", str);
+        return ncclInvalidUsage;
+      }
+      elemList = prefix;
+      prefix = NULL;
+    }
+
+    int unset, set;
+    if (elemList[0] == '^') {
+      unset = 1; set = 0; elemList++;
+    } else {
+      unset = 0; set = 1;
+    }
+
+    bool foundPrefix = false;
+    for (int p=0; p<nprefixes; p++) {
+      if (prefix && strcasecmp(prefix, prefixElems[p]) != 0) continue;
+      foundPrefix = true;
+      for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
+
+      char* tokStr = strdup(elemList);
+      char* tmpStr;
+      char* elem = strtok_r(tokStr, ",", &tmpStr);
+      while (elem) {
+        int e;
+        for (e=0; e<nelems; e++) {
+          if (strcasecmp(elem, elems[e]) == 0) {
+            list[p*nelems+e] = set;
+            break;
+          }
+        }
+        if (e==nelems) {
+          WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
+          return ncclInvalidUsage;
+        }
+        elem = strtok_r(NULL, ",", &tmpStr);
+      }
+      free(tokStr);
+    }
+    if (!foundPrefix) {
+      WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
+      return ncclInvalidUsage;
+    }
+    free(subToken);
+
+    fullToken = strtok_r(NULL, ";", &tmpFullStr);
   }
-  free(tokStr);
+  free(fullStr);
   return ncclSuccess;
 }
 
@@ -144,17 +208,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   if (nRanks <= 1) return ncclSuccess;
 
   int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
-  int index1 = nNodes == 1 ? compCapIndex : cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD ? 1 : 0;
+  int index1 = nNodes == 1 ? compCapIndex :
+               (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
   double llMaxBw = llMaxBws[index1][index2];
   double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
   double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
   double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-  if (cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes;
 
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -190,7 +253,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
         if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
-        if (a == NCCL_ALGO_PAT) busBw *= .85;
+        if (a == NCCL_ALGO_PAT) busBw *= .75;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
@@ -226,10 +289,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           busBw *= ratio;
         }
         comm->bandwidths[coll][a][p] = busBw;
-        /* Ring bandwidth backup */
-        if (a == NCCL_ALGO_RING)
-          comm->ringbdw[coll][p] = comm->bandwidths[coll][NCCL_ALGO_RING][p];
-
         comm->latencies[coll][a][p] = baseLat[a][p];
         float intraLat = hwLat[intraHw[a]][a][p];
         // With ppn=1 latencies are fully exposed, use the Tree network latency
@@ -286,41 +345,78 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
 
   // Protocols/Algorithms enable/disable, and user overrides.
   // All are enabled except ll128 which is enabled by default only in certain cases.
-  int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 };
-  int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1, 1 };
+  int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS];
+  int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS];
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoEnable[f*NCCL_NUM_PROTOCOLS+p] = p == NCCL_PROTO_LL128 ? 2 : 1;
+    }
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 1;
+    }
+  }
 
   const char *protoStr = ncclGetEnv("NCCL_PROTO");
   if (protoStr) {
     INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
-    NCCLCHECK(parseList(protoStr, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+    NCCLCHECK(parseList(protoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
   }
   const char *algoStr = ncclGetEnv("NCCL_ALGO");
   if (algoStr) {
     INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
-    NCCLCHECK(parseList(algoStr, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+    NCCLCHECK(parseList(algoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
   }
 
-  if (comm->nNodes == 1) algoEnable[NCCL_ALGO_NVLS_TREE] = 0;
-
-  // Disable CollNet if it is not supported
-  if (comm->collNetSupport == 0) {
-    algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
-    algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0;
-    if (nNodes > 1) algoEnable[NCCL_ALGO_NVLS] = 0;
-    // If user has hard set NCCL_ALGO=COLLNET, ignore it
-    if (algoEnable[NCCL_ALGO_RING] == 0 && algoEnable[NCCL_ALGO_TREE] == 0 &&
-        algoEnable[NCCL_ALGO_NVLS] == 0 && algoEnable[NCCL_ALGO_NVLS_TREE] == 0) {
-      algoEnable[NCCL_ALGO_RING] = algoEnable[NCCL_ALGO_TREE] = 1;
+  if (comm->rank == 0 && (algoStr||protoStr)) {
+    constexpr int strLength = 1024;
+    char funcAlgoProtoTuningStr[strLength];
+    int offset = 0;
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n     Function | ");
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8s  ", ncclProtoStr[p]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s  ", ncclAlgoStr[a]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+
+    for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s | ", ncclFuncStr[f]);
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8d  ", protoEnable[f*NCCL_NUM_PROTOCOLS+p]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13d  ", algoEnable[f*NCCL_NUM_ALGORITHMS+a]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+    }
+
+    INFO(NCCL_ENV, "Enabled NCCL Func/Proto/Algo Matrix:%s", funcAlgoProtoTuningStr);
+  }
+
+  int nvsCount = 0;
+  NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
+
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      int disable = 0;
+      // Disable NVLS Tree on a single node
+      if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
+      // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
+      if (comm->collNetSupport == 0 &&
+          (a == NCCL_ALGO_COLLNET_DIRECT ||
+           a == NCCL_ALGO_COLLNET_CHAIN ||
+           (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
+      // Disable CollNet+Direct if not on an NVSwitch system
+      if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1;
+      if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0;
     }
-  } else {
-    // Disable CollNet+Direct if not on an NVSwitch system
-    int nvsCount = 0;
-    NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
-    if (nvsCount == 0) algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0;
   }
 
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-    int pEnable = protoEnable[p];
+    int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
       // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
@@ -335,66 +431,51 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       }
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
-    if (algoEnable[a] == 0) comm->bandwidths[c][a][p] = 0;
-    if (a == NCCL_ALGO_RING && pEnable == 0) comm->ringbdw[c][p] = 0;
-  }
-
-  for (int c = 0; c < NCCL_NUM_FUNCTIONS; c++) {
-    bool available = false;
-    for (int a = 0; a < NCCL_NUM_ALGORITHMS; a++)
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        if (comm->bandwidths[c][a][p] != 0) {
-          available = true;
-          goto check_avail;
-        }
-  check_avail:
-    if (available == false) {
-      /* at least set ring algo available */
-      for (int p = 0; p < NCCL_NUM_PROTOCOLS; p++)
-        comm->bandwidths[c][NCCL_ALGO_RING][p] = comm->ringbdw[c][p];
-    }
+    if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
   }
 
   if (comm->rank == 0) {
-    char line[1024];
+    constexpr int lineLen = 1024;
+    char line[lineLen];
+    int offset = 0;
     for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
-      sprintf(line, "  Algorithm   |");
+      offset = snprintf(line, lineLen, "  Algorithm   |");
       for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
         if (a >= NCCL_NUM_ALGORITHMS) continue;
-        sprintf(line+strlen(line), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
+        offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
       }
       INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, "  Protocol    |");
+      offset = snprintf(line, lineLen, "  Protocol    |");
       for (int ba=0; ba<3; ba++) {
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14s |", ncclProtoStr[p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s |", ncclProtoStr[p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
-      sprintf(line, " Max NThreads |");
+      offset = snprintf(line, lineLen, " Max NThreads |");
       for (int ba=0; ba<3; ba++) {
-	int a = block*3+ba;
+        int a = block*3+ba;
         if (a >= NCCL_NUM_ALGORITHMS) continue;
         for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-          sprintf(line+strlen(line), " %14d |", comm->maxThreads[a][p]);
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]);
         }
       }
       INFO(NCCL_TUNING, "%s", line);
       for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
-        sprintf(line, "%13s |", ncclFuncStr[c]);
+        offset = snprintf(line, lineLen, "%13s |", ncclFuncStr[c]);
         for (int ba=0; ba<3; ba++) {
-	  int a = block*3+ba;
+          int a = block*3+ba;
           if (a >= NCCL_NUM_ALGORITHMS) continue;
           for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-            sprintf(line+strlen(line), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+            offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
           }
         }
         INFO(NCCL_TUNING, "%s", line);
       }
     }
   }
-
+ 
   // Set per-thread amount of work before we increase nThreads and nChannels
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
     comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
@@ -438,19 +519,10 @@ static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
   {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
 };
 
-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup) {
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
   float bw = comm->bandwidths[coll][algorithm][protocol];
   float lat = comm->latencies[coll][algorithm][protocol];
 
-  if (backup) {
-    *backup = false;
-    if (algorithm == NCCL_ALGO_RING && bw == 0.0f) {
-      /* try back up RING algorithm */
-      bw = comm->ringbdw[coll][protocol];
-      *backup = true;
-    }
-  }
-
   if (bw == 0) {
     *time = -1.0; return ncclSuccess;
   }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index bb123b798..a41289389 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -17,6 +17,9 @@
 #include <cpuid.h>
 #endif
 
+// Arbitrarily large number for constructing virtual topology string
+#define NCCL_MAX_XML_DEPTH 1024
+
 /*******************/
 /* XML File Parser */
 /*******************/
@@ -430,7 +433,7 @@ static ncclResult_t getBcmLinks(const char* busId, int* nlinks, char** peers) {
 
 ncclResult_t ncclTopoGetStrFromSys(const char* path, const char* fileName, char* strValue) {
   char filePath[PATH_MAX];
-  sprintf(filePath, "%s/%s", path, fileName);
+  snprintf(filePath, sizeof(filePath), "%s/%s", path, fileName);
   int offset = 0;
   FILE* file;
   if ((file = fopen(filePath, "r")) != NULL) {
@@ -883,7 +886,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
 // where sysPath/subsystem points to.
 ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
   char subSysPath[PATH_MAX];
-  sprintf(subSysPath, "%s/subsystem", sysPath);
+  snprintf(subSysPath, sizeof(subSysPath), "%s/subsystem", sysPath);
   char* path = realpath(subSysPath, NULL);
   if (path == NULL) {
     subSys[0] = '\0';
@@ -896,8 +899,9 @@ ncclResult_t ncclTopoGetSubsystem(const char* sysPath, char* subSys) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode) {
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent) {
   NCCLCHECK(xmlFindTagKv(xml, "net", netNode, "name", netName));
+
   if (*netNode != NULL) return ncclSuccess;
 
   const char* pciSysPath = pciPath;
@@ -906,13 +910,15 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
     NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
     // This is not a PCI device (virtual, usb, ...).
     if (strcmp(subSystem, "pci") != 0) {
-      INFO(NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
+      INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
       pciSysPath = NULL;
     }
   }
 
   struct ncclXmlNode* parent = NULL;
-  if (pciSysPath) {
+  if (forceParent) {
+    parent = forceParent;
+  } else if (pciSysPath) {
     int offset;
     for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
     char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
diff --git a/src/graph/xml.h b/src/graph/xml.h
index 0ee56790b..f06c0e68b 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -50,7 +50,7 @@ ncclResult_t ncclTopoGetXmlGraphFromFile(const char* xmlGraphFile, struct ncclXm
 
 /* Auto-detect functions */
 ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct ncclXmlNode** gpuNode);
-ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode);
+ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const char* netName, struct ncclXmlNode** netNode, struct ncclXmlNode* forceParent=NULL);
 
 /* Remove unneeded parts */
 ncclResult_t ncclTopoTrimXml(struct ncclXml* xml);
@@ -132,6 +132,13 @@ static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrNa
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrFloatDefault(struct ncclXmlNode* node, const char* attrName, float* value, float defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtof(str, NULL) : defaultValue;
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlFindTag(struct ncclXml* xml, const char* tagName, struct ncclXmlNode** node) {
   *node = NULL;
   for (int i=0; i<xml->maxIndex; i++) {
@@ -208,6 +215,24 @@ static ncclResult_t xmlSetAttr(struct ncclXmlNode* node, const char* attrName, c
   return ncclSuccess;
 }
 
+static ncclResult_t xmlPrintNodeRecursive(struct ncclXmlNode* node, const char* name) {
+  while (node) {
+    char line[1024*8];
+    int cursor = 0;
+    snprintf(line, sizeof(line), "<name=%s", node->name);
+    for (int i = 0; i < node->nAttrs; i++) {
+      cursor = strlen(line);
+      snprintf(line + cursor, sizeof(line) - cursor, " %s=%s", node->attrs[i].key, node->attrs[i].value);
+    }
+    cursor = strlen(line);
+    snprintf(line + cursor, sizeof(line) - cursor, ">");
+    INFO(NCCL_GRAPH, "%s", line);
+    node = node->parent;
+  }
+  return ncclSuccess;
+}
+
+
 static ncclResult_t xmlSetAttrIfUnset(struct ncclXmlNode* node, const char* attrName, const char* value) {
   int index;
   NCCLCHECK(xmlGetAttrIndex(node, attrName, &index));
diff --git a/src/group.cc b/src/group.cc
index 3d3ecb88c..e387db70c 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -323,7 +323,7 @@ static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** g
   /* reset everything */
   while (!ncclIntruQueueEmpty(asyncJobsPtr)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsPtr);
-    if (job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
       (void) ncclCommSetAsyncError(job->comm, error);
     if (job->undo) job->undo(job);
     if (job->destructor) job->destructor((void*)job);
@@ -392,7 +392,6 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
 }
 
 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
-  int savedDev;
   ncclResult_t ret = ncclSuccess;
   struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
   struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
@@ -401,8 +400,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   bool *groupAbortFlag = gjob->abortFlagPtr;
 
-  CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-
   if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
     struct ncclComm* comm = groupCommPreconnectHeadMain;
     do {
@@ -454,12 +451,19 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
       }
       comm = comm->groupNext;
     } while (comm);
-
     NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
     while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
       struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
       if (job->destructor) job->destructor((void*)job);
     }
+
+    // done with all buffer allocation, start registration and enqueue
+    comm = groupCommHeadMain;
+    do {
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+      NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail);
+      comm = comm->groupNext;
+    } while (comm);
   }
 
   if ((!simInfo) && (groupCommHeadMain != nullptr)) {
@@ -476,6 +480,9 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
   while (groupCommHeadMain != nullptr) {
     struct ncclComm* comm = groupCommHeadMain;
     struct ncclComm* next = comm->groupNext;
+    // Poll for callbacks sent to us from other threads. Typically these free
+    // resources from to our memory pools and UB
+    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
     (void) ncclGroupCommLeave(comm);
     if (!comm->config.blocking) {
       (void) ncclCommSetAsyncError(comm, ret);
@@ -483,8 +490,6 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
     groupCommHeadMain = next;
   }
 
-  CUDACHECK(cudaSetDevice(savedDev));
-
 exit:
   return ret;
 fail:
@@ -563,7 +568,10 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
       ret = ncclInProgress;
     } else {
       /* blocking group */
+      int savedDev;
+      CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
       NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
       if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
       groupResetJobState(ncclGroupJobMainPtr);
     }
diff --git a/src/include/collectives.h b/src/include/collectives.h
index e45d78f26..c82ebce6f 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 // CHUNKSIZE must be a multiple of SLICESIZE
 #define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
@@ -23,6 +24,7 @@
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+#define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 const char* ncclFuncToString(ncclFunc_t op);
 const char* ncclDevRedOpToString(ncclDevRedOp_t op);
@@ -34,11 +36,11 @@ inline int ncclTypeSize(ncclDataType_t type) {
   switch (type) {
   case ncclInt8:
   case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
     return 1;
   case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16:
-  #endif
     return 2;
   case ncclInt32:
   case ncclUint32:
@@ -67,6 +69,319 @@ struct ncclConnFifo {
 
 #include <stdio.h>
 
+class RingAlgorithm {
+protected:
+  int refCount;
+  int nRanks;
+  int nStepsPerLoop;
+  int chunkSteps;
+  int sliceSteps;
+  ssize_t sliceSize;
+  ssize_t loopSize;
+  ssize_t channelSize;
+  uint8_t *sendbuff;
+  uint8_t *recvbuff;
+  void *sendMhandle;
+  void *recvMhandle;
+  void *srecvMhandle;
+public:
+  // this ring class is used by proxy thread to retrieve the send and recv buffer, size as well as corresponding
+  // mem handle based on the current step of the proxy args. The derived ring algo class is AR, AG, and BC which
+  // would be allocated during enqueue stage and copied to proxy side through shared memory. For each copy, we will
+  // increase the refCount by incRefCount() since the same ring algo object can be referenced multiple times for send
+  // and recv progress. After all steps are done, we decrease the refCount and only delete the ring object when
+  // refCount == 0.
+  virtual void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  virtual void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) = 0;
+  int incRefCount() {
+    return __atomic_add_fetch(&refCount, 1, __ATOMIC_RELAXED);
+  }
+  int decRefCount() {
+    return __atomic_sub_fetch(&refCount, 1, __ATOMIC_RELEASE);
+  }
+  RingAlgorithm() { refCount = 0; }
+  virtual ~RingAlgorithm() {};
+};
+
+class RingARAlgorithm : public RingAlgorithm {
+private:
+  int ringIndex;
+  int elemSize;
+  ssize_t chunkSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+    chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+
+    if (nelem <= sliceOffset) {
+      *sendbuffOut = sendbuff;
+      *mhandleOut = sendMhandle;
+    } else {
+      if (curLoopStage == 0) {
+        *sendbuffOut = sendbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = sendMhandle;
+      } else {
+        *sendbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+        *mhandleOut = srecvMhandle;
+      }
+    }
+    size = std::min(curSliceSize, nelem - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int curLoopStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int chunkStage = curLoopStage % nRanks;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t remSize = channelSize - elemOffset;
+    ssize_t chunkOffset;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t curChunkSize;
+    ssize_t size;
+    ssize_t nelem;
+    int chunkId;
+
+    if (remSize < loopSize) {
+      curChunkSize = alignUp(divUp(remSize / elemSize, nRanks), 16 / elemSize) * elemSize;
+    } else {
+      curChunkSize = chunkSize;
+    }
+
+    if (curLoopStage == 0) {
+      chunkId = (ringIndex + 1) % nRanks;
+    } else {
+      chunkId = (ringIndex + nRanks - 1 - chunkStage) % nRanks;
+    }
+
+    chunkOffset = chunkId * curChunkSize;
+    nelem = std::min(remSize - chunkOffset, curChunkSize);
+    curSliceSize = std::max(divUp(nelem / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (nelem <= sliceOffset) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + elemOffset + chunkOffset + sliceOffset;
+    }
+    if (sizeOut) {
+      size = std::min(curSliceSize, nelem - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingARAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int ringIndex, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringIndex = ringIndex;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = 2 * (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->chunkSize = chunkSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = nRanks * chunkSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->channelSize = channelSize;
+    this->elemSize = elemSize;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingARAlgorithm() {}
+};
+
+class RingAGAlgorithm : public RingAlgorithm {
+private:
+  int *ringRanks;
+  int elemSize;
+  ssize_t sendSize;
+  int slicePerChunk;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = (curStep % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+    uint8_t *buff;
+    void *mhandle;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[0];
+      offset = elemOffset + sliceOffset;
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+      offset = elemOffset + rankDest * sendSize + sliceOffset;
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(curSliceSize, channelSize - elemOffset - sliceOffset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int chunkStage = ((curStep + chunkSteps) % nStepsPerLoop) / chunkSteps;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset;
+    ssize_t curSliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t chunkSize = std::min(loopSize, channelSize - elemOffset);
+    ssize_t size;
+    int rankDest;
+
+    curSliceSize = std::max(divUp(chunkSize / elemSize, 16 * slicePerChunk) * 16, sliceSize / elemSize / 32) * elemSize;
+    sliceOffset = sliceStage * curSliceSize;
+    if (chunkStage == 0) {
+      rankDest = ringRanks[1];
+    } else {
+      rankDest = ringRanks[nRanks - chunkStage];
+    }
+    offset = elemOffset + rankDest * sendSize + sliceOffset;
+    *recvbuffOut = recvbuff + offset;
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - elemOffset - sliceOffset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+  }
+
+  RingAGAlgorithm(const void *sendbuff, void *recvbuff, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, int elemSize, size_t sendSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->ringRanks = ringRanks;
+    this->nRanks = nRanks;
+    this->nStepsPerLoop = (nRanks - 1) * chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->elemSize = elemSize;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->sendSize = sendSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+    this->slicePerChunk = chunkSteps / sliceSteps;
+  }
+  ~RingAGAlgorithm() {}
+};
+
+class RingBCAlgorithm : public RingAlgorithm {
+private:
+  int root;
+  int rank;
+  int nextRank;
+public:
+  void getNextSendAddr(int curStep, uint8_t **sendbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    uint8_t *buff;
+    void *mhandle;
+
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      buff = sendbuff;
+      mhandle = sendMhandle;
+    } else if (rank == root) {
+      buff = sendbuff + offset;
+      mhandle = sendMhandle;
+    } else {
+      buff = recvbuff + offset;
+      mhandle = srecvMhandle;
+    }
+    *sendbuffOut = buff;
+    size = std::min(sliceSize, channelSize - offset);
+    *sizeOut = size < 0 ? 0 : size;
+    *mhandleOut = mhandle;
+    return;
+  }
+
+  void getNextRecvAddr(int curStep, uint8_t **recvbuffOut, size_t *sizeOut, void **mhandleOut) {
+    int curLoop = curStep / nStepsPerLoop;
+    int sliceStage = (curStep % chunkSteps) / sliceSteps;
+    ssize_t sliceOffset = sliceStage * sliceSize;
+    ssize_t offset;
+    ssize_t elemOffset = curLoop * loopSize;
+    ssize_t size;
+    offset = elemOffset + sliceOffset;
+    if (offset >= channelSize) {
+      *recvbuffOut = recvbuff;
+    } else {
+      *recvbuffOut = recvbuff + offset;
+    }
+    if (sizeOut) {
+      size = std::min(sliceSize, channelSize - offset);
+      *sizeOut = size < 0 ? 0 : size;
+    }
+    *mhandleOut = recvMhandle;
+    return;
+  }
+
+  RingBCAlgorithm(const void* sendbuff, void* recvbuff, int rank, int root, int nRanks, int *ringRanks, int chunkSteps, int sliceSteps, size_t chunkSize, size_t sliceSize, size_t gridOffset, size_t channelSize, void *sendMhandle, void *recvMhandle, void *srecvMhandle) {
+    this->root = root;
+    this->rank = rank;
+    this->nextRank = ringRanks[1];
+    this->nStepsPerLoop = chunkSteps;
+    this->chunkSteps = chunkSteps;
+    this->sliceSteps = sliceSteps;
+    this->sliceSize = sliceSize;
+    this->loopSize = chunkSize;
+    this->channelSize = channelSize;
+    this->sendbuff = (uint8_t*)sendbuff + gridOffset;
+    this->recvbuff = (uint8_t*)recvbuff + gridOffset;
+    this->sendMhandle = sendMhandle;
+    this->recvMhandle = recvMhandle;
+    this->srecvMhandle = srecvMhandle;
+  }
+  ~RingBCAlgorithm() {}
+};
+
 template<typename T>
 class PatRSAlgorithm{
   size_t offset;
@@ -532,10 +847,10 @@ class PatAGAlgorithm{
       int sendDataRank = (rank + nranks + s) % nranks;
       outIx = sendDataRank * count + offset;
       recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      s -= (1<<recvDim);
       if (recvDim == -1) {
         recvOffset = -1;
       } else {
+        s -= (1<<recvDim);
         int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
         recvOffset = (foffset%postFreq)*nelem;
         recvStepOffset = foffset / postFreq;
diff --git a/src/include/comm.h b/src/include/comm.h
index 9d102dfed..c3f4eb49f 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -197,12 +197,15 @@ struct ncclTaskColl {
   int32_t algorithm:8, protocol:8;
   uint32_t isCollnet:1, isNvls:1;
   uint32_t devFuncId:30;
-  enum ncclRegBufferType regBufType;
+  int regBufType;
   // number of elements in planner->ipcMemQueue associated with this collective
   int nCleanupQueueElts;
 
   void* sendMhandle;
   void* recvMhandle;
+  void** sendNetHandles;
+  void** recvNetHandles;
+  void** srecvNetHandles;
   // index for IPC record lookup
   uintptr_t sendbuffOffset;
   uintptr_t recvbuffOffset;
@@ -236,6 +239,7 @@ struct ncclKernelPlan {
   struct ncclKernelPlan* next;
 
   bool persistent; // aka captured in a graph
+  bool isHostCbEnq;
   enum ncclDevWorkStorageType workStorageType;
   bool kernelSpecialized;
   void *kernelFn;
@@ -365,6 +369,7 @@ struct ncclKernelPlanner {
 
   struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
   struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
+  struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
 
   //////////////////////////////////////////////////////////////////////////////
@@ -463,6 +468,8 @@ struct ncclComm {
 
   // Counter for tracking CUDA launches (P2P and collectives included)
   uint64_t opCount;
+  // Collective operation counter
+  uint64_t collOpCount;
 
   // Channels for collectives
   int nChannels; // connection nChannels
@@ -486,7 +493,6 @@ struct ncclComm {
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
   int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 
   /* This attribute can indicate the states of communicators and return code of
@@ -532,7 +538,7 @@ struct ncclComm {
   int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
   // Whether this communicator uses collNet
   int collNetSupport;
-  bool collNetRegSupport;
+  bool isOneRPN;
   uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
   bool intraNodeP2pSupport;
   int* collNetHeads;
@@ -560,6 +566,7 @@ struct ncclComm {
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
   int persistentRefs; // number of persistent plan-lists capturing this comm
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
   struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
 
   struct ncclKernelPlanner planner;
@@ -599,9 +606,16 @@ struct ncclComm {
 
   // buffer registration cache
   struct ncclRegCache regCache;
+  int isAllNvlink;
+  bool useNetPXN;
+  bool useGdr;
+  int splitCount;
   uint64_t endMagic;
 };
 
+static_assert(offsetof(struct ncclComm, startMagic) == 0, "startMagic must be the first field of ncclComm");
+static_assert(offsetof(struct ncclComm, endMagic) == sizeof(struct ncclComm) - sizeof(uint64_t), "endMagic must be the last field of ncclComm");
+
 enum ncclLaunchMode {
   ncclLaunchModeInvalid=0,
   ncclLaunchModeParallel,
@@ -644,7 +658,7 @@ inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
     }
   }
 finish:
-  cudaThreadExchangeStreamCaptureMode(&mode);
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   return ncclSuccess;
 }
 
diff --git a/src/include/debug.h b/src/include/debug.h
index 491ac3e12..4e50cbf5a 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -38,4 +38,6 @@ extern char ncclLastError[];
 
 void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
 
+void ncclResetDebugInit();
+
 #endif
diff --git a/src/include/device.h b/src/include/device.h
index 153b5ae36..0c861f595 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -88,24 +88,18 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
 #define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
 
-#define NCCL_DIRECT_WRITE 0x01
-#define NCCL_DIRECT_READ  0x02
+#define NCCL_P2P_WRITE 0x01
+#define NCCL_P2P_READ  0x02
 #define NCCL_DIRECT_NIC   0x04
-#define NCCL_IPC_WRITE    0x08
-#define NCCL_IPC_READ     0x10
-#define NCCL_NVLS_MIN_POLL 0x20
+#define NCCL_NVLS_MIN_POLL 0x80
 
 // Number of named barriers supported by CUDA
 #define NCCL_MAX_GROUPS 16
 
-#define NCCL_MAX_COLLNET_SIZE (1L << 29)
-
-enum ncclRegBufferType {
-  NCCL_REGULAR_BUFFER = 0,
-  NCCL_IPC_REG_BUFFER = 1,
-  NCCL_NVLS_REG_BUFFER = 2,
-  NCCL_COLLNET_REG_BUFFER = 3
-};
+#define NCCL_REGULAR_BUFFER 0x00
+#define NCCL_IPC_REG_BUFFER 0x01
+#define NCCL_NVLS_REG_BUFFER 0x02
+#define NCCL_NET_REG_BUFFER 0x04
 
 struct ncclConnInfo {
   // Regular comm mechanism
@@ -143,8 +137,6 @@ struct ncclConnector {
   struct ncclTransportComm* transportComm;
   void* transportResources;
   struct ncclConnInfo conn;
-  int sendMemSameProcess;
-  int recvMemSameProcess;
 };
 
 struct ncclRing {
@@ -228,7 +220,7 @@ struct alignas(16) ncclDevWorkP2p {
   uint8_t sendChunkSize_u32fp8, recvChunkSize_u32fp8;
 
   uint8_t sendProtoLL:1, recvProtoLL:1;
-  uint8_t sendRegistered:1, recvRegistered:1;
+  uint8_t sendNetReg:1, recvNetReg:1;
   uint8_t sendIpcReg:1, recvIpcReg:1;
 };
 
@@ -267,7 +259,7 @@ struct alignas(16) ncclDevWorkColl {
   //   nChannels == (channelHi - channelLo) + 1
   uint32_t channelLo:8, channelHi:8;
   uint32_t nWarps:8;
-  uint32_t redOpArgIsPtr:1, regUsed:2, oneNode:1, direct:4;
+  uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
   uint32_t root;
   void* recvbuff;
   void* sendbuff;
@@ -393,7 +385,7 @@ struct ncclDevComm {
   int nNodes;
   int buffSizes[NCCL_NUM_PROTOCOLS];
   int p2pChunkSize;
-  int isNvlink;
+  int isAllNvlink;
 
   // Work fifo return credits
   uint32_t* workConsumed/*[MAXCHANNELS]*/;
@@ -525,9 +517,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) {
   case ncclInt64:
   case ncclUint64:
   case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
   case ncclBfloat16:
-  #endif
     return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
   case ncclFloat:
   case ncclDouble:
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 1bb5a604f..3eb6c0743 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -25,5 +25,16 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo);
+ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm);
+
+static inline size_t ncclFuncSendCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncReduceScatter ? nRanks*count : count;
+}
+static inline size_t ncclFuncRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather ? nRanks*count : count;
+}
+static inline size_t ncclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count) {
+  return func == ncclFuncAllGather || func == ncclFuncReduceScatter ? nRanks*count : count;
+}
 
 #endif // End include guard
diff --git a/src/include/graph.h b/src/include/graph.h
index b6d86b398..602cc8cd9 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -19,7 +19,7 @@ ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
 
 struct ncclTopoSystem;
 // Build the topology
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile=NULL);
 ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
 
@@ -33,10 +33,11 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int64_t netId, int read, int* useGdr);
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
@@ -118,6 +119,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
     struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
 
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time, bool* backup=nullptr);
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);
 
 #endif
diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h
index c3709584c..3a4c42bb2 100644
--- a/src/include/ibvwrap.h
+++ b/src/include/ibvwrap.h
@@ -12,6 +12,8 @@
 #ifndef NCCL_IBVWRAP_H_
 #define NCCL_IBVWRAP_H_
 
+#include <arpa/inet.h>
+#include <netinet/in.h>
 #ifdef NCCL_BUILD_RDMA_CORE
 #include <infiniband/verbs.h>
 #else
@@ -89,4 +91,14 @@ static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv
 
 ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
 
+// converts a GID into a readable string. On success, returns a non-null pointer to gidStr.
+// NULL is returned if there was an error, with errno set to indicate the error.
+// errno = ENOSPC if the converted string would exceed strLen.
+static inline const char* ibvGetGidStr(union ibv_gid* gid, char* gidStr, size_t strLen) {
+  // GID is a 16B handle, to convert it to a readable form, we use inet_ntop
+  // sizeof(ibv_gid) == sizeof(struct in6_addr), so using AF_INET6
+  static_assert(sizeof(union ibv_gid) == sizeof(struct in6_addr), "the sizeof struct ibv_gid must be the size of struct in6_addr");
+  return inet_ntop(AF_INET6, gid->raw, gidStr, strLen);
+}
+
 #endif //End include guard
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index 26851b17e..fcf2251fe 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -32,6 +32,7 @@ typedef enum {
   NCCL_BOOTSTRAP = 0x1000,
   NCCL_REG = 0x2000,
   NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
   NCCL_ALL = ~0
 } ncclDebugLogSubSys;
 
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
index 467d9fdb8..f165aa1bf 100644
--- a/src/include/nccl_net.h
+++ b/src/include/nccl_net.h
@@ -13,6 +13,9 @@
 #include <stdint.h>
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
@@ -21,6 +24,18 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
+
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
@@ -28,6 +43,7 @@ typedef struct {
                                    // cards with multiple PCI functions (Physical or virtual).
   int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
   int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
   int speed;                       // Port speed in Mbps.
   int port;                        // Port number.
   float latency;                   // Network latency
@@ -35,9 +51,149 @@ typedef struct {
   int maxRecvs;                    // Maximum number of grouped receives.
   ncclNetDeviceType netDeviceType; // Network offload type
   int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v8_t;
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+typedef ncclNetProperties_v9_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 
-typedef ncclNetProperties_v8_t ncclNetProperties_t;
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclNet_v9_t;
+
+typedef ncclNet_v9_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+} ncclCollNet_v9_t;
+
+typedef ncclCollNet_v9_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
 
 typedef struct {
   // Name of the network (mainly for logs)
@@ -94,10 +250,6 @@ typedef struct {
   ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
 } ncclNet_v8_t;
 
-typedef ncclNet_v8_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
-
 typedef struct {
   void* mhandle;
   void* address;
@@ -151,10 +303,6 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v8_t;
 
-typedef ncclCollNet_v8_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
-
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
index 556a0f6e4..a8164d075 100644
--- a/src/include/nccl_profiler.h
+++ b/src/include/nccl_profiler.h
@@ -16,7 +16,6 @@ enum {
   ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
   ncclProfileProxyStep = (1 << 4),  // proxy step event type
   ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileNumEvents = (     6),
 };
 
 typedef struct {
@@ -28,28 +27,25 @@ typedef struct {
       const char* name;
       uint64_t commHash;
       uint64_t seqNumber;
-      uint8_t func;
+      const char* func;
       void const* sendBuff;
       void* recvBuff;
       size_t count;
       int root;
-      uint8_t datatype;
-      uint32_t op;
+      const char* datatype;
       size_t trafficBytes;
       uint8_t nMaxChannels;
       uint8_t nWarps;
-      uint8_t algo;
-      uint8_t proto;
-      int isCollnet;
-      int isNvls;
+      const char* algo;
+      const char* proto;
     } coll;
 
     struct {
       const char* name;
       uint64_t commHash;
-      uint8_t func;
+      const char* func;
       void* buff;
-      uint8_t datatype;
+      const char* datatype;
       size_t count;
       int peer;
     } p2p;
@@ -67,7 +63,7 @@ typedef struct {
       int step;
     } proxyStep;
   };
-} ncclProfilerEventDescr_v1_t;
+} ncclProfilerEventDescr_v2_t;
 
 typedef enum {
   ncclProfilerProxyOpSendPosted,
@@ -93,7 +89,7 @@ typedef enum {
   ncclProfilerProxyCtrlWakeup,
   ncclProfilerProxyCtrlAppend,
   ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v1_t;
+} ncclProfilerEventState_v2_t;
 
 typedef union {
   struct {
@@ -104,7 +100,101 @@ typedef union {
   struct {
     int appendedProxyOps;
   } proxyCtrl;
-} ncclProfilerEventStateArgs_v1_t;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v2_t ncclProfiler_t;
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
 
 typedef struct {
   const char* name;
@@ -142,9 +232,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v1_t;
 
-typedef ncclProfilerEventDescr_v1_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v1_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v1_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v1_t ncclProfiler_t;
-
 #endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
index 5cd02149f..6e61118b9 100644
--- a/src/include/nccl_tuner.h
+++ b/src/include/nccl_tuner.h
@@ -33,6 +33,7 @@ typedef struct {
   //   - numPipeOps: number of operations in the group
   //   - numAlgo: number of algorithms in collCostTable
   //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
   //
   // Outputs:
   //   - nChannels: number of channels (hence SMs) to be used.
@@ -48,16 +49,60 @@ typedef struct {
   // Unset fields will be set automatically by NCCL.
   ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
+                              int regBuff, int* nChannels);
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
   ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
 
-typedef ncclTuner_v3_t ncclTuner_t;
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
 
 // API to be implemented by external tuner
 typedef struct {
diff --git a/src/include/net_device.h b/src/include/net_device.h
index 7bb2968c0..5fae9b542 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -25,6 +25,7 @@ typedef struct {
 } ncclNetDeviceHandle_v7_t;
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
-typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 7dee7d4ae..72fbf9ce2 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -302,7 +302,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa
 
 struct ncclNvmlCCStatus {
     bool CCEnabled;
-    bool multiGpuCCEnabled;
+    bool multiGpuProtectedPCIE;
 };
 
 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 36774dc84..2b7efe0f6 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -36,9 +36,9 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* ar
 ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
 
 // Proxy Step Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartSendProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
-ncclResult_t ncclProfilerStopProxyStepEvents(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi);
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
+ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args, int stepId);
 
 // Proxy Control Start/Stop Events Wrappers
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
@@ -46,7 +46,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
-ncclResult_t ncclProfilerRecordProxyStepEventStates(int sub, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
 
 // Profiler utility functions
diff --git a/src/include/proxy.h b/src/include/proxy.h
index a1c44d6b1..b6ef0fa9d 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -15,6 +15,7 @@
 #include <pthread.h>
 #include "shmutils.h"
 #include "p2p.h"
+#include "collectives.h"
 
 typedef enum : uint8_t {
   ncclPatternRing,
@@ -56,7 +57,11 @@ struct ncclProxyOp {
   int root;
   int next;
   int nsteps;
-  int chunkSize;
+  size_t chunkSize;
+  size_t sliceSize;
+  size_t loopSize;
+  size_t loopOffset;
+  size_t channelSize;
   uint8_t sliceSteps;
   uint8_t chunkSteps;
   uint8_t channelId;
@@ -65,13 +70,15 @@ struct ncclProxyOp {
   uint8_t /*ncclFunc_t*/ coll;
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t protocol;
+  uint8_t algorithm;
   uint8_t reg;
-  // collnet buffer reg handles
+  // collnet/p2p/coll buffer reg handles
   void* sendMhandle;
   void* recvMhandle;
   uint8_t* sendbuff;
   uint8_t* recvbuff;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
   union ncclProxyOpSpecifics specifics;
 
   // Profiler plugin
@@ -93,19 +100,21 @@ struct ncclProxyOp {
 struct ncclProxySubArgs {
   struct ncclProxyConnection* connection;
   int reg;
-  // p2p mhandle
-  void* mhandle;
   // collnet handles
   void* sendMhandle;
   void* recvMhandle;
   uint8_t* sendbuff;
   uint8_t* recvbuff;
   size_t offset;
+  ssize_t loopSize;
+  ssize_t loopOffset;
   int channelId;
   int nsteps;
   ssize_t nbytes;
+  ssize_t chunkSize;
   int peer;
-
+  int isOneRPN;
+  RingAlgorithm *ringAlgo;
   int groupSize; // Number of consecutive sub operations sharing the same recvComm
   uint64_t base;
   uint64_t posted;
@@ -114,11 +123,14 @@ struct ncclProxySubArgs {
   uint64_t transmitted;
   uint64_t done;
   uint64_t end;
+  int regBufferReady;
   void* requests[NCCL_STEPS];
 
   // Profiler plugin
   int eActivationMask;
   int rank;
+  pid_t pid;
+  void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
   void* stepEventHandles[NCCL_STEPS];
@@ -133,10 +145,11 @@ struct ncclProxyArgs {
   proxyProgressFunc_t progress;
   int nsubs;
   int done;
+  int onePPN;
   uint64_t opCount;
   int sliceSteps;
   int chunkSteps;
-  int chunkSize;
+  size_t chunkSize;
   size_t totalSendSize;
   size_t totalRecvSize;
   size_t sendSizePerRound;
@@ -146,16 +159,13 @@ struct ncclProxyArgs {
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t /*ncclFunc_t*/ coll;
   uint8_t protocol;
+  uint8_t algorithm;
   int state;
   char* sharedBuff[NCCL_STEPS];
   int sharedSize[NCCL_STEPS];
 
   int idle;
 
-  // Profiler plugin
-  pid_t pid;
-  void* profilerContext;
-
   // Element linking
   struct ncclProxyArgs* next;
   struct ncclProxyArgs* nextPeer;
diff --git a/src/include/ras.h b/src/include/ras.h
new file mode 100644
index 000000000..7909b3dc8
--- /dev/null
+++ b/src/include/ras.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_H_
+#define NCCL_RAS_H_
+
+#include "socket.h"
+
+// Structure used to communicate data about NCCL ranks from NCCL threads to RAS.
+struct rasRankInit {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  int cudaDev;
+  int nvmlDev;
+};
+
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm);
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks);
+
+#endif // !NCCL_RAS_H_
diff --git a/src/include/register.h b/src/include/register.h
index 7c60535d9..740a645f4 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -6,6 +6,9 @@
 #include <cuda.h>
 #include <stdint.h>
 
+int64_t ncclParamLocalRegister();
+int64_t ncclParamGraphRegister();
+
 enum {
   NET_REG_COMPLETE = 0x01,
   NVLS_REG_COMPLETE = 0x02,
@@ -20,16 +23,21 @@ struct ncclPeerRegIpcAddr {
   uintptr_t* hostPeerRmtAddrs;
 };
 
+struct ncclRegNetHandles {
+  void* handle;
+  struct ncclProxyConnector* proxyConn;
+  struct ncclRegNetHandles* next;
+};
+
 struct ncclReg {
   // common attributes
   size_t pages;
-  int refs;
+  int localRefs;
+  int graphRefs;
   uintptr_t addr;
   uint32_t state;
   // net reg
-  int nDevs;
-  int devs[MAXCHANNELS];
-  void** handles;
+  struct ncclRegNetHandles* netHandleHead;
   // nvls reg
   uintptr_t baseAddr;
   size_t baseSize;
@@ -50,11 +58,12 @@ struct ncclRegCache {
   struct ncclReg **slots;
   int capacity, population;
   uintptr_t pageSize;
-  void* sComms[MAXCHANNELS];
-  void* rComms[MAXCHANNELS];
 };
 
 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
 ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
 
 #endif
diff --git a/src/include/shmutils.h b/src/include/shmutils.h
index 43e8afb79..097b4c657 100644
--- a/src/include/shmutils.h
+++ b/src/include/shmutils.h
@@ -10,7 +10,7 @@
 #include "nccl.h"
 
 typedef void* ncclShmHandle_t;
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
 ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
 
diff --git a/src/include/socket.h b/src/include/socket.h
index 60a413875..f0a3237ce 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -17,9 +17,6 @@
 
 #define MAX_IFS 16
 #define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT            1000 // connection retry sleep interval in usec
-#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
-#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
 #define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
 #define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
 
@@ -39,9 +36,10 @@ enum ncclSocketState {
   ncclSocketStateConnectPolling = 5,
   ncclSocketStateConnected = 6,
   ncclSocketStateReady = 7,
-  ncclSocketStateClosed = 8,
-  ncclSocketStateError = 9,
-  ncclSocketStateNum = 10
+  ncclSocketStateTerminating = 8,
+  ncclSocketStateClosed = 9,
+  ncclSocketStateError = 10,
+  ncclSocketStateNum = 11
 };
 
 enum ncclSocketType {
@@ -49,14 +47,14 @@ enum ncclSocketType {
   ncclSocketTypeBootstrap = 1,
   ncclSocketTypeProxy = 2,
   ncclSocketTypeNetSocket = 3,
-  ncclSocketTypeNetIb = 4
+  ncclSocketTypeNetIb = 4,
+  ncclSocketTypeRasNetwork = 5
 };
 
 struct ncclSocket {
   int fd;
   int acceptFd;
-  int timedOutRetries;
-  int refusedRetries;
+  int errorRetries;
   union ncclSocketAddress addr;
   volatile uint32_t* abortFlag;
   int asyncFlag;
@@ -64,15 +62,18 @@ struct ncclSocket {
   int salen;
   uint64_t magic;
   enum ncclSocketType type;
+  int customRetry;
+  int finalizeCounter; // Used to keep track of initial handshake for async sockets.
+  char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 };
 
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
 int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
 
 // Initialize a socket
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
 // Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
 ncclResult_t ncclSocketListen(struct ncclSocket* sock);
 ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
@@ -88,11 +89,12 @@ ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
 #define NCCL_SOCKET_SEND 0
 #define NCCL_SOCKET_RECV 1
 
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed = NULL);
 ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock);
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index cbeb613ca..37187f69e 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -28,7 +28,6 @@ extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
 
 extern struct ncclTransport* ncclTransports[];
-
 // Forward declarations
 struct ncclRing;
 struct ncclConnector;
@@ -115,16 +114,16 @@ struct ncclTransport {
 };
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
 ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
@@ -143,4 +142,13 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm);
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm);
 
+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle);
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle);
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
+ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 5a1b749a7..383f678c8 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -49,8 +49,7 @@ inline uint64_t clockNano() {
   return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
 }
 
-/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
- * return -1 */
+/* get any bytes of random data from /dev/urandom, return ncclSuccess (0) if it succeeds. */
 inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   ncclResult_t ret = ncclSuccess;
   if (bytes > 0) {
diff --git a/src/init.cc b/src/init.cc
index 94c2fb10e..5caaaae09 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -17,6 +17,7 @@
 #include "graph.h"
 #include "argcheck.h"
 #include "tuner.h"
+#include "ras.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -182,6 +183,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  NCCLCHECK(ncclRasCommFini(comm));
+
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
    * free all intra-process communicators; therefore, we only need to focus on local
    * resource cleanup in commFree(). */
@@ -193,7 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
     }
   }
 
-  CUDACHECK(cudaMemPoolDestroy(comm->memPool));
+  if (comm->memPool) CUDACHECK(cudaMemPoolDestroy(comm->memPool));
 
   delete[] comm->userRedOps;
 
@@ -421,11 +424,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 
   ncclIntruQueueConstruct(&comm->eventCallbackQueue);
 
-  //  setup intraComm0 and intraRanks 0 to default values to ensure proper cleanup of the communicator
-  comm->intraComm0 = comm;
-  comm->intraRank = 0;
-  comm->intraRanks = 1;
-
   return ncclSuccess;
 }
 
@@ -435,6 +433,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels tmpCommAndChans;
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
+  bool ccEnable;
 
   NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
   NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
@@ -448,7 +447,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   tmpCommAndChans.comm.node = comm->node;
   tmpCommAndChans.comm.nNodes = comm->nNodes;
   tmpCommAndChans.comm.abortFlag = comm->abortFlagDev;
-  tmpCommAndChans.comm.isNvlink = ncclTopoPathAllNVLink(comm->topo);
+  tmpCommAndChans.comm.isAllNvlink = comm->isAllNvlink;
   for (int p=0; p < NCCL_NUM_PROTOCOLS; p++) {
     tmpCommAndChans.comm.buffSizes[p] = comm->buffSizes[p];
   }
@@ -458,11 +457,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));
 
   memset(&ccStatus, 0, sizeof(ccStatus));
-  if (ncclNvmlGetCCStatus(&ccStatus) == ncclSuccess && ccStatus.CCEnabled) {
+  ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE);
+  if (ccEnable) {
     comm->workFifoBytes = 0;
-    if (ccStatus.multiGpuCCEnabled == false && comm->rank == 0) {
-      WARN("CC On, Multi-GPU CC Off (No inter-GPU communication protection)");
-    }
   } else {
     comm->workFifoBytes = ncclParamWorkFifoBytes();
     if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
@@ -473,7 +470,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   }
 
   if (comm->rank == 0) {
-    INFO(NCCL_INIT, "CC %s, Multi-GPU CC %s, workFifoBytes %d", ccStatus.CCEnabled ? "On" : "Off", ccStatus.multiGpuCCEnabled ? "On" : "Off", comm->workFifoBytes);
+    INFO(NCCL_INIT, "CC %s, workFifoBytes %d", ccEnable ? "On" : "Off", comm->workFifoBytes);
   }
 
   if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) {
@@ -608,9 +605,6 @@ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
 NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
 
 static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
-  int cpuArch, cpuVendor, cpuModel;
-  NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
-
   int64_t envs[NCCL_NUM_PROTOCOLS] = { ncclParamLlBuffSize(), ncclParamLl128BuffSize(), ncclParamBuffSize() };
   int defaults[NCCL_NUM_PROTOCOLS] = { DEFAULT_LL_BUFFSIZE, DEFAULT_LL128_BUFFSIZE, DEFAULT_BUFFSIZE };
 
@@ -619,7 +613,7 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
   }
 
   if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
-  else if (ncclTopoPathAllNVLink(comm->topo)) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
+  else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
   else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
 
   // Make sure P2P chunksize is not larger than coll chunksize.
@@ -850,6 +844,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   } while(0);
 
   timers[TIMER_INIT_TOPO] = clockNano();
+
+  // Dump XML if requested by user
+  const char* dumpXmlFile;
+  dumpXmlFile = ncclGetEnv("NCCL_TOPO_DUMP_FILE");
+  if (dumpXmlFile) {
+    NCCLCHECKGOTO(ncclTopoGetSystem(comm, NULL, dumpXmlFile), ret, fail);
+  }
+
   // Topo detection / System graph creation
   NCCLCHECKGOTO(ncclTopoGetSystem(comm, &comm->topo), ret, fail);
   // Compute paths between GPUs and NICs
@@ -1076,9 +1078,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
       comm->collNetSupport = 0;
     }
-    // As long as there is more than 1 rank on any node, we need to disable collnet reg
-    comm->collNetRegSupport = (comm->maxLocalRanks == 1);
   }
+  comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
+  comm->isOneRPN = (comm->maxLocalRanks == 1);
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
   NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, parent), ret, fail);
@@ -1293,7 +1295,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
   NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
   timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
-
   /* Local intra-node barrier */
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
 
@@ -1338,6 +1339,7 @@ struct ncclCommInitRankAsyncJob {
   // for ncclCommSplit
   struct ncclComm* parent;
   int color, key;
+  int splitCount;
   // name of the function calling
   char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
 };
@@ -1432,13 +1434,14 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex), add the color
+    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
+    // add unique split counter and the color
     ncclUniqueId tmpId;
     memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d", job->parent->commHash, job->color);
+    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
     comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d- Init START", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
     NCCLCHECKGOTO(bootstrapSplit(comm->commHash, comm, job->parent, job->color, job->key, parentRanks), res, fail);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano() - timers[TIMER_INIT_BOOTSTRAP];
@@ -1474,8 +1477,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     /* unlink child abort flag. */
     __atomic_store_n(&job->parent->childAbortFlag, NULL, __ATOMIC_RELEASE);
     TRACE_CALL("ncclCommSplit(%p, %d, %d, %p, %d, %d)", job->parent, job->color, job->key, comm, comm->rank, comm->nRanks);
-    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p color %d key %d - Init COMPLETE", job->funcName,
-         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->color, job->key);
+    INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d - Init COMPLETE", job->funcName,
+         comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
   } else {
     // the name for the replay tool is ncclCommInitRank for all the variations
     TRACE_CALL("ncclCommInitRank(%p, %d, 0x%llx, %d, %d)", comm, comm->nRanks, commIdHash, comm->rank, comm->cudaDev);
@@ -1716,8 +1719,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   comm->startMagic = comm->endMagic = NCCL_MAGIC; // Used to detect comm corruption.
   *comm->abortFlagRefCount = 1;
   NCCLCHECKGOTO(parseCommConfig(comm, config), res, fail);
-  /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-  comm->initState = ncclInternalError;
+  /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+  comm->initState = ncclInProgress;
   *newcomm = comm;
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
@@ -1749,6 +1752,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
 exit:
   return ncclGroupErrCheck(res);
 fail:
+  if (job) ncclCommInitJobFree(job);
   if (comm) {
     free(comm->abortFlag);
     if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1846,7 +1850,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
 
 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
   free(gpuFlags);
   return ret;
 fail:
@@ -1926,14 +1930,9 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
 static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
   struct ncclCommFinalizeAsyncJob* job = (struct ncclCommFinalizeAsyncJob*) job_;
   ncclComm_t comm = job->comm;
-  int savedDevice;
-  int commDevice = comm->cudaDev;
   ncclResult_t ret = ncclSuccess;
 
-  CUDACHECKGOTO(cudaGetDevice(&savedDevice), ret, fail);
-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(commDevice), ret, fail);
-  }
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
 
   TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d asyncResult %d", comm, comm->rank, *comm->abortFlag, comm->asyncResult);
 
@@ -1963,10 +1962,6 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
     WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, ret);
   }
 
-  if (savedDevice != commDevice) {
-    CUDACHECKGOTO(cudaSetDevice(savedDevice), ret, fail);
-  }
-
 exit:
   return ret;
 fail:
@@ -1974,25 +1969,12 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
 }
 
 static ncclResult_t commCleanup(ncclComm_t comm) {
-  int savedDevice;
-  int commDevice = comm->cudaDev;
-
-  CUDACHECK(cudaGetDevice(&savedDevice));
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(commDevice));
-  }
-
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
   if (comm->tuner != NULL) {
     NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
     NCCLCHECK(ncclTunerPluginUnload(comm));
   }
-
   NCCLCHECK(commFree(comm));
-
-  if (savedDevice != commDevice) {
-    CUDACHECK(cudaSetDevice(savedDevice));
-  }
-
   return ncclSuccess;
 }
 
@@ -2099,6 +2081,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+  NCCLCHECK(ncclGroupStartInternal());
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
     WARN("comm %p has already been destroyed", comm);
@@ -2113,6 +2096,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   goto exit;
@@ -2124,7 +2109,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
     NVTX3_FUNC_RANGE_IN(nccl_domain);
     return ncclSuccess;
   }
-
+  NCCLCHECK(ncclGroupStartInternal());
   // Ask anything that might still be running on the device to quit
   if (comm->childAbortFlag != nullptr) {
     __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
@@ -2152,6 +2137,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
   return ncclSuccess;
 fail:
   goto exit;
@@ -2218,14 +2205,15 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
       NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
     }
 
-    /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
-    childComm->initState = ncclInternalError;
+    /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
+    childComm->initState = ncclInProgress;
   }
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->comm = childComm;
   job->newcomm = newcomm;
   job->parent = comm;
+  job->splitCount = ++comm->splitCount;
   job->color = color;
   job->key = key;
   job->cudaDev = comm->cudaDev;
@@ -2233,13 +2221,13 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
 
 exit:
-  cudaSetDevice(oldDev);
+  (void)cudaSetDevice(oldDev);
   (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   if (childComm) {
-    if (comm && !comm->config.splitShare) {
+    if (!comm->config.splitShare) {
       free(childComm->abortFlag);
       if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
       free(childComm->abortFlagRefCount);
@@ -2347,14 +2335,12 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
   CUDACHECK(cudaGetDevice(&cudaDev));
   CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
 
-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
     int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     // Query device to see if FABRIC handle support is available
     flag = 0;
-    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
     if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
     memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
     memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
@@ -2365,18 +2351,24 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-
-    /* mc property */
     CUDACHECK(cudaGetDeviceCount(&dcnt));
-    mcprop.size = size;
-    /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-    mcprop.numDevices = dcnt;
-    mcprop.handleTypes = requestedHandleTypes;
-    mcprop.flags = 0;
-    CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-    /* only size needs to be aligned to mcGran */
-    ALIGN_SIZE(size, mcGran);
+
+    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
+    if (mcSupport) {
+      /* mc property */
+      mcprop.size = size;
+      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
+      mcprop.numDevices = dcnt;
+      mcprop.handleTypes = requestedHandleTypes;
+      mcprop.flags = 0;
+      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+      /* only size needs to be aligned to mcGran */
+      ALIGN_SIZE(size, mcGran);
+    } else {
+      ALIGN_SIZE(size, memGran);
+    }
+
     if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
       /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
       CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
@@ -2403,6 +2395,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
       }
+      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
     }
     goto exit;
   }
@@ -2429,18 +2422,13 @@ ncclResult_t  ncclMemFree(void *ptr) {
   CUDACHECK(cudaGetDevice(&saveDevice));
 #if CUDART_VERSION >= 12010
   CUdevice ptrDev = 0;
-  int mcSupport = 0;
 
   if (ptr == NULL) goto fallback;
-
   if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
 
   CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
-  if (CUPFN(cuMulticastCreate) != NULL)
-    CUCHECKGOTO(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, ptrDev), ret, fail);
-
   CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
-  if (mcSupport) {
+  if (ncclCuMemEnable()) {
     NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
     goto exit;
   }
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 03e3bde99..e5fec1e46 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -11,7 +11,7 @@
 
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
-NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", 0);
+NCCL_PARAM(CuMemHostEnable, "CUMEM_HOST_ENABLE", -1);
 // Handle type used for cuMemCreate()
 CUmemAllocationHandleType ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
@@ -35,9 +35,6 @@ int ncclIsCuMemSupported() {
   // Query device to see if CUMEM VMM support is available
   CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, currentDev), ret, error);
   if (!flag) return 0;
-  // Query device to see if CUMEM RDMA support is available
-  CUCHECKGOTO(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev), ret, error);
-  if (!flag) return 0;
 error:
   return (ret == ncclSuccess);
 #endif
@@ -49,11 +46,31 @@ int ncclCuMemEnable() {
   return  param >= 0 ? param : (param == -2 && ncclCuMemSupported);
 }
 
+static int ncclCumemHostEnable = -1;
 int ncclCuMemHostEnable() {
+  if (ncclCumemHostEnable != -1)
+    return ncclCumemHostEnable;
 #if CUDART_VERSION < 12020
-  return 0;
+  ncclCumemHostEnable = 0;
+  return ncclCumemHostEnable;
 #else
-  return ncclParamCuMemHostEnable();
+  ncclResult_t ret = ncclSuccess;
+  int cudaDriverVersion;
+  int paramValue = -1;
+  CUDACHECKGOTO(cudaDriverGetVersion(&cudaDriverVersion), ret, error);
+  if (cudaDriverVersion < 12020) {
+    ncclCumemHostEnable = 0;
+  }
+  else {
+    paramValue = ncclParamCuMemHostEnable();
+    if (paramValue != -1)
+      ncclCumemHostEnable = paramValue;
+    else
+      ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
+  }
+  return ncclCumemHostEnable;
+error:
+  return (ret == ncclSuccess);
 #endif
 }
 
@@ -218,10 +235,9 @@ static void initOnceFunc() {
   // Determine whether we support the cuMem APIs or not
   ncclCuMemSupported = ncclIsCuMemSupported();
 
-#if 12020 <= CUDART_VERSION && CUDART_VERSION <= 12030
-  /* To use cuMem* for host memory allocation, we need to create context on each
-   * visible device. This is workaround needed in CUDA 12.3 which is fixed in 12.4. */
-  if (ncclCuMemSupported && ncclCuMemHostEnable()) {
+  /* To use cuMem* for host memory allocation, we need to create context on each visible device.
+   * This is a workaround needed in CUDA 12.2 and CUDA 12.3 which is fixed in 12.4. */
+  if (ncclCuMemSupported && ncclCuMemHostEnable() && 12020 <= driverVersion && driverVersion <= 12030) {
     int deviceCnt, saveDevice;
     cudaGetDevice(&saveDevice);
     cudaGetDeviceCount(&deviceCnt);
@@ -231,7 +247,6 @@ static void initOnceFunc() {
     }
     cudaSetDevice(saveDevice);
   }
-#endif
   initResult = ret;
   return;
 error:
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index eb4e52b60..698465ca4 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -8,6 +8,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#include "ibvcore.h"
 #include "ibvsymbols.h"
 
 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
@@ -53,7 +54,7 @@ ncclResult_t wrap_ibv_symbols(void) {
   } \
   int ret = container.call; \
   if (ret == ENOTSUP || ret == EOPNOTSUPP) { \
-    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    INFO(NCCL_NET, "Call to " name " not supported"); \
     *supported = 0; \
     return ncclSuccess; \
   } else if (ret != success_retval) { \
@@ -87,6 +88,14 @@ ncclResult_t wrap_ibv_symbols(void) {
   container.call; \
   return ncclSuccess;
 
+NCCL_PARAM(IbMQpRetryAll, "IB_MQP_RETRY_ALL", 0);
+NCCL_PARAM(IbMQpRetryCnt, "IB_MQP_RETRY_CNT", 34);
+NCCL_PARAM(IbMQpRetryTimeout, "IB_MQP_RETRY_SLEEP_MSEC", 100); // in milliseconds
+
+#define IBV_ERR_EQ(e, code)        (e == code || e == (-code))
+#define IBV_MQP_RETRY_ERRNO(e)     (IBV_ERR_EQ(e, ETIMEDOUT))
+#define IBV_MQP_RETRY_ERRNO_ALL(e) (ncclParamIbMQpRetryAll() ? (e != 0) : IBV_MQP_RETRY_ERRNO(e))
+
 ncclResult_t wrap_ibv_fork_init() {
   IBV_INT_CHECK(ibvSymbols, ibv_internal_fork_init, ibv_internal_fork_init(), -1, "ibv_fork_init");
 }
@@ -202,8 +211,87 @@ ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct i
   IBV_PTR_CHECK_ERRNO(ibvSymbols, ibv_internal_create_qp, ibv_internal_create_qp(pd, qp_init_attr), *ret, NULL, "ibv_create_qp");
 }
 
-ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_modify_qp, ibv_internal_modify_qp(qp, attr, attr_mask), 0, "ibv_modify_qp");
+static void ibvQpStateName(enum ibv_qp_state state, char* msg, const size_t len) {
+  switch (state) {
+  case (IBV_QPS_RESET): snprintf(msg, len, "RESET"); break;
+  case (IBV_QPS_INIT): snprintf(msg, len, "INIT"); break;
+  case (IBV_QPS_RTR): snprintf(msg, len, "RTR"); break;
+  case (IBV_QPS_RTS): snprintf(msg, len, "RTS"); break;
+  case (IBV_QPS_SQD): snprintf(msg, len, "SQD"); break;
+  case (IBV_QPS_SQE): snprintf(msg, len, "SQE"); break;
+  case (IBV_QPS_ERR): snprintf(msg, len, "ERR"); break;
+  case (IBV_QPS_UNKNOWN): snprintf(msg, len, "UNKNOWN"); break;
+  default: snprintf(msg, len, "NOT RECOGNIZED (%d)", state); break;
+  }
+}
+
+#define QP_ATTR(attr, userAttr, userFlag, mask) ((userFlag & mask) ? (userAttr) : (attr))
+
+static void ibvModifyQpLog(struct ibv_qp* qp, enum ibv_qp_state qpState, struct ibv_qp_attr* userAttr, int userFlag, char* msg, size_t msgLen) {
+  ncclResult_t res;
+  int portNum = -1, gidIndex = -1;
+  char localGidName[INET6_ADDRSTRLEN], remoteGidName[INET6_ADDRSTRLEN];
+  const char *localGidRes = NULL, *remoteGidRes = NULL;
+
+  char nextState[32], currState[32];
+  ibvQpStateName(qp->state, currState, sizeof(currState));
+  ibvQpStateName(qpState, nextState, sizeof(nextState));
+  char devName[IBV_SYSFS_NAME_MAX] = "";
+  snprintf(devName, sizeof(devName), "%s", (qp->pd->context) ? wrap_ibv_get_device_name(qp->pd->context->device) : "N/A");
+
+  struct ibv_qp_attr attr;
+  struct ibv_qp_init_attr init_attr;
+  int attr_mask = IBV_QP_PORT | IBV_QP_AV;
+  res = wrap_ibv_query_qp(qp, &attr, attr_mask, &init_attr);
+  struct ibv_qp_attr *qpAttr = (res == ncclSuccess) ? &attr : NULL;
+
+  // port info, portAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *portAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_PORT);
+  portNum = portAttr ? portAttr->port_num : -1;
+
+  // address info, avAttr can be NULL if not given by the user and query_qp failed
+  struct ibv_qp_attr *avAttr = QP_ATTR(qpAttr, userAttr, userFlag, IBV_QP_AV);
+  if (avAttr && avAttr->ah_attr.is_global) {
+    union ibv_gid *remoteGid = &avAttr->ah_attr.grh.dgid;
+    remoteGidRes = ibvGetGidStr(remoteGid, remoteGidName, sizeof(remoteGidName));
+    // we need pd->context to retrieve local GID, skip if not there
+    if (!qp->pd->context) goto print;
+    gidIndex =  avAttr->ah_attr.grh.sgid_index;
+    union ibv_gid localGid;
+    NCCLCHECKGOTO(wrap_ibv_query_gid(qp->pd->context, portNum, gidIndex, &localGid), res, print);
+    localGidRes = ibvGetGidStr(&localGid, localGidName, sizeof(localGidName));
+  }
+
+print:
+  snprintf(msg, msgLen, "on dev %s:%d, curr state %s, next state %s, local GID index %d, local GID %s, remote GID %s",
+           devName, portNum, currState, nextState, gidIndex, localGidRes ? localGidName : "N/A", remoteGidRes ? remoteGidName : "N/A");
+  return;
+}
+
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp* qp, struct ibv_qp_attr* attr, int attr_mask) {
+  char qpMsg[1024];
+  int ret = 0, attempts = 0;
+  int maxCnt = (int)ncclParamIbMQpRetryCnt() + 1; // number of attempts = number of retry + 1
+  int timeOut = (int)ncclParamIbMQpRetryTimeout();
+  CHECK_NOT_NULL(ibvSymbols, ibv_internal_modify_qp);
+  do {
+    if (attempts > 0) {
+      unsigned int sleepTime = timeOut * attempts;
+      ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+      INFO(NCCL_NET, "Call to ibv_modify_qp failed with %d %s, %s, retrying %d/%d after %u msec of sleep", ret, strerror(ret), qpMsg, attempts, maxCnt, sleepTime);
+      // sleep before retrying
+      struct timespec tv = {.tv_sec = sleepTime / 1000, .tv_nsec = (sleepTime % 1000) * ((long)1e6)};
+      nanosleep(&tv, NULL);
+    }
+    ret = ibvSymbols.ibv_internal_modify_qp(qp, attr, attr_mask);
+    attempts++;
+  } while (IBV_MQP_RETRY_ERRNO_ALL(ret) && attempts < maxCnt);
+  if (ret != 0) {
+    ibvModifyQpLog(qp, attr->qp_state, attr, attr_mask, qpMsg, sizeof(qpMsg));
+    WARN("Call to ibv_modify_qp failed with %d %s, %s", ret, strerror(ret), qpMsg);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
 }
 
 ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index 2d17f47e6..23746b3c5 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -189,14 +189,16 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
 
   TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
-
-  cmptr = CMSG_FIRSTHDR(&msg);
-  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-  cmptr->cmsg_level = SOL_SOCKET;
-  cmptr->cmsg_type = SCM_RIGHTS;
-  memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  if (sendFd != -1) {
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }
 
   msg.msg_name = (void *)&cliaddr;
   msg.msg_namelen = sizeof(struct sockaddr_un);
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index f441af80b..66ba2d4c8 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -311,19 +311,19 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
       status->CCEnabled = false;
 
     if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE)
-      status->multiGpuCCEnabled = true;
+      status->multiGpuProtectedPCIE = true;
     else
-      status->multiGpuCCEnabled = false;
+      status->multiGpuProtectedPCIE = false;
   } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
     NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
     if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
       status->CCEnabled = true;
     else
       status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
   } else {
     status->CCEnabled = false;
-    status->multiGpuCCEnabled = false;
+    status->multiGpuProtectedPCIE = false;
   }
   return ncclSuccess;
 }
diff --git a/src/misc/profiler.cc b/src/misc/profiler.cc
index 9a4adf579..c9fb2a869 100644
--- a/src/misc/profiler.cc
+++ b/src/misc/profiler.cc
@@ -16,9 +16,110 @@ static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
+static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
+  ncclProfiler_v1->init(context, eActivationMask);
+  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
+  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
+  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}
 
 #define MAX_STR_LEN 256
-#define NCCL_PROFILER_PLUGIN_SYMBOL "ncclProfiler_v1"
 
 static void* tryOpenLib(char* name, int *err, char* errStr) {
   if (nullptr == name || strlen(name) == 0) {
@@ -33,7 +134,7 @@ static void* tryOpenLib(char* name, int *err, char* errStr) {
   if (nullptr == handle) {
     strncpy(errStr, dlerror(), MAX_STR_LEN);
     errStr[MAX_STR_LEN] = 0;
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
       *err = ENOENT;
     }
   }
@@ -116,10 +217,21 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
     goto fail;
   }
 
-  ncclProfiler = (ncclProfiler_t*)dlsym(profilerPluginLib, NCCL_PROFILER_PLUGIN_SYMBOL);
+  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
   if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find " NCCL_PROFILER_PLUGIN_SYMBOL ".");
-    goto fail;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
+    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
+    if (ncclProfiler_v1 == nullptr) {
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+      goto fail;
+    } else {
+      ncclProfiler = &ncclProfiler_v1_as_v2;
+      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
+      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
+      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
+    }
+  } else {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
   }
 
   ++profilerPluginRefCount;
@@ -247,7 +359,7 @@ ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
-      ncclProfilerEventDescr_v1_t eDescr = { 0 };
+      ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileGroup;
       ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
     }
@@ -279,20 +391,17 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
         eDescr.coll.name = plan->comm->commName;
         eDescr.coll.commHash = plan->comm->commHash;
         eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ct->func;
+        eDescr.coll.func = ncclFuncToString(ct->func);
         eDescr.coll.sendBuff = ct->sendbuff;
         eDescr.coll.recvBuff = ct->recvbuff;
         eDescr.coll.count = ct->count;
         eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ct->datatype;
-        eDescr.coll.op = ct->opHost;
+        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
         eDescr.coll.trafficBytes = ct->trafficBytes;
         eDescr.coll.nMaxChannels = ct->nMaxChannels;
         eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ct->algorithm;
-        eDescr.coll.proto = ct->protocol;
-        eDescr.coll.isCollnet = ct->isCollnet;
-        eDescr.coll.isNvls = ct->isNvls;
+        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+        eDescr.coll.proto = ncclProtoToString(ct->protocol);
         ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
 
         // update collective task with group event activation mask
@@ -307,10 +416,10 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
         eDescr.rank = plan->comm->rank;
         eDescr.p2p.name = plan->comm->commName;
         eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = pt->func;
+        eDescr.p2p.func = ncclFuncToString(pt->func);
         eDescr.p2p.buff = pt->buff;
         eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = pt->datatype;
+        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
         eDescr.p2p.peer = pt->root;
         ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
 
@@ -345,6 +454,11 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   return ncclSuccess;
 }
 
+// Bellow we set the proxy descriptor step number to DIVUP(step, args->sliceSteps).
+// The reason is that for some ncclOp (e.g. AllReduce) one network transfer is
+// made of sliceSteps steps rather than one step. In the profiler we are still
+// interested in whole network transfers though, so we account for this when
+// computing the actual network step number.
 ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
@@ -354,13 +468,13 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
       eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
       eDescr.proxyOp.channelId = sub->channelId;
       eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
       eDescr.proxyOp.isSend = 1;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyOpStart);
@@ -376,13 +490,13 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
       eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = args->pid;
+      eDescr.proxyOp.pid = sub->pid;
       eDescr.proxyOp.channelId = sub->channelId;
       eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = sub->nsteps;
-      eDescr.proxyOp.chunkSize = args->chunkSize;
+      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
+      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
       eDescr.proxyOp.isSend = 0;
-      ncclProfiler->startEvent(args->profilerContext, &sub->opEventHandle, &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyOpStart);
@@ -400,53 +514,50 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStartSendProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStartRecvProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
-      for (uint64_t step = stepLo; step < stepHi; step++) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileProxyStep;
-        eDescr.parentObj = sub->opEventHandle;
-        eDescr.rank = sub->rank;
-        eDescr.proxyStep.step = step;
-        ncclProfiler->startEvent(args->profilerContext, &sub->stepEventHandles[step%NCCL_STEPS], &eDescr);
-      }
+      int step_ = DIVUP(stepId, args->sliceSteps);
+      ncclProfilerEventDescr_t eDescr = { 0 };
+      eDescr.type = ncclProfileProxyStep;
+      eDescr.parentObj = sub->opEventHandle;
+      eDescr.rank = sub->rank;
+      eDescr.proxyStep.step = step_;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStopProxyStepEvents(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi) {
+ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStop);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->stopEvent(sub->stepEventHandles[step%NCCL_STEPS]);
-        sub->stepEventHandles[step%NCCL_STEPS] = NULL;
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
+      sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
     }
   }
   TIME_STOP_EVENT(proxyStepStop);
@@ -484,8 +595,8 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    ncclProfilerEventStateArgs_t a = { 0 };
-    a.proxyOp.steps = steps;
+    ncclProfilerEventStateArgs_t a = { };
+    a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
     a.proxyOp.transSize = transSize;
     ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
   }
@@ -493,14 +604,13 @@ ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* ar
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs* args, uint64_t stepLo, uint64_t stepHi, ncclProfilerEventState_t eState) {
+ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyStepRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
-    for (uint64_t step = stepLo; step < stepHi; step++) {
-      if (sub->stepEventHandles[step%NCCL_STEPS]) {
-        ncclProfiler->recordEventState(sub->stepEventHandles[step%NCCL_STEPS], eState, 0);
-      }
+    int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
+      ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
     }
   }
   TIME_STOP_EVENT(proxyStepRecord);
@@ -510,7 +620,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventStates(int s, struct ncclProxyArgs*
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyCtrlRecord);
   if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
-    ncclProfilerEventStateArgs_t args = { 0 };
+    ncclProfilerEventStateArgs_t args = { };
     args.proxyCtrl.appendedProxyOps = appended;
     ncclProfiler->recordEventState(eHandle, eState, &args);
   }
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index daf3b338d..eb9cd1015 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -45,7 +45,7 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
   return;
 }
 
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle) {
   int fd = -1;
   char* hptr = NULL;
   void* dptr = NULL;
@@ -62,7 +62,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
      * refcount references; when the peer attaches, it should pass -1 to reduce one reference count. When it
      * goes down to 0, unlink should be called in order to delete shared memory file. */
     if (shmPath[0] == '\0') {
-      sprintf(shmPath, "/dev/shm/nccl-XXXXXX");
+      snprintf(shmPath, shmPathSize, "/dev/shm/nccl-XXXXXX");
     retry_mkstemp:
       fd = mkstemp(shmPath);
       if (fd < 0) {
@@ -70,7 +70,7 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** de
           INFO(NCCL_ALL, "mkstemp: Failed to create %s, error: %s (%d) - retrying", shmPath, strerror(errno), errno);
           goto retry_mkstemp;
         }
-        WARN("Error: failed to create shared memory file %p, error %s (%d)", shmPath, strerror(errno), errno);
+        WARN("Error: failed to create shared memory file %s, error %s (%d)", shmPath, strerror(errno), errno);
         ret = ncclSystemError;
         goto fail;
       }
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 93e577e05..dfb4e6888 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -12,6 +12,18 @@
 #include <ifaddrs.h>
 #include <net/if.h>
 #include "param.h"
+#include <time.h>
+
+NCCL_PARAM(RetryCnt, "SOCKET_RETRY_CNT", 34);
+NCCL_PARAM(RetryTimeOut, "SOCKET_RETRY_SLEEP_MSEC", 100);
+static void msleep(unsigned int time_msec) {
+  const long c_1e6 = 1e6;
+  struct timespec tv = (struct timespec){
+      .tv_sec = time_msec / 1000,
+      .tv_nsec = (time_msec % 1000) * c_1e6,
+  };
+  nanosleep(&tv, NULL);
+}
 
 static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int block, int* closed) {
   int bytes = 0;
@@ -26,8 +38,13 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
       return ncclSuccess;
     }
     if (bytes == -1) {
+      if ((op == NCCL_SOCKET_SEND && errno == EPIPE) || (op == NCCL_SOCKET_RECV && errno == ECONNRESET)) {
+        *closed = 1;
+        return ncclSuccess;
+      }
       if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        WARN("socketProgressOpt: Call to recv from %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+        WARN("socketProgressOpt: Call to %s %s failed : %s", (op == NCCL_SOCKET_RECV ? "recv from" : "send to"),
+             ncclSocketToString(&sock->addr, line), strerror(errno));
         return ncclRemoteError;
       } else {
         bytes = 0;
@@ -38,17 +55,22 @@ static ncclResult_t socketProgressOpt(int op, struct ncclSocket* sock, void* ptr
       INFO(NCCL_NET, "socketProgressOpt: abort called");
       return ncclInternalError;
     }
-  } while (bytes > 0 && (*offset) < size);
+  } while (sock->asyncFlag == 0 && bytes > 0 && (*offset) < size);
   return ncclSuccess;
 }
 
-static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* pclosed = NULL) {
   int closed;
   NCCLCHECK(socketProgressOpt(op, sock, ptr, size, offset, 0 /*block*/, &closed));
   if (closed) {
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
-    return ncclRemoteError;
+    if (pclosed) {
+      *pclosed = closed;
+      return ncclSuccess;
+    } else {
+      char line[SOCKET_NAME_MAXLEN+1];
+      WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+      return ncclRemoteError;
+    }
   }
   return ncclSuccess;
 }
@@ -63,9 +85,9 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s
  *
  * Output: "IPv4/IPv6 address<port>"
  */
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
+const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
   if (buf == NULL || addr == NULL) return NULL;
-  struct sockaddr *saddr = &addr->sa;
+  const struct sockaddr *saddr = &addr->sa;
   if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
   char host[NI_MAXHOST], service[NI_MAXSERV];
   /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
@@ -370,10 +392,9 @@ ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
   if (socketToPort(&sock->addr)) {
     // Port is forced by env. Make sure we get the port.
     int opt = 1;
-#if defined(SO_REUSEPORT)
-    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
-#else
     SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt");
+#if defined(SO_REUSEPORT)
+    SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt");
 #endif
   }
 
@@ -412,6 +433,15 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   sock->fd = accept(sock->acceptFd, (struct sockaddr*)&sock->addr, &socklen);
   if (sock->fd != -1) {
     sock->state = ncclSocketStateAccepted;
+  } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
+             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) {
+    /* per accept's man page, for linux sockets, the following errors might be already pending errors
+     * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
+    if (++sock->errorRetries == ncclParamRetryCnt()) {
+      WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno));
+      return ncclSystemError;
+    }
+    INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno));
   } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
     WARN("socketTryAccept: Accept failed: %s", strerror(errno));
     return ncclSystemError;
@@ -419,72 +449,118 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   return ncclSuccess;
 }
 
-static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
-  uint64_t magic;
-  enum ncclSocketType type;
-  int received = 0;
+static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
   const int one = 1;
+  /* Set socket as non-blocking if async or if we need to be able to abort */
+  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
+    int flags;
+    SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
+    SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
   SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  return ncclSuccess;
+}
 
-  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (received == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
-  if (magic != sock->magic) {
-    WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
-    close(sock->fd);
-    sock->fd = -1;
-    // Ignore spurious connection and accept again
-    sock->state = ncclSocketStateAccepting;
-    return ncclSuccess;
-  } else {
-    received = 0;
-    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
-    if (type != sock->type) {
-      WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
-      sock->state = ncclSocketStateError;
+static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
+  uint64_t magic;
+  enum ncclSocketType type;
+  int received;
+  // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
+  NCCLCHECK(socketSetFlags(sock));
+
+  if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
+    if (sock->asyncFlag == 0) {
+      received = 0;
+      NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+    } else {
+      received = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received));
+      sock->finalizeCounter = received;
+      if (received < sizeof(magic)) return ncclSuccess;
+      memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
+    }
+    if (magic != sock->magic) {
+      WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
       close(sock->fd);
       sock->fd = -1;
-      return ncclInternalError;
-    } else {
-      sock->state = ncclSocketStateReady;
+      // Ignore spurious connection and accept again
+      sock->state = ncclSocketStateAccepting;
+      return ncclSuccess;
     }
   }
+  if (sock->asyncFlag == 0) {
+    received = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &type, sizeof(type), &received));
+  } else {
+    received = sock->finalizeCounter - sizeof(magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(type), &received));
+    sock->finalizeCounter = received + sizeof(magic);
+    if (received < sizeof(type)) return ncclSuccess;
+    memcpy(&type, sock->finalizeBuffer, sizeof(type));
+  }
+  if (type != sock->type) {
+    WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
+    sock->state = ncclSocketStateError;
+    close(sock->fd);
+    sock->fd = -1;
+    return ncclInternalError;
+  } else {
+    sock->state = ncclSocketStateReady;
+  }
   return ncclSuccess;
 }
 
-static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
-  /* blocking/non-blocking connect() is determined by asyncFlag. */
-  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
-
-  if (ret == 0) {
+static ncclResult_t socketResetFd(struct ncclSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  int fd = -1;
+  SYSCHECKGOTO(fd = socket(sock->addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, cleanup);
+  // if sock->fd is valid, close it and reuse its number
+  if (sock->fd != -1) {
+    SYSCHECKGOTO(dup2(fd, sock->fd), "dup2", ret, cleanup);
+    SYSCHECKGOTO(close(fd), "close", ret, cleanup);
+  } else {
+    sock->fd = fd;
+  }
+  NCCLCHECKGOTO(socketSetFlags(sock), ret, exit);
+exit:
+  return ret;
+cleanup:
+  // cleanup fd, leave sock->fd untouched
+  if (fd != -1) {
+    (void)close(fd);
+  }
+  goto exit;
+}
+static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+  if (errCode == 0) {
     sock->state = ncclSocketStateConnected;
-    return ncclSuccess;
-  } else if (errno == EINPROGRESS) {
+  } else if (errCode == EINPROGRESS) {
     sock->state = ncclSocketStateConnectPolling;
-    return ncclSuccess;
-  } else if (errno == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    return ncclSuccess;
-  } else if (errno == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketStartConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
+  } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+    if (sock->customRetry == 0) {
+      if (sock->errorRetries++ == ncclParamRetryCnt()) {
+        sock->state = ncclSocketStateError;
+        WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries);
+        return ncclRemoteError;
+      }
+      unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
+      INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+      msleep(sleepTime);
     }
-    usleep(SLEEP_INT);
-    return ncclSuccess;
+    NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
+    sock->state = ncclSocketStateConnecting;
   } else {
     char line[SOCKET_NAME_MAXLEN+1];
     sock->state = ncclSocketStateError;
-    WARN("socketStartConnect: Connect to %s failed : %s", ncclSocketToString(&sock->addr, line), strerror(errno));
+    WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
     return ncclSystemError;
   }
+  return ncclSuccess;
+}
+static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
+  /* blocking/non-blocking connect() is determined by asyncFlag. */
+  int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
+  return socketConnectCheck(sock, (ret == -1) ? errno : 0, __func__);
 }
 
 static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
@@ -509,33 +585,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
 
   /* check socket status */
   SYSCHECK(getsockopt(sock->fd, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen), "getsockopt");
-
-  if (ret == 0) {
-    sock->state = ncclSocketStateConnected;
-  } else if (ret == ECONNREFUSED) {
-    if (++sock->refusedRetries == RETRY_REFUSED_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded retries (%d)", sock->refusedRetries);
-      return ncclRemoteError;
-    }
-    if (sock->refusedRetries % 1000 == 0) INFO(NCCL_ALL, "Call to connect returned %s, retrying", strerror(errno));
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret == ETIMEDOUT) {
-    if (++sock->timedOutRetries == RETRY_TIMEDOUT_TIMES) {
-      sock->state = ncclSocketStateError;
-      WARN("socketPollConnect: exceeded timeouts (%d)", sock->timedOutRetries);
-      return ncclRemoteError;
-    }
-    usleep(SLEEP_INT);
-    sock->state = ncclSocketStateConnecting;
-  } else if (ret != EINPROGRESS) {
-    sock->state = ncclSocketStateError;
-    char line[SOCKET_NAME_MAXLEN+1];
-    WARN("socketPollConnect: Connect to %s returned %d(%s) errno %d(%s)", ncclSocketToString(&sock->addr, line), ret, strerror(ret), errno, strerror(errno));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
+  return socketConnectCheck(sock, ret, __func__);
 }
 
 ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
@@ -548,12 +598,24 @@ ncclResult_t ncclSocketPollConnect(struct ncclSocket* sock) {
 }
 
 static ncclResult_t socketFinalizeConnect(struct ncclSocket* sock) {
-  int sent = 0;
-  NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  if (sent == 0) return ncclSuccess;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
-  sent = 0;
-  NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  int sent;
+  if (sock->asyncFlag == 0) {
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+    sent = 0;
+    NCCLCHECK(socketWait(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+  } else {
+    if (sock->finalizeCounter < sizeof(sock->magic)) {
+      sent = sock->finalizeCounter;
+      NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->magic, sizeof(sock->magic), &sent));
+      sock->finalizeCounter = sent;
+      if (sent < sizeof(sock->magic)) return ncclSuccess;
+    }
+    sent = sock->finalizeCounter - sizeof(sock->magic);
+    NCCLCHECK(socketProgress(NCCL_SOCKET_SEND, sock, &sock->type, sizeof(sock->type), &sent));
+    sock->finalizeCounter = sent + sizeof(sock->magic);
+    if (sent < sizeof(sock->type)) return ncclSuccess;
+  }
   sock->state = ncclSocketStateReady;
   return ncclSuccess;
 }
@@ -598,7 +660,6 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
 #endif
-  const int one = 1;
 
   if (sock == NULL) {
     WARN("ncclSocketConnect: pass NULL socket");
@@ -616,9 +677,8 @@ ncclResult_t ncclSocketConnect(struct ncclSocket* sock) {
   }
   TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", ncclSocketToString(&sock->addr, line));
 
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
-
   sock->state = ncclSocketStateConnecting;
+  sock->finalizeCounter = 0;
   do {
     NCCLCHECK(socketProgressState(sock));
   } while (sock->asyncFlag == 0 &&
@@ -664,6 +724,7 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
     memcpy(sock, listenSock, sizeof(struct ncclSocket));
     sock->acceptFd = listenSock->fd;
     sock->state = ncclSocketStateAccepting;
+    sock->finalizeCounter = 0;
   }
 
   do {
@@ -694,12 +755,11 @@ ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* listen
   return ret;
 }
 
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag) {
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr, uint64_t magic, enum ncclSocketType type, volatile uint32_t* abortFlag, int asyncFlag, int customRetry) {
   ncclResult_t ret = ncclSuccess;
 
   if (sock == NULL) goto exit;
-  sock->timedOutRetries = 0;
-  sock->refusedRetries = 0;
+  sock->errorRetries = 0;
   sock->abortFlag = abortFlag;
   sock->asyncFlag = asyncFlag;
   sock->state = ncclSocketStateInitialized;
@@ -707,6 +767,7 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
   sock->type = type;
   sock->fd = -1;
   sock->acceptFd = -1;
+  sock->customRetry = customRetry;
 
   if (addr) {
     /* IPv4/IPv6 support */
@@ -718,28 +779,14 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
       WARN("ncclSocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)",
           ncclSocketToString(&sock->addr, line), family, AF_INET, AF_INET6);
       ret = ncclInternalError;
-      goto fail;
+      goto exit;
     }
     sock->salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-
-    /* Connect to a hostname / port */
-    sock->fd = socket(family, SOCK_STREAM, 0);
-    if (sock->fd == -1) {
-      WARN("ncclSocketInit: Socket creation failed : %s", strerror(errno));
-      ret = ncclSystemError;
-      goto fail;
-    }
+    // in case of error, we close the fd before returning as it's unclear if the caller has to use ncclSocketClose for cleanup
+    NCCLCHECKGOTO(socketResetFd(sock), ret, fail);
   } else {
     memset(&sock->addr, 0, sizeof(union ncclSocketAddress));
   }
-
-  /* Set socket as non-blocking if async or if we need to be able to abort */
-  if ((sock->asyncFlag || sock->abortFlag) && sock->fd >= 0) {
-    int flags;
-    SYSCHECKGOTO(flags = fcntl(sock->fd, F_GETFL), "fcntl", ret, fail);
-    SYSCHECKGOTO(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
-  }
-
 exit:
   return ret;
 fail:
@@ -750,12 +797,12 @@ ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* ad
   goto exit;
 }
 
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset) {
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset, int* closed) {
   if (sock == NULL) {
     WARN("ncclSocketProgress: pass NULL socket");
     return ncclInvalidArgument;
   }
-  NCCLCHECK(socketProgress(op, sock, ptr, size, offset));
+  NCCLCHECK(socketProgress(op, sock, ptr, size, offset, closed));
   return ncclSuccess;
 }
 
@@ -788,7 +835,7 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size) {
     WARN("ncclSocketRecv: pass NULL socket");
     return ncclInvalidArgument;
   }
-  if (sock->state != ncclSocketStateReady) {
+  if (sock->state != ncclSocketStateReady && sock->state != ncclSocketStateTerminating) {
     WARN("ncclSocketRecv: socket state (%d) is not ready", sock->state);
     return ncclInternalError;
   }
@@ -802,7 +849,8 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int
     WARN("ncclSocketSendRecv: invalid socket %p/%p", sendSock, recvSock);
     return ncclInternalError;
   }
-  if (sendSock->state != ncclSocketStateReady || recvSock->state != ncclSocketStateReady) {
+  if (sendSock->state != ncclSocketStateReady ||
+      (recvSock->state != ncclSocketStateReady && recvSock->state != ncclSocketStateTerminating)) {
     WARN("ncclSocketSendRecv: socket state (%d/%d) is not ready", sendSock->state, recvSock->state);
     return ncclInternalError;
   }
@@ -846,9 +894,20 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+// Make it possible to close just one part of a socket.
+ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   if (sock != NULL) {
     if (sock->fd >= 0) {
+      shutdown(sock->fd, how);
+    }
+    sock->state = ncclSocketStateTerminating;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+  if (sock != NULL) {
+    if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
index f1a9756f1..267e12a03 100644
--- a/src/misc/tuner.cc
+++ b/src/misc/tuner.cc
@@ -16,9 +16,11 @@
 pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
 static int tunerPluginRefCount;
 static void* tunerPluginLib = nullptr;
-static ncclTuner_v3_t* tunerSymbol = nullptr;
+static ncclTuner_v4_t* tunerSymbol = nullptr;
+static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
 static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v3_t ncclTuner_v2_as_v3;
+static ncclTuner_v4_t ncclTuner_v2_as_v4;
+static ncclTuner_v4_t ncclTuner_v3_as_v4;
 
 static int hasNvlsSupport(float** collCostTable) {
   // Requirements for support of different algorithms:
@@ -39,7 +41,20 @@ static int hasCollNetSupport(float** collCostTable) {
   return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
 }
 
-static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int* nChannels) {
+static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
+  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
+  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
   int algorithm = NCCL_ALGO_UNDEF;
   int protocol = NCCL_PROTO_UNDEF;
   int nvlsSupport = hasNvlsSupport(collCostTable);
@@ -53,11 +68,11 @@ static ncclResult_t ncclTuner_v2_as_v3_getCollInfo(void* context, ncclFunc_t col
   return ncclSuccess;
 }
 
-static ncclResult_t ncclTuner_v2_as_v3_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
+static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
   NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v3.getCollInfo = ncclTuner_v2_as_v3_getCollInfo;
-  ncclTuner_v2_as_v3.destroy = ncclTuner_v2->destroy;
+  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
+  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
   return ncclSuccess;
 }
 
@@ -198,18 +213,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
     goto fail;
   }
 
-  tunerSymbol = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
   if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-    ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-    if (ncclTuner_v2 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-      dlclose(tunerPluginLib);
-      goto fail;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
+    if (ncclTuner_v3 == nullptr) {
+      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
+      if (ncclTuner_v2 == nullptr) {
+        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+        dlclose(tunerPluginLib);
+        goto fail;
+      } else {
+        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
+        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
+        tunerSymbol = &ncclTuner_v2_as_v4;
+      }
     } else {
-      ncclTuner_v2_as_v3.init = ncclTuner_v2_as_v3_init;
-      ncclTuner_v2_as_v3.name = ncclTuner_v2->name;
-      tunerSymbol = &ncclTuner_v2_as_v3;
+      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
+      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
+      tunerSymbol = &ncclTuner_v3_as_v4;
     }
   }
 
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 431ecb554..8a6f94e24 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -12,6 +12,9 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
+#if CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif
 
 #define NCCL_MAJOR ${nccl:Major}
 #define NCCL_MINOR ${nccl:Minor}
@@ -183,6 +186,10 @@ const char* pncclGetErrorString(ncclResult_t result);
 const char*  ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);
 
+/* Reload environment variables that determine logging. */
+void  ncclResetDebugInit();
+void pncclResetDebugInit();
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -236,12 +243,10 @@ typedef enum { ncclInt8       = 0, ncclChar       = 0,
                ncclFloat16    = 6, ncclHalf       = 6,
                ncclFloat32    = 7, ncclFloat      = 7,
                ncclFloat64    = 8, ncclDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
                ncclBfloat16   = 9,
-               ncclNumTypes   = 10
-#else
-               ncclNumTypes   = 9
-#endif
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
 } ncclDataType_t;
 
 /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
diff --git a/src/net.cc b/src/net.cc
index 97a8c7381..13e8c2b51 100644
--- a/src/net.cc
+++ b/src/net.cc
@@ -15,20 +15,95 @@
 //#include <sys/stat.h>
 //#include <unistd.h>
 
-static ncclNet_v8_t ncclNet_v5_as_v8;
-static ncclNet_v8_t ncclNet_v6_as_v8;
-static ncclNet_v8_t ncclNet_v7_as_v8;
+static ncclNet_v9_t ncclNet_v5_as_v9;
+static ncclNet_v9_t ncclNet_v6_as_v9;
+static ncclNet_v9_t ncclNet_v7_as_v9;
+static ncclNet_v9_t ncclNet_v8_as_v9;
 static ncclNet_v5_t *ncclNet_v5;
 static ncclNet_v6_t *ncclNet_v6;
 static ncclNet_v7_t *ncclNet_v7;
-static ncclCollNet_v8_t ncclCollNet_v5_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v6_as_v8;
-static ncclCollNet_v8_t ncclCollNet_v7_as_v8;
+static ncclNet_v8_t *ncclNet_v8;
+static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
+static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
 static ncclCollNet_v5_t *ncclCollNet_v5;
 static ncclCollNet_v6_t *ncclCollNet_v6;
 static ncclCollNet_v7_t *ncclCollNet_v7;
+static ncclCollNet_v8_t *ncclCollNet_v8;
 
-static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet_v8_as_v9.name = ncclNet_v8->name;
+  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
+  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
+  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
+  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
+  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
+  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
+  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
+  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
+  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
+  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
+  ncclNet_v8_as_v9.test = ncclNet_v8->test;
+  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
+  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
+  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet_v8_as_v9.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
   if (ans != ncclSuccess) return ans;
@@ -37,6 +112,7 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p7.guid;
   props->ptrSupport = p7.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p7.speed;
   props->port = p7.port;
   props->maxComms = p7.maxComms;
@@ -44,38 +120,63 @@ static ncclResult_t ncclNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p7.latency;
   props->netDeviceType = p7.netDeviceType;
   props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v8.name = ncclNet_v7->name;
-  ncclNet_v7_as_v8.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v8.getProperties = ncclNet_v7_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v8.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v8.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v8.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v8.regMr = ncclNet_v7_as_v8_regMr;
-  ncclNet_v7_as_v8.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v8.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v8.isend = ncclNet_v7->isend;
-  ncclNet_v7_as_v8.irecv = ncclNet_v7->irecv;
-  ncclNet_v7_as_v8.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v8.test = ncclNet_v7->test;
-  ncclNet_v7_as_v8.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v8.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v8.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v8.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v8.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.name = ncclNet_v7->name;
+  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
+  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
+  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
+  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
+  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
+  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
+  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
+  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
+  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
+  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
+  ncclNet_v7_as_v9.test = ncclNet_v7->test;
+  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
+  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
+  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet_v7_as_v9.makeVDevice  = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -84,6 +185,7 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -91,46 +193,71 @@ static ncclResult_t ncclNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p6.latency;
   props->netDeviceType = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   return ncclNet_v6->connect(dev, handle, sendComm);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   return ncclNet_v6->accept(listenComm, recvComm);
 }
 
-static ncclResult_t ncclNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v8.name = ncclNet_v6->name;
-  ncclNet_v6_as_v8.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v8.getProperties = ncclNet_v6_as_v8_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v6_as_v8.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v8.connect = ncclNet_v6_as_v8_connect;
-  ncclNet_v6_as_v8.accept =  ncclNet_v6_as_v8_accept;
-  ncclNet_v6_as_v8.regMr = ncclNet_v6_as_v8_regMr;
-  ncclNet_v6_as_v8.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v8.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v8.isend = ncclNet_v6->isend;
-  ncclNet_v6_as_v8.irecv = ncclNet_v6->irecv;
-  ncclNet_v6_as_v8.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v8.test = ncclNet_v6->test;
-  ncclNet_v6_as_v8.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v8.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v8.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v8.getDeviceMr = NULL;
-  ncclNet_v6_as_v8.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.name = ncclNet_v6->name;
+  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
+  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
+  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
+  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
+  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
+  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
+  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
+  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
+  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
+  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
+  ncclNet_v6_as_v9.test = ncclNet_v6->test;
+  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
+  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
+  ncclNet_v6_as_v9.getDeviceMr = NULL;
+  ncclNet_v6_as_v9.irecvConsumed = NULL;
+  ncclNet_v6_as_v9.makeVDevice  = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -139,6 +266,7 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -146,48 +274,73 @@ static ncclResult_t ncclNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_v5_as_v8_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   return ncclNet_v5->connect(dev, handle, sendComm);
 }
 
-static ncclResult_t ncclNet_v5_as_v8_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
   return ncclNet_v5->accept(listenComm, recvComm);
 }
 
+static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+   int sizeInt;
+   if (size > MAX_NET_SIZE) return ncclInternalError;
+   sizeInt = (int)size;
+   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+   int sizesInt[NCCL_PROXY_MAX_SUBS];
+   //reset to NULL if optional receive completion is set
+   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
+   for (int i=0; i<n; i++) {
+     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+     sizesInt[i] = (int) sizes[i];
+   }
+   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v8.name = ncclNet_v5->name;
-  ncclNet_v5_as_v8.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v8.getProperties = ncclNet_v5_as_v8_getProperties;
-  ncclNet_v5_as_v8.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v8.connect = ncclNet_v5_as_v8_connect;
-  ncclNet_v5_as_v8.accept =  ncclNet_v5_as_v8_accept;
-  ncclNet_v5_as_v8.regMr = ncclNet_v5_as_v8_regMr;
-  ncclNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v8.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v8.isend = ncclNet_v5->isend;
-  ncclNet_v5_as_v8.irecv = ncclNet_v5->irecv;
-  ncclNet_v5_as_v8.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v8.test = ncclNet_v5->test;
-  ncclNet_v5_as_v8.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v8.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v8.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v8.getDeviceMr = NULL;
-  ncclNet_v5_as_v8.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.name = ncclNet_v5->name;
+  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
+  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
+  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
+  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
+  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
+  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
+  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
+  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
+  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
+  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
+  ncclNet_v5_as_v9.test = ncclNet_v5->test;
+  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
+  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
+  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
+  ncclNet_v5_as_v9.getDeviceMr = NULL;
+  ncclNet_v5_as_v9.irecvConsumed = NULL;
+  ncclNet_v5_as_v9.makeVDevice = NULL;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -196,6 +349,7 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -203,38 +357,52 @@ static ncclResult_t ncclCollNet_v5_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v5_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v5 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v8.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v8.getProperties = ncclCollNet_v5_as_v8_getProperties;
-  ncclCollNet_v5_as_v8.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v8.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v8.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v8.regMr = ncclCollNet_v5_as_v8_regMr;
-  ncclCollNet_v5_as_v8.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v8.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v8.iallreduce = ncclCollNet_v5->iallreduce;
-  ncclCollNet_v5_as_v8.iallgather = nullptr;
-  ncclCollNet_v5_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v8.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v8.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v8.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v8.closeListen = ncclCollNet_v5->closeListen;
+  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
+  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
+  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
+  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
+  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
+  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
+  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
+  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
+  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
+  ncclCollNet_v5_as_v9.iallgather = nullptr;
+  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
+  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
+  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
+  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
   if (ans != ncclSuccess) return ans;
@@ -243,6 +411,7 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p6.guid;
   props->ptrSupport = p6.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p6.speed;
   props->port = p6.port;
   props->maxComms = p6.maxComms;
@@ -250,38 +419,52 @@ static ncclResult_t ncclCollNet_v6_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p6.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v6_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v6 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v8.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v8.getProperties = ncclCollNet_v6_as_v8_getProperties;
-  ncclCollNet_v6_as_v8.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v8.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v8.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v8.regMr = ncclCollNet_v6_as_v8_regMr;
-  ncclCollNet_v6_as_v8.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v8.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v8.iallreduce = ncclCollNet_v6->iallreduce;
-  ncclCollNet_v6_as_v8.iallgather = nullptr;
-  ncclCollNet_v6_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v8.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v8.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v8.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v8.closeListen = ncclCollNet_v6->closeListen;
+  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
+  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
+  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
+  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
+  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
+  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
+  ncclCollNet_v6_as_v9.iallgather = nullptr;
+  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
+  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetProperties_v8_t* props) {
+static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
   if (ans != ncclSuccess) return ans;
@@ -290,6 +473,7 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
   props->guid = p7.guid;
   props->ptrSupport = p7.ptrSupport;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   props->speed = p7.speed;
   props->port = p7.port;
   props->maxComms = p7.maxComms;
@@ -297,47 +481,150 @@ static ncclResult_t ncclCollNet_v7_as_v8_getProperties(int dev, ncclNetPropertie
   props->latency = p7.latency;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclCollNet_v7_as_v8_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
+static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
 // We use a wrapper around the v7 init to copy over the struct contents
 // post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v8_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v8.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v8.getProperties = ncclCollNet_v7_as_v8_getProperties;
-  ncclCollNet_v7_as_v8.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v8.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v8.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v8.regMr = ncclCollNet_v7_as_v8_regMr;
-  ncclCollNet_v7_as_v8.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v8.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v8.iallreduce = ncclCollNet_v7->iallreduce;
-  ncclCollNet_v7_as_v8.iallgather = nullptr;
-  ncclCollNet_v7_as_v8.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v8.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v8.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v8.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v8.closeListen = ncclCollNet_v7->closeListen;
+  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
+  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
+  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
+  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
+  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
+  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
+  ncclCollNet_v7_as_v9.iallgather = nullptr;
+  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
+  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
+  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+   int countInt;
+   if (count > MAX_NET_SIZE) return ncclInternalError;
+   countInt = (int)count;
+   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                  sendMhandle, recvMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+   ncclNetSGE_v8_t recvPartsInt;
+   if (nRecvParts > 1) return ncclInternalError;
+   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   recvPartsInt.mhandle = recvParts->mhandle;
+   recvPartsInt.address = recvParts->address;
+   recvPartsInt.size = (int)recvParts->size;
+   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                   bytesPerRank, windowOffset, windowBytes,
+                   sendMhandle, request);
+   return ans;
+}
+
+static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+   ncclNetSGE_v8_t sendPartsInt;
+   if (nSendParts > 1) return ncclInternalError;
+   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+   sendPartsInt.mhandle = sendParts->mhandle;
+   sendPartsInt.address = sendParts->address;
+   sendPartsInt.size = (int)sendParts->size;
+   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                   recvData, bytesPerRank, windowOffset, windowBytes,
+                   dataType, redOp,
+                  recvMhandle, request);
+   return ans;
+}
+
+// We use a wrapper around the v8 init to copy over the struct contents
+// post-init since they may not be initialized before hand.
+static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
+  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
+  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
+  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
+  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
+  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
+  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
+  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
+  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
   return ncclSuccess;
 }
 
 static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[3] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[3] = { nullptr, nullptr, nullptr };
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
 enum ncclNetState {
   ncclNetStateInit = 0,
   ncclNetStateEnabled = 1,
   ncclNetStateDisabled = 2
 };
-enum ncclNetState ncclNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[3] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
 
 #define MAX_STR_LEN 255
 
@@ -443,72 +730,93 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
     goto fail;
   }
 
-  ncclNets[0] = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
   if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
-    // Try v7 plugin
-    ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-    if (ncclNet_v7 == nullptr) {
-      // Try v6 plugin
-      ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-      if (ncclNet_v6 == nullptr) {
-        // Try v5 plugin
-        ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-        if (ncclNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-          goto fail;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
+    if (ncclNet_v8 == nullptr) {
+      // Try v7 plugin
+      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
+      if (ncclNet_v7 == nullptr) {
+        // Try v6 plugin
+        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
+        if (ncclNet_v6 == nullptr) {
+          // Try v5 plugin
+          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
+          if (ncclNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
+            goto fail;
+          } else {
+            ncclNets[0] = &ncclNet_v5_as_v9;
+            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
+            // Set the name right away to allow for NCCL_NET=... to work
+            ncclNet_v5_as_v9.name = ncclNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          }
         } else {
-          ncclNets[0] = &ncclNet_v5_as_v8;
-          ncclNet_v5_as_v8.init = ncclNet_v5_as_v8_init;
+          ncclNets[0] = &ncclNet_v6_as_v9;
+          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
           // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v5_as_v8.name = ncclNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
+          ncclNet_v6_as_v9.name = ncclNet_v6->name;
+          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
         }
       } else {
-        ncclNets[0] = &ncclNet_v6_as_v8;
-        ncclNet_v6_as_v8.init = ncclNet_v6_as_v8_init;
+        ncclNets[0] = &ncclNet_v7_as_v9;
+        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
         // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v6_as_v8.name = ncclNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
+        ncclNet_v7_as_v9.name = ncclNet_v7->name;
+        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
       }
     } else {
-      ncclNets[0] = &ncclNet_v7_as_v8;
-      ncclNet_v7_as_v8.init = ncclNet_v7_as_v8_init;
+      ncclNets[0] = &ncclNet_v8_as_v9;
+      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
       // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v7_as_v8.name = ncclNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
+      ncclNet_v8_as_v9.name = ncclNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
     }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
   }
 
   // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
   if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
-    ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-    if (ncclCollNet_v7 == nullptr) {
-      ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-      if (ncclCollNet_v6 == nullptr) {
-        ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-        if (ncclCollNet_v5 == nullptr) {
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
+    if (ncclCollNet_v8 == nullptr) {
+      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
+      if (ncclCollNet_v7 == nullptr) {
+        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
+        if (ncclCollNet_v6 == nullptr) {
+          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
+          if (ncclCollNet_v5 == nullptr) {
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
+          } else {
+            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
+            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
+            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
+            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+          }
         } else {
-          ncclCollNets[0] = &ncclCollNet_v5_as_v8;
-          ncclCollNet_v5_as_v8.init = ncclCollNet_v5_as_v8_init;
-          ncclCollNet_v5_as_v8.name = ncclCollNet_v5->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
+         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
+         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
+         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
+         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
         }
       } else {
-        ncclCollNets[0] = &ncclCollNet_v6_as_v8;
-        ncclCollNet_v6_as_v8.init = ncclCollNet_v6_as_v8_init;
-        ncclCollNet_v6_as_v8.name = ncclCollNet_v6->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
+       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
+       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
+       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
+       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
       }
     } else {
-      ncclCollNets[0] = &ncclCollNet_v7_as_v8;
-      ncclCollNet_v7_as_v8.init = ncclCollNet_v7_as_v8_init;
-      ncclCollNet_v7_as_v8.name = ncclCollNet_v7->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
+      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
+      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
+      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
+      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
     }
+  } else {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
   }
 
   ++netPluginRefCount;
@@ -539,6 +847,8 @@ ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
     ncclCollNets[0] = nullptr;
     netPluginStatus = netPluginLoadReady;
     comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
   }
   pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
@@ -561,7 +871,7 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
         return ncclInternalError;
       }
     default:
-      WARN("Unknown device code index");
+      WARN("Unknown device code index %d \n", type);
       return ncclInternalError;
   }
 
@@ -715,8 +1025,9 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
 
 int ncclNetVersion(struct ncclComm* comm) {
   return
-    (comm->ncclNet == &ncclNet_v5_as_v8) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v8) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v8) ? 7 :
-    8;
+    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
+    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
+    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
+    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
+    9;
 }
diff --git a/src/proxy.cc b/src/proxy.cc
index 5e657c0a4..bd8188a37 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -364,7 +364,11 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->channelId = op->channelId;
   sub->nsteps = op->nsteps;
   sub->nbytes = op->nbytes;
+  sub->chunkSize = op->chunkSize;
   sub->offset = 0;
+  sub->loopSize = op->loopSize;
+  sub->loopOffset = op->loopOffset;
+  sub->isOneRPN = op->isOneRPN;
   sub->peer = op->peer;
   sub->reg = op->reg;
   sub->sendMhandle = op->sendMhandle;
@@ -374,8 +378,9 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->eActivationMask = op->eActivationMask;
   sub->taskEventHandle = op->taskEventHandle;
   sub->rank = op->rank;
-  args->pid = op->pid;
-  args->profilerContext = op->profilerContext;
+  sub->pid = op->pid;
+  sub->profilerContext = op->profilerContext;
+  sub->ringAlgo = op->ringAlgo;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -404,6 +409,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   args->pattern = op->pattern;
   args->protocol = op->protocol;
   args->coll = op->coll;
+  args->algorithm = op->algorithm;
   args->specifics = op->specifics;
   args->state = ncclProxyOpReady;
   args->progress = op->connection->tcomm->proxyProgress;
@@ -485,6 +491,7 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   }
   if (op->next != -1) __builtin_prefetch(pool->ops+op->next); // Prefetch next free op
   memcpy(op, proxyOp, sizeof(struct ncclProxyOp));
+  if (proxyOp->ringAlgo) proxyOp->ringAlgo->incRefCount();
   op->next = -1;
   op->connection = proxyConn->connection;
   if (proxyOps->nextOps == -1) {
@@ -601,13 +608,15 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
     } break;
   case ncclPatternPatUp: {
       // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
       int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
+
       while (last == 0) {
         int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
         size_t inpIx, outIx;
@@ -619,24 +628,30 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
         if (nstepsSend[i]) {
           int sendPeer = (rank + (1<<i)) % nranks;
           op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_up);
         }
         if (nstepsRecv[i]) {
           int recvPeer = (rank - (1<<i) + nranks) % nranks;
           op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_up);
         }
       }
+    exit_pat_up:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
     } break;
   case ncclPatternPatDown: {
       // Run full algorithm to count the number of steps for each peer.
-      int *nstepsSend, *nstepsRecv;
-      const int rank = comm->rank, nranks = comm->nRanks;
-      NCCLCHECK(ncclCalloc(&nstepsSend, log2Up(nranks)));
-      NCCLCHECK(ncclCalloc(&nstepsRecv, log2Up(nranks)));
+      ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
       int last = 0;
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      const int rank = comm->rank, nranks = comm->nRanks;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
+      NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
+
       while (last == 0) {
         int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
         size_t inpIx, outIx;
@@ -648,14 +663,18 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
         if (nstepsSend[i]) {
           int sendPeer = (rank - (1<<i) + nranks) % nranks;
           op->nsteps = nstepsSend[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxySend, sendPeer, op, 0, justInquire), result, exit_pat_down);
         }
         if (nstepsRecv[i]) {
           int recvPeer = (rank + (1<<i)) % nranks;
           op->nsteps = nstepsRecv[i];
-          NCCLCHECK(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire));
+          NCCLCHECKGOTO(SaveProxy(comm, channel, proxyRecv, recvPeer, op, 0, justInquire), result, exit_pat_down);
         }
       }
+    exit_pat_down:
+      free(nstepsSend);
+      free(nstepsRecv);
+      NCCLCHECK(result);
     } break;
   case ncclPatternSend:
   case ncclPatternRecv: {
@@ -735,23 +754,17 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
 
   if (state->active == NULL) {
     pthread_mutex_lock(&pool->mutex);
-    while (pool->nextOps == -1 && !state->stop) {
+    if (pool->nextOps == -1 && !state->stop) {
       ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
       ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlSleep);
       pthread_cond_wait(&pool->cond, &pool->mutex);
       ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlWakeup);
       ncclProfilerStopProxyCtrlEvent(eHandle);
     }
-    if (state->stop) { // We might have been woken up to stop.
-      pthread_mutex_unlock(&pool->mutex);
-      return ncclSuccess;
-    }
   }
-
   state->nextOps = pool->nextOps;
   pool->nextOps = pool->nextOpsEnd = -1;
   pthread_mutex_unlock(&pool->mutex);
-  if (state->nextOps == -1) return ncclInternalError;
 
 process_nextops:
   ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -889,7 +902,7 @@ void* ncclProxyProgress(void *proxyState_) {
    * ncclParamProgressAppendOpFreq(). If they are equal, we will append proxy ops. This will decrease the
    * frequency of calling ncclProxyGetPostedOps() and reduce the perf impact. */
   int proxyOpAppendCounter = 0;
-  while (state->stop == 0 || (state->stop == 1 && state->active)) {
+  do {
     int idle = 1;
     ncclResult_t ret = progressOps(proxyState, state, state->active, &idle);
     if (ret != ncclSuccess) {
@@ -902,12 +915,11 @@ void* ncclProxyProgress(void *proxyState_) {
     if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
     if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
     ncclProfilerStopProxyCtrlEvent(eHandle);
-    if (idle || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
+    if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
       int added = 0;
       proxyOpAppendCounter = 0;
       TIME_START(3);
-      if (state->stop == 0)
-        ret = ncclProxyGetPostedOps(proxyState, &added);
+      ret = ncclProxyGetPostedOps(proxyState, &added);
       if (added) { TIME_STOP(3); } else { TIME_CANCEL(3); }
       if (ret != ncclSuccess) {
         __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
@@ -918,7 +930,7 @@ void* ncclProxyProgress(void *proxyState_) {
       }
     }
     lastIdle = idle;
-  }
+  } while (state->stop == 0 || (state->stop == 1 && state->active));
   return NULL;
 }
 
@@ -1090,7 +1102,7 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
     strncpy(poolPath+sizeof("/dev/shm/nccl-")-1, resp.devShmPath, sizeof("XXXXXX")-1);
     struct ncclProxyOps* proxyOps = sharedProxyState->proxyOps + proxyConn->tpLocalRank;
     if (proxyOps->pool == NULL) {
-      NCCLCHECK(ncclShmOpen(poolPath, sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
+      NCCLCHECK(ncclShmOpen(poolPath, sizeof(poolPath), sizeof(struct ncclProxyOpsPool), (void**)(&proxyOps->pool), NULL, -1, &proxyOps->handle));
       proxyOps->nextOps = proxyOps->nextOpsEnd = proxyOps->freeOp = -1;
     }
   }
@@ -1293,7 +1305,7 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
 
     char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
     shmPath[0] = '\0';
-    NCCLCHECK(ncclShmOpen(shmPath, size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, (void**)&pool, NULL, proxyState->tpLocalnRanks, &state->handle));
     // Init pool
     pool->nextOps = -1;
 
@@ -1372,7 +1384,7 @@ static ncclResult_t proxyQueryFd(struct ncclProxyState* proxyState, int rank, vo
   ncclResult_t ret = ncclSuccess;
 
   NCCLCHECKGOTO(ncclIpcSocketInit(&ipcSock, proxyState->tpRank, hash^1, proxyState->abortFlag), ret, exit);
-  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), rmtFd, rank, hash), ret, exit);
+  NCCLCHECKGOTO(ncclIpcSocketSendMsg(&ipcSock, &rmtFd, sizeof(int), -1, rank, hash), ret, exit);
 exit:
   NCCLCHECK(ncclIpcSocketClose(&ipcSock));
   return ncclSuccess;
@@ -1603,7 +1615,7 @@ void* ncclProxyService(void* _args) {
       if (pollfds[s].fd == -1) continue;
 
       // Progress all ops for this ncclProxyLocalPeer
-      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode) closeConn = 1;
+      if (stop == PROXY_ABORT && ncclCuMemEnable() && ncclCuMemHostEnable() && !proxyState->directMode && __atomic_load_n(&proxyState->stop, __ATOMIC_ACQUIRE)) closeConn = 1;
       ncclProxyAsyncOp* op = peer->asyncOps;
       while (op != nullptr) {
         ncclProxyAsyncOp* opnext = op->next; /* in case op is freed in proxyProgressAsync */
@@ -1692,11 +1704,17 @@ static ncclResult_t proxyUDSRecvReq(struct ncclProxyState* proxyState, int reqFd
 
   NCCLCHECK(ncclIpcSocketRecvMsg(&proxyState->ipcSock, &hdr, sizeof(hdr), &rmtFd));
   if (hdr.type == ncclProxyMsgGetFd) {
-    // cuMem API support
+    // cuMem API support for non-UB case, and rmtFd is not used since UDS proxy thread need to export
+    // fd from handle and send it back to the main thread to import the buffer. We just need to close
+    // this dummy rmtFd.
     uint64_t handle = *(uint64_t*)hdr.data;
     INFO(NCCL_PROXY, "proxyUDSRecvReq::ncclProxyMsgGetFd rank %d opId %p handle=0x%lx", hdr.rank, hdr.opId, handle);
+    close(rmtFd);
     return proxyGetFd(proxyState, hdr.rank, hdr.opId, handle);
   } else if (hdr.type == ncclProxyMsgQueryFd) {
+    // remote main thread registers buffer into this rank, it querys rmtFd of this rank through UDS
+    // and the rmtFd is returned unchanged back to remote main thread which will use rmtFd to call into
+    // proxy service thread for buffer registration.
     INFO(NCCL_PROXY, "proxyUDSRecvReq::proxyQueryFd rank %d opId %p rmtFd %d", hdr.rank, hdr.opId, rmtFd);
     return proxyQueryFd(proxyState, hdr.rank, hdr.opId, rmtFd);
   }
@@ -1743,7 +1761,7 @@ void* ncclProxyServiceUDS(void* _args) {
     }
   }
 
-  ncclIpcSocketClose(&proxyState->ipcSock);
+  (void)ncclIpcSocketClose(&proxyState->ipcSock);
   INFO(NCCL_PROXY, "[Proxy Service UDS] exit: stop %d abortFlag %d", proxyState->stop, *proxyState->abortFlag);
   return NULL;
 }
@@ -1800,15 +1818,10 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
     struct ncclProxyState* sharedProxyState = comm->proxyState;
 
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
-      if (comm->proxyState->threadUDS) {
-        // UDS support
-        __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
-      }
-
       if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
         struct ncclSocket sock;
         int type = ncclProxyMsgStop;
-        ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag);
+        NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
         if (ncclSocketConnect(&sock) == ncclSuccess) {
           (void)ncclSocketSend(&sock, &type, sizeof(int));
         }
@@ -1835,6 +1848,8 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
           }
         }
       }
+      // Now we notify proxy service and UDS thread to exit.
+      __atomic_store_n(&comm->proxyState->stop, 1, __ATOMIC_RELEASE);
     }
   }
 
diff --git a/src/ras/client.cc b/src/ras/client.cc
new file mode 100644
index 000000000..8061cef4e
--- /dev/null
+++ b/src/ras/client.cc
@@ -0,0 +1,318 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <cerrno>
+#include <climits>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <getopt.h>
+#include <netdb.h>
+#include <unistd.h>
+
+#include "nccl.h"
+#define NCCL_RAS_CLIENT // Only pull client-specific definitions from the header file below.
+#include "ras_internal.h"
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+// Local timeout increment compared to the '-t' argument, in seconds.
+#define TIMEOUT_INCREMENT 1
+
+static const char* hostName = "localhost";
+static const char* port = STR(NCCL_RAS_CLIENT_PORT);
+static int timeout = -1;
+static bool verbose = false;
+static int sock = -1;
+
+static void printUsage(const char* argv0) {
+  fprintf(stderr,
+          "Usage: %s [OPTION]...\n"
+          "Query the state of a running NCCL job.\n"
+          "\nOptions:\n"
+          "  -h, --host=HOST     Host name or IP address of the RAS client socket of the\n"
+          "                      NCCL job to connect to (localhost by default)\n"
+          "  -p, --port=PORT     TCP port of the RAS client socket of the NCCL job\n"
+          "                      (" STR(NCCL_RAS_CLIENT_PORT) " by default)\n"
+          "  -t, --timeout=SECS  Maximum time for the local NCCL process to wait for\n"
+          "                      responses from other NCCL processes\n"
+          "                      (" STR(RAS_COLLECTIVE_LEG_TIMEOUT_SEC) " secs by default; 0 disables the timeout)\n"
+          "  -v, --verbose       Increase the verbosity level of the RAS output\n"
+          "      --help          Print this help and exit\n"
+          "      --version       Print the version number and exit\n", argv0);
+}
+
+static void parseArgs(int argc, char** argv) {
+  int c;
+  int optIdx = 0;
+  struct option longOpts[] = {
+    {"host",    required_argument, NULL, 'h'},
+    {"port",    required_argument, NULL, 'p'},
+    {"timeout", required_argument, NULL, 't'},
+    {"verbose", no_argument,       NULL, 'v'},
+    {"help",    no_argument,       NULL, 'e'},
+    {"version", no_argument,       NULL, 'r'},
+    {0}
+  };
+
+  while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) {
+    switch (c) {
+      case 'h':
+        hostName = optarg;
+        break;
+      case 'p':
+        port = optarg;
+        break;
+      case 't': {
+        char* endPtr = nullptr;
+        timeout = strtol(optarg, &endPtr, 10);
+        if (timeout < 0 || !endPtr || *endPtr != '\0') {
+          fprintf(stderr, "Invalid timeout: %s\n", optarg);
+          exit(1);
+        }
+        break;
+      }
+      case 'v':
+        verbose = true;
+        break;
+      case 'e':
+        printUsage(argv[0]);
+        exit(0);
+      case 'r':
+        fprintf(stderr, "NCCL RAS client version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "."
+                STR(NCCL_PATCH) NCCL_SUFFIX "\n");
+        exit(0);
+      default:
+        printUsage(argv[0]);
+        exit(1);
+    }
+  }
+}
+
+static ssize_t socketWrite(int fd, const void* buf, size_t count) {
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = write(fd, ((const char*)buf)+done, count-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    done += ret;
+  } while (done < count);
+
+  return done;
+}
+
+// Reads a message from RAS.  Assumes that the message ends with '\n' (will continue reading until the terminating
+// newline, unless false is passed as untilNewLine).
+// Terminates the buffer with '\0'.  Returns the number of bytes read (excluding the added terminating '\0').
+static ssize_t rasRead(int fd, void* buf, size_t count, bool untilNewline = true) {
+  char* bufChar = (char*)buf;
+  size_t done = 0;
+  do {
+    ssize_t ret;
+    ret = read(fd, bufChar+done, count-1-done);
+    if (ret == -1) {
+      if (errno != EINTR)
+        return -1;
+      continue;
+    }
+    if (ret == 0)
+      break; // EOF
+    done += ret;
+  } while (untilNewline && (done == 0 || bufChar[done-1] != '\n'));
+  bufChar[done] = '\0';
+
+  return done;
+}
+
+static int connectToNCCL() {
+  struct addrinfo hints = {0};
+  struct addrinfo* addrInfo = nullptr;
+  int ret;
+  char msgBuf[1024];
+  int bytes;
+  struct timeval tv = {TIMEOUT_INCREMENT, 0};
+
+retry:
+  hints.ai_family = AF_UNSPEC;
+  hints.ai_socktype = SOCK_STREAM;
+  if ((ret = getaddrinfo(hostName, port, &hints, &addrInfo)) != 0) {
+    fprintf(stderr, "Resolving %s:%s: %s\n", hostName, port, gai_strerror(ret));
+    goto fail;
+  }
+  for (struct addrinfo* ai = addrInfo; ai; ai = ai->ai_next) {
+    char hostBuf[NI_MAXHOST], portBuf[NI_MAXSERV];
+    int err;
+    sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+    if (sock == -1) {
+      perror("socket");
+      continue;
+    }
+    // Initially start with a small, 1-sec timeout to quickly eliminate non-responsive processes...
+    if (timeout && (setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof tv) != 0 ||
+                    setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0)) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+    if (connect(sock, ai->ai_addr, ai->ai_addrlen) == 0)
+      break;
+    err = errno;
+    if (getnameinfo(ai->ai_addr, ai->ai_addrlen, hostBuf, sizeof(hostBuf), portBuf, sizeof(portBuf),
+                    NI_NUMERICHOST | NI_NUMERICSERV) != 0) {
+      strcpy(hostBuf, hostName);
+      strcpy(portBuf, port);
+    }
+    fprintf(stderr, "Connecting to %s:%s: %s\n", hostBuf, portBuf, strerror(err));
+    close(sock);
+    sock = -1;
+  }
+  freeaddrinfo(addrInfo);
+  addrInfo = nullptr;
+
+  if (sock == -1) {
+    fprintf(stderr, "Failed to connect to the NCCL RAS service!\n"
+            "Please make sure that the NCCL job has the RAS service enabled and that\n"
+            "%s.\n",
+            (strcmp(hostName, "localhost") || strcmp(port, STR(NCCL_RAS_CLIENT_PORT)) ?
+            "the host/port arguments are correct and match NCCL_RAS_ADDR" :
+            "the RAS client was started on a node where the NCCL job is running"));
+    goto fail;
+  }
+
+  // Exchange the RAS client handshake.
+  strcpy(msgBuf, "CLIENT PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n");
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("write to socket");
+    goto fail;
+  }
+  bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+  if (bytes < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      goto timeout;
+    }
+    perror("read socket");
+    goto fail;
+  }
+  if (bytes == 0) {
+    fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+    goto fail;
+  }
+  if (strncasecmp(msgBuf, "SERVER PROTOCOL ", strlen("SERVER PROTOCOL "))) {
+    fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+    goto fail;
+  }
+  if (strtol(msgBuf+strlen("SERVER PROTOCOL "), nullptr, 10) != NCCL_RAS_CLIENT_PROTOCOL) {
+    fprintf(stderr, "NCCL RAS protocol version mismatch (NCCL: %s; RAS client: %d)!\n"
+            "Will try to continue in spite of that...\n", msgBuf+strlen("SERVER PROTOCOL "), NCCL_RAS_CLIENT_PROTOCOL);
+  }
+
+  if (timeout >= 0) {
+    snprintf(msgBuf, sizeof(msgBuf), "TIMEOUT %d\n", timeout);
+    if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("write to socket");
+      goto fail;
+    }
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK) {
+        goto timeout;
+      }
+      perror("read socket");
+      goto fail;
+    }
+    if (bytes == 0) {
+      fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+      goto fail;
+    }
+    if (strcasecmp(msgBuf, "OK\n")) {
+      fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+      goto fail;
+    }
+  }
+  if (timeout) {
+    // Increase the socket timeout to accommodate NCCL timeout.
+    tv.tv_sec += (timeout > 0 ? timeout : RAS_COLLECTIVE_LEG_TIMEOUT_SEC) + RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC;
+    if (setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof tv) != 0) {
+      perror("setsockopt");
+      // Non-fatal; fall through.
+    }
+  }
+
+  return 0;
+fail:
+  if (addrInfo)
+    freeaddrinfo(addrInfo);
+  if (sock != -1)
+    (void)close(sock);
+  return 1;
+timeout:
+  fprintf(stderr, "Connection timed out; retrying...\n");
+  (void)close(sock);
+  goto retry;
+}
+
+int getNCCLStatus() {
+  char msgBuf[4096];
+  int bytes;
+  snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : ""));
+  if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK)
+      fprintf(stderr, "Connection timed out\n");
+    else
+      perror("write to socket");
+    return 1;
+  }
+  for (;;) {
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf), /*untileNewLine*/false);
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("read socket");
+      return 1;
+    }
+    if (bytes == 0) // EOF
+      break;
+    if (fwrite(msgBuf, 1, bytes, stdout) != bytes) {
+      fprintf(stderr, "fwrite to stdout failed!\n");
+      return 1;
+    }
+    if (fflush(stdout) != 0) {
+      perror("fflush stdout");
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int main(int argc, char** argv) {
+  parseArgs(argc, argv);
+
+  if (connectToNCCL())
+    return 1;
+
+  if (getNCCLStatus()) {
+    (void)close(sock);
+    return 1;
+  }
+
+  if (close(sock) == -1) {
+    perror("close socket");
+    return 1;
+  }
+  return 0;
+}
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
new file mode 100644
index 000000000..414a1ed94
--- /dev/null
+++ b/src/ras/client_support.cc
@@ -0,0 +1,1755 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out duriyng development only!
+#include <cassert>
+#include <cstdarg>
+#include <cstddef>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// Outlier count above which we don't print individual details about each of them.
+#define RAS_CLIENT_DETAIL_THRESHOLD 10
+// Fraction of the count of the total above which we don't consider another set to be an outlier.
+#define RAS_CLIENT_OUTLIER_FRACTION 0.25
+// Fraction of the count of the total below which a set is considered to be an outlier.
+#define RAS_CLIENT_VERBOSE_OUTLIER_FRACTION 0.5
+
+#define STR2(v) #v
+#define STR(v) STR2(v)
+
+// The RAS client listening socket of this RAS thread (normally port 28028).
+int rasClientListeningSocket = -1;
+
+// Auxiliary structure used when processing the results.  Helps with statistics gathering and sorting.
+struct rasValCount {
+  uint64_t value; // The observed value.
+  int count; // The number of occurences of this value in the results.
+  int firstIdx; // The index of the first occurence of this value in the results.
+};
+
+// Used in rasAuxComm below.  The values are bitmasks so that they can be combined.
+typedef enum {
+  RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator.
+  RAS_ACS_INIT = 2,
+  RAS_ACS_RUNNING = 4,
+  RAS_ACS_FINALIZE = 8,
+  RAS_ACS_ABORT = 16
+} rasACStatus;
+
+// Used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK).
+typedef enum {
+  RAS_ACE_OK = 0,
+  RAS_ACE_MISMATCH = 1,
+  RAS_ACE_ERROR = 2,
+  RAS_ACE_INCOMPLETE = 4
+} rasACError;
+
+// Auxiliary structure used when processing the results.  Helps with sorting and includes additional statistics
+// on the number of peers and nodes for a communicator.
+struct rasAuxComm {
+  struct rasCollComms::comm* comm;
+  int nPeers;
+  int nNodes;
+  int ranksPerNodeMin;
+  int ranksPerNodeMax;
+  unsigned int status; // Bitmask of rasACStatus values.
+  unsigned int errors; // Bitmask of rasACError values.
+  uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against.
+};
+
+// Connected RAS clients.
+struct rasClient* rasClients;
+int nRasClients;
+
+// Minimum byte count to increment the output buffer size by if it's too small.
+#define RAS_OUT_INCREMENT 4096
+
+// Internal buffer for storing the formatted results.
+static char* rasOutBuffer = nullptr;
+static int nRasOutBuffer = 0; // Does _not_ include the terminating '\0' (which _is_ present in the buffer).
+static int rasOutBufferSize = 0;
+
+// We use them all over the place; no point in wasting the stack...
+static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
+                           // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
+                           // small numbers (times two if the NVML mask is different than the CUDA mask).
+                           // Still, 1024 should normally be plenty (verbose output may make things more difficult,
+                           // but we do check for overflows, so it will just be trimmed).
+
+static ncclResult_t getNewClientEntry(struct rasClient** pClient);
+static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
+static void rasClientTerminate(struct rasClient* client);
+
+static ncclResult_t rasClientRun(struct rasClient* client);
+static ncclResult_t rasClientRunInit(struct rasClient* client);
+static ncclResult_t rasClientRunConns(struct rasClient* client);
+static ncclResult_t rasClientRunComms(struct rasClient* client);
+static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
+                                     const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync = false);
+
+static void rasOutAppend(const char* format, ...) __attribute__ ((format(printf, 1, 2)));
+static void rasOutExtract(char* buffer);
+static int rasOutLength();
+static void rasOutReset();
+
+static int rasPeersNGpuCompare(const void* e1, const void* e2);
+static int rasPeersNProcsCompare(const void* e1, const void* e2);
+static int rasPeersHostPidCompare(const void* e1, const void* e2);
+static int ncclSocketsHostCompare(const void* p1, const void* p2);
+static int rasValCountsCompareRev(const void* p1, const void* p2);
+static int rasAuxCommsCompareRev(const void* p1, const void* p2);
+static int rasCommRanksPeerCompare(const void* p1, const void* p2);
+static int rasCommRanksCollOpCompare(const void* p1, const void* p2);
+
+static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size);
+static const char* ncclErrorToString(ncclResult_t err);
+static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
+static bool rasCountIsOutlier(int count, bool verbose, int totalCount = -1);
+
+
+///////////////////////////////////
+// General rasClients functions. //
+///////////////////////////////////
+
+// Creates a listening socket for clients to connect to.
+ncclResult_t rasClientInitSocket() {
+  ncclResult_t ret = ncclSuccess;
+  const char* clientAddr = "localhost:" STR(NCCL_RAS_CLIENT_PORT);
+  union ncclSocketAddress addr;
+  const int opt = 1;
+  if (const char* env = ncclGetEnv("NCCL_RAS_ADDR"))
+    clientAddr = env;
+  NCCLCHECKGOTO(ncclSocketGetAddrFromString(&addr, clientAddr), ret, fail);
+  SYSCHECKGOTO(rasClientListeningSocket = socket(addr.sa.sa_family, SOCK_STREAM, 0), "socket", ret, fail);
+  SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+               "setsockopt", ret, fail);
+#if defined(SO_REUSEPORT)
+  SYSCHECKGOTO(setsockopt(rasClientListeningSocket, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)),
+               "setsockopt", ret, fail);
+#endif
+  SYSCHECKGOTO(bind(rasClientListeningSocket, &addr.sa, (addr.sa.sa_family == AF_INET ? sizeof(struct sockaddr_in) :
+                                                          sizeof(struct sockaddr_in6))), "bind", ret, fail);
+  SYSCHECKGOTO(listen(rasClientListeningSocket, 16384), "listen", ret, fail);
+  INFO(NCCL_INIT|NCCL_RAS, "RAS client listening socket at %s", ncclSocketToString(&addr, rasLine));
+exit:
+  return ret;
+fail:
+  INFO(NCCL_INIT|NCCL_RAS, "RAS failed to establish a client listening socket at %s", clientAddr);
+  if (rasClientListeningSocket != -1) {
+    (void)close(rasClientListeningSocket);
+    rasClientListeningSocket = -1;
+  }
+  goto exit;
+}
+
+// Accepts a new RAS client connection.  The acceptance process may need to continue in the main event loop.
+ncclResult_t rasClientAcceptNewSocket() {
+  ncclResult_t ret = ncclSuccess;
+  struct rasClient* client = nullptr;
+  union ncclSocketAddress addr;
+  socklen_t addrlen = sizeof(addr);
+  int flags;
+
+  NCCLCHECKGOTO(getNewClientEntry(&client), ret, fail);
+
+  SYSCHECKGOTO(client->sock = accept(rasClientListeningSocket, (struct sockaddr*)&addr, &addrlen), "accept", ret, fail);
+
+  SYSCHECKGOTO(flags = fcntl(client->sock, F_GETFL), "fcntl", ret, fail);
+  SYSCHECKGOTO(fcntl(client->sock, F_SETFL, flags | O_NONBLOCK), "fcntl", ret, fail);
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&client->pfd), ret, fail);
+  rasPfds[client->pfd].fd = client->sock;
+  rasPfds[client->pfd].events = POLLIN;
+  client->status = RAS_CLIENT_CONNECTED;
+exit:
+  return ret;
+fail:
+  if (client && client->sock != -1)
+    (void)close(client->sock);
+  goto exit;
+}
+
+// Returns the index of the first available entry in the rasClients array, enlarging the array if necessary.
+static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
+  struct rasClient* client;
+  int i;
+  for (i = 0; i < nRasClients; i++)
+    if (rasClients[i].status == RAS_CLIENT_CLOSED)
+      break;
+  if (i == nRasClients) {
+    NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT));
+    nRasClients += RAS_INCREMENT;
+  }
+
+  client = rasClients+i;
+  memset(client, '\0', sizeof(*client));
+  client->sock = client->pfd = -1;
+  ncclIntruQueueConstruct(&client->sendQ);
+  client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
+  client->collIdx = -1;
+
+  *pClient = client;
+  return ncclSuccess;
+}
+
+// Allocates a message of the desired length for sending.
+// Behind the scenes uses rasMsgAlloc.
+// Must use rasClientFreeMsg to free.
+static ncclResult_t rasClientAllocMsg(char** msg, size_t msgLen) {
+  return rasMsgAlloc((struct rasMsg**)msg, msgLen);
+}
+
+// To be used only with messages allocated with rasClientAllocMsg, i.e., for messages meant for sending.
+static void rasClientFreeMsg(char* msg) {
+  rasMsgFree((struct rasMsg*)msg);
+}
+
+// Enqueues a message for sending to a RAS client.  The message *must* have been allocated using rasClientAllocMsg.
+static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen) {
+  // Get to the metadata of this message.
+  struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+  meta->offset = 0;
+  meta->length = msgLen;
+  ncclIntruQueueEnqueue(&client->sendQ, meta);
+  assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED);
+  rasPfds[client->pfd].events |= POLLOUT;
+}
+
+// Terminates a connection with a RAS client.
+static void rasClientTerminate(struct rasClient* client) {
+  (void)close(client->sock);
+  client->sock = -1;
+  client->status = RAS_CLIENT_CLOSED;
+  rasPfds[client->pfd].fd = -1;
+  rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0;
+  client->pfd = -1;
+  while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) {
+    free(meta);
+  }
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// Functions related to the asynchronous operations of RAS clients. //
+//////////////////////////////////////////////////////////////////////
+
+// Invoked when an asynchronous operation that a client was waiting on completes.  Finds the right client and
+// reinvokes rasClientRun.
+ncclResult_t rasClientResume(struct rasCollective* coll) {
+  int collIdx = coll-rasCollectives;
+  int i;
+  struct rasClient* client = nullptr;
+  for (i = 0; i < nRasClients; i++) {
+    client = rasClients+i;
+    if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) {
+      break;
+    }
+  }
+  if (i == nRasClients) {
+    INFO(NCCL_RAS, "RAS failed to find a matching client!");
+    rasCollFree(coll);
+    goto exit;
+  }
+
+  NCCLCHECK(rasClientRun(client));
+exit:
+  return ncclSuccess;
+}
+
+// Handles a ready client FD from the main event loop.
+void rasClientEventLoop(int clientIdx, int pollIdx) {
+  struct rasClient* client = rasClients+clientIdx;
+  bool closed = false;
+
+  if (client->status == RAS_CLIENT_CONNECTED) {
+    char* cmd;
+    char* cmdEnd;
+    if (rasPfds[pollIdx].revents & POLLIN) {
+      if (client->recvOffset < sizeof(client->recvBuffer)) {
+        ssize_t nRecv;
+        nRecv = recv(client->sock, client->recvBuffer+client->recvOffset,
+                     sizeof(client->recvBuffer) - client->recvOffset, MSG_DONTWAIT);
+        if (nRecv == 0) {
+          closed = true;
+        } else if (nRecv == -1) {
+          if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+            if (errno == ECONNRESET)
+              INFO(NCCL_RAS, "RAS socket closed by the client on receive; terminating it");
+            else
+              INFO(NCCL_RAS, "RAS unexpected error from recv; terminating the client socket");
+            closed = true;
+          }
+        } else { // nRecv > 0
+          client->recvOffset += nRecv;
+        }
+      } else { // client->recvOffset == sizeof(client->recvBuffer)
+        rasPfds[client->pfd].events &= ~POLLIN; // No room to receive for now.
+      }
+    } // if (rasPfds[pollIdx].revents & POLLIN)
+    if (closed) {
+      rasClientTerminate(client);
+      return;
+    }
+    cmd = client->recvBuffer;
+    while ((cmdEnd = (char*)memchr(cmd, '\n', client->recvOffset - (cmd-client->recvBuffer))) != nullptr) {
+      char* msg;
+      int msgLen;
+      *cmdEnd = '\0'; // Replaces '\n'.
+      if (cmdEnd > cmd && cmdEnd[-1] == '\r')
+        cmdEnd[-1] = '\0'; // Replaces '\r' (e.g., in case of a telnet connection).
+
+      if (strncasecmp(cmd, "client protocol ", strlen("client protocol ")) == 0) {
+        // We ignore the protocol version for now; we just send our version back.
+        snprintf(rasLine, sizeof(rasLine), "SERVER PROTOCOL " STR(NCCL_RAS_CLIENT_PROTOCOL) "\n");
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strncasecmp(cmd, "timeout ", strlen("timeout ")) == 0) {
+        char* endPtr = nullptr;
+        int timeout = strtol(cmd+strlen("timeout "), &endPtr, 10);
+        if (timeout < 0 || !endPtr || *endPtr != '\0') {
+          snprintf(rasLine, sizeof(rasLine), "ERROR: Invalid timeout value %s\n", cmd+strlen("timeout "));
+        } else {
+          client->timeout = timeout * CLOCK_UNITS_PER_SEC;
+          strcpy(rasLine, "OK\n");
+        }
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strcasecmp(cmd, "status") == 0) {
+        client->status = RAS_CLIENT_INIT;
+        (void)rasClientRun(client);
+      } else if (strcasecmp(cmd, "verbose status") == 0) {
+        client->status = RAS_CLIENT_INIT;
+        client->verbose = 1;
+        (void)rasClientRun(client);
+      } else {
+        snprintf(rasLine, sizeof(rasLine), "ERROR: Unknown command %s\n", cmd);
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess)
+          return; // It should be non-fatal if we don't return a response...
+        // We don't copy the terminating '\0', hence memcpy rather than strcpy.
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
+      }
+
+      cmd = cmdEnd+1;
+    } // while newline found
+
+    if (cmd == client->recvBuffer) {
+      if (client->recvOffset == sizeof(client->recvBuffer)) {
+        // We didn't find any newlines and the buffer is full.
+        INFO(NCCL_RAS, "RAS excessively long input line; terminating the client socket");
+        rasClientTerminate(client);
+        return;
+      }
+      // Otherwise it's an incomplete command; we need to wait for the rest of it.
+    } else { // cmd > client->recvBuffer
+      // Shift whatever remains (if anything) to the beginning of the buffer.
+      memmove(client->recvBuffer, cmd, client->recvOffset - (cmd-client->recvBuffer));
+      client->recvOffset -= cmd-client->recvBuffer;
+    }
+  } // if (client->status == RAS_CLIENT_CONNECTED)
+
+  if (rasPfds[pollIdx].revents & POLLOUT) {
+    struct rasMsgMeta* meta;
+    while ((meta = ncclIntruQueueHead(&client->sendQ)) != nullptr) {
+      ssize_t nSend;
+      nSend = send(client->sock, ((char*)&meta->msg)+meta->offset, meta->length-meta->offset,
+                   MSG_DONTWAIT | MSG_NOSIGNAL);
+      if (nSend < 1) {
+        if (nSend == -1 && errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+          if (errno == EPIPE)
+            INFO(NCCL_RAS, "RAS socket closed by the client on send; terminating it");
+          else
+            INFO(NCCL_RAS, "RAS unexpected error from send; terminating the client socket");
+          closed = true;
+        }
+        break;
+      }
+
+      meta->offset += nSend;
+      if (meta->offset < meta->length)
+        break;
+
+      ncclIntruQueueDequeue(&client->sendQ);
+      free(meta);
+    } // while (meta)
+
+    if (closed) {
+      rasClientTerminate(client);
+      return;
+    }
+
+    if (!meta) {
+      rasPfds[client->pfd].events &= ~POLLOUT; // Nothing more to send for now.
+      if (client->status == RAS_CLIENT_FINISHED)
+        rasClientTerminate(client);
+    }
+  } // if (rasPfds[pollIdx].revents & POLLOUT)
+}
+
+
+//////////////////////////////////////////////////////////
+// Functions driving data gathering for the RAS client. //
+//////////////////////////////////////////////////////////
+
+// Main function that drives the whole data gathering process and sends it back to the client.
+// There are multiple asynchronous aspects of it (getting the data on connections and on communicators), so the
+// function may exit early and needs to be reinvoked when the asynchronous responses arrive or the timeout expires.
+// The state tracking the progress of such operations is kept in the rasClient.
+static ncclResult_t rasClientRun(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+
+  switch (client->status) {
+    case RAS_CLIENT_INIT:
+      NCCLCHECKGOTO(rasClientRunInit(client), ret, exit);
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+      client->status = RAS_CLIENT_CONNS;
+      if (ret == ncclInProgress) {
+        ret = ncclSuccess;
+        break;
+      }
+    case RAS_CLIENT_CONNS:
+      assert(client->collIdx != -1);
+      NCCLCHECKGOTO(rasClientRunConns(client), ret, exit);
+#endif
+      client->status = RAS_CLIENT_COMMS;
+      if (ret == ncclInProgress) {
+        ret = ncclSuccess;
+        break;
+      }
+    case RAS_CLIENT_COMMS:
+      assert(client->collIdx != -1);
+      NCCLCHECKGOTO(rasClientRunComms(client), ret, exit);
+      client->status = RAS_CLIENT_FINISHED;
+      break;
+    default:
+      WARN("Invalid client status %d", client->status);
+      ret = ncclInternalError;
+      goto exit;
+  }
+exit:
+  return ret;
+}
+
+// Sends to the client the initial data that can be obtained locally -- version info, stats on rasPeers,
+// dump of rasDeadPeers.  Initiates the RAS_COLL_CONNS collective operation.
+static ncclResult_t rasClientRunInit(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasPeerInfo* peersReSorted = nullptr;
+  int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal;
+  bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal;
+  int firstIdx, nPeers;
+  struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS];
+  int nValCounts;
+  static int cudaDriver = -1, cudaRuntime = -1;
+
+  rasOutReset();
+  rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
+               " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
+  if (cudaRuntime == -1)
+    cudaRuntimeGetVersion(&cudaRuntime);
+  if (cudaDriver == -1)
+    cudaDriverGetVersion(&cudaDriver);
+  rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+
+  rasOutReset();
+  totalGpus = totalNodes = 0;
+  firstNGpusNode = 0; // #GPUs on the first peer of a node.
+  firstNGpusGlobal = 0; // #GPUs on peerIdx 0.
+  consistentNGpusNode = true; // Whether #GPUs/peer is consistent between the peers *on any one node*.
+  consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*.
+  consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes.
+  nPeers = 0; // #peers on a node.
+  firstNPeersGlobal = 0;
+  for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+    int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs);
+    totalGpus += nGpus;
+    if (peerIdx == 0) {
+      totalNodes = 1;
+      nPeers = 1;
+      firstNGpusGlobal = firstNGpusNode = nGpus;
+    } else { // peerIdx > 0
+      if (nGpus != firstNGpusGlobal)
+        consistentNGpusGlobal = false;
+      if (!ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasPeers[peerIdx-1].addr)) {
+        totalNodes++;
+        if (firstNPeersGlobal == 0)
+          firstNPeersGlobal = nPeers;
+        else if (nPeers != firstNPeersGlobal)
+          consistentNPeersGlobal = false;
+        nPeers = 1;
+        firstNGpusNode = nGpus;
+      } else { // Same node.
+        if (nGpus != firstNGpusNode)
+          consistentNGpusNode = false;
+        nPeers++;
+      } // Same node
+    } // peerIdx > 0
+    if (peerIdx == nRasPeers-1) {
+      if (firstNPeersGlobal == 0)
+        firstNPeersGlobal = nPeers;
+      else if (nPeers != firstNPeersGlobal)
+        consistentNPeersGlobal = false;
+    }
+  } // for (peerIdx)
+
+  rasOutAppend("Job summary\n"
+               "===========\n\n");
+
+  if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
+    rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
+                 "(total)   per node  per process    (total)  (total)\n"
+                 "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
+                 totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
+  } else {
+    // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
+    // but of a group of peers, so calculating it is more involved.  We make a copy of rasPeers and creatively
+    // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node.
+    NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
+    memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+
+    firstIdx = 0;
+    nPeers = 0;
+    for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+      if (peerIdx == 0) {
+        nPeers = 1;
+        firstIdx = 0;
+      } else { // peerIdx > 0
+        if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) {
+          for (int i = firstIdx; i < peerIdx; i++) {
+            // Go back and update the number of processes of all the elements of that node.
+            peersReSorted[i].cudaDevs = nPeers;
+          }
+          nPeers = 1;
+          firstIdx = peerIdx;
+        } else {
+          nPeers++;
+        }
+      } // peerIdx > 0
+      if (peerIdx == nRasPeers-1) {
+        // Last iteration of the loop.
+        for (int i = firstIdx; i < nRasPeers; i++) {
+          peersReSorted[i].cudaDevs = nPeers;
+        }
+      }
+    } // for (peerIdx)
+
+    // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the
+    // secondary, and process id as the tertiary.
+    qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare);
+
+    // Calculate the distribution of different numbers of peers per node.
+    nValCounts = 0;
+    for (int peerIdx = 0; peerIdx < nRasPeers;) {
+      if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) {
+        valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs;
+        valCounts[nValCounts].count = 1;
+        valCounts[nValCounts].firstIdx = peerIdx;
+        nValCounts++;
+      } else {
+        valCounts[nValCounts-1].count++;
+      }
+      // Advance peerIdx to the next node.
+      peerIdx += peersReSorted[peerIdx].cudaDevs;
+    }
+    // valCounts is currently sorted by value (the number of peers per node).  Sort it by the count (most frequent
+    // number of peers first).
+    qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
+
+    // Print it out, the most frequent peer counts first.
+    if (consistentNGpusNode && consistentNGpusGlobal) {
+      rasOutAppend("  Nodes  Processes         GPUs\n"
+                   "          per node  per process\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("%7d  %9ld  %11d\n",
+                     vc->count, vc->value, firstNGpusGlobal);
+      }
+    } else {
+      rasOutAppend("  Nodes  Processes\n"
+                   "          per node\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("%7d  %9ld\n",
+                     vc->count, vc->value);
+      }
+
+      // We calculate and print the GPUs/process separately.  This is required for !consistentNGpusNode and
+      // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts).
+
+      // Sort peers by the GPU count, to simplify data extraction.
+      memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+      // GPU count is the primary key, host IP is the secondary, and process id is the tertiary.
+      qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare);
+
+      // Calculate the distribution of different numbers of GPUs per peer.
+      nValCounts = 0;
+      for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+        if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) !=
+                            __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) {
+          valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs);
+          valCounts[nValCounts].count = 1;
+          valCounts[nValCounts].firstIdx = peerIdx;
+          nValCounts++;
+        } else {
+          valCounts[nValCounts-1].count++;
+        }
+      }
+      // valCounts is currently sorted by value (number of GPUs per peer).  Sort it by the count (most frequent
+      // GPU counts first).
+      qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
+
+      // Print it out, the most frequent GPU counts first.
+      rasOutAppend("\n"
+                   "         Processes         GPUs\n"
+                   "                    per process\n");
+      for (int i = 0; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        rasOutAppend("         %9d  %11ld\n",
+                     vc->count, vc->value);
+      }
+    }
+    rasOutAppend("\n"
+                 "  Nodes  Processes         GPUs\n"
+                 "(total)    (total)      (total)\n"
+                 "%7d"  "  %9d"    "  %11d\n",
+                 totalNodes, nRasPeers, totalGpus);
+
+    if (consistentNGpusNode && consistentNGpusGlobal) {
+      // In this simpler case, also print the node outliers.
+      for (int i = 1; i < nValCounts; i++) {
+        struct rasValCount* vc = valCounts+i;
+        // We assume that the most frequent group is correct; for the remaining ones, we try to provide more info,
+        // provided that they meet our definition of an outlier.
+        if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) {
+          rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : ""));
+          // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as
+          // the tertiary, which comes in handy when printing...
+          for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) {
+            lineBuf[0] = '\0';
+            for (int j = 0; j < vc->value; j++) {
+              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                       (j > 0 ? "," : ""), peersReSorted[j].pid);
+            }
+            rasOutAppend("  Node %s running process%s %s\n",
+                         ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                         (vc->value > 1 ? "es" : ""), lineBuf);
+          } // for (peerIdx)
+        } // if (rasCountIsOutlier(vc->count))
+      } // for (i)
+    } // !consistentNPeersGlobal
+  } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+  rasOutAppend("\nGathering data about the RAS network (timeout %lds)...", client->timeout / CLOCK_UNITS_PER_SEC);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_CONNS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress; // We need to wait for async. responses.
+  }
+#endif
+  rasOutAppend("\nCommunicators...");
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_COMMS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress;
+  }
+exit:
+  free(peersReSorted);
+  return ret;
+fail:
+  goto exit;
+}
+
+#if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
+      // To be revisited with future extensions to RAS.
+// Processes the response from the RAS_COLL_CONNS collective operation and sends the data to the client (for now
+// primarily the list of missing processes).  Initiates the RAS_COLL_COMMS collective operation.
+static ncclResult_t rasClientRunConns(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollConns* connsData = (struct rasCollConns*)coll->data;
+  int expected;
+  struct rasPeerInfo* peersBuf = nullptr;
+
+  assert(coll->nFwdSent == coll->nFwdRecv);
+  client->collIdx = -1;
+
+  rasOutReset();
+  rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
+  if (coll->nLegTimeouts > 0) {
+    rasOutAppend(" Warning: encountered %d communication timeout%s while gathering data\n", coll->nLegTimeouts,
+                 (coll->nLegTimeouts > 1 ? "s" : ""));
+  }
+
+  expected = nRasPeers - nRasDeadPeers;
+  if (coll->nPeers != expected) {
+    int missing = expected - coll->nPeers;
+    rasOutAppend(" Warning: missing data from %d process%s (received from %d, expected %d)\n",
+                 missing, (missing > 1 ? "es" : ""), coll->nPeers, expected);
+    if (missing <= RAS_CLIENT_DETAIL_THRESHOLD) {
+      // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
+      // by address (including port, which isn't meaningful to end users).
+      int nPeersBuf = 0;
+      NCCLCHECKGOTO(ncclCalloc(&peersBuf, missing), ret, fail);
+      // Ensure both arrays are sorted (rasPeers already is, by addr); makes finding missing records a breeze.
+      qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
+      for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
+        int cmp;
+        if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers)
+          cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx);
+        else
+          cmp = (rasPeerIdx < nRasPeers ? -1 : 1);
+
+        if (cmp == 0) {
+          rasPeerIdx++;
+          collPeerIdx++;
+        } else if (cmp < 0) {
+          memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          rasPeerIdx++;
+        } else { // cmp > 0
+          // Process not found in rasPeers -- shouldn't happen.
+          collPeerIdx++;
+        } // cmp > 0
+      } // for (rasPeerIdx, collPeerIdx)
+
+      // Sort the output by host and pid, not host and port.
+      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      rasOutAppend("  The missing process%s:\n", (missing > 1 ? "es" : ""));
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
+                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersBuf != missing)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     missing-nPeersBuf, (missing-nPeersBuf > 1 ? "es" : ""));
+    } // if (expected - coll->nPeers <= RAS_CLIENT_DETAIL_THRESHOLD)
+  } // if (coll->nPeers != expected)
+
+  if (connsData->nConns > 0) {
+    rasOutAppend(" Collected data about %d unidirectional connection%s\n",
+                 connsData->nConns, (connsData->nConns > 1 ? "s" : ""));
+    rasOutAppend(" Travel times (valid only if system clocks are synchronized between nodes):\n"
+                 "  Minimum %fs, maximum %fs, average %fs\n",
+                 connsData->travelTimeMin/1e9, connsData->travelTimeMax/1e9,
+                 connsData->travelTimeSum/(1e9*connsData->travelTimeCount));
+  } else {
+    rasOutAppend(" No connection data collected!\n");
+  }
+  if (connsData->nNegativeMins > 0) {
+    rasOutAppend(" Warning: negative travel times were observed across %d connection%s,\n"
+                 " indicating that the system clocks are *not* synchronized.\n"
+                 " Ordering of events based on local timestamps should be considered unreliable\n",
+                 connsData->nNegativeMins, (connsData->nNegativeMins > 1 ? "s" : ""));
+    if (connsData->nNegativeMins <= RAS_CLIENT_DETAIL_THRESHOLD) {
+      rasOutAppend("  The affected connection%s:\n", (connsData->nNegativeMins > 1 ? "s" : ""));
+      for (int i = 0; i < connsData->nNegativeMins; i++) {
+        struct rasCollConns::negativeMin* negativeMin = connsData->negativeMins+i;
+        int sourcePeerIdx = rasPeerFind(&negativeMin->source);
+        int destPeerIdx = rasPeerFind(&negativeMin->dest);
+        if (sourcePeerIdx != -1 && destPeerIdx != -1)
+          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %fs\n",
+                       ncclSocketToHost(&negativeMin->source, rasLine, sizeof(rasLine)), rasPeers[sourcePeerIdx].pid,
+                       ncclSocketToHost(&negativeMin->dest, lineBuf, sizeof(lineBuf)), rasPeers[destPeerIdx].pid,
+                       negativeMin->travelTimeMin/1e9);
+      }
+    }
+  }
+  rasCollFree(coll);
+
+  rasOutAppend("\nGathering data about the NCCL communicators (timeout %lds)...",
+               client->timeout / CLOCK_UNITS_PER_SEC);
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+  {
+    struct rasCollRequest collReq;
+    bool allDone = false;
+    rasCollReqInit(&collReq);
+    collReq.timeout = client->timeout;
+    collReq.type = RAS_COLL_COMMS;
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
+                  ret, fail);
+    if (!allDone)
+      ret = ncclInProgress;
+  }
+exit:
+  free(peersBuf);
+  return ret;
+fail:
+  goto exit;
+}
+#endif
+
+// Processes the response from the RAS_COLL_COMMS collective operation and sends the data to the client:
+// statistics on the communicators, missing data from ranks, inconsistent collective operation counts,
+// initialization and asynchronous errors, and inconsistent initialization/termination status.
+static ncclResult_t rasClientRunComms(struct rasClient* client) {
+  ncclResult_t ret = ncclSuccess;
+  char* msg = nullptr;
+  int msgLen;
+  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollComms* commsData = (struct rasCollComms*)coll->data;
+  struct rasCollComms::comm* comm;
+  struct rasCollComms::comm::rank* ranksReSorted = nullptr;
+  struct rasValCount* valCounts = nullptr;
+  int nValCounts;
+  struct rasValCount* collOpCounts = nullptr;
+  struct rasAuxComm* auxComms = nullptr;
+  int maxCommSize;
+  int* peerIdxConv = nullptr;
+  int vcIdx;
+  int nPeersMissing;
+  uint64_t* peerNvmlDevs = nullptr;
+  const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" };
+  const char*const errorStr[] = {
+    // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer.
+    "OK",
+    "MISMATCH",
+    "ERROR",
+    "ERROR,MISMATCH",
+    "INCOMPLETE",
+    "INCOMPLETE,MISMATCH",
+    "INCOMPLETE,ERROR",
+    "INCOMPLETE,ERROR,MISMATCH"
+  };
+
+  assert(coll->nFwdSent == coll->nFwdRecv);
+  client->collIdx = -1;
+
+  rasOutReset();
+  rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
+
+  // Calculate the number of missing peers early as we rely on it for other things.
+  nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
+
+  // Sort the communicators by size.  As the structure is inconvenient to move around due to the elements being
+  // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort
+  // that array while keeping the data intact.
+  NCCLCHECKGOTO(ncclCalloc(&auxComms, commsData->nComms), ret, fail);
+  // While initializing the just allocated array, also find out the size of the largest communicator so that we know
+  // how much memory to allocate for another temporary array.
+  maxCommSize = 0;
+  comm = commsData->comms;
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    if (maxCommSize < comm->commNRanks)
+      maxCommSize = comm->commNRanks;
+    auxComms[commIdx].comm = comm;
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
+  }
+  NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail);
+
+  // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx.
+  NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail);
+  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+    peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx);
+  // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
+  qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
+
+  // Fill in the remaining fields of auxComm's.
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    struct rasAuxComm* auxComm = auxComms+commIdx;
+    int nRanks = 0;
+    comm = auxComm->comm;
+
+    if (comm->commNRanks > comm->nRanks) {
+      // There are two possibilities here.  Either we are missing the data on some ranks because the processes are
+      // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which
+      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).  Because we
+      // currently don't collect data about missing ranks, we can't reliably distinguish these two cases.
+      // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this
+      // as an INCOMPLETE error; otherwise as a MISMATCH warning.
+      if (nPeersMissing > 0 || nRasDeadPeers > 0)
+        auxComm->errors |= RAS_ACE_INCOMPLETE;
+      else {
+        auxComm->errors |= RAS_ACE_MISMATCH;
+        auxComm->status |= RAS_ACS_UNKNOWN;
+      }
+    }
+
+    memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+    // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted
+    // by process _and_ node, which makes counting easy.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+      ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+    qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare);
+
+    // Count the peers and nodes, get the status/error indicators.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+      if (rankIdx == 0) {
+        auxComm->nPeers = auxComm->nNodes = 1;
+        auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
+        auxComm->ranksPerNodeMax = 0;
+        auxComm->firstCollOpCount = rank->collOpCount;
+        nRanks = 1;
+      } else { // rankIdx > 0
+        if (rank->peerIdx != rank[-1].peerIdx) {
+          auxComm->nPeers++;
+          if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) {
+            auxComm->nNodes++;
+            if (auxComm->ranksPerNodeMin > nRanks)
+              auxComm->ranksPerNodeMin = nRanks;
+            if (auxComm->ranksPerNodeMax < nRanks)
+              auxComm->ranksPerNodeMax = nRanks;
+            nRanks = 0;
+          }
+        } // if (rank->peerIdx != rank[-1].peerIdx)
+        nRanks++;
+      } // rankIdx > 0
+      if (rankIdx == comm->nRanks-1) {
+        // Last iteration of the loop.
+        if (auxComm->ranksPerNodeMin > nRanks)
+          auxComm->ranksPerNodeMin = nRanks;
+        if (auxComm->ranksPerNodeMax < nRanks)
+          auxComm->ranksPerNodeMax = nRanks;
+      }
+
+      if (rank->status.abortFlag)
+        auxComm->status |= RAS_ACS_ABORT;
+      else if (rank->status.finalizeCalled || rank->status.destroyFlag) {
+        // destroyFlag is set by ncclCommDestroy and ncclCommAbort.  finalizeCalled appears to be set by
+        // ncclCommFinalize only.  According to the docs, ncclCommDestroy *can* be called without calling
+        // ncclCommFinalize first.  The code structure here ensures that we attribute destroyFlag properly
+        // as a finalize state indicator (and ignore it in case of ncclCommAbort).
+        auxComm->status |= RAS_ACS_FINALIZE;
+      }
+      else if (rank->status.initState == ncclSuccess)
+        auxComm->status |= RAS_ACS_RUNNING;
+      else // rank->initState != ncclSuccess
+        auxComm->status |= RAS_ACS_INIT;
+
+      if (rank->collOpCount != auxComm->firstCollOpCount)
+        auxComm->errors |= RAS_ACE_MISMATCH;
+      if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress)
+        auxComm->errors |= RAS_ACE_ERROR;
+      if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress)
+        auxComm->errors |= RAS_ACE_ERROR;
+    } // for (rankIdx)
+
+    if (__builtin_popcount(auxComm->status) > 1) {
+      // We've got a status mismatch between ranks.
+      auxComm->errors |= RAS_ACE_MISMATCH;
+    }
+  } // for (commIdx)
+  // Sort it by size/nNodes/status/errors/missing ranks.
+  qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
+
+  // Calculate the distribution of different communicator sizes.
+  NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail);
+  nValCounts = 0;
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    if (commIdx == 0 ||
+        auxComms[commIdx].comm->commNRanks != auxComms[commIdx-1].comm->commNRanks ||
+        auxComms[commIdx].nNodes != auxComms[commIdx-1].nNodes ||
+        // __builtin_clz returns the number of leading 0-bits, which is a proxy for the index of the highest 1-bit.
+        __builtin_clz(auxComms[commIdx].status) != __builtin_clz(auxComms[commIdx-1].status) ||
+        auxComms[commIdx].errors != auxComms[commIdx-1].errors) {
+      valCounts[nValCounts].value = 0; // We have many distinguishing values but only one field to store them.
+                                       // It doesn't really matter, given that we can extract them via firstIdx.
+      valCounts[nValCounts].count = 1;
+      valCounts[nValCounts].firstIdx = commIdx;
+      nValCounts++;
+    } else {
+      valCounts[nValCounts-1].count++;
+    }
+  }
+
+  rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
+               "    #  in group  per comm  per node  per comm  in group\n");
+  if (commsData->nComms == 0)
+    rasOutAppend("No communicator data collected!\n");
+
+  // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group.
+  NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail);
+
+  // Print it out, the largest communicators first.
+  for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc = valCounts+vcIdx;
+    struct rasAuxComm* auxComm = auxComms+vc->firstIdx;
+    int ranksPerNodeMin, ranksPerNodeMax;
+    int ranksTotal;
+
+    ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
+    ranksPerNodeMax = 0;
+    memset(peerNvmlDevs, '\0', coll->nPeers * sizeof(*peerNvmlDevs));
+    // We don't group comms by ranksPerNodeMin/Max, so the values may differ between comms in one group.
+    // Calculate the group's min/max.
+    // Also calculate the number of unique ranks in the group.
+    for (int commIdx = 0; commIdx < vc->count; commIdx++) {
+      if (ranksPerNodeMin > auxComm[commIdx].ranksPerNodeMin)
+        ranksPerNodeMin = auxComm[commIdx].ranksPerNodeMin;
+      if (ranksPerNodeMax < auxComm[commIdx].ranksPerNodeMax)
+        ranksPerNodeMax = auxComm[commIdx].ranksPerNodeMax;
+      for (int rankIdx = 0; rankIdx < auxComm[commIdx].comm->nRanks; rankIdx++) {
+        struct rasCollComms::comm::rank* rank = auxComm[commIdx].comm->ranks+rankIdx;
+        peerNvmlDevs[rank->peerIdx] |= (1UL << rank->nvmlDev);
+      }
+    }
+    ranksTotal = 0;
+    for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+      ranksTotal += __builtin_popcountll(peerNvmlDevs[peerIdx]);
+    if (ranksPerNodeMin == ranksPerNodeMax)
+      snprintf(rasLine, sizeof(rasLine), "%d", ranksPerNodeMin);
+    else
+      snprintf(rasLine, sizeof(rasLine), "%d-%d", ranksPerNodeMin, ranksPerNodeMax);
+    rasOutAppend("%5d  %8d  %8d  %8s  %8d  %8d  %8s  %6s\n",
+                 vcIdx, vc->count, auxComm->nNodes, rasLine, auxComm->comm->commNRanks, ranksTotal,
+                 // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
+                 // status (which is a bitmask) into an array index.
+                 statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]);
+  }
+
+  rasOutAppend("\nErrors\n"
+               "======\n\n");
+
+  if (nPeersMissing > 0) {
+    rasOutAppend("INCOMPLETE\n"
+                 "  Missing communicator data from %d job process%s\n", nPeersMissing, (nPeersMissing > 1 ? "es" : ""));
+    if (rasCountIsOutlier(nPeersMissing, client->verbose)) {
+      // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
+      // by address (including port, which isn't meaningful to end users).
+      struct rasPeerInfo* peersBuf = nullptr;
+      int nPeersBuf;
+
+      // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing
+      // them much easier.
+      NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail);
+      nPeersBuf = 0;
+      for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
+        int cmp;
+        if (rasPeerIdx < nRasPeers && collPeerIdx < coll->nPeers)
+          cmp = ncclSocketsCompare(&rasPeers[rasPeerIdx].addr, coll->peers+collPeerIdx);
+        else
+          cmp = (rasPeerIdx < nRasPeers ? -1 : 1);
+
+        if (cmp == 0) {
+          rasPeerIdx++;
+          collPeerIdx++;
+        } else if (cmp < 0) {
+          // Process missing from coll->peers.  Don't report dead ones though, as they are not included
+          // in nPeersMissing and are reported separately below.
+          if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) {
+            assert(nPeersBuf < nPeersMissing);
+            memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          }
+          rasPeerIdx++;
+        } else { // cmp > 0
+          // Process not found in rasPeers -- shouldn't happen, unless during a race?
+          collPeerIdx++;
+        } // cmp > 0
+      } // for (rasPeerIdx, collPeerIdx)
+
+      // Sort the output by host and pid.
+      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
+                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersBuf != nPeersMissing)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : ""));
+      free(peersBuf);
+    } // if (rasCountIsOutlier(nPeersMissing))
+    rasOutAppend("\n");
+  }
+
+  if (nRasDeadPeers > 0) {
+    rasOutAppend("DEAD\n"
+                 "  %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers,
+                 (nRasDeadPeers > 1 ? "es are" : " is"));
+    if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) {
+      struct rasPeerInfo* peersReSorted = nullptr;
+      int nPeersReSorted = 0;
+      NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail);
+      for (int i = 0; i < nRasDeadPeers; i++) {
+        int peerIdx = rasPeerFind(rasDeadPeers+i);
+        if (peerIdx != -1)
+          memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted));
+      }
+      // Sort the output by host and pid, not host and port.
+      qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare);
+      for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) {
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid,
+                     ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf,
+                                        sizeof(lineBuf)));
+      }
+      if (nPeersReSorted != nRasDeadPeers)
+        rasOutAppend("  [could not find information on %d process%s]\n",
+                     nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : ""));
+      free(peersReSorted);
+    } // if (rasCountIsOutlier(nRasDeadPeers)
+    rasOutAppend("\n");
+  }
+
+  for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc;
+    vc = valCounts+vcIdx;
+    for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
+      struct rasAuxComm* auxComm = auxComms+commIdx;
+      comm = auxComm->comm;
+
+      if (auxComm->errors & RAS_ACE_INCOMPLETE) {
+        int nRanksMissing = comm->commNRanks - comm->nRanks;
+        rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n"
+                     "  Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx,
+                     comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : ""));
+        if (rasCountIsOutlier(nRanksMissing, client->verbose)) {
+          lineBuf[0] = '\0';
+          // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
+          // exception of the missing ranks...
+          for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
+            if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
+              rankIdx++;
+            } else {
+              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                       (rankIdx == commRank ? "" : ","), commRank);
+            }
+          } // for (commRank)
+          rasOutAppend("  The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf);
+        } // if (rasCountIsOutlier(nRanksMissing))
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_INCOMPLETE)
+
+      if (auxComm->errors & RAS_ACE_ERROR) {
+        int ncclErrors[ncclNumResults];
+        int nErrors;
+        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+
+        memset(ncclErrors, '\0', sizeof(ncclErrors));
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+          ncclErrors[comm->ranks[rankIdx].status.initState]++;
+        nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]);
+        if (nErrors > 0) {
+          rasOutAppend("  Initialization error%s on %d rank%s\n",
+                       (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : ""));
+          rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors);
+        }
+
+        memset(ncclErrors, '\0', sizeof(ncclErrors));
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
+          ncclErrors[comm->ranks[rankIdx].status.asyncError]++;
+        nErrors = comm->nRanks - (ncclErrors[ncclSuccess] + ncclErrors[ncclInProgress]);
+        if (nErrors > 0) {
+          rasOutAppend("  Asynchronous error%s on %d rank%s\n",
+                       (nErrors > 1 ? "s" : ""), nErrors, (nErrors > 1 ? "s" : ""));
+          rasClientBreakDownErrors(client, comm, peerIdxConv, ncclErrors, /*isAsync*/true);
+        }
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_ERROR)
+    } // for (commIdx)
+  } // for (vcIdx)
+
+  rasOutAppend("Warnings\n"
+               "========\n\n");
+
+  if (coll->nLegTimeouts > 0) {
+    rasOutAppend("TIMEOUT\n"
+                 "  Encountered %d communication timeout%s while gathering communicator data\n\n",
+                 coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : ""));
+  }
+
+  for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
+    struct rasValCount* vc = valCounts+vcIdx;
+    for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
+      bool inconsistent;
+      struct rasAuxComm* auxComm = auxComms+commIdx;
+      comm = auxComm->comm;
+
+      if (auxComm->errors & RAS_ACE_MISMATCH) {
+        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+
+        if (collOpCounts == nullptr) {
+          // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts
+          // array is reverse-sorted by commNRanks.  On the other hand, for this purpose allocating commNRanks
+          // elements may be massively overpessimistic...
+          NCCLCHECKGOTO(ncclCalloc(&collOpCounts, comm->commNRanks), ret, fail);
+        }
+
+        if (__builtin_popcount(auxComm->status) > 1) {
+          rasOutAppend("  Communicator ranks have different status\n");
+
+          // We need to sort the ranks by status.  However, status is normally calculated from other fields.
+          // We will copy the ranks and reuse collOpCount to store it.
+          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+
+            if (rank->status.abortFlag)
+              rank->collOpCount = RAS_ACS_ABORT;
+            else if (rank->status.finalizeCalled || rank->status.destroyFlag)
+              rank->collOpCount = RAS_ACS_FINALIZE;
+            else if (rank->status.initState == ncclSuccess)
+              rank->collOpCount = RAS_ACS_RUNNING;
+            else
+              rank->collOpCount = RAS_ACS_INIT;
+          }
+          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          // Calculate the frequency of different status values.
+          int nCollOpCounts = 0;
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+              // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
+              // status (which is a bitmask) into an array index.
+              collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount);
+              collOpCounts[nCollOpCounts].count = 1;
+              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+              nCollOpCounts++;
+            } else {
+              collOpCounts[nCollOpCounts-1].count++;
+            }
+          }
+          if (comm->nRanks < comm->commNRanks) {
+            // Add a "fake" element corresponding to the missing entries.  The statusStr array contains the "UNKNOWN"
+            // string at index 0.
+            collOpCounts[nCollOpCounts].value = 0;
+            collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks;
+            collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier.
+            nCollOpCounts++;
+          }
+          // Sort by that frequency (most frequent first).
+          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+          for (int coc = 0; coc < nCollOpCounts; coc++) {
+            struct rasValCount* vcc = collOpCounts+coc;
+            if (vcc->count > 1)
+              rasOutAppend("  %d ranks have status %s\n", vcc->count, statusStr[vcc->value]);
+            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+              if (vcc->firstIdx != -1) {
+                // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+                for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                  int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                  if (peerIdx != -1) {
+                    if (vcc->count > 1)
+                      rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                   ranksReSorted[rankIdx].commRank,
+                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                    else
+                      rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
+                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value],
+                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                  } else { // peerIdx == -1
+                    if (vcc->count > 1)
+                      rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                    else
+                      rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
+                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value]);
+                  } // peerIdx == -1
+                } // for (rankIdx)
+              } else {
+                // UNKNOWN ranks.  Format a string with their rank numbers (we don't know anything more).
+                lineBuf[0] = '\0';
+                // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
+                // exception of the missing ranks...
+                for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
+                  if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
+                    rankIdx++;
+                  } else {
+                    snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
+                             (rankIdx == commRank ? "" : ","), commRank);
+                  }
+                } // for (commRank)
+                if (vcc->count > 1) {
+                  rasOutAppend("  The unknown ranks: %s\n", lineBuf);
+                } else {
+                  rasOutAppend("  Rank %s has status %s\n", lineBuf, statusStr[vcc->value]);
+                }
+              }
+            } // if (rasCountIsOutlier(vcc->count))
+          } // for (coc)
+        } // if (__builtin_popcount(auxComm->status) > 1)
+
+        inconsistent = false;
+        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+          if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) {
+            inconsistent = true;
+            break;
+          }
+        }
+        if (inconsistent) {
+          rasOutAppend("  Communicator ranks have different collective operation counts\n");
+
+          // Sort the ranks by collOpCount and rank for easy counting.
+          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          // Calculate the frequency of different collOpCount values.
+          int nCollOpCounts = 0;
+          for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+              collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount;
+              collOpCounts[nCollOpCounts].count = 1;
+              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+              nCollOpCounts++;
+            } else {
+              collOpCounts[nCollOpCounts-1].count++;
+            }
+          }
+          // Sort by that frequency (most frequent first).
+          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+          for (int coc = 0; coc < nCollOpCounts; coc++) {
+            struct rasValCount* vcc = collOpCounts+coc;
+            if (vcc->count > 1)
+              rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
+            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+              // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+              for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                if (peerIdx != -1) {
+                  if (vcc->count > 1)
+                    rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                 ranksReSorted[rankIdx].commRank,
+                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                 rasPeers[peerIdx].pid,
+                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                  else
+                    rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n",
+                                 ranksReSorted[rankIdx].commRank, vcc->value,
+                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                 rasPeers[peerIdx].pid,
+                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                } else { // peerIdx == -1
+                  if (vcc->count > 1)
+                    rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                  else
+                     rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
+                                  ranksReSorted[rankIdx].commRank, vcc->value);
+                } // peerIdx == -1
+              } // for (rankIdx)
+            } // if (rasCountIsOutlier(vcc->count))
+          } // for (coc)
+        } // if (inconsistent)
+        rasOutAppend("\n");
+      } // if (auxComm->errors & RAS_ACE_MISMATCH)
+    } // for (commIdx)
+  } // for (vcIdx)
+  rasCollFree(coll);
+
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
+exit:
+  free(peerNvmlDevs);
+  free(collOpCounts);
+  free(valCounts);
+  free(peerIdxConv);
+  free(ranksReSorted);
+  free(auxComms);
+  return ret;
+fail:
+  goto exit;
+}
+
+static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
+                                     const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) {
+  for (;;) {
+    int maxCount = 0;
+    ncclResult_t maxCountIdx = ncclSuccess;
+    for (int i = ncclUnhandledCudaError; i < ncclInProgress; i++) {
+      if (maxCount < ncclErrors[i]) {
+        maxCount = ncclErrors[i];
+        maxCountIdx = (ncclResult_t)i;
+      }
+    } // for (i)
+    if (maxCountIdx == ncclSuccess)
+      break;
+    if (maxCount > 1)
+      rasOutAppend("  %d ranks reported %s\n", maxCount, ncclErrorToString(maxCountIdx));
+    if (rasCountIsOutlier(maxCount, client->verbose)) {
+      for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+        if ((isAsync ? comm->ranks[rankIdx].status.asyncError : comm->ranks[rankIdx].status.initState) == maxCountIdx) {
+          int peerIdx = peerIdxConv[comm->ranks[rankIdx].peerIdx];
+          if (peerIdx != -1) {
+            if (maxCount > 1)
+              rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                           comm->ranks[rankIdx].commRank,
+                           rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)),
+                           rasPeers[peerIdx].pid,
+                           ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+            else
+              rasOutAppend("  Rank %d reported %s -- GPU %s managed by process %d on node %s\n",
+                           comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx),
+                           rasCommRankGpuToString(comm->ranks+rankIdx, lineBuf, sizeof(lineBuf)),
+                           rasPeers[peerIdx].pid,
+                           ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+          } else { // peerIdx == -1
+            if (maxCount > 1)
+              rasOutAppend("  Rank %d -- [process information not found]\n", comm->ranks[rankIdx].commRank);
+            else
+              rasOutAppend("  Rank %d reported %s -- [process information not found]\n",
+                           comm->ranks[rankIdx].commRank, ncclErrorToString(maxCountIdx));
+          } // peerIdx == -1
+        } // if rank's error matches
+      } // for (rankIdx)
+    } // if (rasCountIsOutlier(maxCount))
+    ncclErrors[maxCountIdx] = 0;
+  } // for (;;)
+}
+
+
+//////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the internal output buffer. //
+//////////////////////////////////////////////////////////////////////
+
+// Appends a printf-formatted string to the output buffer.
+// Unlike with INFO or WARN messages, the caller should terminate lines with '\n' as appropriate.
+static void rasOutAppend(const char* format, ...) {
+  ncclResult_t ret; // Ignored.
+  va_list vargs;
+  int needed;
+  va_start(vargs, format);
+  needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs);
+  va_end(vargs);
+
+  if (needed < 0) // Output error (whatever that might be...)
+    return;
+
+  // The +1 below accounts for the terminating '\0'.
+  if (needed + 1 > rasOutBufferSize-nRasOutBuffer) {
+    int newBufferSize = ROUNDUP(nRasOutBuffer+needed+1, RAS_OUT_INCREMENT);
+    NCCLCHECKGOTO(ncclRealloc(&rasOutBuffer, rasOutBufferSize, newBufferSize), ret, exit);
+    rasOutBufferSize = newBufferSize;
+
+    va_start(vargs, format);
+    needed = vsnprintf(rasOutBuffer+nRasOutBuffer, rasOutBufferSize-nRasOutBuffer, format, vargs);
+    va_end(vargs);
+
+    if (needed < 0) // Output error (whatever that might be...)
+      return;
+  }
+
+  nRasOutBuffer += needed;
+  assert(nRasOutBuffer <= rasOutBufferSize);
+exit:
+  ;
+}
+
+// Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'.
+// The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes
+// the terminating '\0').
+static void rasOutExtract(char* buffer) {
+  if (rasOutBuffer)
+    memcpy(buffer, rasOutBuffer, rasOutLength());
+}
+
+// Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'.
+static int rasOutLength() {
+  return nRasOutBuffer;
+}
+
+// Resets the output buffer position to the beginning (effectively clearing the buffer).
+static void rasOutReset() {
+  ncclResult_t ret; // Ignored.
+  nRasOutBuffer = 0;
+  if (rasOutBuffer == nullptr) {
+    NCCLCHECKGOTO(ncclCalloc(&rasOutBuffer, RAS_OUT_INCREMENT), ret, exit);
+    rasOutBufferSize = RAS_OUT_INCREMENT;
+  }
+exit:
+  ;
+}
+
+
+///////////////////////////////////////////////////////////////////
+// Various sorting callbacks used when grouping/formatting data. //
+///////////////////////////////////////////////////////////////////
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the number of bits set in cudaDevs.  Uses the host IP as the
+// secondary key and the process id as the tertiary key.
+static int rasPeersNGpuCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+  int c1 = __builtin_popcountll(p1->cudaDevs);
+  int c2 = __builtin_popcountll(p2->cudaDevs);
+
+  if (c1 == c2) {
+    // Host IP address is the secondary key.
+    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    if (cmp == 0) {
+      // Process ID is the tertiary key.
+      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+    }
+    return cmp;
+  } else {
+    return (c1 < c2 ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the number of peers per node, which we store in cudaDevs.
+// Uses the host IP as the secondary key and the process id as the tertiary key.
+static int rasPeersNProcsCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+
+  if (p1->cudaDevs == p2->cudaDevs) {
+    // Host IP address is the secondary key.
+    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    if (cmp == 0) {
+      // Process ID is the tertiary key.
+      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+    }
+    return cmp;
+  } else {
+    return (p1->cudaDevs < p2->cudaDevs ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasPeerInfo elements.  Sorts by the host IP and the process id as the secondary key (rather
+// than the port).
+static int rasPeersHostPidCompare(const void* e1, const void* e2) {
+  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
+  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+
+  int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+  if (cmp == 0) {
+    // Process ID is the secondary key.
+    cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+  }
+  return cmp;
+}
+
+// Sorting callback for ncclSocketAddress.  Unlike the ncclSocketsCompare, it ignores the port.
+static int ncclSocketsHostCompare(const void* p1, const void* p2) {
+  const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
+  const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2;
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family) {
+    if (family > 0 && a2->sa.sa_family > 0)
+      return (family < a2->sa.sa_family ? -1 : 1);
+    else // Put empty addresses at the end (not that it matters...).
+      return (family > 0 ? -1 : 1);
+  }
+
+  int cmp;
+  if (family == AF_INET) {
+    cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr));
+  }
+  else if (family == AF_INET6) {
+    cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr));
+  } else {
+    // The only remaining valid case are empty addresses.
+    assert(family == 0);
+    cmp = 0; // Two empty addresses are equal...
+  }
+
+  return cmp;
+}
+
+// Sorting callback for rasValCount elements.  Sorts by the count, largest first.  Value is the secondary key.
+static int rasValCountsCompareRev(const void* p1, const void* p2) {
+  const struct rasValCount* r1 = (const struct rasValCount*)p1;
+  const struct rasValCount* r2 = (const struct rasValCount*)p2;
+
+  if (r1->count == r2->count) {
+    return (r1->value > r2->value ? -1 : (r1->value < r2->value ? 1: 0));
+  } else {
+    return (r1->count > r2->count ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasAuxComm elements.
+// Sorts the comms by the rank count (commNRanks), nNodes as secondary key, status as the tertiary, and errors as
+// the quaternary.  Sorts in reverse (largest first).
+// The final key is the comm's nRanks, sorted in reverse to the other keys, so comms with the largest number
+// of ranks *missing* will be first.
+static int rasAuxCommsCompareRev(const void* p1, const void* p2) {
+  const struct rasAuxComm* c1 = (const struct rasAuxComm*)p1;
+  const struct rasAuxComm* c2 = (const struct rasAuxComm*)p2;
+
+  if (c1->comm->commNRanks == c2->comm->commNRanks) {
+    if (c1->nNodes == c2->nNodes) {
+      // We don't want to compare the status values directly because they could be bitmasks and we are only
+      // interested in the highest bit set.
+      // __builtin_clz returns the number of leading 0-bits, so in our case the value will be the *smallest*
+      // if RAS_ACS_ABORT (8) is set and the *largest* if only RAS_ACS_INIT (1) is set, so we reverse the
+      // comparison to get the desired sorting order.
+      int s1 = __builtin_clz(c1->status);
+      int s2 = __builtin_clz(c2->status);
+      if (s1 == s2) {
+        if (c1->errors == c2->errors) {
+          if (c1->comm->nRanks == c2->comm->nRanks) {
+            return 0;
+          } else {
+            return (c1->comm->nRanks < c2->comm->nRanks ? -1 : 1);
+          }
+        } else {
+          return (c1->errors > c2->errors ? -1 : 1);
+        }
+      } else {
+        return (s1 < s2 ? -1 : 1);
+      }
+    } else {
+      return (c1->nNodes > c2->nNodes ? -1 : 1);
+    }
+  } else {
+    return (c1->comm->commNRanks > c2->comm->commNRanks ? -1 : 1);
+  }
+}
+
+// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the peerIdx.
+static int rasCommRanksPeerCompare(const void* p1, const void* p2) {
+  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
+  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+
+  return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0));
+}
+
+// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the collOpCount, with rank as the secondary key.
+static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
+  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
+  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
+
+  if (r1->collOpCount == r2->collOpCount) {
+    // Use the rank as the secondary key.
+    return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0));
+  } else {
+    return (r1->collOpCount < r2->collOpCount ? -1 : 1);
+  }
+}
+
+
+////////////////////////////////////////////////////////////
+// String formatting functions for various types of data. //
+////////////////////////////////////////////////////////////
+
+// Coverts a GPU mask(s) to a string.  If the CUDA mask is different from the NVML mask, both are printed.
+const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
+  bool first = true;
+  buf[0] = '\0';
+  for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+    if (cudaDevs & (1UL << i)) {
+      snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
+      first = false;
+    }
+  if (cudaDevs != nvmlDevs) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
+    first = true;
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+      if (nvmlDevs & (1UL << i)) {
+        snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
+        first = false;
+      }
+    snprintf(buf+strlen(buf), size-strlen(buf), ")");
+  }
+  return buf;
+}
+
+// Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
+// printed.
+static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
+  snprintf(buf, size, "%d", rank->cudaDev);
+  if (rank->cudaDev != rank->nvmlDev) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev);
+  }
+  return buf;
+}
+
+// Converts a NCCL error result to a string.
+static const char* ncclErrorToString(ncclResult_t err) {
+  switch (err) {
+    case ncclUnhandledCudaError     : return "Unhandled CUDA error";
+    case ncclSystemError            : return "System error";
+    case ncclInternalError          : return "Internal error";
+    case ncclInvalidArgument        : return "Invalid argument";
+    case ncclInvalidUsage           : return "Invalid usage";
+    case ncclRemoteError            : return "Remote process error";
+    case ncclInProgress             : return "NCCL operation in progress";
+    default                         : return "Unexpected error";
+  }
+}
+
+// Converts the IP number of a NCCL address to a string (the port part is ignored and no DNS resolution is attempted).
+static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size) {
+  if (addr->sa.sa_family > 0)
+    return inet_ntop(addr->sa.sa_family,
+                     (addr->sa.sa_family == AF_INET ? (void*)&addr->sin.sin_addr : (void*)&addr->sin6.sin6_addr),
+                     buf, size);
+  else {
+    if (size > 0)
+      buf[0] = '\0';
+    return buf;
+  }
+}
+
+// Determines if the given count constitutes an outlier.
+static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
+  if (count == 1)
+    return true; // A single rank is always considered an outlier...
+  if (verbose) {
+    return (totalCount != -1 ? count < totalCount * RAS_CLIENT_VERBOSE_OUTLIER_FRACTION : true);
+  } else {
+    return count <= RAS_CLIENT_DETAIL_THRESHOLD &&
+           (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION);
+  }
+}
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
new file mode 100644
index 000000000..201144f1a
--- /dev/null
+++ b/src/ras/collectives.cc
@@ -0,0 +1,762 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out duriyng development only!
+#include <cassert>
+#include <mutex>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// The number of recent collectives to keep track of.  Completely arbitrary.
+#define COLL_HISTORY_SIZE 64
+
+// An entry in the rasCollHistory array keeping track of recently completed collectives (to make it possible to
+// identify and drop duplicates arriving over different links).
+struct rasCollHistoryEntry {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+};
+
+// Array keeping track of recently completed collectives (to avoid infinite loops).  LRU-based replacement.
+static struct rasCollHistoryEntry rasCollHistory[COLL_HISTORY_SIZE];
+static int nRasCollHistory, rasCollHistNextIdx;
+
+// Monotonically increased to ensure that each collective originating locally has a unique Id.
+static uint64_t rasCollLastId;
+
+// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// no such tracking).
+struct rasCollective* rasCollectives;
+static int nRasCollectives;
+
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts);
+
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
+
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static int ncclCommsCompare(const void* p1, const void* p2);
+
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the initialization of collectives and the message exchanges. //
+///////////////////////////////////////////////////////////////////////////////////////
+
+// Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
+static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
+  struct rasCollective* coll;
+  int i;
+  for (i = 0; i < nRasCollectives; i++)
+    if (rasCollectives[i].type == RAS_MSG_NONE)
+      break;
+  if (i == nRasCollectives) {
+    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
+    nRasCollectives += RAS_INCREMENT;
+  }
+
+  coll = rasCollectives+i;
+  memset(coll, '\0', sizeof(*coll));
+  coll->startTime = clockNano();
+  coll->fromConnIdx = -1;
+  // We are unlikely to use the whole array, but at least we won't need to realloc.
+  NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
+
+  *pColl = coll;
+  return ncclSuccess;
+}
+
+// Initializes a collective request by giving it a unique ID.
+void rasCollReqInit(struct rasCollRequest* req) {
+  memcpy(&req->rootAddr, &rasNetListeningSocket.addr, sizeof(req->rootAddr));
+  req->rootId = ++rasCollLastId;
+}
+
+// Sends a collective request message through all regular RAS network connections (effectively, broadcasts it).
+// Also used for re-broadcasts (on peers receiving the request over the network).
+// Checking for duplicates is the responsibility of the caller.
+// For collectives other than broadcasts, initializes a rasCollective structure and fills it with local data,
+// in preparation for collective response messages.
+// pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
+// in scenarios such as a total of two peers.
+// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// it's a broadcast, which require no such tracking).
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
+                               int fromConnIdx) {
+  struct rasCollective* coll = nullptr;
+  if (req->type >= RAS_COLL_CONNS) {
+    // Keep track of this collective operation so that we can handle the responses appropriately.
+    NCCLCHECK(getNewCollEntry(&coll));
+    if (pCollIdx)
+      *pCollIdx = coll-rasCollectives;
+    memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
+    coll->rootId = req->rootId;
+    coll->type = req->type;
+    coll->timeout = req->timeout;
+    coll->fromConnIdx = fromConnIdx;
+    if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
+      memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
+      coll->nPeers = 1;
+    }
+
+    // Collective-specific initialization of accumulated data (using local data for now).
+    if (req->type == RAS_COLL_CONNS)
+      (void)rasCollConnsInit(&coll->data, &coll->nData);
+    else if (req->type == RAS_COLL_COMMS)
+      (void)rasCollCommsInit(&coll->data, &coll->nData);
+  } else { // req->type < RAS_COLL_CONNS
+    // Add the info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &req->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = req->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    // Collective-specific message handling.
+    if (req->type == RAS_BC_DEADPEER) {
+      bool done = false;
+      rasMsgHandleBCDeadPeer(req, &done);
+      if (done)
+        goto exit;
+    }
+  } // req->type < RAS_COLL_CONNS
+
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
+    rasConns[connIdx].linkFlag = false;
+
+  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+
+  if (coll && pAllDone)
+    *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
+exit:
+  return ncclSuccess;
+}
+
+// Sends the collective message through all connections associated with this link (with the exception of the one
+// the message came from, if any).
+static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
+                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      if (!conn->linkFlag) {
+        // We send collective messages through fully established and operational connections only.
+        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
+            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
+        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
+        conn->linkFlag = true;
+      } // if (!conn->linkFlag)
+    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
+  } // for (i)
+
+  return ncclSuccess;
+}
+
+// Sends a collective message down a particular connection.
+static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLREQ) + reqLen;
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLREQ;
+  memcpy(&msg->collReq, req, reqLen);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_COLLREQ collective message request on the receiver side.  Primarily deals with duplicates and
+// re-broadcasts the message to local peers, though in case of a very limited RAS network it might be done right away,
+// in which case it can immediately send the response.
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
+  bool allDone = false;
+  int collIdx = -1;
+  assert(sock->connIdx != -1);
+
+  // First check if we've already handled this request (through another connection).
+  for (int i = 0; i < nRasCollHistory; i++) {
+    // In principle we can use i to index the array but we convert it so that we check the most recent entries first.
+    int collHistIdx = (rasCollHistNextIdx + COLL_HISTORY_SIZE - 1 - i) % COLL_HISTORY_SIZE;
+    if (memcmp(&msg->collReq.rootAddr, &rasCollHistory[collHistIdx].rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+        msg->collReq.rootId == rasCollHistory[collHistIdx].rootId) {
+      if (msg->collReq.type >= RAS_COLL_CONNS) {
+        // Send an empty response so that the sender can account for it.  The non-empty response has already been
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+      }
+      goto exit;
+    }
+  } // for (i)
+
+  if (msg->collReq.type >= RAS_COLL_CONNS) {
+    // Check if we're currently handling this collective request.
+    for (int i = 0; i < nRasCollectives; i++) {
+      struct rasCollective* coll = rasCollectives+i;
+      if (coll->type != RAS_MSG_NONE &&
+          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+          msg->collReq.rootId == coll->rootId) {
+        assert(msg->collReq.type == coll->type);
+
+        // Send an empty response so that the sender can account for it.  The non-empty response will be
+        // sent through the connection that we received the request through first.
+        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+                                      /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
+        goto exit;
+      } // if match
+    } // for (i)
+  } // if (msg->collReq.type >= RAS_COLL_CONNS)
+
+  // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+
+  if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
+    assert(collIdx != -1);
+    // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
+    // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
+    // has more than one connection so there should always be _some_ other peer to forward the request to.
+    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+  }
+exit:
+  return ncclSuccess;
+}
+
+// Sends a collective response back to the process we received the collective request from.
+// Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
+// any peers (unlikely), the peers sent their responses (likely), or we timed out.
+static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
+  if (coll->fromConnIdx != -1) {
+    // For remotely-initiated collectives, send the response back.
+    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+                                  coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
+
+    // Add the identifying info to the collective message history.
+    nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
+    memcpy(&rasCollHistory[rasCollHistNextIdx].rootAddr, &coll->rootAddr,
+           sizeof(rasCollHistory[rasCollHistNextIdx].rootAddr));
+    rasCollHistory[rasCollHistNextIdx].rootId = coll->rootId;
+    rasCollHistNextIdx = (rasCollHistNextIdx + 1) % COLL_HISTORY_SIZE;
+
+    rasCollFree(coll);
+  } else {
+    // For locally-initiated collectives, invoke the client code again (which will release it, once finished).
+    NCCLCHECK(rasClientResume(coll));
+  }
+  return ncclSuccess;
+}
+
+// Sends a collective response via the connection we originally received the request from.  The message should be
+// a cumulative response from this process and all the processes that we forwarded the request to.
+static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
+                                        const union ncclSocketAddress* rootAddr, uint64_t rootId,
+                                        const union ncclSocketAddress* peers, int nPeers,
+                                        const char* data, int nData, int nLegTimeouts) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_COLLRESP) + nPeers*sizeof(*peers);
+  int dataOffset = 0;
+
+  if (nData > 0) {
+    ALIGN_SIZE(msgLen, alignof(int64_t));
+    dataOffset = msgLen;
+    msgLen += nData;
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_COLLRESP;
+  memcpy(&msg->collResp.rootAddr, rootAddr, sizeof(msg->collResp.rootAddr));
+  msg->collResp.rootId = rootId;
+  msg->collResp.nLegTimeouts = nLegTimeouts;
+  msg->collResp.nPeers = nPeers;
+  msg->collResp.nData = nData;
+  if (nPeers)
+    memcpy(msg->collResp.peers, peers, nPeers*sizeof(*msg->collResp.peers));
+  if (nData)
+    memcpy(((char*)msg)+dataOffset, data, nData);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+
+  return ncclSuccess;
+}
+
+// Handles the collective response on the receiver side.  Finds the corresponding rasCollective structure, merges
+// the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
+// accumulated response back.
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
+  int collIdx;
+  struct rasCollective* coll = nullptr;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    coll = rasCollectives+collIdx;
+    if (coll->type != RAS_MSG_NONE &&
+        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+        msg->collResp.rootId == coll->rootId)
+      break;
+  }
+  if (collIdx == nRasCollectives) {
+    INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
+         ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
+         ncclSocketToString(&sock->sock.addr, rasLine));
+    goto exit;
+  }
+
+  coll->nLegTimeouts += msg->collResp.nLegTimeouts;
+  assert(sock->connIdx != -1);
+  // Account for the received response in our collective operation tracking.
+  for (int i = 0; i < coll->nFwdSent; i++) {
+    if (coll->fwdConns[i] == sock->connIdx) {
+      coll->fwdConns[i] = -1;
+      break;
+    }
+  }
+  coll->nFwdRecv++;
+  if (msg->collResp.nData > 0) {
+    // Collective-specific merging of the response into locally accumulated data.
+    if (coll->type == RAS_COLL_CONNS)
+      NCCLCHECK(rasCollConnsMerge(coll, msg));
+    else if (coll->type == RAS_COLL_COMMS)
+      NCCLCHECK(rasCollCommsMerge(coll, msg));
+  }
+  // We merge the peers after merging the data, so that the data merge function can rely on peers being unchanged.
+  if (msg->collResp.nPeers > 0) {
+    NCCLCHECK(ncclRealloc(&coll->peers, coll->nPeers, coll->nPeers + msg->collResp.nPeers));
+    memcpy(coll->peers+coll->nPeers, msg->collResp.peers, msg->collResp.nPeers * sizeof(*coll->peers));
+    coll->nPeers += msg->collResp.nPeers;
+  }
+
+  // If we received all the data we were waiting for, send our response back.
+  if (coll->nFwdSent == coll->nFwdRecv)
+    NCCLCHECK(rasCollReadyResp(coll));
+exit:
+  return ncclSuccess;
+}
+
+// Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
+// terminated.
+void rasCollsPurgeConn(int connIdx) {
+  for (int i = 0; i < nRasCollectives; i++) {
+    struct rasCollective* coll = rasCollectives+i;
+    if (coll->type != RAS_MSG_NONE) {
+      char line[SOCKET_NAME_MAXLEN+1];
+      if (coll->fromConnIdx == connIdx) {
+        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+        rasCollFree(coll);
+      } else {
+        for (int j = 0; j < coll->nFwdSent; j++) {
+          if (coll->fwdConns[j] == connIdx) {
+            coll->fwdConns[j] = -1;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            if (coll->nFwdSent == coll->nFwdRecv)
+              (void)rasCollReadyResp(coll);
+            break;
+          }
+        } // for (j)
+      } // coll->fromConnIdx != connIdx
+    } // !RAS_MSG_NONE
+  } // for (i)
+}
+
+// Frees a rasCollective entry and any memory associated with it.
+void rasCollFree(struct rasCollective* coll) {
+  free(coll->fwdConns);
+  coll->fwdConns = nullptr;
+  free(coll->peers);
+  coll->peers = nullptr;
+  free(coll->data);
+  coll->data = nullptr;
+  coll->fromConnIdx = -1;
+  coll->type = RAS_MSG_NONE;
+}
+
+// Invoked from the main RAS thread loop to handle timeouts of the collectives.
+// We obviously want to have a reasonable *total* timeout that the RAS client can rely on, but we don't have strict
+// global coordination.  So we have, in effect, two timeouts: soft (5s) and hard (10s).  Soft equals the keep-alive
+// timeout.
+// When sending collective requests, we skip any connections that are experiencing delays.  After the 5s timeout, we
+// check again the status of all outstanding connections and if any is now delayed, we give up on it.
+// That works fine for directly observable delays, but if the problematic connection is further away from us, all
+// we can do is trust that the other peers will "do the right thing soon".  However, if there is a cascade of
+// problematic connections, they could still exceed the 5s total.  So after 10s we give up waiting no matter what
+// and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
+// time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
+    struct rasCollective* coll = rasCollectives+collIdx;
+    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
+      continue;
+
+    if (now - coll->startTime > coll->timeout) {
+      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+      if (!coll->timeoutWarned) {
+        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
+             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+        coll->timeoutWarned = true;
+      }
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] != -1) {
+          struct rasConnection* conn = rasConns+coll->fwdConns[i];
+          char line[SOCKET_NAME_MAXLEN+1];
+          if (!conn->experiencingDelays && conn->sockIdx != -1) {
+            struct rasSocket* sock = rasSockets+conn->sockIdx;
+            // Ensure that the connection is fully established and operational, and that the socket hasn't been
+            // re-created during the handling of the collective (which would suggest that the request may have been
+            // lost).
+            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
+              continue;
+          }
+          // In all other cases we declare a timeout so that we can (hopefully) recover.
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          coll->fwdConns[i] = -1;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+        } // if (coll->fwdConns[i] != -1)
+      } // for (i)
+      if (coll->nFwdSent == coll->nFwdRecv) {
+        (void)rasCollReadyResp(coll);
+      } else {
+        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+        // (incomplete) responses will arrive shortly, so we should wait a little longer.
+        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+          // the originator of the collective, if it's not us, may have timed out already anyway).
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+               ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+               (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+          coll->nFwdRecv = coll->nFwdSent;
+          (void)rasCollReadyResp(coll);
+        } else {
+          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+        }
+      } // conn->nFwdRecv < conn->nFwdSent
+    } else {
+      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+    }
+  } // for (collIdx)
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_CONNS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
+// as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
+// but the system clocks may not be perfectly in sync).
+static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+  struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
+  struct rasCollConns* pConnsData;
+
+  // Update the statistical data first and in the process also calculate how much connection-specific space we
+  // will need.
+  for (int i = 0; i < nRasConns; i++) {
+    struct rasConnection* conn = rasConns+i;
+    if (conn->inUse && conn->travelTimeCount > 0) {
+      if (connsData.travelTimeMin > conn->travelTimeMin)
+        connsData.travelTimeMin = conn->travelTimeMin;
+      if (connsData.travelTimeMax < conn->travelTimeMax)
+        connsData.travelTimeMax = conn->travelTimeMax;
+      connsData.travelTimeSum += conn->travelTimeSum;
+      connsData.travelTimeCount += conn->travelTimeCount;
+      connsData.nConns++;
+      if (conn->travelTimeMin < 0)
+        connsData.nNegativeMins++;
+    }
+  }
+
+  *pNData = sizeof(connsData) + connsData.nNegativeMins*sizeof(*connsData.negativeMins);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  pConnsData = (struct rasCollConns*)*pData;
+  memcpy(pConnsData, &connsData, sizeof(*pConnsData));
+  if (connsData.nNegativeMins > 0) {
+    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
+      struct rasConnection* conn = rasConns+i;
+      if (conn->inUse && conn->travelTimeMin < 0) {
+        struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
+        memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
+        memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
+        negativeMin->travelTimeMin = conn->travelTimeMin;
+        negMinsIdx++;
+      }
+      assert(negMinsIdx <= connsData.nNegativeMins);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_CONNS response message into the local accumulated data.
+static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollConns* collData;
+  struct rasCollConns* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollConns*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollConns*)coll->data;
+
+  // Merge the stats.
+  if (collData->travelTimeMin > msgData->travelTimeMin)
+    collData->travelTimeMin = msgData->travelTimeMin;
+  if (collData->travelTimeMax < msgData->travelTimeMax)
+    collData->travelTimeMax = msgData->travelTimeMax;
+  collData->travelTimeSum += msgData->travelTimeSum;
+  collData->travelTimeCount += msgData->travelTimeCount;
+  collData->nConns += msgData->nConns;
+
+  // Append the info about negative minimums.
+  if (msgData->nNegativeMins > 0) {
+    int nData = sizeof(*collData) +
+      (collData->nNegativeMins+msgData->nNegativeMins) * sizeof(*collData->negativeMins);
+    NCCLCHECK(ncclRealloc(&coll->data, coll->nData, nData));
+    collData = (struct rasCollConns*)coll->data;
+    memcpy(coll->data+coll->nData, msgData->negativeMins,
+           msgData->nNegativeMins * sizeof(*collData->negativeMins));
+    coll->nData = nData;
+    collData->nNegativeMins += msgData->nNegativeMins;
+  }
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of the RAS_COLL_COMMS collective. //
+/////////////////////////////////////////////////////////////////////////
+
+// Initializes the accumulated data with just the local data for now.
+// For this particular collective, we keep for every communicator information about every rank, to help identify
+// the missing ones and the discrepancies between the ones that did respond.
+static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+  struct rasCollComms* commsData;
+  int nComms = 0, nRanks = 0;
+  std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+  // Start by counting the communicators so that we know how much space to allocate.
+  // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
+  // of multiple GPUs per process) and between the peers.
+  if (!ncclCommsSorted) {
+    qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
+    ncclCommsSorted = true;
+  }
+  for (int i = 0; i < nNcclComms; i++) {
+    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+      break;
+    if (i == 0) {
+      nComms = 1;
+    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      nComms++;
+    }
+    nRanks++;
+  }
+
+  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+  // pointer manipulations somewhat unwieldy...
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
+  NCCLCHECK(ncclCalloc(pData, *pNData));
+  commsData = (struct rasCollComms*)*pData;
+  commsData->nComms = nComms;
+
+  // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
+  struct rasCollComms::comm* comm = commsData->comms;
+  for (int i = 0; i < nNcclComms; i++) {
+    struct rasCollComms::comm::rank* rank;
+    ncclResult_t asyncError;
+    if (ncclComms[i] == nullptr)
+      break;
+    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+      if (i > 0)
+        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
+      comm->commHash = ncclComms[i]->commHash;
+      comm->commNRanks = ncclComms[i]->nRanks;
+      comm->nRanks = 0;
+    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
+      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
+      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+           ncclComms[i]->rank, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    if (comm->nRanks == comm->commNRanks) {
+      INFO(NCCL_RAS,
+           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
+           comm->commNRanks, comm->commHash);
+      continue; // Short of failing, the best we can do is skip...
+    }
+    rank = comm->ranks+comm->nRanks;
+    rank->commRank = ncclComms[i]->rank;
+    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+    // always 0.  It will increase after we send this response back to the peer we got the request from.
+    rank->peerIdx = 0;
+    rank->collOpCount = ncclComms[i]->collOpCount;
+    rank->status.initState = ncclComms[i]->initState;
+    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
+      rank->status.asyncError = asyncError;
+    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
+    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
+    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
+    rank->cudaDev = ncclComms[i]->cudaDev;
+    rank->nvmlDev = ncclComms[i]->nvmlDev;
+    comm->nRanks++;
+  }
+  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+
+  return ncclSuccess;
+}
+
+// Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
+static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
+  struct rasCollComms* collData;
+  struct rasCollComms* msgData;
+  int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
+  ALIGN_SIZE(dataOffset, alignof(int64_t));
+
+  msgData = (struct rasCollComms*)(((char*)msg) + dataOffset);
+  collData = (struct rasCollComms*)coll->data;
+
+  if (msgData->nComms > 0) {
+    struct rasCollComms* newData = nullptr;
+
+    // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
+    NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
+    struct rasCollComms::comm* collComm = collData->comms;
+    struct rasCollComms::comm* msgComm = msgData->comms;
+    struct rasCollComms::comm* newComm = newData->comms;
+
+    for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
+      int cmp;
+      if (collIdx < collData->nComms && msgIdx < msgData->nComms)
+        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+      else
+        cmp = (collIdx < collData->nComms ? -1 : 1);
+
+      if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
+        INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
+             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+        cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
+        // We try to preserve both separately, although the input data might already be messed up anyway...
+      }
+
+      if (cmp == 0) {
+        // Merge the comms.
+        newComm->commHash = collComm->commHash;
+        newComm->commNRanks = collComm->commNRanks;
+        if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
+          INFO(NCCL_RAS,
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
+               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+          // We'll skip the extras in the loop below.
+        } else {
+          newComm->nRanks = collComm->nRanks + msgComm->nRanks;
+        }
+        // Merge the ranks.
+        for (int newRankIdx = 0, collRankIdx = 0, msgRankIdx = 0;
+             collRankIdx < collComm->nRanks || msgRankIdx < msgComm->nRanks;
+             newRankIdx++) {
+          int cmpRank;
+          if (newRankIdx == newComm->commNRanks)
+            break; // Short of failing, the best we can do is skip...
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+            cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
+                       (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
+          else
+            cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+
+          // There shouldn't be any overlaps in ranks between different sources.
+          if (cmpRank == 0) {
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
+                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            msgRankIdx++; // Short of failing, the best we can do is skip...
+          }
+          memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
+                                             msgComm->ranks+msgRankIdx++), sizeof(*newComm->ranks));
+          if (cmpRank > 0) {
+            // peerIdx values from msgComm need to shift after merge.
+            newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
+          }
+        } // for (newRankIdx)
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        collIdx++;
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgIdx++;
+      } else if (cmp < 0) {
+        // Copy from collComm.
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        memcpy(newComm, collComm, commSize);
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
+        collIdx++;
+      } else { // cmp > 0
+        // Copy from msgComm.
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        memcpy(newComm, msgComm, commSize);
+        for (int i = 0; i < newComm->nRanks; i++) {
+          // peerIdx values from msgComm need to shift after merge.
+          newComm->ranks[i].peerIdx += coll->nPeers;
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm)) + commSize);
+        msgIdx++;
+      } // cmp > 0
+    } // for (collIdx and msgIdx)
+
+    free(coll->data);
+    coll->data = (char*)newData;
+    // newComm points at the next element beyond the last one -- exactly what we need.
+    coll->nData = ((char*)newComm) - (char*)newData;
+  } // if (msgData->nComms > 0)
+
+  return ncclSuccess;
+}
+
+// Sorting callback for the ncclComms array.
+static int ncclCommsCompare(const void* p1, const void* p2) {
+  const ncclComm** pc1 = (const ncclComm**)p1;
+  const ncclComm** pc2 = (const ncclComm**)p2;
+
+  // Put nullptr's at the end.
+  if (*pc1 == nullptr || *pc2 == nullptr)
+    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+
+  if ((*pc1)->commHash == (*pc2)->commHash) {
+    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  } else {
+    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+  }
+}
diff --git a/src/ras/peers.cc b/src/ras/peers.cc
new file mode 100644
index 000000000..f2692d3e1
--- /dev/null
+++ b/src/ras/peers.cc
@@ -0,0 +1,960 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "ras_internal.h"
+
+
+// All the known peer NCCL processes. The array is sorted by addr to ensure locality (within a node and hopefully
+// also within a DC).  The array may grow over time and it *includes* dead peers.
+struct rasPeerInfo* rasPeers;
+int nRasPeers;
+// Hash of the rasPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasPeersHash;
+// Index of this process within the rasPeers array (may change over time as the array grows).
+static int myPeerIdx = -1;
+
+// Addresses of all the dead peers, sorted.  In principle we could instead have a flag in rasPeerInfo for this,
+// but we expect rasPeers to be largely static (and large at scale!) and rasDeadPeers to be fairly dynamic and
+// much smaller, so we prefer to keep the dead info separately so that we don't end up sending the possibly large
+// rasPeerInfo array around all the time.
+union ncclSocketAddress* rasDeadPeers;
+// The number of dead peers.
+int nRasDeadPeers;
+// The array size (may be larger than nRasDeadPeers).
+static int rasDeadPeersSize;
+// Hash of the rasDeadPeers array, for figuring out when to sync with a remote peer.
+uint64_t rasDeadPeersHash;
+
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers);
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
+
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx);
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+
+static ncclResult_t rasLinkReinitConns(struct rasLink* link);
+
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers);
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr);
+
+static int rasAddrRankInitCompare(const void* k, const void* e);
+static int rasAddrPeerInfoCompare(const void* k, const void* e);
+static int rasRanksCompare(const void* e1, const void* e2);
+
+static void rasPeersDump();
+static void rasDeadPeersDump();
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres);
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local RAS_ADD_RANKS notifications. //
+/////////////////////////////////////////////////////////////////////////////
+
+// Handles RAS_ADD_RANKS notification -- adds new ranks to the internal list of all RAS peers, reconfigures RAS
+// network connections, and notifies the peers.
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks) {
+  ncclResult_t ret = ncclSuccess;
+
+  INFO(NCCL_RAS, "RAS handling local addRanks request (old nRasPeers %d)", nRasPeers);
+
+  // Convert the input rasRankInit structures into our internal rasPeerInfo.
+  struct rasPeerInfo* rankPeers = nullptr;
+  int nRankPeers;
+  int newNRasPeers;
+  NCCLCHECKGOTO(rasRanksConvertToPeers(ranks, nranks, &rankPeers, &nRankPeers, &newNRasPeers), ret, fail);
+
+  // Update local rasPeers.
+  NCCLCHECKGOTO(rasPeersUpdate(rankPeers, &nRankPeers, newNRasPeers), ret, fail);
+
+  INFO(NCCL_RAS, "RAS finished local processing of addRanks request (new nRasPeers %d, nRankPeers %d)",
+       nRasPeers, nRankPeers);
+  // Print peers only if something changed and we're the "root".
+  if (nRankPeers > 0 && memcmp(&ranks[0].addr, &rasNetListeningSocket.addr, sizeof(ranks[0].addr)) == 0)
+    rasPeersDump();
+
+  // Propagate the changes through our RAS network links.
+  NCCLCHECKGOTO(rasNetUpdatePeers(rankPeers, nRankPeers, /*updateDeadPeers*/false, ranks, nranks), ret, fail);
+
+exit:
+  if (rankPeers)
+    free(rankPeers);
+  free(ranks);
+  return ret;
+fail:
+  goto exit;
+}
+
+// Converts the rasRankInit structure into rasPeerInfo.  This skips empty elements (in case of errors), orders
+// elements by the address/cudaDev, and merges elements with duplicate addresses (in case of multiple CUDA devices per
+// process).  In the process we also calculate how large the merged rasPeers array will need to be.
+static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks,
+                                           struct rasPeerInfo** rankPeers, int *nRankPeers, int* newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int peerIdx, rankPeerIdx;
+
+  // Handy when checking for empty (in case of errors) addresses.
+  union ncclSocketAddress emptyAddr;
+  memset(&emptyAddr, '\0', sizeof(emptyAddr));
+
+  // Begin by sorting the array by address and cudaDev (to match the rasPeers order).
+  qsort(ranks, nranks, sizeof(*ranks), &rasRanksCompare);
+
+  // We over-allocate peers here because to get an accurate count we would need to loop over the ranks first...
+  // nRankPeers will hold the actual count of used elements.
+  *rankPeers = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(rankPeers, nranks), ret, fail);
+
+  peerIdx = rankPeerIdx = 0;
+  *newNRasPeers = nRasPeers;
+  for (int rankIdx = 0; rankIdx < nranks; rankIdx++) {
+    const struct rasRankInit* rank = ranks+rankIdx;
+    struct rasPeerInfo* rankPeer = *rankPeers+rankPeerIdx;
+
+    if (memcmp(&emptyAddr, &rank->addr, sizeof(emptyAddr)) == 0) {
+      // Skip empty rank entries.
+      continue;
+    }
+
+    // First check if the rank doesn't need to be merged into the previous entry in rankPeers
+    // (possible if there are multiple ranks with the same address).
+    if (rankPeerIdx > 0 && memcmp(&rank->addr, &rankPeer[-1].addr, sizeof(rank->addr)) == 0) {
+      // Merge into the previous entry in peers.
+      rankPeer[-1].cudaDevs |= (1UL << rank->cudaDev);
+      rankPeer[-1].nvmlDevs |= (1UL << rank->nvmlDev);
+      continue;
+    }
+
+    // Add a new entry to rankPeers.
+    assert(rankPeerIdx < nranks);
+    memcpy(&rankPeer->addr, &rank->addr, sizeof(rankPeer->addr));
+    rankPeer->pid = rank->pid;
+    rankPeer->cudaDevs = (1UL << rank->cudaDev);
+    rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeerIdx++;
+
+    // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
+    // many more entries will be needed.
+    const struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    int cmp = 0;
+    while (peerIdx < nRasPeers) {
+      cmp = ncclSocketsCompare(&rank->addr, &rasPeer->addr);
+      if (cmp <= 0)
+        break;
+      peerIdx++;
+      rasPeer++;
+    }
+    if (peerIdx == nRasPeers) {
+      // The current rank is "greater than" all existing peers, so it will need a new entry.  We stay in the loop so
+      // that we don't need to handle the remaining ranks separately.
+      (*newNRasPeers)++;
+      continue;
+    }
+    if (cmp < 0) {
+      (*newNRasPeers)++;
+    } else {
+      // Duplicates (cmp == 0) between the rank array and the peers array will be merged.
+      assert(rank->pid == rasPeer->pid);
+    }
+  }
+  assert(peerIdx <= nRasPeers);
+  *nRankPeers = rankPeerIdx;
+
+exit:
+  return ret;
+fail:
+  if (*rankPeers) {
+    free(*rankPeers);
+    *rankPeers = nullptr;
+  }
+  goto exit;
+}
+
+// Updates the rasPeers array with the new data.  The new data gets updated in the process as well: any data that
+// wasn't actually new is purged, so as to minimize the amount of data we forward to our peers.
+// On a successful return, nRankPeers contains the number of entries that were updated.
+static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers) {
+  ncclResult_t ret = ncclSuccess;
+  int rankPeerIdxDst;
+  int rankPeerIdx, peerIdx;
+
+  if (newNRasPeers == -1) {
+    // First calculate the new size of rasPeers.
+    newNRasPeers = nRasPeers;
+    for (rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+      struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+      struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+      int cmp = 1;
+
+      while (peerIdx < nRasPeers) {
+        cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer will go in front of rasPeer.
+          newNRasPeers++;
+          break;
+        }
+
+        peerIdx++;
+        rasPeer++;
+
+        if (cmp == 0)
+          break;
+      }
+      if (cmp > 0) // No more rasPeer entries -- rankPeer will go at the end.
+        newNRasPeers++;
+    }
+  }
+
+  // If needed, allocate a new, larger rasPeers array.
+  struct rasPeerInfo* newRasPeers;
+  int myNewPeerIdx;
+  if (newNRasPeers > nRasPeers) {
+    NCCLCHECKGOTO(ncclCalloc(&newRasPeers, newNRasPeers), ret, fail);
+  } else {
+    newRasPeers = rasPeers;
+  }
+
+  // Now merge the rankPeers into newRasPeers.  In the process, modify rankPeers to become a "diff" between
+  // the old rasPeers and newRasPeers -- this will be the data structure to broadcast on the RAS network.
+  myNewPeerIdx = -1;
+  int newPeerIdx;
+  for (newPeerIdx = rankPeerIdx = peerIdx = 0; rankPeerIdx < *nRankPeers || peerIdx < nRasPeers;) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    struct rasPeerInfo* rasPeer = rasPeers+peerIdx;
+    struct rasPeerInfo* newRasPeer = newRasPeers+newPeerIdx;
+
+    if (rankPeerIdx < *nRankPeers) {
+      if (peerIdx < nRasPeers) {
+        int cmp = ncclSocketsCompare(&rankPeer->addr, &rasPeer->addr);
+
+        if (cmp < 0) {
+          // rankPeer needs to occur before rasPeer -- that's possible only if we are adding new entries.
+          assert(newRasPeers != rasPeers);
+          // Add new entry to newRasPeers.
+          assert(newPeerIdx < newNRasPeers);
+          memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+          newPeerIdx++;
+          rankPeerIdx++;
+        }
+        else {
+          // cmp >= 0 -- Start by copying peer to newRasPeer, if needed.
+          if (newRasPeers != rasPeers) {
+            assert(newPeerIdx < newNRasPeers);
+            memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+          }
+          else { // in-place
+            assert(newRasPeer == rasPeer);
+          }
+
+          if (cmp == 0) {
+            // The address of rankPeer is the same as that of newRasPeer -- merge into it.
+            // First though calculate what GPUs from rankPeer are actually new (if any).
+            uint64_t newDevs = rankPeer->cudaDevs & ~newRasPeer->cudaDevs;
+            newRasPeer->cudaDevs |= rankPeer->cudaDevs;
+            // Update rankPeer->devs with the newly added devs only -- we'll clean it up at the end.
+            rankPeer->cudaDevs = newDevs;
+            // Repeat for nvmlDevs...
+            newDevs = rankPeer->nvmlDevs & ~newRasPeer->nvmlDevs;
+            newRasPeer->nvmlDevs |= rankPeer->nvmlDevs;
+            rankPeer->nvmlDevs = newDevs;
+            rankPeerIdx++;
+          }
+          // Given that we might've added new entries, we need to update myPeerIdx as well.
+          if (myPeerIdx == peerIdx)
+            myNewPeerIdx = newPeerIdx;
+          peerIdx++;
+          newPeerIdx++;
+        }
+      } else { // peerIdx == nRasPeers
+        // No more rasPeers -- add a new entry based on rank.
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rankPeer, sizeof(*newRasPeer));
+        // If this is the first time this function is run, myPeerIdx will need to be set.  It's more work in that
+        // case as we need to compare the addresses of each peer until we find one.
+        if (myPeerIdx == -1 && memcmp(&newRasPeer->addr, &rasNetListeningSocket.addr, sizeof(newRasPeer->addr)) == 0)
+          myNewPeerIdx = newPeerIdx;
+        newPeerIdx++;
+        rankPeerIdx++;
+      }
+    } else { // rankPeerIdx == *nRankPeers
+      // No more rankPeers -- copy the rasPeer over if needed.
+      if (newRasPeers != rasPeers) {
+        assert(newPeerIdx < newNRasPeers);
+        memcpy(newRasPeer, rasPeer, sizeof(*newRasPeer));
+      }
+      else { // in-place at the end.
+        assert(newRasPeer == rasPeer);
+      }
+      if (myPeerIdx == peerIdx)
+        myNewPeerIdx = newPeerIdx;
+      peerIdx++;
+      newPeerIdx++;
+    }
+  }
+  assert(newPeerIdx == newNRasPeers);
+
+  if (newRasPeers != rasPeers) {
+    if (rasPeers)
+      free(rasPeers);
+    rasPeers = newRasPeers;
+    nRasPeers = newNRasPeers;
+    assert(myNewPeerIdx != -1);
+    myPeerIdx = myNewPeerIdx;
+  } else {
+    assert(myNewPeerIdx == myPeerIdx);
+  }
+  rasPeersHash = getHash((const char*)rasPeers, nRasPeers*sizeof(*rasPeers));
+
+  // Purge from rankPeers all entries that didn't actually contribute any new GPUs.
+  for (rankPeerIdx = rankPeerIdxDst = 0; rankPeerIdx < *nRankPeers; rankPeerIdx++) {
+    struct rasPeerInfo* rankPeer = rankPeers+rankPeerIdx;
+    if (rankPeer->cudaDevs != 0) {
+      if (rankPeerIdxDst != rankPeerIdx) {
+        memcpy(rankPeers+rankPeerIdxDst, rankPeer, sizeof(*rankPeers));
+      }
+      rankPeerIdxDst++;
+    }
+  }
+  assert(rankPeerIdxDst <= *nRankPeers);
+  *nRankPeers = rankPeerIdxDst;
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Searches through rasPeers given the peer address.  Returns the index of the found entry in the rasPeers
+// array or -1 if not found.
+int rasPeerFind(const union ncclSocketAddress* addr) {
+  struct rasPeerInfo* peer = (struct rasPeerInfo*)bsearch(addr, rasPeers, nRasPeers, sizeof(*rasPeers),
+                                                          rasAddrPeerInfoCompare);
+  return (peer ? peer-rasPeers : -1);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the propagation of peers updates over the RAS network. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Propagates information about new peers through the RAS network links.
+// ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
+// of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
+// be used, however, to request at least the propagation of rasDeadPeers to such peers.
+// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// propagate the update back through it.
+// Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
+// connections as needed.
+static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
+                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Do we actually have anything to do?
+  if (nNewPeers == 0 && !updateDeadPeers)
+    goto exit;
+
+  // Start by propagating the update through the RAS network links.  We consider any errors during this process
+  // to be non-fatal (we can re-sync later around a keep-alive exchange).
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+
+  // Calculate new link peers and open new connections if needed.
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
+  NCCLCHECKGOTO(rasLinkReinitConns(&rasPrevLink), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+// Sends a peers update through all the connections associated with a particular link.  See rasNetUpdatePeers
+// for the explanation of the function arguments.
+static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
+                                           bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
+                                           int fromConnIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    // Note that we don't send the update via the connection that we received this notification from in the first
+    // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
+    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      // Failed propagations are not considered fatal (we will retry after a keep-alive).
+      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a peers update down a particular connection.  See rasNetUpdatePeers for the explanation of the function
+// arguments.
+static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
+                                           int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
+  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+    // If we have the rank info, check if the peer on the other side of this connection has participated in the new
+    // communicator.
+    int connRank = -1;
+    if (ranks && !updateDeadPeers) {
+      struct rasRankInit* rank = (struct rasRankInit*)bsearch(&conn->addr, ranks, nranks, sizeof(*ranks),
+                                                              rasAddrRankInitCompare);
+      if (rank)
+        connRank = rank-ranks;
+    }
+    if (connRank < 0) {
+      // It did not participate or we don't know -- we should send an update to that peer then.
+      NCCLCHECK(rasConnSendPeersUpdate(conn, newPeers, nNewPeers));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+// Sends a RAS_MSG_PEERSUPDATE message, which can include both the rasPeers (preferably only the newly added peers
+// rather than the complete rasPeers array, to save on the network bandwidth) and rasDeadPeers (sent in its entirety
+// if at all, as it's assumed to be a lot smaller than rasPeers).
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers) {
+  struct rasMsg* msg = nullptr;
+  int msgLen;
+  int deadPeersOffset = 0;
+  int nDeadPeers;
+
+  if (conn->lastSentPeersHash == rasPeersHash || conn->lastRecvPeersHash == rasPeersHash) {
+    nPeers = 0;
+  }
+  if (conn->lastSentDeadPeersHash == rasDeadPeersHash || conn->lastRecvDeadPeersHash == rasDeadPeersHash) {
+    nDeadPeers = 0;
+  } else {
+    // We expect the rasDeadPeers array to be much smaller than rasPeers so if we send it, we send it in full.
+    nDeadPeers = nRasDeadPeers;
+  }
+
+  if (nPeers == 0 && nDeadPeers == 0)
+    goto exit;
+
+  msgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*peers);
+  if (nDeadPeers > 0) {
+    ALIGN_SIZE(msgLen, alignof(union ncclSocketAddress));
+    deadPeersOffset = msgLen;
+    msgLen += nDeadPeers*sizeof(*rasDeadPeers);
+  }
+
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_PEERSUPDATE;
+  msg->peersUpdate.peersHash = rasPeersHash;
+  msg->peersUpdate.nPeers = nPeers;
+  msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+  msg->peersUpdate.nDeadPeers = nDeadPeers;
+  memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
+  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+
+  if (nPeers > 0)
+    conn->lastSentPeersHash = rasPeersHash;
+  if (nDeadPeers > 0)
+    conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+  INFO(NCCL_RAS, "RAS sending a peersUpdate to %s (nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&conn->addr, rasLine), nPeers, nDeadPeers);
+
+  rasConnEnqueueMsg(conn, msg, msgLen);
+exit:
+  return ncclSuccess;
+}
+
+// Handles the RAS_MSG_PEERSUPDATE message on the receiver side.  The received data is merged into the local
+// rasPeers and rasDeadPeers arrays.  If the checksums of the resulting arrays don't match those from the message,
+// sends its own RAS_MSG_PEERSUPDATE back to the source, to ensure a sync.
+// Subsequently propagates the update to its own peers.
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen = 0;
+  assert(sock->connIdx != -1);
+  struct rasConnection* conn = rasConns+sock->connIdx;
+  int nPeers, nDeadPeers;
+  int deadPeersOffset = 0;
+  bool updatePeers, updateDeadPeers;
+
+  INFO(NCCL_RAS, "RAS handling peersUpdate from %s (peersHash 0x%lx, deadPeersHash 0x%lx, nPeers %d, nDeadPeers %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->peersUpdate.peersHash, msg->peersUpdate.deadPeersHash,
+       msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+  INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
+       rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
+  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+
+  // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
+  // to send it.  We'll find out by comparing the hash values after the merge.
+  // We want to prepare the message pre-merge though because post-merge it will include the just received new peers,
+  // and it's pointless to send those back to where they just came from.
+  // nPeers and nDeadPeers are used primarily for message length calculations, so they have to assume the worst-case
+  // scenario (e.g., no overlap in case of nDeadPeers).
+  nPeers = (msg->peersUpdate.peersHash != rasPeersHash ? nRasPeers : 0);
+  nDeadPeers = (msg->peersUpdate.deadPeersHash != rasDeadPeersHash ? nRasDeadPeers+msg->peersUpdate.nDeadPeers : 0);
+  if (nPeers > 0 || nDeadPeers > 0) {
+    newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + nPeers*sizeof(*rasPeers);
+    if (nDeadPeers > 0) {
+      ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+      newMsgLen += nDeadPeers*sizeof(*rasDeadPeers);
+    }
+    NCCLCHECKGOTO(rasMsgAlloc(&newMsg, newMsgLen), ret, fail);
+    newMsg->type = RAS_MSG_PEERSUPDATE;
+    // Note that after rasPeersUpdate below we may still decide not to send the peers.
+    memcpy(newMsg->peersUpdate.peers, rasPeers, nPeers * sizeof(newMsg->peersUpdate.peers[0]));
+    newMsg->peersUpdate.nPeers = nPeers;
+
+    if (nDeadPeers > 0) {
+      // Calculate the offset where dead peers are stored in the received message.  We do it before the peers
+      // update because it could modify msg->peersUpdate.nPeers...
+      deadPeersOffset = rasMsgLength(RAS_MSG_PEERSUPDATE) + msg->peersUpdate.nPeers * sizeof(msg->peersUpdate.peers[0]);
+      ALIGN_SIZE(deadPeersOffset, alignof(union ncclSocketAddress));
+    }
+
+    if (nPeers > 0)
+      NCCLCHECKGOTO(rasPeersUpdate(msg->peersUpdate.peers, &msg->peersUpdate.nPeers), ret, fail);
+    else
+      msg->peersUpdate.nPeers = 0;
+    if (nDeadPeers > 0)
+      NCCLCHECKGOTO(rasDeadPeersUpdate((union ncclSocketAddress*)(((char*)msg)+deadPeersOffset),
+                                       &msg->peersUpdate.nDeadPeers), ret, fail);
+    else
+      msg->peersUpdate.nDeadPeers = 0;
+
+    INFO(NCCL_RAS, "RAS finished local processing of peersUpdate "
+         "(new nRasPeers %d, nRasDeadPeers %d, nPeers %d, nDeadPeers %d)",
+         nRasPeers, nRasDeadPeers, msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
+    if (msg->peersUpdate.nPeers > 0)
+      rasPeersDump();
+    if (msg->peersUpdate.nDeadPeers > 0)
+      rasDeadPeersDump();
+
+    // If post-merge the hashes are still different, send our (dead) peers back.
+    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    if (updatePeers || updateDeadPeers) {
+      newMsg->peersUpdate.peersHash = rasPeersHash;
+      newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
+      if (updatePeers) {
+        assert(nPeers > 0);
+        conn->lastSentPeersHash = rasPeersHash;
+      } else {
+        // If hashes match, make sure that we don't send the rasPeers back.
+        newMsg->peersUpdate.nPeers = 0;
+      }
+
+      // We need to recalculate the message size from scratch now that both rasPeers and rasDeadPeers may have changed.
+      newMsgLen = rasMsgLength(RAS_MSG_PEERSUPDATE) + newMsg->peersUpdate.nPeers * sizeof(*rasPeers);
+
+      if (updateDeadPeers) {
+        assert(nRasDeadPeers > 0);
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+
+        ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
+        deadPeersOffset = newMsgLen;
+        newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
+
+        memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
+      } else {
+        newMsg->peersUpdate.nDeadPeers = 0;
+      }
+
+      INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
+           newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
+
+      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      newMsg = nullptr;
+    } // if (updatePeers || updateDeadPeers)
+
+    // Propagate the changes through our RAS network links.
+    NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
+                                    sock->connIdx), ret, fail);
+  }
+
+exit:
+  rasMsgFree(newMsg);
+  return ret;
+fail:
+  goto exit;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////////////////
+// Functions related to the (re-)configuration of RAS connections after a peers update. //
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// Reinitializes the connection(s) of a particular link, following a peers update.
+// Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// structures, so it's better to drop it all and recalculate from scratch.
+// We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
+// is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
+// the process repeats.
+// External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
+static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
+  struct rasLinkConn* linkConn;
+  struct rasConnection* conn = nullptr;
+  int newPeerIdx = myPeerIdx;
+
+  if (link->connsSize == 0) {
+    link->connsSize = RAS_INCREMENT;
+    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
+  }
+  link->nConns = 0;
+
+  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
+  while (newPeerIdx != -1) {
+    if (link->nConns == link->connsSize) {
+      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
+      link->connsSize += RAS_INCREMENT;
+    }
+
+    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
+    if (newPeerIdx == -1) {
+      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+      if (link->nConns > 0)
+        break;
+    }
+    linkConn = link->conns+link->nConns;
+    linkConn->peerIdx = newPeerIdx;
+    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
+    linkConn->external = false;
+
+    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
+    // Depending on the circumstances, we may first need to create that connection.
+    if (linkConn->connIdx == - 1) {
+      if (link->nConns == 0) {
+        if (linkConn->peerIdx != -1) {
+          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+          // to avoid races and the creation of duplicate connections.
+          if (myPeerIdx < linkConn->peerIdx) {
+            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
+          }
+          else { // If we didn't initiate the connection, start the timeout.
+            link->lastUpdatePeersTime = clockNano();
+          }
+        } // if (linkConn->peerIdx != -1)
+      } else { // link->nConns > 0
+        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
+      } // link->nConns > 0
+    } else { // linkConn->connIdx != -1
+      if (link->nConns == 0) {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      } else {
+        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
+             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      }
+    }
+    link->nConns++;
+    if (linkConn->connIdx == -1)
+      break;
+    conn = rasConns+linkConn->connIdx;
+
+    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
+    // fallback in the next iteration, to ensure that RAS will keep retrying.
+    if (!conn->experiencingDelays)
+      break;
+
+    INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
+         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+
+  return ncclSuccess;
+}
+
+// Calculates the index of the peer on the RAS network.  Can also be used to calculate the index of the next fallback
+// peer.
+// In the simplest case we want to try the "next closest" fallback, although we still need to check for and skip
+// any dead peers.
+// For fallbacks to fallbacks, we also apply a more pessimistic policy.  We skip all the remaining RAS threads that
+// are on the same node as the previous fallback (unless it's the same node that we're running on or we have strong
+// indications that the node is up).  We do that to avoid having to excessively wait iterating through, say, 8
+// processes when a whole node might be down.
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback) {
+  int newPeerIdx = (peerIdx + link->direction + nRasPeers) % nRasPeers;
+  do {
+    if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
+      // peerIdx is a fallback and it is not running on the same node as us.
+      int tryPeerIdx = newPeerIdx;
+      int tryConnIdx = -1;
+
+      // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
+      // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
+      // little suboptimal one.
+      while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
+        if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
+          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConnIdx != -1) {
+            struct rasConnection* tryConn = rasConns+tryConnIdx;
+            // Check if the connection is fully established and operational, i.e., if the underlying socket
+            // is ready and there's been recent communication on it.
+            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
+                !tryConn->experiencingDelays) {
+              // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
+              // this case.  This is the only case when tryConnIdx != -1 after this loop.
+              break;
+            }
+          } // if (tryConnIdx != -1)
+        } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
+
+        tryConnIdx = -1;
+        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        if (tryPeerIdx == myPeerIdx)
+          break;
+      }
+
+      if (tryConnIdx == -1)
+        newPeerIdx = tryPeerIdx;
+      if (tryPeerIdx == myPeerIdx)
+        break;
+    } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
+    
+    if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
+      newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
+    }
+    else
+      break;
+  } while (newPeerIdx != myPeerIdx);
+
+  return (newPeerIdx != myPeerIdx ? newPeerIdx : -1);
+}
+
+
+//////////////////////////////////////////////////////
+// Functions related to the handling of dead peers. //
+//////////////////////////////////////////////////////
+
+// Marks a peer as dead in the local rasDeadPeers array.  Any propagation, reconfiguration, etc., needs to be
+// handled outside of this function.
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr) {
+  union ncclSocketAddress* deadAddr;
+
+  if (!rasPeerIsDead(addr)) {
+    NCCLCHECK(getNewDeadEntry(&deadAddr));
+    memcpy(deadAddr, addr, sizeof(*deadAddr));
+    qsort(rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), &ncclSocketsCompare);
+
+    rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+    INFO(NCCL_RAS, "RAS declaring peer %s as DEAD; rasDeadPeersHash 0x%lx",
+         ncclSocketToString(addr, rasLine), rasDeadPeersHash);
+  }
+  return ncclSuccess;
+}
+
+// Invoked when an incoming RAS_MSG_PEERSUPDATE includes info on dead peers.  Updates the rasDeadPeers array.
+// Any propagation needs to be handled outside of this function, though it *does* disconnect any connections
+// with the newly dead peers.
+// On return, nUpdatePeers contains the number of newly added dead entries.
+static ncclResult_t rasDeadPeersUpdate(union ncclSocketAddress* updatePeers, int* nUpdatePeers) {
+  static union ncclSocketAddress* newPeers = nullptr;
+  static union ncclSocketAddress* oldPeers;
+
+  if (*nUpdatePeers == 0)
+    return ncclSuccess;
+
+  // Pessimistically estimate the new size of rasDeadPeers.
+  int nNewPeers = nRasDeadPeers + *nUpdatePeers;
+  if (nNewPeers > rasDeadPeersSize) {
+    nNewPeers = ROUNDUP(nNewPeers, RAS_INCREMENT);
+
+    NCCLCHECK(ncclCalloc(&newPeers, nNewPeers));
+    oldPeers = rasDeadPeers;
+  } else {
+    // We don't need to allocate a new array in this case.  We just shift the existing content to the end of the
+    // array to make room in the front for merging.
+    oldPeers = rasDeadPeers+(rasDeadPeersSize-nRasDeadPeers);
+    memmove(oldPeers, rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+    newPeers = rasDeadPeers;
+  }
+
+  // Merge updatePeers with oldPeers into newPeers.
+  int oldPeersIdx, updatePeersIdx, newPeersIdx;
+  for (oldPeersIdx = updatePeersIdx = newPeersIdx = 0; oldPeersIdx < nRasDeadPeers || updatePeersIdx < *nUpdatePeers;) {
+    int cmp;
+    if (oldPeersIdx < nRasDeadPeers && updatePeersIdx < *nUpdatePeers) {
+      cmp = ncclSocketsCompare(oldPeers+oldPeersIdx, updatePeers+updatePeersIdx);
+    } else {
+      cmp = (oldPeersIdx < nRasDeadPeers ? -1 : 1);
+    }
+
+    memmove(newPeers+newPeersIdx++, (cmp <= 0 ? oldPeers+oldPeersIdx : updatePeers+updatePeersIdx), sizeof(*newPeers));
+    if (cmp <= 0)
+      oldPeersIdx++;
+    if (cmp > 0) {
+      rasConnDisconnect(updatePeers+updatePeersIdx);
+    }
+    if (cmp >= 0)
+      updatePeersIdx++;
+  }
+  *nUpdatePeers = newPeersIdx - nRasDeadPeers;
+  nRasDeadPeers = newPeersIdx;
+
+  if (newPeers != rasDeadPeers) {
+    free(rasDeadPeers);
+    rasDeadPeers = newPeers;
+    rasDeadPeersSize = nNewPeers;
+  }
+
+  rasDeadPeersHash = getHash((const char*)rasDeadPeers, nRasDeadPeers*sizeof(*rasDeadPeers));
+
+  return ncclSuccess;
+}
+
+// Returns the index of the first available entry in the rasDeadPeers array, enlarging the array if necessary.
+static ncclResult_t getNewDeadEntry(union ncclSocketAddress** pAddr) {
+  if (nRasDeadPeers == rasDeadPeersSize) {
+    NCCLCHECK(ncclRealloc(&rasDeadPeers, rasDeadPeersSize, rasDeadPeersSize+RAS_INCREMENT));
+    rasDeadPeersSize += RAS_INCREMENT;
+  }
+
+  *pAddr = rasDeadPeers+(nRasDeadPeers++);
+  return ncclSuccess;
+}
+
+// Checks whether a peer is dead by looking it up in the rasDeadPeers array.
+bool rasPeerIsDead(const union ncclSocketAddress* addr) {
+  return (rasDeadPeers != nullptr &&
+          bsearch(addr, rasDeadPeers, nRasDeadPeers, sizeof(*rasDeadPeers), ncclSocketsCompare) != nullptr);
+}
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+// Auxiliary functions -- primarily sorting/searching callbacks, plus some debug output support. //
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Searching callback for struct rasRankInit.  Compares the ncclSocketAddress key against a rasRankInit element.
+static int rasAddrRankInitCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasRankInit* elem = (const struct rasRankInit*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Searching callback for struct rasPeerInfo.  Compares the ncclSocketAddress key against a rasPeerInfo element.
+static int rasAddrPeerInfoCompare(const void* k, const void* e) {
+  const union ncclSocketAddress* key = (const union ncclSocketAddress*)k;
+  const struct rasPeerInfo* elem = (const struct rasPeerInfo*)e;
+
+  return ncclSocketsCompare(key, &elem->addr);
+}
+
+// Sorting callback for struct rasRankInit. addr is the primary key; cudaDev is secondary.
+static int rasRanksCompare(const void* e1, const void* e2) {
+  const struct rasRankInit* r1 = (const struct rasRankInit*)e1;
+  const struct rasRankInit* r2 = (const struct rasRankInit*)e2;
+  int cmp = ncclSocketsCompare(&r1->addr, &r2->addr);
+  if (cmp == 0) {
+    if (r1->addr.sa.sa_family == 0) // Bail out in case of empty addresses...
+      return 0;
+    assert(r1->pid == r2->pid);
+    cmp = (r1->cudaDev < r2->cudaDev ? -1 : (r1->cudaDev > r2->cudaDev ? 1 : 0));
+    assert(cmp != 0); // There should be no complete duplicates within the rank array.
+  }
+  return cmp;
+}
+
+// Sorting callback for ncclSocketAddress.  We want to sort by the address family (IPv4 first), then the address,
+// then port.  Unfortunately, that's not the order of how they are laid out in memory, so one big memcmp won't do.
+// memcmp is still useful though for individual elements in the network byte order.
+int ncclSocketsCompare(const void* p1, const void* p2) {
+  const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
+  const union ncclSocketAddress* a2 = (const union ncclSocketAddress*)p2;
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family) {
+    if (family > 0 && a2->sa.sa_family > 0)
+      return (family < a2->sa.sa_family ? -1 : 1);
+    else // Put empty addresses at the end (not that it matters...).
+      return (family > 0 ? -1 : 1);
+  }
+
+  int cmp;
+  if (family == AF_INET) {
+    if ((cmp = memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr))) == 0) {
+      cmp = memcmp(&a1->sin.sin_port, &a2->sin.sin_port, sizeof(a1->sin.sin_port));
+    }
+  }
+  else if (family == AF_INET6) {
+    if ((cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr))) == 0) {
+      cmp = memcmp(&a1->sin6.sin6_port, &a2->sin6.sin6_port, sizeof(a1->sin6.sin6_port));
+    }
+  } else {
+    // The only remaining valid case are empty addresses.
+    assert(family == 0);
+    cmp = 0; // Two empty addresses are equal...
+  }
+
+  return cmp;
+}
+
+// Returns true if two socket addresses are from the same node (actually, the same network interface on one node).
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2) {
+  // AF_INET (2) is less than AF_INET6 (10).
+  int family = a1->sa.sa_family;
+  if (family != a2->sa.sa_family)
+    return false;
+
+  if (family == AF_INET)
+    return (memcmp(&a1->sin.sin_addr, &a2->sin.sin_addr, sizeof(a1->sin.sin_addr)) == 0);
+  else if (family == AF_INET6)
+    return (memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr)) == 0);
+  else
+    return true; // Two empty addresses are equal...
+}
+
+// Debug output routine: dumps the rasPeers array.
+static void rasPeersDump() {
+  for (int p = 0; p < nRasPeers; p++) {
+    const struct rasPeerInfo* peer = rasPeers+p;
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+  }
+  if (nRasPeers > 0)
+    INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
+}
+
+// Debug output routine: dumps the rasDeadPeers array.
+static void rasDeadPeersDump() {
+  for (int p = 0; p < nRasDeadPeers; p++) {
+    int deadPeerIdx = rasPeerFind(rasDeadPeers+p);
+    INFO(NCCL_RAS, "RAS dead peer %d: %s", p,
+         (deadPeerIdx >= 0 ? rasPeerDump(rasPeers+deadPeerIdx, rasLine, sizeof(rasLine)) :
+          ncclSocketToString(rasDeadPeers+p, rasLine)));
+  }
+  if (nRasDeadPeers > 0)
+    INFO(NCCL_RAS, "RAS deadPeersHash 0x%lx", rasDeadPeersHash);
+}
+
+// Debug output routine: dumps part of an individual element from the rasPeers array.
+static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nres) {
+  char line[SOCKET_NAME_MAXLEN+1], line2[1024];
+  snprintf(result, nres, "socket %s, pid %d, GPU%s %s", ncclSocketToString(&peer->addr, line), peer->pid,
+           (__builtin_popcountll(peer->cudaDevs) > 1 ? "s" : ""),
+           rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
+  return result;
+}
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
new file mode 100644
index 000000000..4905d7a69
--- /dev/null
+++ b/src/ras/ras.cc
@@ -0,0 +1,668 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+#include <cstddef>
+#include <mutex>
+#include <poll.h>
+#include <unistd.h>
+
+#include "alloc.h"
+#include "checks.h"
+#include "comm.h"
+#include "nccl.h"
+#include "utils.h"
+#include "ras_internal.h"
+
+// Type of a notification from a local NCCL thread.
+typedef enum {
+  RAS_ADD_RANKS = 0,
+  RAS_TERMINATE = 1
+} rasNotificationType;
+
+// Used for communication from local NCCL threads to the RAS thread.
+struct rasNotification {
+  rasNotificationType type;
+  union {
+    struct {
+      struct rasRankInit* ranks;
+      int nranks;
+    } addRanks;
+  };
+};
+static_assert(sizeof(struct rasNotification) <= PIPE_BUF, "The rasNotification structure is too large");
+
+// These ensure that we get only one RAS port/thread per process.
+static std::mutex rasInitMutex;
+static bool rasInitialized = false;
+static int rasInitRefCount = 0;
+
+// The RAS network listening socket of this RAS thread (random port).
+struct ncclSocket rasNetListeningSocket;
+
+static pthread_t rasThread;
+
+// Used for communication from regular NCCL threads to the RAS thread.
+static std::mutex rasNotificationMutex;
+static int rasNotificationPipe[2] = {-1, -1};
+
+// Data for the main poll() in the RAS thread.
+struct pollfd* rasPfds;
+static int nRasPfds;
+
+// We use it all over the place; no point in wasting the stack...
+char rasLine[SOCKET_NAME_MAXLEN+1];
+
+// An array holding the addresses of all NCCL communicators.  Modified by the NCCL threads (hence the mutex), read by
+// the RAS thread.
+std::mutex ncclCommsMutex;
+struct ncclComm** ncclComms = nullptr;
+int nNcclComms = 0;
+bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
+
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
+static ncclResult_t rasLocalHandle();
+static void rasLocalHandleTerminate();
+
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
+static ncclResult_t rasNetSendNack(struct rasSocket* sock);
+
+static void* rasThreadMain(void*);
+
+NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
+
+//////////////////////////////////////////////////
+// Functions invoked from regular NCCL threads. //
+//////////////////////////////////////////////////
+
+// Invoked by regular NCCL threads on every comm initialization.  This is the first function to call.
+// The myRank structure should be passed with the addr element initialized to the IP address of the bootstrap
+// network interface to use.  On a successful return, the address will be updated with the port number of the
+// RAS network listening socket.
+ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank) {
+  ncclResult_t ret = ncclSuccess;
+  if (!rasInitialized) {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    if (!rasInitialized) {
+      union ncclSocketAddress addr;
+
+      memcpy(&addr, &myRank->addr, sizeof(addr));
+      (addr.sa.sa_family == AF_INET ? addr.sin.sin_port : addr.sin6.sin6_port) = htons(0);
+      NCCLCHECKGOTO(ncclSocketInit(&rasNetListeningSocket, &addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork,
+                                   /*abortFlag*/nullptr, /*asyncFlag*/1), ret, fail);
+      NCCLCHECKGOTO(ncclSocketListen(&rasNetListeningSocket), ret, fail);
+      INFO(NCCL_RAS, "RAS network listening socket at %s",
+           ncclSocketToString(&rasNetListeningSocket.addr, rasLine));
+
+      (void)rasClientInitSocket();
+
+      SYSCHECKGOTO(pipe(rasNotificationPipe), "pipe", ret, fail);
+
+      PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
+      ncclSetThreadName(rasThread, "NCCL RAS");
+      (void)pthread_detach(rasThread);
+
+      rasInitialized = true;
+    }
+  }
+  ncclAtomicRefCountIncrement(&rasInitRefCount);
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+
+    int i;
+    for (i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == nullptr)
+        break;
+    }
+    if (i == nNcclComms) {
+      NCCLCHECK(ncclRealloc(&ncclComms, nNcclComms, nNcclComms+RAS_INCREMENT*8));
+      nNcclComms += RAS_INCREMENT*8;
+    }
+    ncclComms[i] = comm;
+    ncclCommsSorted = false;
+  }
+
+  if (myRank != nullptr)
+    memcpy(&myRank->addr, &rasNetListeningSocket.addr, sizeof(myRank->addr));
+
+exit:
+  return ret;
+fail:
+  if (rasNotificationPipe[1] != 0)
+    (void)close(rasNotificationPipe[1]);
+  if (rasNotificationPipe[0] != 0)
+    (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  goto exit;
+}
+
+// Invoked by regular NCCL threads on every comm termination.
+ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
+  if (!rasInitialized)
+    return ncclSuccess;
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    for (int i = 0; i < nNcclComms; i++) {
+      if (ncclComms[i] == comm) {
+        ncclComms[i] = nullptr;
+        ncclCommsSorted = false;
+        break;
+      }
+    }
+  }
+  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
+    struct rasNotification msg;
+    msg.type = RAS_TERMINATE;
+    NCCLCHECK(rasLocalNotify(&msg));
+  }
+  return ncclSuccess;
+}
+
+// Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
+// the communicator.
+ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
+  struct rasNotification msg;
+  msg.type = RAS_ADD_RANKS;
+  msg.addRanks.ranks = ranks;
+  msg.addRanks.nranks = nranks;
+  NCCLCHECK(rasLocalNotify(&msg));
+  return ncclSuccess;
+}
+
+// Internal function running on regular NCCL threads -- asynchronously notifies the RAS thread.
+static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
+  if (!rasInitialized)
+    return ncclSuccess;
+
+  // Take an exclusive lock here to avoid multiplexing between multiple user threads (not sure if it's
+  // strictly required, but it won't hurt)...
+  std::lock_guard<std::mutex> lock(rasNotificationMutex);
+  size_t done = 0;
+  while (done < sizeof(*msg)) {
+    ssize_t written;
+    SYSCHECK(written = write(rasNotificationPipe[1], (char*)msg + done, sizeof(*msg) - done), "write");
+    done += written;
+  }
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////
+// Functions related to the handling of local notifications from NCCL threads. //
+/////////////////////////////////////////////////////////////////////////////////
+
+// Handles asynchronous local notifications arriving from regular NCCL threads.
+static ncclResult_t rasLocalHandle() {
+  struct rasNotification msg;
+
+  size_t done = 0;
+  while (done < sizeof(msg)) {
+    ssize_t nread;
+    SYSCHECK(nread = read(rasNotificationPipe[0], (char*)&msg + done, sizeof(msg) - done), "read");
+    if (nread == 0) // EOF
+      return ncclSystemError;
+    done += nread;
+  }
+
+  if (msg.type == RAS_ADD_RANKS) {
+    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+  } else if (msg.type == RAS_TERMINATE) {
+    rasLocalHandleTerminate();
+  } else {
+    WARN("RAS received unknown notification type %d", msg.type);
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles local RAS_TERMINATE notification.
+static void rasLocalHandleTerminate() {
+  INFO(NCCL_RAS, "RAS handling local termination request");
+  // For now we don't do anything.
+}
+
+
+////////////////////////////////////////////////
+// Generic functions related to RAS messages. //
+////////////////////////////////////////////////
+
+// Allocates a RAS message of the desired length for sending.
+// Behind the scenes allocates encapsulating rasMsgMeta structure, which includes local metadata stored in front
+// of the message.
+// Must use rasMsgFree to free.
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen) {
+  struct rasMsgMeta* meta = nullptr;
+  NCCLCHECK(ncclCalloc((char**)&meta, offsetof(struct rasMsgMeta, msg) + msgLen));
+  *msg = &meta->msg;
+  // coverity[leaked_storage:FALSE] => rasMsgFree is used to free it
+  return ncclSuccess;
+}
+
+// To be used only with messages allocated with rasMsgAlloc.  I.e., it should be used for sent messages, not
+// for received ones.
+void rasMsgFree(struct rasMsg* msg) {
+  if (msg) {
+    struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+    free(meta);
+  }
+}
+
+// Enqueues a message for sending down a RAS connection.
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front) {
+  // Get to the metadata of this message.
+  struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
+  bool ready = false;
+
+  meta->enqueueTime = clockNano();
+  meta->offset = 0;
+  meta->length = msgLen;
+
+  if (front)
+    ncclIntruQueueEnqueueFront(&conn->sendQ, meta);
+  else
+    ncclIntruQueueEnqueue(&conn->sendQ, meta);
+
+  if (conn->sockIdx != -1) {
+    struct rasSocket* sock = rasSockets+conn->sockIdx;
+    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[sock->pfd].events |= POLLOUT;
+      ready = true;
+    }
+  }
+  if (!ready) {
+    // It's not a bug, unless it's for things like keep-alive messages...
+    INFO(NCCL_RAS, "RAS enqueued message type %d on a non-ready connection with %s "
+         "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
+         msg->type, ncclSocketToString(&conn->addr, rasLine),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+  }
+}
+
+// Attempts to send the queued RAS messages to another RAS thread.
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
+  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
+  struct rasMsgMeta* meta;
+  *closed = 0;
+  while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
+    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+      // We don't send anything beyond the handshake at this point.
+      meta = nullptr;
+      break;
+    }
+    if (meta->offset < sizeof(meta->length)) {
+      // Send the length of the message.
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      if (*closed)
+        return ncclSuccess;
+      if (meta->offset < sizeof(meta->length))
+        break;
+    }
+    // Send the body of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+                                 meta->length+sizeof(meta->length), &meta->offset, closed));
+    if (*closed)
+      return ncclSuccess;
+    if (meta->offset < meta->length+sizeof(meta->length))
+      break;
+    ncclIntruQueueDequeue(&conn->sendQ);
+    free(meta);
+  }
+
+  *allSent = !meta;
+
+  return ncclSuccess;
+}
+
+// Attempts to receive a message through a RAS socket.
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed) {
+  *closed = 0;
+  if (sock->recvOffset < sizeof(sock->recvLength)) {
+    // Receive the length of the message.
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, &sock->recvLength, sizeof(sock->recvLength),
+                                 &sock->recvOffset, closed));
+    if (*closed || sock->recvOffset < sizeof(sock->recvLength))
+      return ncclSuccess;
+    NCCLCHECK(ncclCalloc((char**)&sock->recvMsg, sock->recvLength));
+  }
+  // Receive the body of the message.
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &sock->sock, ((char*)sock->recvMsg)-sizeof(sock->recvLength),
+                               sock->recvLength+sizeof(sock->recvLength), &sock->recvOffset, closed));
+  if (*closed || sock->recvOffset < sock->recvLength+sizeof(sock->recvLength))
+    return ncclSuccess;
+
+  *msg = sock->recvMsg;
+  sock->recvMsg = nullptr;
+  sock->recvOffset = sock->recvLength = 0;
+
+  return ncclSuccess;
+}
+
+
+//////////////////////////////////////////////////////////////////
+// Functions related to the handling of specific message types. //
+//////////////////////////////////////////////////////////////////
+
+// Invoked from the main RAS thread to dispatch incoming messages to the appropriate handler.
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
+  if (msg->type == RAS_MSG_CONNINIT) {
+    NCCLCHECK(rasMsgHandleConnInit(msg, sock));
+  } else if (msg->type == RAS_MSG_CONNINITACK) {
+    NCCLCHECK(rasMsgHandleConnInitAck(msg, sock));
+  } else if (msg->type == RAS_MSG_KEEPALIVE) {
+    NCCLCHECK(rasMsgHandleKeepAlive(msg, sock));
+  } else if (msg->type == RAS_MSG_PEERSUPDATE) {
+    NCCLCHECK(rasMsgHandlePeersUpdate(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLREQ) {
+    NCCLCHECK(rasMsgHandleCollReq(msg, sock));
+  } else if (msg->type == RAS_MSG_COLLRESP) {
+    NCCLCHECK(rasMsgHandleCollResp(msg, sock));
+  } else {
+    WARN("RAS received unknown message type (%d) from %s", msg->type, ncclSocketToString(&sock->sock.addr, rasLine));
+    return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+// Handles the first message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasConnection* conn = nullptr;
+  int connIdx, peerIdx;
+  struct rasMsg* newMsg = nullptr;
+  int newMsgLen;
+  char line[SOCKET_NAME_MAXLEN+1];
+
+  INFO(NCCL_RAS, "RAS handling connInit from %s (version %d, listeningAddr %s, peersHash 0x%lx, deadPeersHash 0x%lx)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInit.ncclVersion,
+       ncclSocketToString(&msg->connInit.listeningAddr, line), msg->connInit.peersHash, msg->connInit.deadPeersHash);
+
+  if (msg->connInit.ncclVersion != NCCL_VERSION_CODE) {
+    // Close any such sockets immediately!  This is basically unrecoverable...
+    WARN("NCCL version mismatch with remote peer %s (local: %d, remote %d)",
+         ncclSocketToString(&sock->sock.addr, rasLine), NCCL_VERSION_CODE, msg->connInit.ncclVersion);
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    ret = ncclInvalidUsage;
+    goto exit;
+  }
+
+  if (rasPeerIsDead(&msg->connInit.listeningAddr)) {
+    // A peer long declared dead is suddenly alive again?!
+    INFO(NCCL_RAS, "RAS connection from peer %s that is considered dead!",
+         ncclSocketToString(&msg->connInit.listeningAddr, rasLine));
+    rasNetSendNack(sock);
+    rasSocketTerminate(sock, /*finalize*/true);
+    goto exit;
+  }
+
+  // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
+  connIdx = rasConnFind(&msg->connInit.listeningAddr);
+  if (connIdx != -1) {
+    conn = rasConns+connIdx;
+
+    INFO(NCCL_RAS,
+         "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
+         (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
+
+    if (conn->sockIdx != -1) {
+      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+      INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
+           connSock->status, (clockNano()-connSock->createTime)/1e9);
+      // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
+      // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
+      // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
+      // If each side closed the "old" one, both would end up being closed.
+      // As we normally try to initiate connections from the side with a lower address (precisely to avoid such
+      // situations), we'll follow the same logic here: the "lower" side will reject the new connection (as it
+      // came from the "wrong" side), whereas the "higher" side will keep the new one (as it came from the correct
+      // side) and terminate the old one (that it presumably just opened).
+      if (ncclSocketsCompare(&rasNetListeningSocket.addr, &conn->addr) < 0) {
+        INFO(NCCL_RAS, "RAS terminating the new socket");
+        rasSocketTerminate(sock, /*finalize*/true);
+        goto exit;
+      } else {
+        INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
+        rasSocketTerminate(connSock);
+      }
+    }
+  }
+  if (!conn) {
+    NCCLCHECK(getNewConnEntry(&conn));
+    memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
+    connIdx = conn - rasConns;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  conn->sockIdx = sock-rasSockets;
+  sock->connIdx = connIdx;
+  memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
+
+  // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
+  // update the expected (non-external) connections; external ones will be added during keep-alive handling.
+  peerIdx = rasPeerFind(&conn->addr);
+  // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
+  // the peers update.
+  if (peerIdx != -1) {
+    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
+    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+  }
+
+  // Send a confirmation to the server that requested the connection (so that the resilience code can mark
+  // the connection as live).
+  newMsgLen = rasMsgLength(RAS_MSG_CONNINITACK);
+  NCCLCHECK(rasMsgAlloc(&newMsg, newMsgLen));
+  newMsg->type = RAS_MSG_CONNINITACK;
+  newMsg->connInitAck.nack = 0;
+  rasConnEnqueueMsg(conn, newMsg, newMsgLen, /*front*/true);
+
+  conn->lastRecvPeersHash = msg->connInit.peersHash;
+  conn->lastRecvDeadPeersHash = msg->connInit.deadPeersHash;
+
+  if (msg->connInit.peersHash != rasPeersHash || msg->connInit.deadPeersHash != rasDeadPeersHash) {
+    // Send my rasPeers and request the same in return.
+    INFO(NCCL_RAS, "RAS connInit hash mismatch (my peersHash 0x%lx, deadPeersHash 0x%lx); sending my (dead) peers",
+         rasPeersHash, rasDeadPeersHash);
+    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+  }
+exit:
+  return ret;
+}
+
+// Handles the second message sent over a RAS socket as part of the handshake.
+static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock) {
+  INFO(NCCL_RAS, "RAS handling connInitAck from %s (nack %d)",
+       ncclSocketToString(&sock->sock.addr, rasLine), msg->connInitAck.nack);
+
+  if (msg->connInitAck.nack) {
+    // The remote peer doesn't want to talk to us.  The easiest way to prevent it is by declaring it dead.
+    // We make a copy of the address because rasConnDisconnect will terminate the rasSocket.
+    union ncclSocketAddress addr;
+    memcpy(&addr, &sock->sock.addr, sizeof(addr));
+    rasConnDisconnect(&addr);
+    (void)rasPeerDeclareDead(&addr);
+
+    return ncclSuccess;
+  }
+
+  sock->status = RAS_SOCK_READY;
+  // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
+
+  return ncclSuccess;
+}
+
+// Handles the deadPeer broadcast.
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+
+  if (!rasPeerIsDead(&req->deadPeer.addr)) {
+    rasConnDisconnect(&req->deadPeer.addr);
+    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+    *pDone = false;
+  } else {
+    INFO(NCCL_RAS, "RAS already knew it was dead");
+    // No point in re-broadcasting what's already known.
+    *pDone = true;
+  }
+}
+
+// Attempts to immediately send a fatal NACK connInitAck response to a socket.  A bit of a hack (as it doesn't
+// follow our usual message queuing and polling convention) but, since this can be invoked only for newly opened
+// connections, and the message is tiny, it should be OK.  We can't use the regular path because the socket is
+// about to be terminated.
+static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
+  struct rasMsg msg;
+  int length = rasMsgLength(RAS_MSG_CONNINITACK);
+  int closed = 0;
+  int offset;
+
+  INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
+
+  msg.type = RAS_MSG_CONNINITACK;
+  msg.connInitAck.nack = 1;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &length, sizeof(length), &offset, &closed));
+  if (closed || offset < sizeof(length))
+    return ncclSuccess;
+  offset = 0;
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &sock->sock, &msg, length, &offset, &closed));
+  // We are closing this socket anyway -- it doesn't matter to us if we succeeded or not.
+
+  return ncclSuccess;
+}
+
+
+/////////////////////////////////////////////////////////////////
+// Functions related to the main event loop of the RAS thread. //
+/////////////////////////////////////////////////////////////////
+
+// Main function of the RAS thread.
+static void* rasThreadMain(void*) {
+  ncclResult_t ret = ncclSuccess; // Unused.
+  int pfd;
+  int rasNetListeningSocketFd;
+
+  INFO(NCCL_RAS, "RAS thread started");
+
+  // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasNotificationPipe[0];
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  rasPfds[pfd].fd = rasNetListeningSocketFd;
+  rasPfds[pfd].events = POLLIN;
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  rasPfds[pfd].fd = rasClientListeningSocket;
+  rasPfds[pfd].events = POLLIN;
+
+  // Main event loop of the RAS thread.
+  for (int64_t nextWakeup=0;;) {
+    int timeout, nEvents;
+    int64_t now = clockNano();
+    if (nextWakeup > 0) {
+      // The "1" below helps avoid round-downs and especially zeroes.
+      if (nextWakeup > now)
+        timeout = (nextWakeup - now) / (CLOCK_UNITS_PER_SEC / 1000) + 1;
+      else
+        timeout = 1;
+    } else {
+      timeout = 1000; // 1 second.
+    }
+
+    nEvents = poll(rasPfds, nRasPfds, timeout);
+
+    nextWakeup = clockNano()+CLOCK_UNITS_PER_SEC;
+    if (nEvents == -1 && errno != EINTR)
+      INFO(NCCL_RAS, "RAS continuing in spite of an unexpected error from poll: %s", strerror(errno));
+
+    // Handle any poll-related events.
+    for (int pollIdx = 0; pollIdx < nRasPfds && nEvents > 0; pollIdx++) {
+      if (rasPfds[pollIdx].revents) {
+        nEvents--;
+        if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
+          (void)rasLocalHandle();
+        } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
+          (void)rasNetAcceptNewSocket();
+        } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
+          (void)rasClientAcceptNewSocket();
+        } else {
+          // Check if it's one of the RAS sockets.
+          int sockIdx;
+          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
+            struct rasSocket* sock = rasSockets+sockIdx;
+            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sockIdx, pollIdx);
+              break;
+            }
+          } // for (sockIdx)
+
+          if (sockIdx == nRasSockets) {
+            // Try a client socket instead.
+            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
+              struct rasClient* client = rasClients+clientIdx;
+              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(clientIdx, pollIdx);
+                break;
+              }
+            } // for (clientIdx)
+          } // if (sockIdx == nRasSockets)
+        } // dynamic fds
+      } // if (revents)
+    } // for (pollIdx)
+
+    now = clockNano();
+
+    rasSocksHandleTimeouts(now, &nextWakeup);
+
+    rasConnsHandleTimeouts(now, &nextWakeup);
+
+    rasNetHandleTimeouts(now, &nextWakeup);
+
+    rasCollsHandleTimeouts(now, &nextWakeup);
+  } // for (;;)
+
+fail:
+  WARN("fatal error - RAS thread terminating");
+  std::lock_guard<std::mutex> lock(rasInitMutex);
+  (void)close(rasNotificationPipe[1]);
+  (void)close(rasNotificationPipe[0]);
+  (void)close(rasClientListeningSocket);
+  (void)ncclSocketClose(&rasNetListeningSocket);
+  rasInitialized = false;
+  return nullptr;
+}
+
+// Returns the index of the first available entry in the rasPfds array, enlarging the array if necessary.
+ncclResult_t rasGetNewPollEntry(int* index) {
+  int i;
+  for (i = 0; i < nRasPfds; i++)
+    if (rasPfds[i].fd == -1)
+      break;
+  if (i == nRasPfds) {
+    NCCLCHECK(ncclRealloc(&rasPfds, nRasPfds, nRasPfds+RAS_INCREMENT));
+    nRasPfds += RAS_INCREMENT;
+    for (int j = i; j < nRasPfds; j++)
+      rasPfds[j].fd = -1;
+  }
+
+  memset(rasPfds+i, '\0', sizeof(*rasPfds));
+  rasPfds[i].fd = -1;
+
+  *index = i;
+  return ncclSuccess;
+}
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
new file mode 100644
index 000000000..68cac0b44
--- /dev/null
+++ b/src/ras/ras_internal.h
@@ -0,0 +1,512 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_RAS_INTERNAL_H_
+#define NCCL_RAS_INTERNAL_H_
+
+#define NCCL_RAS_CLIENT_PORT 28028
+#define NCCL_RAS_CLIENT_PROTOCOL 2
+
+#define RAS_COLLECTIVE_LEG_TIMEOUT_SEC 5
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC RAS_COLLECTIVE_LEG_TIMEOUT_SEC
+
+// End of the client section; everything below is meant for the NCCL threads only.
+#ifndef NCCL_RAS_CLIENT
+
+#include <mutex>
+
+#include "nccl.h"
+#include "ras.h"
+#include "socket.h"
+#include "utils.h"
+
+// Type of a RAS network or client message.
+typedef enum {
+  RAS_MSG_CONNINIT = 1,
+  RAS_MSG_CONNINITACK = 2,
+  RAS_MSG_KEEPALIVE = 3,
+  RAS_MSG_PEERSUPDATE = 4,
+  RAS_MSG_COLLREQ = 5,
+  RAS_MSG_COLLRESP = 6,
+} rasMsgType;
+
+// Type of a RAS network collective message.
+typedef enum {
+  RAS_MSG_NONE = 0,
+  RAS_BC_DEADPEER = 1,
+  // Broadcast operations above this line; collective operations below (1000 is the demarcation line).
+  RAS_COLL_CONNS = 1001, // Collect data about all RAS connections.
+  RAS_COLL_COMMS = 1002, // Collect data about all communicators.
+} rasCollectiveType;
+
+// Payload of a collective request message (RAS_MSG_COLLREQ).
+struct rasCollRequest {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int64_t timeout;
+  rasCollectiveType type;
+  union {
+    struct {
+      union ncclSocketAddress addr;
+    } deadPeer;
+    struct {
+    } conns;
+    struct {
+    } comms;
+  };
+};
+
+// Payload of a collective response message (RAS_MSG_COLLRESP).
+struct rasCollResponse {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  int nLegTimeouts; // If >0, indicates incomplete data.
+  int nPeers;
+  int nData; // Size of data in bytes.
+  union ncclSocketAddress peers[0]; // Variable length.
+  // The peersAddrs array is followed by:
+  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+};
+
+// Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
+// NCCL process.
+struct rasPeerInfo {
+  union ncclSocketAddress addr;
+  pid_t pid;
+  uint64_t cudaDevs; // Bitmask.  Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
+  uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+};
+
+// Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
+// byte order.  Depending on the message type, the length of the message will vary.
+struct rasMsg {
+  rasMsgType type;
+  union {
+    struct {
+      int ncclVersion;
+      union ncclSocketAddress listeningAddr;
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+    } connInit; // Sent by the connecting side as the first message.
+    struct {
+      int nack; // If non-0, we should stop trying to reconnect.
+    } connInitAck; // Response from the accepting side to the above.
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int linkMask; // What links at the destination peer should the connection be part of
+                    // (bit 0: nextLink; bit 1: prevLink).
+      struct timespec realTime; // Wallclock time at the source, for statistical purposes (in principle there's
+                                // no guarantee that the nodes have synchronized clocks so we can't really rely
+                                // on it for anything important)..
+      int nack; // If non-0, it means that this message is a response to an unexpected keepAlive message.
+    } keepAlive;
+    struct {
+      uint64_t peersHash;
+      uint64_t deadPeersHash;
+      int nPeers;
+      int nDeadPeers;
+      struct rasPeerInfo peers[0]; // Variable length.
+      // The peers array is followed by the following:
+      //union ncclSocketAddress deadPeers[0]; // Variable length.
+    } peersUpdate;
+    struct {
+      int protocol; // Protocol version, sent to the client.
+    } clientInit;
+    struct {
+      int nData;
+      char data[0]; // Variable length.
+    } clientDump;
+    struct rasCollRequest collReq; // Variable length.
+    struct rasCollResponse collResp; // Variable length.
+  };
+};
+
+// Returns the size of the collective portion of a collective request message.
+static inline size_t rasCollDataLength(rasCollectiveType type) {
+  struct rasCollRequest* data;
+  switch (type) {
+    case RAS_BC_DEADPEER:
+      return offsetof(struct rasCollRequest, deadPeer) + sizeof(data->deadPeer);
+    case RAS_COLL_CONNS:
+      return offsetof(struct rasCollRequest, conns) + sizeof(data->conns);
+    case RAS_COLL_COMMS:
+      return offsetof(struct rasCollRequest, comms) + sizeof(data->comms);
+    case RAS_MSG_NONE:
+      return 0;
+  };
+  return 0;
+}
+
+// Returns the size for a message of a particular type.
+static inline size_t rasMsgLength(rasMsgType type, rasCollectiveType collType = RAS_MSG_NONE) {
+  struct rasMsg* msg;
+  switch (type) {
+    case RAS_MSG_CONNINIT:
+      return offsetof(struct rasMsg, connInit) + sizeof(msg->connInit);
+    case RAS_MSG_CONNINITACK:
+      return offsetof(struct rasMsg, connInitAck) + sizeof(msg->connInitAck);
+    case RAS_MSG_KEEPALIVE:
+      return offsetof(struct rasMsg, keepAlive) + sizeof(msg->keepAlive);
+    case RAS_MSG_PEERSUPDATE:
+      return offsetof(struct rasMsg, peersUpdate) + sizeof(msg->peersUpdate);
+    case RAS_MSG_COLLREQ:
+      return offsetof(struct rasMsg, collReq) + rasCollDataLength(collType);
+    case RAS_MSG_COLLRESP:
+      return offsetof(struct rasMsg, collResp) + sizeof(msg->collResp);
+  };
+  return 0;
+}
+
+// How much to enlarge any RAS array by if we run out of space.
+#define RAS_INCREMENT 4
+
+// Our clock has nanosecond resolution.
+#define CLOCK_UNITS_PER_SEC 1000000000L
+
+// Keep-alive messages are sent no sooner than a second after the last message was sent down a particular connection.
+#define RAS_KEEPALIVE_INTERVAL (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If no message arrives in 5 seconds via a particular connection that uses keep-alive messages, generate a warning
+// and try alternative connections.
+#define RAS_KEEPALIVE_TIMEOUT_WARN (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a socket that uses keep-alive messages if no message arrives in 20 seconds.
+// We will try to re-establish communication via that connection (until RAS_PEER_DEAD_TIMEOUT).
+#define RAS_KEEPALIVE_TIMEOUT_ERROR RAS_STUCK_TIMEOUT
+
+// Retry connecting on failing sockets (ECONNREFUSED, etc.) once a second.
+#define RAS_CONNECT_RETRY (1*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If we can't connect in 5 seconds, we generate a warning and try alternative connections.
+#define RAS_CONNECT_WARN RAS_KEEPALIVE_TIMEOUT_WARN
+
+// Abort a busy socket (one we are trying to send on, or one that was being established) if there's been
+// no sign of progress in 20 second.  We will try to re-establish communication (up to RAS_PEER_DEAD_TIMEOUT).
+#define RAS_STUCK_TIMEOUT (20*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Terminate ad-hoc connections that have not been used in 60 seconds.
+#define RAS_IDLE_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// If the socket is closed by peer within 5 seconds from the idle timeout, do not attempt to re-establish.
+#define RAS_IDLE_GRACE_PERIOD (5*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Declare a peer as dead and don't retry communicating with it if we couldn't reach it for 60 seconds.
+#define RAS_PEER_DEAD_TIMEOUT (60*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a leg of a collective operation if the response takes more than 5 seconds to arrive *and* one of the
+// connections experiences delays.
+#define RAS_COLLECTIVE_LEG_TIMEOUT (RAS_COLLECTIVE_LEG_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Abort a whole collective operation after at most RAS_COLLECTIVE_LEG_TIMEOUT+RAS_COLLECTIVE_EXTRA_TIMEOUT (10s).
+#define RAS_COLLECTIVE_EXTRA_TIMEOUT (RAS_COLLECTIVE_EXTRA_TIMEOUT_SEC*CLOCK_UNITS_PER_SEC*ncclParamRasTimeoutFactor())
+
+// Structure used for tracking the progress of sending a RAS message.
+struct rasMsgMeta {
+  struct rasMsgMeta* next;
+  int64_t enqueueTime;
+  int offset; // Progress sending the message (including the message size itself (an int, which is sent first)).
+  int length; // Length of the message (*excluding* the message size).
+  struct rasMsg msg; // Variable length.
+};
+
+// Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
+// For every collective operation, each participating RAS thread will create its own.
+struct rasCollective {
+  union ncclSocketAddress rootAddr;
+  uint64_t rootId;
+
+  rasCollectiveType type;
+
+  int64_t timeout;
+  bool timeoutWarned;
+
+  int64_t startTime; // For timeout calculations.
+  int fromConnIdx; // The connection we received the request from.
+
+  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  int nFwdSent; // Count of the above (local process only).
+  int nFwdRecv; // Count of the responses received or timeouts (local process only).
+
+  int nLegTimeouts; // Collective (from this process and the responses we received).
+
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  int nPeers;
+
+  char* data; // Collective (from this process and the responses we received).
+  int nData;
+};
+
+// Collective data in RAS_COLL_CONNS responses.
+struct rasCollConns {
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+  int nConns;
+  int nNegativeMins;
+  struct negativeMin {
+    union ncclSocketAddress source;
+    union ncclSocketAddress dest;
+    int64_t travelTimeMin;
+  } negativeMins[0]; // Variable length.
+};
+
+// Collective data in RAS_COLL_COMMS responses.
+struct rasCollComms {
+  int nComms;
+  struct comm {
+    uint64_t commHash;
+    int commNRanks;
+    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rank {
+      int commRank;
+      int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
+      uint64_t collOpCount;
+      struct {
+        ncclResult_t initState:4;
+        ncclResult_t asyncError:4;
+        bool finalizeCalled:1;
+        bool destroyFlag:1;
+        bool abortFlag:1;
+      } status;
+      char cudaDev;
+      char nvmlDev;
+    } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
+  } comms[0]; // Variable length. Sorted by commHash.
+};
+
+// Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
+// or one of the fallbacks).
+struct rasLinkConn {
+  int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
+               // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
+  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
+               // for a connection to be started by the remote peer).
+  bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
+                 // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
+                 // valid primary connection, in order to ensure that keep-alive messages are sent.
+};
+
+// Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
+// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
+// network is reconfigured or a peer dies.
+struct rasLink {
+  int direction; // 1 for nextLink, -1 for prevLink.
+
+  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
+  // the lowest indices).
+  struct rasLinkConn* conns;
+  int nConns;
+  int connsSize; // Array size; could be larger than nConns.
+
+  // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
+  // the peer on the other side to do so) but that peer failed to initiate.
+  int64_t lastUpdatePeersTime;
+};
+
+// Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
+// socket (described by the rasSocket structure), which can be affected by transient network issues.
+struct rasConnection {
+  bool inUse;
+
+  union ncclSocketAddress addr;
+
+  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // to a single entry here, for sockets that are in the process of being terminated and re-established.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no such socket.
+  int sockIdx;
+
+  // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
+  // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
+  // lastSentPeersHash stores *our* rasPeersHash from the time we last sent a peers *update* through this connection
+  // (which is different than sending just the hash, like we do in KEEPALIVE, etc.).
+  // lastRecvPeersHash stores the latest known rasPeersHash of the peer (received via KEEPALIVE, etc.).
+  uint64_t lastSentPeersHash;
+  uint64_t lastRecvPeersHash;
+
+  // Same but for rasDeadPeersHash.
+  uint64_t lastSentDeadPeersHash;
+  uint64_t lastRecvDeadPeersHash;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  // Used for keeping track of timeouts that may extend beyond the lifetime of a socket.
+  // The timeout starts when the connection is being created (and is turned off when the initialization is completed
+  // successfully) or when we detect a problem, such as a socket timeout (in the latter case, we may need to
+  // retroactively calculate the start time).
+  // A value of 0 indicates that they are not currently in use.
+  int64_t startRetryTime;
+  int64_t lastRetryTime;
+
+  bool experiencingDelays; // A flag indicating that the connection is currently subject to RAS_KEEPALIVE_TIMEOUT_WARN
+                           // or RAS_CONNECT_WARN timeout.  If set, the warnings have been issued and the fallbacks
+                           // have been initiated if needed.
+  bool linkFlag; // Used within rasNet* calls to mark whether this connection was already handled when iterating over
+                 // multiple links (since a connection can belong to more than one link).
+  // The below four fields are for statistical purposes only.
+  int64_t travelTimeMin;
+  int64_t travelTimeMax;
+  int64_t travelTimeSum;
+  int64_t travelTimeCount;
+};
+
+// Status of a RAS socket.
+typedef enum {
+  RAS_SOCK_CLOSED = 0,
+  RAS_SOCK_CONNECTING = 1,
+  RAS_SOCK_HANDSHAKE = 2,
+  RAS_SOCK_READY = 3,
+  RAS_SOCK_TERMINATING = 4
+} rasSocketStatus;
+
+// Describes a socket implementing communication between two peers.
+struct rasSocket {
+  struct ncclSocket sock;
+
+  rasSocketStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+ // Index of the corresponding entry in the rasConns array.
+  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
+  // -1 if there is no connection (normal condition on the accept side before the connInit message).
+  int connIdx;
+
+  int64_t createTime;
+  int64_t lastSendTime;
+  int64_t lastRecvTime;
+
+  // Data on the message currently being received.
+  int recvOffset;
+  int recvLength;
+  struct rasMsg* recvMsg;
+};
+
+// Status of a RAS client.
+typedef enum {
+  RAS_CLIENT_CLOSED = 0,
+  RAS_CLIENT_CONNECTED = 1,
+  RAS_CLIENT_INIT = 2,
+  RAS_CLIENT_CONNS = 3,
+  RAS_CLIENT_COMMS = 4,
+  RAS_CLIENT_FINISHED = 99
+} rasClientStatus;
+
+// Describes a RAS client.
+struct rasClient {
+  int sock;
+
+  rasClientStatus status;
+
+  int pfd; // Index in the rasPfds array.
+
+  char recvBuffer[1024];
+  int recvOffset;
+
+  // Queue of messages to send.
+  struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQ;
+
+  int verbose;
+  int64_t timeout;
+
+  // State stored during asynchronous operations such as collectives.
+  int collIdx; // Index to the onging rasCollective.
+};
+
+
+// ras.cc
+extern struct pollfd* rasPfds;
+extern struct ncclSocket rasNetListeningSocket;
+extern std::mutex ncclCommsMutex;
+extern struct ncclComm** ncclComms;
+extern int nNcclComms;
+extern  bool ncclCommsSorted;
+extern char rasLine[SOCKET_NAME_MAXLEN+1];
+
+int64_t ncclParamRasTimeoutFactor();
+ncclResult_t rasMsgAlloc(struct rasMsg** msg, size_t msgLen);
+void rasMsgFree(struct rasMsg* msg);
+void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t msgLen, bool front = false);
+ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
+ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
+ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
+void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+ncclResult_t rasGetNewPollEntry(int* index);
+
+
+// rasnet.cc
+extern struct rasLink rasNextLink, rasPrevLink;
+extern struct rasConnection* rasConns;
+extern int nRasConns;
+extern struct rasSocket *rasSockets;
+extern int nRasSockets;
+
+ncclResult_t getNewConnEntry(struct rasConnection** pConn);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
+int rasConnFind(const union ncclSocketAddress* addr);
+void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasConnDisconnect(const union ncclSocketAddress* addr);
+ncclResult_t rasNetAcceptNewSocket();
+void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
+                        bool retry = true);
+void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
+ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
+                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+
+// peers.cc
+extern struct rasPeerInfo* rasPeers;
+extern int nRasPeers;
+extern uint64_t rasPeersHash;
+extern union ncclSocketAddress* rasDeadPeers;
+extern int nRasDeadPeers;
+extern uint64_t rasDeadPeersHash;
+
+ncclResult_t rasLocalHandleAddRanks(struct rasRankInit* ranks, int nranks);
+int rasPeerFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct rasPeerInfo* peers, int nPeers);
+ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
+int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallback = false);
+ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
+bool rasPeerIsDead(const union ncclSocketAddress* addr);
+int ncclSocketsCompare(const void* p1, const void* p2);
+bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+
+
+// collectives.cc
+extern struct rasCollective* rasCollectives;
+
+void rasCollReqInit(struct rasCollRequest* req);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
+                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
+ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
+void rasCollsPurgeConn(int connIdx);
+void rasCollFree(struct rasCollective* coll);
+void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+
+// client_support.cc
+extern int rasClientListeningSocket;
+extern struct rasClient* rasClients;
+extern int nRasClients;
+ncclResult_t rasClientInitSocket();
+ncclResult_t rasClientAcceptNewSocket();
+ncclResult_t rasClientResume(struct rasCollective* coll);
+void rasClientEventLoop(int clientIdx, int pollIdx);
+const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+
+#endif // !NCCL_RAS_CLIENT
+
+#endif // !NCCL_RAS_INTERNAL_H_
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
new file mode 100644
index 000000000..441ad192c
--- /dev/null
+++ b/src/ras/rasnet.cc
@@ -0,0 +1,1189 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#define NDEBUG // Comment out during development only!
+#include <cassert>
+
+#include "ras_internal.h"
+
+// Links forming the backbone of the RAS network (currently a ring).
+struct rasLink rasNextLink = {1}, rasPrevLink = {-1};
+
+// Connections on the RAS network.
+struct rasConnection* rasConns;
+int nRasConns;
+
+// Sockets implementing the RAS network.
+struct rasSocket *rasSockets;
+int nRasSockets;
+
+// Magic file descriptor number when we want poll() to ignore an entry.  Anything negative would do, but
+// I didn't want to use -1 because it has a special meaning for us.
+#define POLL_FD_IGNORE -2
+
+static void rasConnOpen(struct rasConnection* conn);
+static ncclResult_t rasConnPrepare(struct rasConnection* conn);
+static void rasConnTerminate(struct rasConnection* conn);
+
+static ncclResult_t getNewSockEntry(struct rasSocket** pSock);
+
+static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup);
+static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup);
+static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false);
+
+static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx);
+static void rasConnResume(struct rasConnection* conn);
+static void rasLinkSanitizeFallbacks(struct rasLink* link);
+static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1);
+static int rasLinkFindConn(const struct rasLink* link, int connIdx);
+
+
+///////////////////////////////////////////////
+// Functions related to the RAS connections. //
+///////////////////////////////////////////////
+
+// Allocates an entry in the rasConns array, enlarging the array if necessary.
+ncclResult_t getNewConnEntry(struct rasConnection** pConn) {
+  struct rasConnection* conn;
+  int i;
+  for (i = 0; i < nRasConns; i++)
+    if (!rasConns[i].inUse)
+      break;
+  if (i == nRasConns) {
+    NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT));
+    nRasConns += RAS_INCREMENT;
+  }
+
+  conn = rasConns+i;
+  memset(conn, '\0', sizeof(*conn));
+  conn->inUse = true;
+  conn->sockIdx = -1;
+  ncclIntruQueueConstruct(&conn->sendQ);
+  conn->travelTimeMin = INT64_MAX;
+  conn->travelTimeMax = INT64_MIN;
+
+  *pConn = conn;
+  return ncclSuccess;
+}
+
+// Creates a new RAS network connection to a remote peer address.
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
+  ncclResult_t ret = ncclSuccess;
+  struct rasConnection* conn = nullptr;
+
+  // First check if a connection entry for this peer already exists.
+  int connIdx = rasConnFind(addr);
+  if (connIdx != -1) {
+    conn = rasConns+connIdx;
+  }
+
+  if (conn && conn->sockIdx != -1) {
+    // An entry exists and has a socket associated with it -- nothing left for us to do.
+    if (pConnIdx)
+      *pConnIdx = connIdx;
+    goto exit;
+  }
+
+  if (!conn) {
+    NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit);
+    memcpy(&conn->addr, addr, sizeof(conn->addr));
+    // We are establishing a new connection -- start the timeout.
+    conn->startRetryTime = clockNano();
+    connIdx = conn - rasConns;
+  }
+
+  if (pConnIdx)
+    *pConnIdx = connIdx;
+
+  rasConnOpen(conn);
+
+exit:
+  return ret;
+}
+
+// Opens a connection to a remote peer.
+static void rasConnOpen(struct rasConnection* conn) {
+  ncclResult_t ret; // Not used.
+  struct rasSocket* sock;
+  bool closeSocketOnFail = false;
+  int ready;
+
+  NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketInit(&sock->sock, &conn->addr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr,
+                               /*asyncFlag*/1, /*customRetry*/1), ret, fail);
+  closeSocketOnFail = true;
+  NCCLCHECKGOTO(ncclSocketConnect(&sock->sock), ret, fail);
+  NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
+
+  // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures
+  // we don't need to clean them up.
+  conn->sockIdx = sock-rasSockets;
+  sock->connIdx = conn-rasConns;
+  rasPfds[sock->pfd].fd = sock->sock.fd;
+
+  // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because
+  // there are other things we want to do before sending the CONNINIT, such as adding the connection to
+  // the network links, etc.
+  sock->status = RAS_SOCK_CONNECTING;
+  rasPfds[sock->pfd].events = (POLLIN | POLLOUT);
+  if (sock->sock.state == ncclSocketStateConnecting)
+    rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect().
+
+exit:
+  conn->lastRetryTime = clockNano();
+  // We deliberately ignore ret as this function will be retried later if needed.
+  return;
+fail:
+  if (closeSocketOnFail)
+    (void)ncclSocketClose(&sock->sock);
+  goto exit;
+}
+
+// Sends an initial RAS message to the peer after connecting to it.
+static ncclResult_t rasConnPrepare(struct rasConnection* conn) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_CONNINIT);
+
+  // The first message the RAS threads exchange provides the listening address of the connecting thread
+  // and the NCCL version to ensure that users aren't mixing things up.
+  NCCLCHECK(rasMsgAlloc(&msg, msgLen));
+  msg->type = RAS_MSG_CONNINIT;
+  msg->connInit.ncclVersion = NCCL_VERSION_CODE;
+  memcpy(&msg->connInit.listeningAddr, &rasNetListeningSocket.addr, sizeof(msg->connInit.listeningAddr));
+  msg->connInit.peersHash = rasPeersHash;
+  msg->connInit.deadPeersHash = rasDeadPeersHash;
+  // We don't update lastSent[Dead]PeersHash because we aren't actually sending the peers themselves here.
+
+  rasConnEnqueueMsg(conn, msg, msgLen, /*front*/true);
+
+  // We'll finish the initialization in rasMsgHandleConnInitAck, after the other side responds.
+  return ncclSuccess;
+}
+
+// Searches through rasConns for a connection with a provided address.
+int rasConnFind(const union ncclSocketAddress* addr) {
+  // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way)
+  // so binary search won't do...
+  for (int i = 0; i < nRasConns; i++) {
+    struct rasConnection* conn = rasConns+i;
+    if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
+      return i;
+  }
+
+  return -1;
+}
+
+// Handles any connection-related timeouts.  Many timeouts affect the underlying sockets and thus have been handled
+// in the socket timeout handler earlier by terminating the problematic sockets.  If a socket connection doesn't
+// exist or needs to be re-established (due to having just been terminated), we handle that here.
+// This is also where we declare peers as dead, etc.
+// Invoked from the main RAS event loop.
+void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
+    struct rasConnection* conn = rasConns+connIdx;
+
+    if (!conn->inUse)
+      continue;
+
+    if (conn->sockIdx != -1) {
+      struct rasSocket* sock = rasSockets+conn->sockIdx;
+      bool sockTerminated = false;
+
+      // Retry the socket connections that have been refused.
+      if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) {
+        if (now - sock->lastSendTime > RAS_CONNECT_RETRY) {
+          int ready;
+          if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+            INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s",
+                 ncclSocketToString(&sock->sock.addr, rasLine));
+            rasSocketTerminate(sock, /*finalize*/true);
+            // We will retry below in the same loop.
+            sockTerminated = true;
+          } else {
+            // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations.
+            sock->lastSendTime = clockNano();
+            if (!ready && sock->sock.state == ncclSocketStateConnecting)
+              *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+            else
+              rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop.
+          } // if (ncclSocketReady)
+        } else {
+          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+        }
+      } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting)
+
+      // For connections that have data to send but that we've been unable to send a message on for a while,
+      // consider their sockets lost and terminate them.
+      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) {
+        if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
+          INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s",
+               (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
+               CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
+          // We will retry below in the same loop.
+        } else {
+          *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime,
+                                                       ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT);
+        }
+      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY)
+    } // if (conn->sockIdx != -1)
+
+    // For connections that are being (re-)established, irrespective of whether there's a valid socket associated
+    // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired.
+    if (conn->startRetryTime) {
+      // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead
+      // so that we don't try again.
+      if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) {
+        struct rasCollRequest bCast;
+        INFO(NCCL_RAS, "RAS connect retry timeout (%lds) on socket connection with %s",
+             (now-conn->startRetryTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+
+        // Broadcast the info about a dead peer to everybody.  This will handle it locally as well, including
+        // declaring the peer dead and terminating the connection.
+        rasCollReqInit(&bCast);
+        bCast.type = RAS_BC_DEADPEER;
+        memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr));
+        (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER));
+
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT);
+      }
+
+      // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via
+      // the conn->sockIdx == -1 test).
+
+      // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
+      // to establish fallback connections.
+      if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
+        if (!conn->experiencingDelays) {
+          INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
+               (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+
+          // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
+          // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
+          // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
+          conn->experiencingDelays = true;
+          (void)rasLinkAddFallback(&rasNextLink, connIdx);
+          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+          // rasConns may have been reallocated by the above calls.
+          conn = rasConns+connIdx;
+
+          // Stop collectives from waiting for a response over it.
+          rasCollsPurgeConn(connIdx);
+        } // if (!conn->experiencingDelays)
+      } else {
+        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
+      }
+
+      // If a socket was terminated (or never opened, due to some error), try to open it now.
+      // We retry once a second.
+      if (conn->sockIdx == -1) {
+        if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
+          INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
+               ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
+               (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
+          rasConnOpen(conn);
+        }
+        if (conn->sockIdx == -1)
+          *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
+      }
+    } // if (conn->startRetryTime)
+  } // for (connIdx)
+}
+
+// Checks if we have a connection to a given peer and if so, terminates it.  The connection is removed from the
+// RAS links, though fallbacks are initiated if necessary.  Typically called just before declaring a peer dead.
+void rasConnDisconnect(const union ncclSocketAddress* addr) {
+  int connIdx = rasConnFind(addr);
+  if (connIdx != -1) {
+    (void)rasLinkAddFallback(&rasNextLink, connIdx);
+    (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+    rasLinkDropConn(&rasNextLink, connIdx);
+    rasLinkDropConn(&rasPrevLink, connIdx);
+
+    rasConnTerminate(rasConns+connIdx);
+  }
+}
+
+// Terminates a connection and frees the rasConns entry.
+static void rasConnTerminate(struct rasConnection* conn) {
+  int connIdx = conn - rasConns;
+
+  // Make sure there are no lingering rasSockets pointing to it.
+  for (int i = 0; i < nRasSockets; i++) {
+    struct rasSocket* sock = rasSockets+i;
+    if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx)
+      rasSocketTerminate(sock, /*finalize*/true);
+  }
+
+  // Also check any ongoing collectives.
+  rasCollsPurgeConn(connIdx);
+
+  while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) {
+    free(meta);
+  }
+
+  INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine));
+
+  conn->inUse = false;
+  conn->sockIdx = -1; // Should be that way already, but just to be extra sure...
+}
+
+
+///////////////////////////////////////////
+// Functions related to the RAS sockets. //
+///////////////////////////////////////////
+
+// Accepts a new RAS network socket connection.  The socket is not usable until after the handshake, as a
+// corresponding rasConnection can't be established without knowing the peer's address.
+ncclResult_t rasNetAcceptNewSocket() {
+  ncclResult_t ret = ncclSuccess;
+  struct rasSocket* sock;
+  int ready;
+  bool socketInitialized = false;
+  NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
+
+  NCCLCHECKGOTO(ncclSocketInit(&sock->sock, nullptr, NCCL_SOCKET_MAGIC, ncclSocketTypeRasNetwork, nullptr,
+                               /*asyncFlag*/1), ret, fail);
+  socketInitialized = true;
+  NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail);
+  NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
+
+  if (sock->sock.fd != -1) {
+    NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
+    rasPfds[sock->pfd].fd = sock->sock.fd;
+    rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side.  This also
+                                        // helps the code tell the sides apart.
+    sock->status = RAS_SOCK_CONNECTING;
+
+    INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
+  }
+
+exit:
+  return ret;
+fail:
+  if (socketInitialized)
+    NCCLCHECK(ncclSocketClose(&sock->sock));
+  goto exit;
+}
+
+// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary.
+static ncclResult_t getNewSockEntry(struct rasSocket** pSock) {
+  struct rasSocket* sock;
+  int i;
+  for (i = 0; i < nRasSockets; i++)
+    if (rasSockets[i].status == RAS_SOCK_CLOSED)
+      break;
+  if (i == nRasSockets) {
+    NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT));
+    nRasSockets += RAS_INCREMENT;
+  }
+
+  sock = rasSockets+i;
+  memset(sock, '\0', sizeof(*sock));
+  sock->pfd = -1;
+  sock->connIdx = -1;
+  sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano();
+
+  *pSock = sock;
+  return ncclSuccess;
+}
+
+// Invoked from the main RAS event loop to handle RAS socket timeouts.
+void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
+    struct rasSocket* sock = rasSockets+sockIdx;
+
+    if (sock->status == RAS_SOCK_CLOSED)
+      continue;
+
+    // For socket connections that are still being established, give up on the ones that take too long to initialize.
+    if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) {
+      if (now - sock->createTime > RAS_STUCK_TIMEOUT) {
+        if (sock->connIdx == -1) {
+          INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s",
+               (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+        } else {
+          struct rasConnection* conn = rasConns+sock->connIdx;
+          INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s "
+               "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
+               (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine),
+               conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0),
+               sock->status);
+        }
+        rasSocketTerminate(sock, /*finalize*/true);
+        // We may retry later.
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE)
+
+    // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
+    if (sock->status == RAS_SOCK_TERMINATING) {
+      if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) {
+        INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s",
+             (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/true);
+        // This socket is presumably already being re-established, if needed.
+        continue;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_TERMINATING)
+
+    // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
+    // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
+    // suspend, rasSocketTerminate will do additional checking.
+    if (sock->status == RAS_SOCK_READY) {
+      if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) {
+        INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s",
+             (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false);
+        continue;
+        // The RAS network timeout handler will terminate the conn it was associated with, if any.
+      } else {
+        *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT);
+      }
+    } // if (sock->status == RAS_SOCK_READY)
+  } // for (sockIdx)
+}
+
+// Handles the termination of a RAS socket.
+// We try to do it in stages for established sockets (in READY state).  We shut down just the sending side
+// for them and change their state to TERMINATING, so that we can still receive data that may be in the buffers.
+// Once we get an EOF when receiving data, we finalize the termination.
+// For not fully established sockets, we can terminate immediately as there's no useful data to extract.
+void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
+  assert(sock->status != RAS_SOCK_CLOSED);
+  if (sock->connIdx != -1) {
+    struct rasConnection* conn = rasConns+sock->connIdx;
+    // If the sockIdx of the connection points back to us, it means that we are the current socket of this
+    // connection, so we have additional work to do before we can terminate it.
+    if (conn->sockIdx == sock-rasSockets) {
+      // Reset it to indicate there's no valid socket associated with that connection anymore.
+      conn->sockIdx = -1;
+
+      // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably
+      // deliberately closed them.  Make an exception for sockets that are part of the RAS network links.
+      if ((retry &&
+           clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) ||
+          rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) {
+        // For connections that were fine until now, the connection-level timeout starts at termination, and possibly
+        // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then
+        // we need to include that timeout as well.
+        if (conn->startRetryTime == 0) {
+          conn->startRetryTime = conn->lastRetryTime = clockNano() - startRetryOffset;
+        }
+
+        // We also filter through the sendQ, eliminating any messages that won't need to be sent when the socket
+        // connection is re-established (that's essentially the server init and keep-alives).
+        // As ncclIntruQueue can't be iterated, we transfer the content in bulk to a temporary and then filter the
+        // messages as we move them back one-by-one.
+        struct ncclIntruQueue<struct rasMsgMeta, &rasMsgMeta::next> sendQTmp;
+        ncclIntruQueueConstruct(&sendQTmp);
+        ncclIntruQueueTransfer(&sendQTmp, &conn->sendQ);
+        while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&sendQTmp)) {
+          if (meta->msg.type != RAS_MSG_CONNINIT && meta->msg.type != RAS_MSG_CONNINITACK &&
+              meta->msg.type != RAS_MSG_KEEPALIVE) {
+            if (meta->offset != 0) {
+              // Reset the progress of any partially-sent messages (they will need to be resent from the beginning;
+              // in principle that could apply to the first message only).
+              meta->offset = 0;
+            }
+            ncclIntruQueueEnqueue(&conn->sendQ, meta);
+          } else { // RAS_MSG_CONNINIT || RAS_MSG_CONNINITACK || RAS_MSG_KEEPALIVE
+            free(meta);
+          }
+        } // while (meta)
+      } // if (retry)
+
+      // Stop collectives from waiting for a response over this connection.
+      rasCollsPurgeConn(sock->connIdx);
+    } // if (conn->sockIdx == sock-rasSockets)
+  } // if (sock->connIdx != -1)
+
+  if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
+    if (sock->status != RAS_SOCK_TERMINATING) {
+      // The receiving side is still open -- close just the sending side.
+      (void)ncclSocketShutdown(&sock->sock, SHUT_WR);
+      rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send.
+      // The timeout for this socket starts ticking now...
+      sock->lastSendTime = clockNano();
+      sock->status = RAS_SOCK_TERMINATING;
+    }
+    // Else it must be in RAS_SOCK_TERMINATING state already -- in that case we do nothing here and instead
+    // we wait for an EOF on the receiving side or for a timeout.
+  } else {
+    // Either the caller requested finalization or we cannot receive on it.
+    (void)ncclSocketClose(&sock->sock);
+    sock->status = RAS_SOCK_CLOSED;
+    rasPfds[sock->pfd].fd = -1;
+    rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
+    sock->pfd = sock->connIdx = -1;
+    sock->recvOffset = sock->recvLength = 0;
+    free(sock->recvMsg);
+    sock->recvMsg = nullptr;
+  }
+}
+
+// Handles a ready socket FD from the main event loop.
+void rasSockEventLoop(int sockIdx, int pollIdx) {
+  struct rasSocket* sock = rasSockets+sockIdx;
+
+  if (sock->status == RAS_SOCK_CONNECTING) {
+    int ready;
+    // Socket is not yet fully established. Continue the OS or NCCL-level handshake.
+    if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+      INFO(NCCL_RAS, "RAS unexpected error from ncclSocketReady; terminating the socket connection with %s",
+           ncclSocketToString(&sock->sock.addr, rasLine));
+      rasSocketTerminate(sock);
+      // We may retry further down.
+    } else {
+      if (ready) {
+        // We can tell the connect-side based on what events is set to.
+        bool connectSide = (rasPfds[pollIdx].events & POLLOUT);
+        (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano();
+        sock->status = RAS_SOCK_HANDSHAKE;
+        if (connectSide) {
+          assert(sock->connIdx != -1);
+          if (rasConns[sock->connIdx].sockIdx == sockIdx) {
+            if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) {
+              INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s",
+                   ncclSocketToString(&sock->sock.addr, rasLine));
+              rasSocketTerminate(sock);
+              // We may retry further down.
+            }
+          } else {
+            // The connection this socket is associated with no longer considers it to be the current one.
+            // This could possibly happen due to a race condition.  Simply terminate it.
+            INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!",
+                 ncclSocketToString(&sock->sock.addr, rasLine));
+            rasSocketTerminate(sock);
+          }
+        } // if (connectSide)
+      } else { // !ready
+        if (sock->sock.state == ncclSocketStateConnecting)
+          rasPfds[sock->pfd].fd = POLL_FD_IGNORE; // Don't poll on this socket before connect().
+      }
+    } // if (ncclSocketReady)
+  } else { // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING.
+    // The extra test for TERMINATING is there to take care of a race when the handling of one socket
+    // results in another socket being terminated, but one that already has revents waiting from poll.
+    if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) {
+      int closed = 0;
+      bool allSent = false;
+      assert(sock->connIdx != -1);
+      struct rasConnection* conn = rasConns+sock->connIdx;
+      assert(conn->sockIdx == sockIdx);
+      if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) {
+        INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s",
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock);
+        // We may retry further down.
+      } else if (closed) {
+        INFO(NCCL_RAS, "RAS socket connection with %s closed by peer on send; terminating it",
+             ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock);
+        // We may retry further down.
+      } else {
+        sock->lastSendTime = clockNano();
+        if (allSent)
+          rasPfds[sock->pfd].events &= ~POLLOUT; // Nothing more to send for now.
+      }
+    }
+    if (rasPfds[pollIdx].revents & POLLIN) {
+      struct rasMsg* msg;
+      do {
+        int closed = 0;
+        msg = nullptr;
+        if (rasMsgRecv(sock, &msg, &closed) != ncclSuccess) {
+          INFO(NCCL_RAS, "RAS unexpected error from rasMsgRecv; terminating the socket connection with %s",
+               ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/true);
+          // We may retry further down.
+        } else if (closed) {
+          const char* socketType;
+          if (sock->connIdx == -1)
+            socketType = "incoming";
+          else if (rasConns[sock->connIdx].sockIdx != sockIdx)
+            socketType = "old";
+          else if (sock->status == RAS_SOCK_HANDSHAKE)
+            socketType = "new";
+          else
+            socketType = "current";
+          INFO(NCCL_RAS, "RAS %s socket connection with %s closed by peer on receive; terminating it",
+               socketType, ncclSocketToString(&sock->sock.addr, rasLine));
+          rasSocketTerminate(sock, /*finalize*/true);
+          // We may retry further down.
+        } else {
+          sock->lastRecvTime = clockNano();
+          if (msg) {
+            (void)rasMsgHandle(msg, sock);
+            free(msg);
+            // Message handlers can terminate a socket in certain cases; we need to check for
+            // that here so that we don't try to receive from a closed socket.
+            // No handlers are currently believed to create new sockets but better to be safe than sorry
+            // and re-init the sock variable.
+            sock = rasSockets+sockIdx;
+            if (sock->status == RAS_SOCK_CLOSED)
+              break;
+          }
+          if (sock->connIdx != -1) {
+            struct rasConnection* conn = rasConns+sock->connIdx;
+            if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays))
+              rasConnResume(conn);
+          }
+        }
+      } while (msg);
+    } // if (POLLIN)
+  } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING
+}
+
+
+////////////////////////////////////////////////////////////////
+// Functions related to the handling of RAS network timeouts. //
+////////////////////////////////////////////////////////////////
+
+// Invoked from the main RAS event loop to handle RAS network timeouts.
+void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) {
+  // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each
+  // connection just once.  We solve that with a simple flag within a connection.  This also allows us to distinguish
+  // connections that are part of a link from those that are not.
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
+    rasConns[connIdx].linkFlag = false;
+
+  (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup);
+  (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup);
+
+  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
+    struct rasConnection* conn = rasConns+connIdx;
+    if (conn->inUse && !conn->linkFlag) {
+      // The connection is not part of any link.  Check if it should be terminated.
+      if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) {
+        rasConnTerminate(conn);
+        continue;
+      }
+    }
+  }
+}
+
+// Checks for and handles timeouts at the link level; primarily the keep-alives for link connections.
+static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) {
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+    if (linkConn->connIdx != -1) {
+      if (!rasConns[linkConn->connIdx].linkFlag) {
+        rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup);
+        // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here.
+        // For the same reason we re-init linkConn.
+        linkConn = link->conns+i;
+        rasConns[linkConn->connIdx].linkFlag = true;
+      }
+    } else if (i == 0 && link->lastUpdatePeersTime != 0) {
+      // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address
+      // than the peer.  If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action.
+      if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) {
+        INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s",
+             (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC,
+             ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
+        if (linkConn->connIdx != -1) {
+          rasConns[linkConn->connIdx].linkFlag = true;
+        }
+        // We used to connect to the first fallback but I think trying to connect to the calculated primary first
+        // in this case is more intuitive.
+        //(void)rasLinkTryFallback(link, -1);
+        link->lastUpdatePeersTime = 0;
+      } else {
+        *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN);
+      }
+    } // if (i == 0 && link->lastUpdatePeerTime != 0)
+  } // for (i)
+
+  return ncclSuccess;
+}
+
+// Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links.
+static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) {
+  struct rasConnection* conn = rasConns+connIdx;
+  if (conn->sockIdx != -1) {
+    struct rasSocket* sock = rasSockets+conn->sockIdx;
+
+    if (sock->status == RAS_SOCK_READY) {
+      // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued.
+      if (ncclIntruQueueEmpty(&conn->sendQ)) {
+        if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
+          rasConnSendKeepAlive(conn);
+        } else {
+          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
+        }
+      }
+
+      // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections.
+      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
+        if (!conn->experiencingDelays) {
+          INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s",
+               (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+
+          // At this point, it's mostly just a precaution; we will continue with the primary connection until
+          // RAS_PEER_DEAD_TIMEOUT expires.
+          conn->experiencingDelays = true;
+          (void)rasLinkAddFallback(&rasNextLink, connIdx);
+          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
+          // rasConns and rasSockets may have been reallocated by the above calls.
+          conn = rasConns+connIdx;
+          sock = rasSockets+conn->sockIdx;
+
+          // Stop collectives from waiting for a response over it.
+          rasCollsPurgeConn(connIdx);
+        }
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
+      }
+
+      // For long timeouts we need to act.
+      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
+        INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s",
+             (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+        rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
+        *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait.
+      } else {
+        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
+      }
+    } // if (sock->status == RAS_SOCK_READY)
+  } // if (conn->sockIdx != -1)
+}
+
+// Sends a keep-alive message to a peer on the RAS network.
+static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) {
+  struct rasMsg* msg = nullptr;
+  int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE);
+  if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) {
+    int linkIdx;
+    msg->type = RAS_MSG_KEEPALIVE;
+    msg->keepAlive.peersHash = rasPeersHash;
+    msg->keepAlive.deadPeersHash = rasDeadPeersHash;
+    msg->keepAlive.nack = (nack ? 1 : 0);
+
+    linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns);
+    if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external)
+      msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink.
+    linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns);
+    if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external)
+      msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink.
+
+    (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime);
+
+    rasConnEnqueueMsg(conn, msg, msgLen);
+  }
+}
+
+// Handles incoming keep-alive messages.
+ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock) {
+  struct timespec currentTime;
+  int64_t travelTime;
+  int peerIdx;
+
+  assert(sock->connIdx != -1);
+  struct rasConnection* conn = rasConns+sock->connIdx;
+  SYSCHECK(clock_gettime(CLOCK_REALTIME, &currentTime), "clock_gettime");
+  travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 +
+    (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec);
+
+  if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) {
+    conn->lastRecvPeersHash = msg->keepAlive.peersHash;
+  }
+  if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) {
+    conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
+  }
+
+  // Make sure that the connection is part of the appropriate links forming the RAS network.  In particular, this
+  // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer
+  // needed).
+  peerIdx = rasPeerFind(&conn->addr);
+  // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before
+  // the peers update.
+  (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true);
+  (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true);
+
+  // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the
+  // connection is just an external fallback), we should check if *we* still need it.  It might be that we don't,
+  // and because we stopped sending the keep-alives, our peer doesn't know about it.  rasLinkUpdateConn calls above
+  // will have wiped any external fallbacks, so anything that remains must be needed.
+  if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) {
+    if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) {
+      // We don't need this connection either.  Notify the peer about it.  To avoid an infinite loop, we set the
+      // special nack flag in the message to distinguish it from regular keep-alives.
+      rasConnSendKeepAlive(conn, /*nack*/true);
+    }
+  }
+
+  if (conn->travelTimeMin > travelTime)
+    conn->travelTimeMin = travelTime;
+  if (conn->travelTimeMax < travelTime)
+    conn->travelTimeMax = travelTime;
+  conn->travelTimeSum += travelTime;
+  conn->travelTimeCount++;
+
+  if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) {
+    // This could happen due to a short-lived race condition between the peers propagation
+    // process and the periodic keep-alive messages (perhaps we'll see it regularly at scale?).
+    // Just in case there's some unforeseen problem with the peers propagation though, exchange with the
+    // remote to get everybody in sync.
+    INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)",
+         ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash);
+    INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash);
+    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+  }
+  return ncclSuccess;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Functions related to the RAS links and recovery from connection failures. //
+///////////////////////////////////////////////////////////////////////////////
+
+// Checks if the connection (that we just detected some problem with) is part of the RAS link and if so,
+// tries to initiate a(nother) fallback connection if needed.
+// External connections are generally ignored by this whole process: in particular, we don't add fallbacks for
+// timing out external connections.  However, we will use an active external connection if it would be a better
+// option than whatever we can come up with.
+static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
+  int peerIdx = -1;
+  int linkIdx = -1;
+  int firstExtLinkIdx = -1;
+  int newPeerIdx;
+
+  // First check if the connection is part of this link.  In the process also check if any of the link's connections
+  // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out.
+  for (int i = 0; i < link->nConns; i++) {
+    struct rasLinkConn* linkConn = link->conns+i;
+
+    if (linkConn->peerIdx == -1) {
+      // Such elements are always at the very end of the array and we can't use them so we can just as well break.
+      break;
+    }
+
+    // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing
+    // delays).
+    if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) {
+      struct rasConnection* conn = rasConns+linkConn->connIdx;
+      if (!conn->experiencingDelays) {
+        if (!linkConn->external)
+          goto exit; // We don't need to do anything if there's a non-external connection.
+        else if (linkConn->peerIdx != -1) {
+          // Record the location of the first potentially viable external connection in the chain; we may prefer it
+          // over anything we can come up with.
+          if (firstExtLinkIdx == -1)
+            firstExtLinkIdx = i;
+          if (linkIdx != -1)
+            break; // Break out of the loop if we already have all the data we might need.
+        } // linkConn->external && linkConn->peerIdx != -1
+      } // if (!conn->experiencingDelays)
+    } // if (linkConn->connIdx != -1)
+
+    if (linkConn->connIdx == connIdx) {
+      if (linkConn->external)
+        goto exit; // We don't add fallbacks for external connections...
+      peerIdx = linkConn->peerIdx;
+      linkIdx = i;
+      // We are not breaking out of the loop here because we want to check for active connections on *all* potentially
+      // viable elements (in particular, there could be some external ones beyond this one).
+    }
+  }
+
+  if (linkIdx == -1)
+    goto exit;
+
+  // We found an existing element so the connection is part of the link.  No existing non-external connections of this
+  // link are active, so a fallback is needed.
+  assert(peerIdx != -1);
+  newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0);
+  // In principle we want to add (at most) one fallback.  However, if the found fallback connection already exists
+  // and is also experiencing delays, we need to keep iterating.
+  while (newPeerIdx != -1) {
+    int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr);
+    // If we previously found a potential external fallback connection, check if it's better than what we just found.
+    if (firstExtLinkIdx != -1) {
+      linkIdx = -1;
+      // Calculate the index that the newly found fallback would have (pretend mode).
+      NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true,
+                                  &linkIdx));
+      assert(linkIdx != -1);
+      if (firstExtLinkIdx < linkIdx) {
+        // The external connection *is* better -- use it as a fallback instead and be done.
+        link->conns[firstExtLinkIdx].external = false;
+        goto exit;
+      }
+    }
+    NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false,
+                                &linkIdx));
+    if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx)
+      firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index.
+
+    INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s",
+         link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"),
+         linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine));
+    // Note that we don't follow here our convention of "lower address is the one establishing connections" --
+    // that convention is for optimizing regular operations, but we don't want to take chances during fault
+    // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those.
+    if (newConnIdx == -1)
+      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx));
+
+    struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx;
+    // If the fallback connection is also experiencing delays, we need to keep trying.
+    if (!conn->experiencingDelays)
+      break;
+    INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
+         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+
+    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true);
+  }
+  if (newPeerIdx == -1)
+      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+exit:
+  return ncclSuccess;
+}
+
+// Invoked when we receive a message over a connection that was just activated or was experiencing delays.
+// Cleans up the fallbacks, timers, etc, as appropriate.
+static void rasConnResume(struct rasConnection* conn) {
+  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+    INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
+         (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"),
+         ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
+         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
+
+    conn->experiencingDelays = false;
+
+    conn->startRetryTime = conn->lastRetryTime = 0;
+
+    rasLinkSanitizeFallbacks(&rasNextLink);
+    rasLinkSanitizeFallbacks(&rasPrevLink);
+
+    if (!ncclIntruQueueEmpty(&conn->sendQ))
+      rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT;
+  }
+}
+
+// Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed).
+static void rasLinkSanitizeFallbacks(struct rasLink* link) {
+  if (link->nConns > 0 && link->conns[0].connIdx != -1) {
+    struct rasConnection* conn = rasConns+link->conns[0].connIdx;
+    if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+      // We have a good primary.  Simply drop all the fallbacks (the external ones will get recreated via the
+      // keepAlive messages).
+      for (int i = 1; i < link->nConns; i++) {
+        INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+             link->direction, (link->conns[i].external ? "external " : ""), i,
+             ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine));
+      }
+      link->nConns = 1;
+      link->lastUpdatePeersTime = 0;
+    }
+  }
+}
+
+// Attempt to drop a connection from a link.
+static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) {
+  if (linkIdx == -1)
+    linkIdx = rasLinkFindConn(link, connIdx);
+  if (linkIdx != -1) {
+    if (linkIdx == 0) {
+      INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
+           link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+    } else {
+      INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+           link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx,
+           ncclSocketToString(&rasConns[connIdx].addr, rasLine));
+    }
+    memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns));
+    if (link->nConns > 1)
+      link->nConns--;
+    else {
+      link->conns[0].peerIdx = link->conns[0].connIdx = -1;
+    }
+
+    if (linkIdx == 0) {
+      // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
+      // the remote peer loses interest in it).
+      link->conns[0].external = false;
+      if (link->conns[0].connIdx != -1) {
+        INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
+             link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine));
+      }
+      rasLinkSanitizeFallbacks(link);
+    }
+  }
+}
+
+// Checks if a given connection is a member of this link and if so, returns its entry index.
+// Returns -1 if connection not found.
+static int rasLinkFindConn(const struct rasLink* link, int connIdx) {
+  for (int i = 0; i < link->nConns; i++) {
+    if (link->conns[i].connIdx == connIdx)
+      return i;
+  }
+  return -1;
+}
+
+// Note: the behavior of this function has become super-complex and so it should be considered for refactoring.
+// Searches for and updates an entry in a RAS network link.  The conns array is de-facto sorted by peerIdx: it is
+// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also
+// be -1 (the latter are stored at the end).
+// external provides an updated value for the entry's external field.  A false value, if requested, is always set;
+// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry
+// already exists and the function is invoked with external == true, the new value will be ignored.
+// If insert is set, it will, if necessary, insert a new entry if one is not already there.
+// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate.
+// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored.
+// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external).
+// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed
+// (the entry's external must match the argument external for it to be removed).
+ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert,
+                               bool pretend, int* pLinkIdx) {
+  int i, oldLinkIdx = -1;
+
+  if (external && connIdx != -1)
+    insert = true;
+
+  if (connIdx != -1) {
+    // Start by checking if we already have an element with this connIdx.
+    oldLinkIdx = rasLinkFindConn(link, connIdx);
+    if (oldLinkIdx != -1) {
+      struct rasLinkConn* linkConn = link->conns+oldLinkIdx;
+      if (linkConn->peerIdx != -1)
+        assert(linkConn->peerIdx == peerIdx);
+
+      if (linkConn->peerIdx == peerIdx) {
+        if (!external && !pretend)
+          linkConn->external = false; // Ensure that external is cleared if so requested.
+        if (pLinkIdx)
+          *pLinkIdx = oldLinkIdx;
+        goto exit; // Nothing more to do if both connIdx and peerIdx are up to date.
+      }
+
+      // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong
+      // place in the array -- we need to find the right spot.  linkConn->peerIdx == -1 can only happen for external
+      // connections.
+      assert(external);
+    }
+  }
+
+  if (peerIdx != -1) {
+    // Search for the right spot in the conns array.
+    for (i = 0; i < link->nConns; i++) {
+      struct rasLinkConn* linkConn = link->conns+i;
+      if (peerIdx != -1 && linkConn->peerIdx == peerIdx) {
+        // The exact conn element already exists.
+        if (connIdx == -1 && !insert) {
+          // Drop the connection from the link.
+          if (linkConn->external == external) {
+            if (!pretend)
+              rasLinkDropConn(link, linkConn->connIdx, i);
+            else if (pLinkIdx)
+              *pLinkIdx = i;
+          }
+        } else { // connIdx != -1 || insert
+          if (!pretend) {
+            if (linkConn->connIdx != -1)
+              assert(linkConn->connIdx == connIdx);
+            else
+              linkConn->connIdx = connIdx;
+            if (!external)
+              linkConn->external = false; // Ensure that external is cleared if so requested.
+            if (i == 0) {
+              // We received a connection from the remote peer that matches the primary connection we've been
+              // waiting for.
+              rasLinkSanitizeFallbacks(link);
+            }
+          } // if (!pretend)
+          if (pLinkIdx)
+            *pLinkIdx = i;
+        } // connIdx != -1 || insert
+
+        goto exit;
+      } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx)
+      if (!insert)
+        continue;
+      // Ensure that the i-1 index is also valid.
+      if (i == 0)
+        continue;
+      // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them.
+      if (peerIdx != -1 && linkConn->peerIdx == -1)
+        break;
+      // Detect a roll-over and handle it specially.
+      if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) {
+        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 ||
+            link->direction * (peerIdx - linkConn->peerIdx) < 0)
+          break;
+      } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 &&
+            link->direction * (peerIdx - linkConn->peerIdx) < 0)
+          break;
+      }
+    } // for (i)
+  } else {
+    // If peerIdx == -1, insert the new element at the very end.  This can only happen for external connections.
+    assert(external && oldLinkIdx == -1);
+    i = link->nConns;
+  }
+  if (!insert)
+    goto exit;
+
+  // i holds the index at which to insert a new element.
+  if (pretend) {
+    if (pLinkIdx)
+      *pLinkIdx = i;
+    goto exit;
+  }
+
+  if (oldLinkIdx == -1) {
+    struct rasLinkConn* linkConn;
+    if (link->nConns == link->connsSize) {
+      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
+      link->connsSize += RAS_INCREMENT;
+    }
+    linkConn = link->conns+i;
+    // Shift existing conns with indices >= i to make room for the new one.
+    memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns));
+    linkConn->peerIdx = peerIdx;
+    linkConn->connIdx = connIdx;
+    linkConn->external = external;
+    if (external) {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
+           ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine));
+    }
+    link->nConns++;
+  }
+  else { // oldLinkIdx > -1
+    // We already have the conn, we just need to move it to a new spot.
+    struct rasLinkConn* linkConn = link->conns+i;
+    assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1.
+    if (i != oldLinkIdx) {
+      struct rasLinkConn tmp;
+      struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler.
+      // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns
+      // with indices in the range [i, oldLinkIdx).
+      memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp));
+      memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn));
+      memcpy(linkConn, &tmp, sizeof(*linkConn));
+    }
+    if (!external)
+      linkConn->external = false; // Ensure that external is cleared if so requested.
+  } // oldLinkIdx > -1
+  if (pLinkIdx)
+    *pLinkIdx = i;
+exit:
+  return ncclSuccess;
+}
diff --git a/src/register.cc b/src/register.cc
deleted file mode 100644
index c4ca4b4a0..000000000
--- a/src/register.cc
+++ /dev/null
@@ -1,204 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "argcheck.h" // Need some checks here since we access comm
-#include "nccl.h"
-#include "comm.h"
-#include "net.h"
-#include "register.h"
-#include "transport.h"
-
-ncclResult_t ncclNetDeregister(struct ncclComm* comm, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    if (reg->handles[d] != NULL) NCCLCHECK(comm->ncclNet->deregMr(cache->sComms[reg->devs[d]], reg->handles[d]));
-  }
-  reg->nDevs = 0;
-  free(reg->handles);
-  reg->handles = NULL;
-  ncclDebugNoWarn = 0;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetRegister(struct ncclComm* comm, void* addr, size_t size, struct ncclReg* reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  int netCount = 0;
-  if (comm->topo != NULL) NCCLCHECK(ncclTopoGetNetCount(comm->topo, &netCount));
-  if (netCount == 0) return ncclSuccess;
-
-  ncclResult_t ret = ncclSuccess;
-
-  // Find local devices for p2p operations
-  for (int c=0; c<comm->p2pnChannels; c++) {
-    int dev;
-    if (ncclTopoGetLocalNet(comm->topo, comm->rank, c, NULL, &dev) != ncclSuccess) goto end; // No local net
-    ncclNetProperties_t props;
-    NCCLCHECKGOTO(comm->ncclNet->getProperties(dev, &props), ret, end);
-    if (props.regIsGlobal == 0) { // We need to be sure all NICs support global registration.
-      reg->nDevs = 0;
-      break;
-    }
-    int found = 0;
-    for (int d=0; d<reg->nDevs; d++) if (reg->devs[d] == dev) found = 1;
-    if (!found) reg->devs[reg->nDevs++] = dev;
-  }
-
-  NCCLCHECKGOTO(ncclCalloc(&reg->handles, reg->nDevs), ret, end);
-
-  ncclDebugNoWarn = NCCL_NET;
-  for (int d=0; d<reg->nDevs; d++) {
-    int dev = reg->devs[d];
-    reg->handles[d] = NULL;
-
-    if (cache->sComms[dev] == NULL) {
-      // Create a loopback network comm object for that device to register the buffers.
-      void *lComm = NULL;
-      ncclNetHandle_t netHandle;
-      bool connected = false;
-      NCCLCHECKGOTO(comm->ncclNet->listen(dev, &netHandle, &lComm), ret, end);
-      while (!connected) {
-        if (*comm->abortFlag) {
-          goto end;
-        }
-        if (cache->sComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->connect(dev, &netHandle, cache->sComms+dev, NULL), ret, end);
-        if (cache->rComms[dev] == NULL)
-          NCCLCHECKGOTO(comm->ncclNet->accept(lComm, cache->rComms+dev, NULL), ret, end);
-        connected = (cache->rComms[dev] != NULL) && (cache->sComms[dev] != NULL);
-      }
-      NCCLCHECK(comm->ncclNet->closeListen(lComm));
-    }
-    if (comm->ncclNet->regMr(cache->sComms[dev], addr, size, NCCL_PTR_CUDA, reg->handles+d) != ncclSuccess) {
-      reg->handles[d] = NULL;
-      NCCLCHECK(ncclNetDeregister(comm, reg));
-      reg->nDevs = 0;
-      goto end;
-    }
-  }
-end:
-  INFO(NCCL_INIT, "Register ptr %p size %ld on %d net devices", addr, size, reg->nDevs);
-  ncclDebugNoWarn = 0;
-  if (ret != ncclSuccess) NCCLCHECK(ncclNetDeregister(comm, reg));
-  return ret;
-}
-
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-
-  *reg = NULL;
-  for (int slot=0; /*true*/; slot++) {
-    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
-    if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      *reg = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
-
-ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, void** handle) {
-  if (!ncclParamLocalRegister()) {
-    *handle = NULL;
-    return ncclSuccess;
-  }
-  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-  for (int slot=0; /*true*/; slot++) {
-    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
-      if (cache->population == cache->capacity) { // must grow cache
-        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
-        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
-      }
-      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
-      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
-      struct ncclReg* regSlot = cache->slots[slot];
-      regSlot->addr = addr;
-      regSlot->pages = pages;
-      regSlot->refs = 1;
-      NCCLCHECK(ncclNetRegister(comm, (void*)addr, pages*pageSize, regSlot));
-      regSlot->state |= NET_REG_COMPLETE;
-      cache->population += 1;
-      *handle = regSlot;
-      return ncclSuccess;
-    } else if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      cache->slots[slot]->refs++;
-      *handle = cache->slots[slot];
-      return ncclSuccess;
-    }
-  }
-}
-
-ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
-  struct ncclRegCache* cache = &comm->regCache;
-  for (int i=0; i<cache->population; i++) {
-    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)cache->slots[i]->addr, cache->slots[i]->pages);
-    NCCLCHECK(ncclNetDeregister(comm, cache->slots[i]));
-    if (cache->slots[i]->state & NVLS_REG_COMPLETE) NCCLCHECK(ncclNvlsDeregBuffer(&cache->slots[i]->mcHandle, cache->slots[i]->regAddr, cache->slots[i]->dev, cache->slots[i]->regSize));
-    free(cache->slots[i]);
-  }
-  free(cache->slots);
-  for (int d=0; d<MAXCHANNELS; d++) {
-    if (cache->sComms[d]) NCCLCHECK(comm->ncclNet->closeSend(cache->sComms[d]));
-    if (cache->rComms[d]) NCCLCHECK(comm->ncclNet->closeRecv(cache->rComms[d]));
-  }
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
-ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(buff, comm, "buff", "ncclCommRegister"));
-  NCCLCHECK(ncclRegister(comm, buff, size, handle));
-  return ncclSuccess;
-}
-
-NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
-ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) {
-  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
-  struct ncclReg* reg = (struct ncclReg*)handle;
-  struct ncclRegCache* cache = &comm->regCache;
-  int slot;
-  int saveDev;
-  if (handle == NULL) goto exit;
-  CUDACHECK(cudaGetDevice(&saveDev));
-  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  for (slot=0; slot<cache->population && cache->slots[slot] != reg; slot++);
-  if (slot == cache->population) {
-    WARN("Deregister: Could not find handle");
-    return ncclInvalidUsage;
-  }
-  if (--reg->refs) return ncclSuccess;
-  NCCLCHECK(ncclNetDeregister(comm, reg));
-  if (reg->state & NVLS_REG_COMPLETE) {
-    NCCLCHECK(ncclNvlsDeregBuffer(&reg->mcHandle, reg->regAddr, reg->dev, reg->regSize));
-    reg->regAddr = (CUdeviceptr)NULL;
-  }
-  if (reg->state & COLLNET_REG_COMPLETE) {
-    NCCLCHECK(ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle));
-  }
-  if (reg->state & IPC_REG_COMPLETE) {
-    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
-      if (reg->ipcInfos[i])
-        NCCLCHECK(ncclIpcDeregBuffer(comm, reg->ipcInfos[i]));
-    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
-    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
-  }
-  free(reg);
-  memmove(cache->slots+slot, cache->slots+slot+1, (cache->population-slot-1)*sizeof(struct ncclReg*));
-  cache->population -= 1;
-  CUDACHECK(cudaSetDevice(saveDev));
-exit:
-  return ncclSuccess;
-}
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
new file mode 100644
index 000000000..4282dc9c8
--- /dev/null
+++ b/src/register/coll_reg.cc
@@ -0,0 +1,446 @@
+#include "register.h"
+#include "transport.h"
+#include "enqueue.h"
+
+static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
+  if (conn->connected) {
+    if (conn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+      *needReg = true;
+    } else {
+      // network connection
+      *needReg = false;
+    }
+  } else {
+    struct ncclPeerInfo* peerInfo = &comm->peerInfo[peer];
+    struct ncclPeerInfo* myInfo = &comm->peerInfo[comm->rank];
+    int canConnect = 0;
+    NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, graph, myInfo, peerInfo));
+    if (canConnect) {
+      *needReg = true;
+    } else {
+      *needReg = false;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegisterCollNvlsBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try graph registration. */
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (nvlsReged == 0 && ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+
+      if (collnetReged == 0 && ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  }
+exit:
+#endif
+  return result;
+}
+
+ncclResult_t ncclRegisterCollBuffers(
+    struct ncclComm* comm, struct ncclTaskColl* info,
+    void* outRegBufSend[NCCL_MAX_LOCAL_RANKS],
+    void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS],
+    struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue,
+    bool* regNeedConnect
+  ) {
+  ncclResult_t result = ncclSuccess;
+
+  info->regBufType = NCCL_REGULAR_BUFFER;
+  *regNeedConnect = true;
+  if (!(ncclParamLocalRegister() || (comm->planner.persistent && ncclParamGraphRegister()))) goto exit;
+#if CUDART_VERSION >= 11030
+  if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
+    /* this part of nvls reg code is temporarily not used and obsolete. */
+    if (!comm->nvlsRegSupport || info->opDev.op == ncclDevPreMulSum) goto exit;
+    int nvlsReged = 0;
+    int collnetReged = 0;
+    const void *sendbuff = info->sendbuff;
+    void *recvbuff = info->recvbuff;
+    void *recvHandle = NULL, *sendHandle = NULL;
+    if (info->func == ncclFuncAllGather) sendbuff = NULL;
+    if (info->func == ncclFuncReduceScatter) recvbuff = NULL;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+
+    /* first try local registration. */
+    if (ncclParamLocalRegister()) {
+      ncclNvlsLocalRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv);
+    }
+
+    if (nvlsReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNvlsGraphRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, &nvlsReged, outRegBufSend, outRegBufRecv, cleanupQueue, &info->nCleanupQueueElts);
+    }
+
+    if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      if (ncclParamLocalRegister()) {
+        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+      }
+
+      if (collnetReged == 0 && comm->planner.persistent && ncclParamGraphRegister()) {
+        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+      }
+    }
+
+    if (nvlsReged) {
+      *regNeedConnect = 0;
+      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
+       * saturate bandwidth. */
+      if (comm->nNodes == 1) {
+        if (info->func == ncclFuncReduceScatter)
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
+        else
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+      } else {
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+      }
+      info->regBufType |= NCCL_NVLS_REG_BUFFER;
+    }
+
+    if (collnetReged) {
+      info->regBufType |= NCCL_NET_REG_BUFFER;
+      info->sendMhandle = sendHandle;
+      info->recvMhandle = recvHandle;
+    }
+  } else if (info->protocol == NCCL_PROTO_SIMPLE) {
+    // IPC buffer registration
+    if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit;
+    if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
+    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
+    if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
+
+    int peerRanks[NCCL_MAX_LOCAL_RANKS];
+    int nPeers = 0;
+    size_t elementSize = ncclTypeSize(info->datatype);
+    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
+    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
+    int regBufFlag = 0;
+    memset(peerRanks, 0xff, sizeof(int) * NCCL_MAX_LOCAL_RANKS);
+
+    if (info->algorithm == NCCL_ALGO_COLLNET_DIRECT) {
+      struct ncclChannel* channel = comm->channels;
+      int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) {
+        for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
+          for (int down = 0; down < 2; ++down) {
+            int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r];
+            if (peer != -1) {
+              struct ncclConnector* peerConn = &channel->peers[peer]->recv[0];
+              bool needReg = false;
+
+              NCCLCHECK(registerCheckP2PConnection(comm, peerConn, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], peer, &needReg));
+              if (needReg) {
+                bool found = false;
+                for (int p = 0; p < nPeers; ++p) {
+                  if (peerRanks[p] == peer) {
+                    found = true;
+                    break;
+                  }
+                }
+                if (!found) peerRanks[nPeers++] = peer;
+              }
+            }
+          }
+        }
+
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+            if (ipcRegFlag) ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!ipcRegFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->sendbuffOffset, &info->sendbuffRmtAddrs);
+            if (ipcRegFlag) ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &ipcRegFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (ipcRegFlag) {
+          info->regBufType |= NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet buffer
+      if (info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && !(info->func == ncclFuncAllReduce && !comm->isOneRPN)) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    } else if (info->algorithm == NCCL_ALGO_RING) {
+      struct ncclReg* recvRegRecord = NULL;
+      struct ncclReg* sendRegRecord = NULL;
+      int sendNetPeers = comm->nChannels;
+      int recvNetPeers = comm->nChannels;
+      struct ncclConnector** sendNetConns = NULL;
+      struct ncclConnector** recvNetConns = NULL;
+      void** sendNetHandles = NULL;
+      void** recvNetHandles = NULL;
+      void** srecvNetHandles = NULL;
+      bool hasRecvNetPeer = false;
+      bool hasSendNetPeer = false;
+
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &sendRegRecord));
+      if (sendRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      NCCLCHECK(ncclCalloc(&sendNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&sendNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetConns, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&recvNetHandles, comm->nChannels));
+      NCCLCHECK(ncclCalloc(&srecvNetHandles, comm->nChannels));
+
+      for (int c = 0; c < comm->nChannels; ++c) {
+        struct ncclChannel* channel = comm->channels + c;
+        for (int r = 0; r < 2; ++r) {
+          int peer;
+          struct ncclConnector* peerConn;
+          if (r == 0) {
+            peer = channel->ring.prev;
+            peerConn = &channel->peers[peer]->recv[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              recvNetConns[c] = peerConn;
+              hasRecvNetPeer = true;
+            }
+          } else {
+            peer = channel->ring.next;
+            peerConn = &channel->peers[peer]->send[0];
+            if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+              sendNetConns[c] = peerConn;
+              hasSendNetPeer = true;
+            }
+          }
+          if (peerConn->conn.flags & (NCCL_P2P_READ | NCCL_P2P_WRITE)) {
+            bool found = false;
+            for (int p = 0; p < nPeers; ++p) {
+              if (peerRanks[p] == peer) {
+                found = true;
+                break;
+              }
+            }
+            if (!found) peerRanks[nPeers++] = peer;
+          }
+        }
+      }
+      if (nPeers > 0 && comm->intraNodeP2pSupport) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+        }
+      }
+      if (regBufFlag) {
+        info->regBufType = NCCL_IPC_REG_BUFFER;
+      }
+
+      // start net registration
+      regBufFlag = 0;
+      if (!comm->useNetPXN && comm->useGdr && comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+            if (regBufFlag)
+              ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles, cleanupQueue, &info->nCleanupQueueElts);
+        }
+        if (!regBufFlag && ncclParamLocalRegister()) {
+          if (hasSendNetPeer) {
+            ncclNetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, sendNetConns, sendNetPeers, &regBufFlag, sendNetHandles);
+            if (regBufFlag)
+              ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, sendNetConns, sendNetPeers, &regBufFlag, srecvNetHandles);
+          }
+          if ((regBufFlag || !hasSendNetPeer) && hasRecvNetPeer)
+            ncclNetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, recvNetConns, recvNetPeers, &regBufFlag, recvNetHandles);
+        }
+      }
+
+      if (regBufFlag) {
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+        info->sendNetHandles = sendNetHandles;
+        info->recvNetHandles = recvNetHandles;
+        info->srecvNetHandles = srecvNetHandles;
+        if (comm->isOneRPN && (info->func == ncclFuncAllGather || info->func == ncclFuncBroadcast)) {
+          info->nMaxChannels = 1;
+        }
+      } else {
+        free(sendNetHandles);
+        free(recvNetHandles);
+        free(srecvNetHandles);
+      }
+
+      free(sendNetConns);
+      free(recvNetConns);
+    } else if (info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) {
+      struct ncclReg* recvRegRecord;
+      int netSendRegFlag = 0, netRecvRegFlag = 0;
+      void *sendHandle, *recvHandle;
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
+      if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
+      if (comm->intraNodeP2pSupport) {
+        for (int c = 0; c < comm->nChannels; ++c) {
+          struct ncclChannel* channel = comm->channels + c;
+          struct ncclTree* tree = NULL;
+          int peers[NCCL_MAX_TREE_ARITY + 1];
+
+          if (info->algorithm == NCCL_ALGO_TREE)
+            tree = &channel->tree;
+          else
+            tree = &channel->collnetChain;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY; ++p) peers[p] = tree->down[p];
+          peers[NCCL_MAX_TREE_ARITY] = tree->up;
+          for (int p = 0; p < NCCL_MAX_TREE_ARITY + 1; ++p) {
+            int peer = peers[p];
+            bool peerNeedReg = false;
+            struct ncclConnector* recvConn = NULL;
+            // P2P transport
+            if (peer == -1 || peer == comm->nRanks) continue;
+            recvConn = &channel->peers[peer]->recv[0];
+            NCCLCHECK(registerCheckP2PConnection(comm, recvConn, &comm->graphs[info->algorithm], peer, &peerNeedReg));
+
+            if (peerNeedReg) {
+              bool found = false;
+              for (int pindex = 0; pindex < nPeers; ++pindex) {
+                if (peerRanks[pindex] == peer) {
+                  found = true;
+                  break;
+                }
+              }
+              if (!found) peerRanks[nPeers++] = peer;
+            }
+          }
+        }
+        if (nPeers > 0) {
+          if (comm->planner.persistent && ncclParamGraphRegister()) {
+            ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
+          }
+          if (!regBufFlag && ncclParamLocalRegister()) {
+            ncclIpcLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs);
+          }
+        }
+        if (regBufFlag) {
+          info->regBufType = NCCL_IPC_REG_BUFFER;
+        }
+      }
+
+      // register collnet chain 1RPN buffer
+      if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->opDev.op != ncclDevPreMulSum && info->opDev.op != ncclDevSumPostDiv && comm->isOneRPN) {
+        if (comm->planner.persistent && ncclParamGraphRegister()) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+          info->sendMhandle = sendHandle;
+          if (netSendRegFlag) {
+            ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+            info->recvMhandle = recvHandle;
+          }
+        }
+
+        if ((netSendRegFlag == 0 || netRecvRegFlag == 0) && ncclParamLocalRegister()) {
+          if (!netSendRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &netSendRegFlag, &sendHandle);
+            info->sendMhandle = sendHandle;
+          }
+          if (netSendRegFlag && !netRecvRegFlag) {
+            ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &netRecvRegFlag, &recvHandle);
+            info->recvMhandle = recvHandle;
+          }
+        }
+      }
+
+      if (netSendRegFlag && netRecvRegFlag) {
+        if (comm->isOneRPN) info->nMaxChannels = 1;
+        info->regBufType |= NCCL_NET_REG_BUFFER;
+      }
+    }
+
+    if (info->regBufType == NCCL_IPC_REG_BUFFER && comm->nNodes == 1 && 16 < info->nMaxChannels && info->nMaxChannels <= 24) {
+      info->nMaxChannels = 16;
+    }
+  }
+exit:
+#endif
+  return result;
+}
diff --git a/src/register/register.cc b/src/register/register.cc
new file mode 100644
index 000000000..9e8f6eaaf
--- /dev/null
+++ b/src/register/register.cc
@@ -0,0 +1,179 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "argcheck.h" // Need some checks here since we access comm
+#include "nccl.h"
+#include "comm.h"
+#include "net.h"
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  *reg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
+    if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      *reg = cache->slots[slot];
+      return ncclSuccess;
+    }
+  }
+}
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
+
+ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
+  if (reg && isValid) {
+    if (reg->localRefs)
+      *isValid = true;
+    else
+      *isValid = false;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool isGraph, void** handle) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  uintptr_t pageSize = cache->pageSize;
+  uintptr_t addr = (uintptr_t)data & -pageSize;
+  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+
+  if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
+  INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
+
+  for (int slot=0; /*true*/; slot++) {
+    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
+      if (cache->population == cache->capacity) { // must grow cache
+        cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
+        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
+      }
+      memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
+      NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
+      struct ncclReg* regSlot = cache->slots[slot];
+      regSlot->addr = addr;
+      regSlot->pages = pages;
+      if (isGraph) regSlot->graphRefs = 1;
+      else regSlot->localRefs = 1;
+      cache->population += 1;
+      *handle = regSlot;
+      goto exit;
+    } else if ((addr >= cache->slots[slot]->addr) &&
+        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+      if (isGraph) cache->slots[slot]->graphRefs++;
+      else cache->slots[slot]->localRefs++;
+      *handle = cache->slots[slot];
+      goto exit;
+    }
+  }
+
+exit:
+  return ncclSuccess;
+}
+
+static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
+  if (reg->state & NET_REG_COMPLETE) {
+    struct ncclRegNetHandles* netHandle = reg->netHandleHead;
+    struct ncclRegNetHandles* netHandlePrev;
+    while(netHandle) {
+      if (ncclNetDeregBuffer(comm, netHandle->proxyConn, netHandle->handle) != ncclSuccess) {
+        WARN("rank %d deregister NET buffer handle %p proxy rank %d failed\n", comm->rank, netHandle->handle, netHandle->proxyConn->rank);
+      }
+      netHandlePrev = netHandle;
+      netHandle = netHandle->next;
+      free(netHandlePrev);
+    }
+  }
+  if (reg->state & NVLS_REG_COMPLETE) {
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    }
+    reg->regAddr = (CUdeviceptr)NULL;
+  }
+  if (reg->state & COLLNET_REG_COMPLETE) {
+    if (ncclCollnetDeregBuffer(comm, reg->collnetProxyconn, reg->collnetHandle) != ncclSuccess) {
+      WARN("rank %d deregister COLLNET buffer handle %p proxy rank %d failed", comm->rank, reg->collnetHandle, reg->collnetProxyconn->rank);
+    }
+  }
+  if (reg->state & IPC_REG_COMPLETE) {
+    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; ++i)
+      if (reg->ipcInfos[i]) {
+        if (ncclIpcDeregBuffer(comm, reg->ipcInfos[i]) != ncclSuccess) {
+          WARN("rank %d deregister IPC buffer %p peerRank %d failed", comm->rank, reg->ipcInfos[i]->baseAddr, reg->ipcInfos[i]->peerRank);
+        }
+        free(reg->ipcInfos[i]);
+      }
+    if (reg->regIpcAddrs.hostPeerRmtAddrs) free(reg->regIpcAddrs.hostPeerRmtAddrs);
+    if (reg->regIpcAddrs.devPeerRmtAddrs) NCCLCHECK(ncclCudaFree(reg->regIpcAddrs.devPeerRmtAddrs));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
+  struct ncclRegCache* cache = &comm->regCache;
+  for (int i = 0; i < cache->population; i++) {
+    struct ncclReg* reg = cache->slots[i];
+    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
+    NCCLCHECK(regCleanup(comm, reg));
+    free(reg);
+  }
+  free(cache->slots);
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  if (!ncclParamLocalRegister())
+    *handle = NULL;
+  else
+    NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
+  NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  return ncclSuccess;
+}
+
+static ncclResult_t commDeregister(struct ncclComm *comm, bool isGraph, struct ncclReg* reg) {
+  NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
+  struct ncclRegCache* cache = &comm->regCache;
+  int slot;
+  int saveDev;
+  if (reg == NULL) goto exit;
+  CUDACHECK(cudaGetDevice(&saveDev));
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  for (slot = 0; slot < cache->population && cache->slots[slot] != reg; slot++);
+  if (slot == cache->population) {
+    WARN("Deregister: Could not find handle");
+    return ncclInvalidUsage;
+  }
+  if (isGraph) --reg->graphRefs;
+  else --reg->localRefs;
+  if (reg->localRefs || reg->graphRefs) return ncclSuccess;
+  NCCLCHECK(regCleanup(comm, reg));
+  free(reg);
+  memmove(cache->slots + slot, cache->slots + slot + 1, (cache->population - slot - 1) * sizeof(struct ncclReg*));
+  cache->population -= 1;
+  CUDACHECK(cudaSetDevice(saveDev));
+exit:
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclCommDeregister, const ncclComm_t comm, void* handle);
+ncclResult_t ncclCommDeregister(const ncclComm_t comm, void *handle) {
+  NCCLCHECK(commDeregister(comm, false, (struct ncclReg*)handle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle) {
+  NCCLCHECK(commDeregister(comm, true, handle));
+  return ncclSuccess;
+}
diff --git a/src/register/sendrecv_reg.cc b/src/register/sendrecv_reg.cc
new file mode 100644
index 000000000..f82fbd714
--- /dev/null
+++ b/src/register/sendrecv_reg.cc
@@ -0,0 +1,35 @@
+#include "register.h"
+#include "transport.h"
+
+ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+
+  *regFlag = 0;
+  if (comm->netDeviceType != NCCL_NET_DEVICE_UNPACK) {
+    if (comm->planner.persistent && ncclParamGraphRegister()) {
+      ncclNetGraphRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle, cleanupQueue, NULL);
+    }
+    if (*regFlag == 0 && ncclParamLocalRegister()) {
+      ncclNetLocalRegisterBuffer(comm, userbuff, size, &conn, 1, regFlag, handle);
+    }
+  }
+  return ret;
+}
+
+ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, size_t size, int peerRank, int* regFlag, void** regAddr, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue) {
+  ncclResult_t ret = ncclSuccess;
+  uintptr_t offset = 0;
+  uintptr_t* peerRmtAddrs = NULL;
+
+  *regFlag = 0;
+  if (comm->planner.persistent && ncclParamGraphRegister()) {
+    ncclIpcGraphRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs, reinterpret_cast<void*>(cleanupQueue), NULL);
+  }
+  if (*regFlag == 0 && ncclParamLocalRegister()) {
+    ncclIpcLocalRegisterBuffer(comm, userbuff, size, &peerRank, 1, NCCL_IPC_SENDRECV, regFlag, &offset, &peerRmtAddrs);
+  }
+
+  if (*regFlag)
+    *regAddr = (void*)((uintptr_t)peerRmtAddrs + offset);
+  return ret;
+}
diff --git a/src/transport.cc b/src/transport.cc
index eeee7a24b..5629ce7a2 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -94,13 +94,13 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
   }
   *intraNodeP2pSupport = supportFlag;
   *directMode = directFlag;
+  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType/*=NULL*/) {
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) {
   // Stream used during transport setup; need for P2P pre-connect + CUDA Graph
   ncclResult_t ret = ncclSuccess;
-  int highestType = TRANSPORT_UNDEFINED;  // track highest transport type
   struct ncclConnect** data; // Store intermediate send/recvData structs for connect
   struct ncclConnect** recvData = NULL; // Points to entries inside data for given recv connection within a channel
   struct ncclConnect** sendData = NULL; // Points to entries inside data for given send connection within a channel
@@ -131,7 +131,10 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     // The next M entries contain sendData, connection information for send connections
     // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
     int p = i-(done+1);
-    if (recvMask || sendMask) NCCLCHECKGOTO(ncclCalloc(data+p, 2*MAXCHANNELS), ret, fail);
+    if (recvMask || sendMask) {
+      if (data[p] == NULL) NCCLCHECKGOTO(ncclCalloc(data + p, 2 * MAXCHANNELS), ret, fail);
+      else memset(data[p], 0, 2 * MAXCHANNELS * sizeof(struct ncclConnect));
+    }
     recvData[p] = data[p];
     int sendChannels = 0, recvChannels = 0;
     int type;
@@ -139,7 +142,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     for (int c=0; c<MAXCHANNELS; c++) {
       if (recvMask & (1UL<<c)) {
         NCCLCHECKGOTO(selectTransport<0>(comm, graph, recvData[p]+recvChannels++, c, recvPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
       }
     }
     TIME_STOP(0);
@@ -148,7 +150,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     for (int c=0; c<MAXCHANNELS; c++) {
       if (sendMask & (1UL<<c)) {
         NCCLCHECKGOTO(selectTransport<1>(comm, graph, sendData[p]+sendChannels++, c, sendPeer, connIndex, &type), ret, fail);
-        if (type > highestType) highestType = type;
       }
     }
     TIME_STOP(1);
@@ -222,22 +223,18 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
             }
             TIME_STOP(4);
           }
-          if (sendMask || recvMask) {
-            free(data[p]);
-            data[p] = NULL;
-          }
         }
-	if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
+        if (ncclParamReportConnectProgress() && comm->rank == 0 && done > 0) {
           struct timeval now;
           gettimeofday(&now, NULL);
-          if (((now.tv_sec - timeLast.tv_sec)*1.0 + (now.tv_usec-timeLast.tv_usec)*1e-6) > 1) {
-            float elapsed = (now.tv_sec - timeStart.tv_sec)*1.0 + (now.tv_usec-timeStart.tv_usec)*1e-6;
-	    float remaining = elapsed*(comm->nRanks-done)/done;
+          if (((now.tv_sec - timeLast.tv_sec) * 1.0 + (now.tv_usec - timeLast.tv_usec) * 1e-6) > 1) {
+            float elapsed = (now.tv_sec - timeStart.tv_sec) * 1.0 + (now.tv_usec - timeStart.tv_usec) * 1e-6;
+            float remaining = elapsed * (comm->nRanks - done) / done;
             printf("%sP2p connect: %g%% Elapsed %d:%02d Remaining %d:%02d                                       ",
-                timeReported ? "\r" : "", done*100.0/comm->nRanks, ((int)elapsed)/60, ((int)elapsed)%60, ((int)remaining)/60, ((int)remaining)%60);
+              timeReported ? "\r" : "", done * 100.0 / comm->nRanks, ((int)elapsed) / 60, ((int)elapsed) % 60, ((int)remaining) / 60, ((int)remaining) % 60);
             fflush(stdout);
             timeReported = true;
-	    timeLast = now; // struct copy;
+            timeLast = now; // struct copy;
           }
         }
       }
@@ -280,7 +277,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
     comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0UL;
   }
 
-  if (highestTransportType != NULL) *highestTransportType = highestType;
   TIME_PRINT("P2P Setup/Connect");
 exit:
   for(int i=0; i<maxPeers; ++i){
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 7d2f298ae..67180123f 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -112,6 +112,7 @@ struct sendResources {
   uint64_t step;
   struct reqSlot (*reqFifo)[NCCL_STEPS];
   int collNetRank;
+  size_t maxCollBytes;
 };
 
 struct recvResources {
@@ -133,6 +134,7 @@ struct recvResources {
   uint64_t step;
   struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS];
   int collNetRank;
+  size_t maxCollBytes;
 };
 
 static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -157,7 +159,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
 
   send->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -177,10 +179,10 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, -1, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
 
   recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
@@ -319,6 +321,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   connection->collNet = req->collNet;
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  /* collective size limits*/
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
   return ncclSuccess;
 }
 
@@ -430,6 +439,12 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   connection->collNet = req->collNet;
   /* DMA-BUF support */
   resources->useDmaBuf = resources->useGdr && proxyState->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF);
+  resources->maxCollBytes = props.maxCollBytes;
+  if((resources->maxCollBytes <= 0) || (resources->maxCollBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: collnet plugin returned invalid value for maxCollBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxCollBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   collNetHandle_t* netHandle = (collNetHandle_t*) respBuff;
   if (respSize != sizeof(collNetHandle_t)) return ncclInternalError;
@@ -645,14 +660,14 @@ static size_t calcAlgoOffset(struct ncclProxyArgs* args, int isAllNotOne, int su
   return offset;
 }
 
-static int calcRegionOffset(
+static ssize_t calcRegionOffset(
     struct ncclProxyArgs* args, int isRecvNotSend, int sub, uint64_t step,
     int side // 0=begin, 1=end
   ) {
   struct ncclCollNetSharedRes* collNet = args->subs[0].connection->collNet;
-  int slotSize = collNet->buffSize/NCCL_STEPS;
-  int chunkSize = args->chunkSize;
-  int base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
+  ssize_t slotSize = collNet->buffSize/NCCL_STEPS;
+  ssize_t chunkSize = args->chunkSize;
+  ssize_t base = isRecvNotSend*NCCL_STEPS + (step%NCCL_STEPS);
   base *= collNet->nChannels*slotSize;
   if (args->coll == ncclFuncAllReduce) {
     return base + (sub+side)*chunkSize;
@@ -674,6 +689,165 @@ static constexpr int calcStepsPerGroup(int nGroups) {
   return NCCL_STEPS;
 }
 
+static ncclResult_t collNetRegIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t *nBytesInOut, void **request) {
+  ssize_t loopSize, winOffset, nBytes;
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // for UB iallreduce 1RPN case, user's send and recv buffers are both directly accessed by collnet network.
+  // we can just issue maximal collnet bytes by resources->maxCollBytes for each iallreduce.
+  // for multi-RPN case, we have to consider pipeline, so each time we only send groupSize * chunkSize (i.e., nBytesInOut)
+  // sub->loopOffset is data offset to the buffer for this head rank in each loop
+  // winOffset is used to find actual offset from send and recv buffer for this iallreduce
+  // loopSize is all bytes sent by all channels and head ranks in each loop.
+  // send and recv mem handle are retrieved from sub in which user buffer mem handles are stored.
+  if (sub->isOneRPN) {
+    winOffset = 0;
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    loopSize = nBytes;
+  } else {
+    winOffset = sub->loopOffset + groupStart * args->chunkSize;
+    nBytes = std::min(sub->nbytes - winOffset, *nBytesInOut);
+    loopSize = sub->loopSize;
+  }
+
+  if (nBytes > 0) {
+    NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff + winOffset, sub->recvbuff + winOffset, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, request));
+    if (*request) {
+      // if issued successfully, we need to move the pointer forward and reduce the existing nbytes.
+      sub->nbytes -= loopSize;
+      sub->sendbuff += loopSize;
+      sub->recvbuff += loopSize;
+      TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] registered Iallreduce posted sendbuff %p recvbuff %p size %ld loopSize %ld winOffset %ld isOneRPN %d req %p", (long)sub->transmitted, sub->nsteps, groupStart, sub->sendbuff, sub->recvbuff, nBytes, loopSize, winOffset, sub->isOneRPN, *request);
+    }
+  }
+  *nBytesInOut = nBytes;
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t sendBeg, ssize_t recvBeg, void **request) {
+  void *sendMhandle = resources->sendMhandles[NCCL_PROTO_SIMPLE];
+  void *recvMhandle = resources->recvMhandles[NCCL_PROTO_SIMPLE];
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
+  // non-UB iallreduce, region is intermediate buffer and sendBeg/recvBeg is the corresponding offset
+  // for send and recv data. The send and recv mem handle are retrieved from resources.
+  NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, nBytes / eltSize, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallreduce posted size %ld sendBeg %ld recvBeg %ld req %p", (long)sub->transmitted, sub->nsteps, nBytes, sendBeg, recvBeg, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  ssize_t winOffset;
+  void *sendbuff;
+  // UB iallgather 1RPN logic is the same as iallreduce.
+  // If iallgather is not 1RPN, we can let collnet network directly access sendbuff but not recvbuff;
+  // the main reason is non-1RPN case will cause non-contiguous recv data from network, so
+  // we have to use intermediate buffer "region" to recv data and copy into the recvbuff.
+  // so allBeg and recvMhandle, which are global window offset of recv buffer and mem handle for region,
+  // are only used in multi-RPN case.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    recvParts.mhandle = sub->recvMhandle;
+    recvParts.address = sub->recvbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    recvParts.mhandle = recvMhandle;
+    recvParts.address = region + recvBeg;
+  }
+  recvParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    sendbuff = sub->sendbuff + winOffset % sizePerRank;
+  } else {
+    sendbuff = sub->sendbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, sendbuff, 1, &recvParts, sizePerRank, winOffset, nBytes, sub->sendMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->recvbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t recvParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  recvParts.mhandle = recvMhandle;
+  recvParts.address = region + recvBeg;
+  recvParts.size = nBytes;
+  // non-UB iallgather, we use intermidate region buffers for both send and recv data.
+  // sendMhandle and recvMhandle are send and recv mem handles for region, and allBeg is
+  // the global window offset of recv buffer. sendBeg and recvBeg are offset to the region
+  // for intermediate data.
+  NCCLCHECK(proxyState->ncclCollNet->iallgather(resources->collNetComm, region + sendBeg, 1, &recvParts, sizePerRank, allBeg, nBytes, sendMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Iallgather posted sizePerRank %ld winOffset %ld recvSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  ssize_t nBytes;
+  size_t winOffset;
+  void *recvbuff;
+  // Similar to iallgather, if ireducescatter is not 1RPN, we can let collnet network
+  // directly access recvbuff but not sendbuff. We use intermediate buffer "region" to
+  // send data and directly recv into the recvbuff.
+  if (sub->isOneRPN) {
+    nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+    winOffset = sub->offset;
+    sendParts.mhandle = sub->sendMhandle;
+    sendParts.address = sub->sendbuff;
+  } else {
+    nBytes = nBytesIn;
+    winOffset = allBeg;
+    sendParts.mhandle = sendMhandle;
+    sendParts.address = region + sendBeg;
+  }
+  sendParts.size = nBytes;
+  if (winOffset / sizePerRank == args->specifics.collnetDirect.node) {
+    recvbuff = sub->recvbuff + winOffset % sizePerRank;
+  } else {
+    recvbuff = sub->recvbuff;
+  }
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, recvbuff, sizePerRank, winOffset, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->recvMhandle, request));
+  if (*request) {
+    if (sub->isOneRPN) {
+      sub->sendbuff += nBytes;
+      sub->nbytes -= nBytes;
+      sub->offset += nBytes;
+    }
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] registered Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld isOneRPN %d request %p", sub->transmitted, sub->nsteps, sizePerRank, winOffset, nBytes, sub->isOneRPN, *request);
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
+  ncclNetSGE_v9_t sendParts;
+  ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  sendParts.mhandle = sendMhandle;
+  sendParts.address = region + sendBeg;
+  sendParts.size = nBytes;
+  // non-UB ireducescatter is the same as non-UB iallgather but in the reverse direction.
+  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(resources->collNetComm, 1, &sendParts, region + recvBeg, sizePerRank, allBeg, nBytes, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, recvMhandle, request));
+  if (*request)
+    TRACE(NCCL_NET, "sendProxy [%ld/%d] Ireducescatter posted sizePerRank %ld winOffset %ld sendSize %ld request %p", sub->transmitted, sub->nsteps, sizePerRank, allBeg, nBytes, *request);
+  return ncclSuccess;
+}
+
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
@@ -683,6 +857,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
       resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
     }
     args->state = ncclProxyOpProgress;
   }
@@ -695,28 +871,30 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       struct sendResources* resources = (struct sendResources*) (sub->connection->transportResources);
       void* sendMhandle = resources->sendMhandles[p];
       void* recvMhandle = resources->recvMhandles[p];
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[p]);
       auto reqFifo = resources->reqFifo;
       int group = s/COLLNET_GROUP_NSUBS;
       int groupStart = s - (s%COLLNET_GROUP_NSUBS);
 
       if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncReduceScatter)) {
           resources->recvMem->connFifo[buffSlot].offset = calcRegionOffset(args, 0, s, sub->posted, 0);
           __sync_synchronize();
         }
         volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
+        TRACE(NCCL_NET, "sendProxy [%ld/%d/%d/%d] posted offset %d @ %p signal %ld->%ld", long(sub->posted), group, buffSlot, sub->nsteps, resources->recvMem->connFifo[buffSlot].offset, &resources->recvMem->connFifo[buffSlot].offset, long(*sendHead), long(sub->base + sub->posted + args->sliceSteps - NCCL_STEPS));
         sub->posted += args->sliceSteps;
-        *sendHead = sub->base + sub->posted - NCCL_STEPS;
+        // Only post one credit for registered buffer
+        if (sub->reg == 0 || !sub->isOneRPN || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
         if (resources->gdcSync) wc_store_fence(); // Flush out WC write
       }
       if (sub->received < sub->posted && sub->received < sub->done + calcStepsPerGroup(nGroups)) {
         int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
         volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        if ((connFifo[buffSlot].size != -1 || sub->reg) && ((*recvTail > (sub->base+sub->received)))) {
+        //device progresses tail by only 1 for registered buffers
+        uint64_t tail = sub->base + (sub->reg && sub->isOneRPN ? 0 : sub->received);
+        if ((connFifo[buffSlot].size != -1 || sub->reg) && (*recvTail > tail)) {
           if (args->coll != ncclFuncAllReduce && sub->reg == 0) {
             int sendBeg = calcRegionOffset(args, 0, s, sub->received, 0);
             int sendEnd = calcRegionOffset(args, 0, s, sub->received, 1);
@@ -738,110 +916,42 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
           if (!reqFifo[group][buffSlot].turnIsSendNotRecv) continue;
 
-          ssize_t sizePerRank = 0;
-          size_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
-          size_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
-          int sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
-          int sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
+          ssize_t allBeg = calcAlgoOffset(args, 1, groupStart, sub->transmitted);
+          ssize_t allEnd = calcAlgoOffset(args, 1, s+1, sub->transmitted);
+          ssize_t sendBeg = calcRegionOffset(args, 0, groupStart, sub->transmitted, 0);
+          ssize_t sendEnd = calcRegionOffset(args, 0, s, sub->transmitted, 1);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->transmitted, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->transmitted, 1);
           reqFifo[group][buffSlot].size = recvEnd - recvBeg;
-          size_t eltSize = ncclTypeSize((ncclDataType_t)args->dtype);
 
-          if (sendBeg==sendEnd && recvBeg==recvEnd && sub->reg == 0) {
+          if (sendBeg==sendEnd && recvBeg==recvEnd) {
             sub->requests[buffSlot] = nullptr; // trivally finished request
           } else {
+            ssize_t nBytes = 0;
             if (args->coll == ncclFuncAllReduce) {
+              nBytes = sendEnd - sendBeg;
               if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                int count = (int)(nBytes / eltSize);
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, sub->sendbuff, sub->recvbuff, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sub->sendMhandle, sub->recvMhandle, sub->requests + buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->sendbuff += nBytes;
-                  sub->recvbuff += nBytes;
-                }
+                NCCLCHECK(collNetRegIallreduce(proxyState, resources, args, sub, groupStart, &nBytes, &sub->requests[buffSlot]));
               } else {
-                int count = (sendEnd - sendBeg) / eltSize;
-                NCCLCHECK(proxyState->ncclCollNet->iallreduce(resources->collNetComm, region + sendBeg, region + recvBeg, count, (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp, sendMhandle, recvMhandle, sub->requests + buffSlot));
+                NCCLCHECK(collNetIallreduce(proxyState, resources, args, sub, nBytes, sendBeg, recvBeg, &sub->requests[buffSlot]));
               }
-            } else {
-              sizePerRank = args->specifics.collnetDirect.sizePerRank;
-              if (args->coll == ncclFuncAllGather) {
-                ncclNetSGE_v8_t recvParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *sendbuff;
-                  recvParts.mhandle = sub->recvMhandle;
-                  recvParts.address = sub->recvbuff;
-                  recvParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    sendbuff = sub->sendbuff + sub->offset % sizePerRank;
-                  } else {
-                    sendbuff = sub->sendbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, sendbuff, 1, &recvParts,
-                    sizePerRank, sub->offset, nBytes,
-                    sub->sendMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->recvbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  recvParts.mhandle = recvMhandle;
-                  recvParts.address = region + recvBeg;
-                  recvParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->iallgather(
-                    resources->collNetComm, region + sendBeg, 1, &recvParts,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    sendMhandle, sub->requests + buffSlot));
-                }
+            } else if (args->coll == ncclFuncAllGather) {
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIallgather(proxyState, resources, args, sub, nBytes, allBeg, recvBeg, recvMhandle, &sub->requests[buffSlot]));
               } else {
-                ncclNetSGE_v8_t sendParts;
-                if (sub->reg) {
-                  size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                  void *recvbuff;
-                  sendParts.mhandle = sub->sendMhandle;
-                  sendParts.address = sub->sendbuff;
-                  sendParts.size = nBytes;
-                  if (sub->offset / sizePerRank == args->specifics.collnetDirect.node) {
-                    recvbuff = sub->recvbuff + sub->offset % sizePerRank;
-                  } else {
-                    recvbuff = sub->recvbuff;
-                  }
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, recvbuff,
-                    sizePerRank, sub->offset, nBytes,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    sub->recvMhandle, sub->requests + buffSlot));
-                  if (sub->requests[buffSlot]) {
-                    sub->sendbuff += nBytes;
-                    sub->nbytes -= nBytes;
-                    sub->offset += nBytes;
-                  }
-                } else {
-                  sendParts.mhandle = sendMhandle;
-                  sendParts.address = region + sendBeg;
-                  sendParts.size = allEnd - allBeg;
-                  NCCLCHECK(proxyState->ncclCollNet->ireducescatter(
-                    resources->collNetComm, 1, &sendParts, region + recvBeg,
-                    sizePerRank, allBeg, allEnd - allBeg,
-                    (ncclDataType_t)args->dtype, (ncclRedOp_t)args->redOp,
-                    recvMhandle, sub->requests + buffSlot));
-                }
+                NCCLCHECK(collNetIallgather(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
               }
-            }
-            if (sub->requests[buffSlot] == nullptr) continue;
-
-            if (args->coll == ncclFuncAllReduce) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallreduce posted, size %d req %p", (long)sub->transmitted, group, buffSlot, int(sendEnd-sendBeg), sub->requests[buffSlot]);
-            } else if (args->coll == ncclFuncAllGather) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Iallgather posted sendSize=%ld recvOffset=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(sizePerRank), long(allBeg), long(allEnd-allBeg), sub->requests[buffSlot]);
             } else {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Ireducescatter posted sendOffset=%ld sendSize=%ld recvSize=%ld request=%p", (long)sub->transmitted, group, buffSlot, long(allBeg), long(allEnd-allBeg), long(sizePerRank), sub->requests[buffSlot]);
+              // reducescatter
+              nBytes = allEnd - allBeg;
+              if (sub->reg) {
+                NCCLCHECK(collNetRegIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, sendMhandle, &sub->requests[buffSlot]));
+              } else {
+                NCCLCHECK(collNetIreducescatter(proxyState, resources, args, sub, nBytes, allBeg, sendBeg, recvBeg, sendMhandle, recvMhandle, &sub->requests[buffSlot]));
+              }
             }
+            if (nBytes > 0 && sub->requests[buffSlot] == nullptr) continue;
           }
         }
         sub->transmitted += args->sliceSteps;
@@ -875,6 +985,52 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
   return ncclSuccess;
 }
 
+static ncclResult_t collNetRecvFlush(struct ncclProxyState* proxyState, struct recvResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, int groupStart, ssize_t nBytesIn, ssize_t recvBeg, void **request) {
+  char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
+  if (sub->reg && (sub->isOneRPN || args->coll != ncclFuncAllGather)) {
+    ssize_t nBytes, loopSize;
+    ssize_t offset = sub->offset + groupStart * args->chunkSize;
+    if (sub->isOneRPN) {
+      nBytes = std::min((size_t)sub->nbytes, resources->maxCollBytes);
+      loopSize = nBytes;
+    } else {
+      nBytes = std::min(sub->nbytes - sub->loopOffset, nBytesIn);
+      loopSize = sub->loopSize;
+    }
+    if (nBytes > 0) {
+      if (args->coll == ncclFuncReduceScatter) {
+        ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
+        ssize_t groupStartOffset = sub->offset + groupStart * args->chunkSize;
+        ssize_t groupEndOffset = groupStartOffset + nBytes;
+        int node = args->specifics.collnetDirect.node;
+        int startNode = groupStartOffset / sizePerRank;
+        int lastNode = groupEndOffset / sizePerRank;
+        if (startNode == node) {
+          offset = groupStartOffset % sizePerRank;
+          nBytes = std::min(sizePerRank - offset, nBytes);
+        } else if (startNode < node && node < lastNode) {
+          offset = 0;
+          nBytes = sizePerRank;
+        } else if (node == lastNode) {
+          offset = 0;
+          nBytes = groupEndOffset % sizePerRank;
+        } else {
+          // dummy flush
+          offset = 0;
+        }
+      }
+      NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset + sub->loopOffset, nBytes, sub->recvMhandle, request));
+      if (*request) {
+        sub->nbytes -= loopSize;
+        sub->offset += loopSize;
+      }
+    }
+  } else {
+    NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region + recvBeg, nBytesIn, resources->mhandles[NCCL_PROTO_SIMPLE], request));
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
@@ -884,22 +1040,21 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       sub->base = ROUNDUP(resources->step, args->chunkSteps);
       sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0;
       resources->step = sub->base + sub->nsteps;
+      //adjust nsteps for registerd buffers as device signals a single step
+      if (sub->reg && sub->isOneRPN) sub->nsteps = DIVUP((size_t)sub->nbytes, resources->maxCollBytes);
       memset(sub->requests, 0, sizeof(sub->requests));
     }
     args->state = ncclProxyOpProgress;
   }
   args->idle = 1;
   if (args->state == ncclProxyOpProgress) {
-    int p = NCCL_PROTO_SIMPLE;
     int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS);
     for (int s=0; s<args->nsubs; s++) {
       int group = s/COLLNET_GROUP_NSUBS;
       int groupStart = s - (s%COLLNET_GROUP_NSUBS);
       struct ncclProxySubArgs* sub = args->subs+s;
       struct recvResources* resources = (struct recvResources*) (sub->connection->transportResources);
-      void* mhandle = resources->mhandles[p];
       auto reqFifo = resources->reqFifo;
-      char* region = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
 
       // Enforce sync between operations of the same group.
       if (LAST_OF_GROUP(args, s) && (sub->posted < sub->done + calcStepsPerGroup(nGroups)) && (sub->posted < sub->nsteps)) {
@@ -913,10 +1068,10 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       if (LAST_OF_GROUP(args, s) && (sub->received < sub->posted)) {
         int buffSlot = (sub->base+sub->received)%NCCL_STEPS;
         if (!reqFifo[group][buffSlot].turnIsSendNotRecv) { // Buffer is cleared : coll is complete
-          int recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
-          int recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
-          int totalSize = recvEnd - recvBeg;
-          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %d chunkSize=%d", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
+          ssize_t recvBeg = calcRegionOffset(args, 1, groupStart, sub->received, 0);
+          ssize_t recvEnd = calcRegionOffset(args, 1, s, sub->received, 1);
+          ssize_t totalSize = recvEnd - recvBeg;
+          TRACE(NCCL_NET, "recvProxy [%ld/%d/%d] received, size %ld chunkSize=%ld", (long)sub->received, group, buffSlot, totalSize, args->chunkSize);
           sub->received += args->sliceSteps;
           if ((reqFifo[group][buffSlot].size > 0 || sub->reg) && resources->useGdr && resources->needFlush) {
             // GDRCOPY support
@@ -929,37 +1084,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               return ncclInternalError;
 #endif
             } else {
-              if (sub->reg) {
-                size_t nBytes = std::min(sub->nbytes, NCCL_MAX_COLLNET_SIZE);
-                size_t offset = 0;
-                if (args->coll == ncclFuncReduceScatter) {
-                  size_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
-                  int node = args->specifics.collnetDirect.node;
-                  int startNode = sub->offset / sizePerRank;
-                  int lastNode = (sub->offset + nBytes) / sizePerRank;
-                  if (startNode == node) {
-                    offset = sub->offset % sizePerRank;
-                    nBytes = std::min(sizePerRank - offset, nBytes);
-                  } else if (startNode < node && node < lastNode) {
-                    nBytes = sizePerRank;
-                  } else if (node == lastNode) {
-                    nBytes = (sub->offset + nBytes) % sizePerRank;
-                  } else {
-                    // no need to flush
-                    nBytes = 0;
-                  }
-                }
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, sub->recvbuff + offset, nBytes, sub->recvMhandle, sub->requests+buffSlot));
-                if (sub->requests[buffSlot]) {
-                  sub->nbytes -= nBytes;
-                  sub->offset += nBytes;
-                  if (args->coll == ncclFuncAllGather || args->coll == ncclFuncAllReduce) {
-                    sub->recvbuff += nBytes;
-                  }
-                }
-              } else {
-                NCCLCHECK(proxyState->ncclCollNet->iflush(resources->collNetComm, region+recvBeg, totalSize, mhandle, sub->requests+buffSlot));
-              }
+              NCCLCHECK(collNetRecvFlush(proxyState, resources, args, sub, groupStart, totalSize, recvBeg, &sub->requests[buffSlot]));
             }
           }
           args->idle = 0;
@@ -980,14 +1105,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         }
       }
       if (sub->transmitted < sub->flushed) {
-        if (sub->reg == 0) {
+        if (sub->reg == 0 || (!sub->isOneRPN && args->coll == ncclFuncAllGather)) {
           int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS;
           volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
           connFifo[buffSlot].offset = calcRegionOffset(args, 1, s, sub->transmitted, 0);
           __sync_synchronize();
         }
         volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-        *recvTail = sub->base + sub->flushed;
+        if (sub->reg && sub->isOneRPN) {
+          // We may have bumped net steps, but reg operations only have a single step w.r.t. the GPU.
+          if (sub->flushed == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
+        } else {
+          *recvTail = sub->base + sub->flushed;
+        }
         if (resources->gdcSync) wc_store_fence(); // Flush out WC write
         sub->transmitted += args->sliceSteps;
         args->idle = 0;
@@ -999,7 +1129,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       bool groupSync = s==0 ? args->subs[args->nsubs-1].done == sub->done
                             : (sub-1)->done > sub->done;
       volatile uint64_t* sendHead = &resources->sendMem->head;
-      if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) {
+      int done = sub->reg && sub->isOneRPN ? 0 : sub->done;
+      if (groupSync && sub->done < sub->transmitted && sub->base + done < *sendHead) {
         sub->done += args->sliceSteps;
         args->idle = 0;
         if (sub->done == sub->nsteps && s == args->nsubs-1) {
@@ -1017,24 +1148,22 @@ struct collnetRegInfo {
   size_t size;
 };
 
-ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
   ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+  if (regRecord) {
+    if (regRecord->state & COLLNET_REG_COMPLETE) {
+      // reuse previous registration
+      *outRegBufFlag = 2;
+      *outHandle = regRecord->collnetHandle;
+      INFO(NCCL_REG, "rank %d - COLLNET reuse register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, regRecord->collnetHandle, buffSize, type == collNetRecv ? "Recv" : "Send");
+      goto exit;
+    } else {
+      /* start register collnet buffer */
+      struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+      void* handle = NULL;
+      struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn;
 
-  *outRegBufFlag = 0;
-  *outHandle = NULL;
-  if (comm && userbuff && buffSize > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      if (regRecord->state & COLLNET_REG_COMPLETE) {
-        // reuse previous registration
-        *outRegBufFlag = 2;
-        *outHandle = regRecord->collnetHandle;
-        goto exit;
-      } else {
-        /* start register collnet buffer */
-        struct collnetRegInfo info = {regRecord->addr, regRecord->pages * comm->regCache.pageSize};
-        void* handle = NULL;
+      if (conn->flags & NCCL_DIRECT_NIC) {
         struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
@@ -1042,11 +1171,13 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
           regRecord->collnetProxyconn = proxyconn;
           *outHandle = regRecord->collnetHandle = handle;
           *outRegBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
         }
+      } else {
+        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
       }
     }
   }
-
 exit:
   return ret;
 fail:
@@ -1055,44 +1186,63 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
   goto exit;
 }
 
+ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  *outHandle = NULL;
+  if (comm && userbuff && buffSize > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
 struct ncclCollnetCleanupCallback {
   struct ncclCommCallback base;
-  struct ncclProxyConnector* proxyConn;
-  void* buffer;
-  size_t size;
-  void* mhandle;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
 };
 
 static ncclResult_t cleanupCollnet(struct ncclComm* comm, struct ncclCommCallback* cb) {
   struct ncclCollnetCleanupCallback* obj = (struct ncclCollnetCleanupCallback*)cb;
-  NCCLCHECK(ncclCollnetDeregBuffer(comm, obj->proxyConn, obj->mhandle));
-  INFO(NCCL_REG, "rank %d - deregistered collnet buffer handle %p, size %ld, buff %p", comm->rank, obj->mhandle, obj->size, obj->buffer);
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
   free(obj);
   return ncclSuccess;
 }
 
 ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
   ncclResult_t ret = ncclSuccess;
-  void* handle = NULL;
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)userbuff & -pageSize;
-  size_t size = DIVUP((uintptr_t)userbuff - addr + buffSize, pageSize) * pageSize;
-  collnetRegInfo info = {addr, size};
   struct ncclCollnetCleanupCallback* record = NULL;
-  struct ncclProxyConnector* proxyConn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend = NULL;
+  size_t baseSendSize = 0;
 
   *outRegBufFlag = 0;
-  NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
-  record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
-  record->base.fn = cleanupCollnet;
-  record->proxyConn = proxyConn;
-  record->buffer = (void*)userbuff;
-  record->size = buffSize;
-  *outHandle = record->mhandle = handle;
-  *outRegBufFlag = 1;
-  ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
-  *nCleanupQueueElts += 1;
+  if (comm && userbuff && buffSize > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+
+    if (*outRegBufFlag) {
+      record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
+      record->base.fn = cleanupCollnet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }
 
 exit:
   return ret;
@@ -1104,6 +1254,7 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
 
 ncclResult_t ncclCollnetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyconn, void* handle) {
   NCCLCHECK(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - COLLNET deregistered buffer handle %p", comm->rank, handle);
   return ncclSuccess;
 }
 
@@ -1111,26 +1262,67 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   void* handle;
   struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
   struct sendResources* resources = (struct sendResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
   memcpy(respBuff, (void*)&handle, sizeof(void*));
   *done = 1;
   return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }
 
 static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   void* handle;
   struct collnetRegInfo* info = (struct collnetRegInfo*)reqBuff;
   struct recvResources* resources = (struct recvResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
-  if (proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle) != ncclSuccess) handle = NULL;
+  #if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useGdr && resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+#endif
+peermem:
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
   memcpy(respBuff, (void*)&handle, sizeof(void*));
   *done = 1;
   return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
 }
 
 static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
@@ -1155,13 +1347,6 @@ static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection,
   return ncclSuccess;
 }
 
-struct ncclTransport collNetTransport = {
-  "COL",
-  canConnect,
-  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
-  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
-
 ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
   char line[1024];
@@ -1197,7 +1382,6 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
 
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
-  int highestTransportType0 = TRANSPORT_UNDEFINED, highestTransportType1 = TRANSPORT_UNDEFINED;
 
   if (comm->collNetSupport == 0) goto exit;
 
@@ -1206,13 +1390,13 @@ ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
     struct ncclChannel* channelRecv = comm->channels + c;
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collnetDirect.down, 0), ret, fail);
   }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0, &highestTransportType0), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 0), ret, fail);
 
   for (int c = 0; c < comm->nChannels; c++) {
     struct ncclChannel* channelSend = comm->channels + c;
     NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.down, NCCL_MAX_DIRECT_ARITY, channelSend->collnetDirect.up, 1), ret, fail);
   }
-  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1, &highestTransportType1), ret, fail);
+  NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_COLLNET_DIRECT], 1), ret, fail);
 
   INFO(NCCL_INIT, "rank %d Connected CollNet", comm->rank);
 
@@ -1410,3 +1594,10 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
   comm->collNetSupport = 0;
   goto exit;
 }
+
+struct ncclTransport collNetTransport = {
+  "COL",
+  canConnect,
+  { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
+};
\ No newline at end of file
diff --git a/src/transport/generic.cc b/src/transport/generic.cc
index 7fd7e59fb..47b023667 100644
--- a/src/transport/generic.cc
+++ b/src/transport/generic.cc
@@ -1,17 +1,37 @@
 #include "comm.h"
 #include "transport.h"
+#include "bootstrap.h"
 
 ncclResult_t ncclTransportRingConnect(struct ncclComm* comm) {
+  struct ringConnInfo {
+    bool useNetPXN;
+    bool useGdr;
+  };
+  struct ringConnInfo* ringInfo = NULL;
   ncclResult_t ret = ncclSuccess;
   if (comm && comm->nRanks > 1) {
+    comm->useGdr = true;
+    comm->useNetPXN = false;
     for (int c = 0; c < comm->nChannels; c++) {
       struct ncclChannel* channel = comm->channels + c;
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
     }
     NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &comm->graphs[NCCL_ALGO_RING], 0), ret, fail);
-    INFO(NCCL_INIT, "Connected all rings");
+    if (ncclParamLocalRegister() || ncclParamGraphRegister()) {
+      NCCLCHECK(ncclCalloc(&ringInfo, comm->nRanks));
+      ringInfo[comm->rank].useGdr = comm->useGdr;
+      ringInfo[comm->rank].useNetPXN = comm->useNetPXN;
+      NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, ringInfo, sizeof(struct ringConnInfo)), ret, fail);
+      for (int i = 0; i < comm->nRanks; ++i) {
+        if (!ringInfo[i].useGdr) comm->useGdr = false;
+        if (ringInfo[i].useNetPXN) comm->useNetPXN = true;
+        if (comm->useGdr == false && comm->useNetPXN == true) break;
+      }
+    }
+    INFO(NCCL_INIT, "Connected all rings, use ring PXN %d GDR %d", comm->useNetPXN, comm->useGdr);
   }
 exit:
+  free(ringInfo);
   return ret;
 fail:
   goto exit;
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 00eca607d..8760b4258 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -15,6 +15,7 @@
 #include "profiler.h"
 #include "transport.h"
 #include "shm.h"
+#include <assert.h>
 
 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
 
@@ -107,6 +108,7 @@ struct sendNetResources {
   int netDeviceVersion;
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
 };
 
 struct recvNetResources {
@@ -139,6 +141,12 @@ struct recvNetResources {
   int netDeviceVersion;
   ncclNetDeviceType netDeviceType;
   ncclNetDeviceHandle_t* netDeviceHandle;
+  size_t maxP2pBytes;
+};
+
+struct netRegInfo {
+  uintptr_t buffer;
+  size_t size;
 };
 
 /* Determine if two peers can communicate with NET */
@@ -166,6 +174,9 @@ struct setupReq {
   int connIndex;
 };
 
+NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1);
+
+static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag");
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
@@ -181,8 +192,10 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, peerInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 1, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 1, &req.useGdr));
   send->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
+  if (proxyRank != myInfo->rank && connIndex == 0) comm->useNetPXN = true;
 
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 1, proxyRank, &send->proxyConn));
   req.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
@@ -198,6 +211,7 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
         proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = comm->topParentRanks[proxyRank];
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
   return ncclSuccess;
 }
 
@@ -218,10 +232,12 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   int proxyRank;
   int64_t netId;
   NCCLCHECK(ncclTopoGetNetDev(comm, myInfo->rank, graph, channelId, myInfo->rank, &netId, &req.netDev, &proxyRank));
-  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netId, 0, &req.useGdr));
+  NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
+  recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
+  if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
 
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm->topo, myInfo->busId, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
 
   // We don't support PXN on receive yet
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
@@ -230,6 +246,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRank = comm->topParentRanks[myInfo->rank];
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
+  memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
   INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
       req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
   return ncclSuccess;
@@ -283,8 +300,11 @@ struct netRecvConnectArgs {
 
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
   struct connectMap* map = (connectMap*) send->transportResources;
-
   void* opId;
+  int recvUseGdr;
+
+  memcpy(&recvUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!recvUseGdr) send->conn.flags &= ~NCCL_DIRECT_NIC;
 
   // map isn't allocated thus this op hasn't been submitted yet
   if (!map) {
@@ -391,6 +411,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
 static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* recv) {
   struct connectMap* map = (connectMap*) recv->transportResources;
   void* opId;
+  int sendUseGdr;
+
+  memcpy(&sendUseGdr, (uint8_t*)connectInfo + sizeof(ncclNetHandle_t), sizeof(int));
+  if (!sendUseGdr) recv->conn.flags &= ~NCCL_DIRECT_NIC;
+
   if (!map) {
     NCCLCHECK(ncclCalloc(&map, 1));
     recv->transportResources = map;
@@ -522,7 +547,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
   return ncclSuccess;
 }
 
-static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, int* size) {
+static ncclResult_t sharedBuffersGet(struct ncclProxyState* proxyState, int channel, int slot, int* offset, size_t* size) {
   // Use different pools for different channels and also separate send/recv.
   int globalSlot = (channel*NCCL_SHARED_STEPS)+slot;
   *offset = proxyState->p2pChunkSize * globalSlot;
@@ -590,6 +615,13 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
 
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("sendProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   // We don't return any data
   if (respSize != 0) return ncclInternalError;
@@ -621,6 +653,13 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   resources->maxRecvs = props.maxRecvs;
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
+  /* point-to-point size limits*/
+  resources->maxP2pBytes = props.maxP2pBytes;
+  if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
+    WARN("recvProxySetup: net plugin returned invalid value for maxP2pBytes %ld \
+      [allowed range: %ld - %ld] \n", resources->maxP2pBytes, 0L, NCCL_MAX_NET_SIZE_BYTES);
+    return ncclInternalError;
+  }
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
   NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
@@ -916,6 +955,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   resources->sendMem = (struct ncclSendMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, sendMem);
   resources->recvMem = (struct ncclRecvMem*) NCCL_NET_MAP_GET_POINTER(map, cpu, recvMem);
+  for (int i = 0; i < NCCL_STEPS; i++) resources->recvMem->connFifo[i].size = -1;
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     resources->buffers[p] = NCCL_NET_MAP_GET_POINTER(map, cpu, buffs[p]);
     if (resources->buffers[p]) {
@@ -1032,7 +1072,6 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 }
 
 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
   if (args->state == ncclProxyOpReady) {
@@ -1045,11 +1084,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->transmitted = sub->done = 0;
       ncclProfilerStartSendProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netSendComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->sendMhandle = resources->mhandles[args->protocol];
     }
     args->state = ncclProxyOpProgress;
   }
@@ -1059,6 +1095,9 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
     int maxDepth = std::min(NCCL_STEPS, NCCL_SHARED_STEPS/args->nsubs);
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
+      int postedStepId = sub->posted;
+      int transmittedStepId = sub->transmitted;
+      int doneStepId = sub->done;
       if (sub->done == sub->nsteps) continue;
       struct sendNetResources* resources = (struct sendNetResources*) (sub->connection->transportResources);
       volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
@@ -1066,7 +1105,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
       // Post buffers to the GPU
       if (sub->posted < sub->nsteps && sub->posted < sub->done + maxDepth) {
-        ncclProfilerStartSendProxyStepEvents(s, args, sub->posted, sub->posted+args->sliceSteps);
+        ncclProfilerStartSendProxyStepEvent(s, args, postedStepId);
         int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
         if (resources->shared) {
           if (!sub->reg) {
@@ -1078,12 +1117,13 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           }
           volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
           sub->posted += args->sliceSteps;
-          // Only post one credit for registered buffer
-          if (sub->reg == 0 || sub->posted == args->sliceSteps) *sendHead = sub->base + sub->posted - NCCL_STEPS;
+          *sendHead = sub->base + sub->posted - NCCL_STEPS;
           if (resources->gdcSync) wc_store_fence(); // Flush out WC write
-        } else sub->posted += args->sliceSteps;
+        } else {
+          sub->posted += args->sliceSteps;
+        }
         ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
-        ncclProfilerRecordProxyStepEventStates(s, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepSendGPUWait);
+        ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait);
         args->idle = 0;
         continue;
       }
@@ -1091,10 +1131,10 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) {
         int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS;
         volatile uint64_t* recvTail = &resources->recvMem->tail;
-        uint64_t tail = sub->base + (sub->reg ? 0 : sub->transmitted);
-        if ((sub->reg || connFifo[buffSlot].size != -1) && ((*recvTail > tail) || p == NCCL_PROTO_LL)) {
+        uint64_t tail = sub->base + sub->transmitted;
+        if (connFifo[buffSlot].size != -1 && (*recvTail > tail || p == NCCL_PROTO_LL)) {
           // We have something to receive, let's check if it's completely ready.
-          int size = sub->reg ? std::min(MAX_NET_SIZE, sub->nbytes) : connFifo[buffSlot].size;
+          int size = connFifo[buffSlot].size;
           bool shared = (p == NCCL_PROTO_SIMPLE) && resources->shared;
           char* buff = shared ? localBuff+connFifo[buffSlot].offset : localBuff+buffSlot*stepSize;
           int ready = 1;
@@ -1120,22 +1160,28 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
               volatile uint32_t *f2 = &lines[i].flag2;
               if (f1[0] != flag || f2[0] != flag) { ready = 0; break; }
             }
-          } else if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            buff = sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset;
+          } else if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              buff = sub->reg ? (char*)sub->sendbuff + sub->transmitted * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+            } else if (sub->reg) {
+              size_t sendSize;
+              sub->ringAlgo->getNextSendAddr(sub->transmitted, (uint8_t**)&buff, &sendSize, &sub->sendMhandle);
+              assert(sendSize == size);
+            }
           }
           if (ready) {
-            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted + args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
+            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
             // Data is ready, try to send.
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->mhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
-              TRACE(NCCL_NET, "sendProxy [%ld/%d] Isend posted, req %p, size %d, proto %d, myRank %d, channelId %d", sub->transmitted, buffSlot, sub->requests[buffSlot], size, p, proxyState->tpRank, sub->channelId);
+              TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
+              sub->transSize += size;
               sub->transmitted += args->sliceSteps;
               ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
-              ncclProfilerRecordProxyStepEventStates(s, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepSendWait);
-              sub->transSize += size;
+              ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
               continue;
             }
@@ -1149,41 +1195,24 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
         int buffSlot = (sub->base+sub->done)%NCCL_STEPS;
         NCCLCHECK(proxyState->ncclNet->test(sub->requests[buffSlot], &done, &size));
         if (done) {
-          if (sub->reg) {
-            if (size < sub->nbytes) {
-              sub->recvbuff += size;
-              sub->nbytes -= size;
-              // Do one more step (at least)
-              sub->nsteps++;
-            } else {
-              // Signal the GPU the send is complete and it can return.
-              connFifo[sub->base%NCCL_STEPS].size = -1;
-            }
-          }
           // Make sure size is reset to -1 before we update the head.
-          if (sub->reg == 0) connFifo[buffSlot].size = -1;
+          connFifo[buffSlot].size = -1;
           __sync_synchronize();
-          TRACE(NCCL_NET, "sendProxy [%ld/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]);
+          TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
-          ncclProfilerStopProxyStepEvents(s, args, sub->done-args->sliceSteps, sub->done);
+          ncclProfilerStopProxyStepEvent(s, args, doneStepId);
           ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
 
           if (resources->shared == 0) {
             volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
-            if (sub->reg) {
-              // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-              if (sub->done == sub->nsteps) *sendHead = sub->base + args->sliceSteps;
-            } else {
-              *sendHead = sub->base + sub->done;
-            }
+            *sendHead = sub->base + sub->done;
             if (resources->gdcSync) wc_store_fence(); // Flush out WC write
           }
           args->idle = 0;
           if (sub->done == sub->nsteps) {
-            if (sub->reg && sub->nbytes > 0) {
-              NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, sub->mhandle));
-            }
             args->done++;
+            if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+            sub->ringAlgo = NULL;
           }
         }
       }
@@ -1232,14 +1261,11 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       // Set step base for next op
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
+      sub->regBufferReady = 0;
       for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
       ncclProfilerStartRecvProxyOpEvent(s, args);
-      if (sub->reg && sub->nbytes > 0) {
-        // Register buffer
-        NCCLCHECK(proxyState->ncclNet->regMr(resources->netRecvComm, sub->recvbuff, sub->nbytes, NCCL_PTR_CUDA, &sub->mhandle));
-      } else {
-        sub->mhandle = resources->mhandles[args->protocol];
-      }
+      if (!sub->reg)
+        sub->recvMhandle = resources->mhandles[args->protocol];
     }
     args->state = ncclProxyOpProgress;
   }
@@ -1251,32 +1277,44 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       struct ncclProxySubArgs* subGroup = args->subs+s;
       int subCount = 0;
       void* ptrs[NCCL_PROXY_MAX_SUBS];
-      int sizes[NCCL_PROXY_MAX_SUBS];
+      size_t sizes[NCCL_PROXY_MAX_SUBS];
       int tags[NCCL_PROXY_MAX_SUBS];
       void* mhandles[NCCL_PROXY_MAX_SUBS];
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
+        int postedStepId = sub->posted;
         if (sub->posted < sub->nsteps) {
           if (sub->posted >= sub->done + maxDepth) { subCount = 0; break; }
-          ncclProfilerStartRecvProxyStepEvents(s+i, args, sub->posted, sub->posted+args->sliceSteps);
+          ncclProfilerStartRecvProxyStepEvent(s+i, args, postedStepId);
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-          if (sub->reg) maxDepth = 1;
           int stepSize = resources->buffSizes[p] / NCCL_STEPS;
           char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
           int buffSlot = (sub->base+sub->posted)%NCCL_STEPS;
           volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-          if (p == NCCL_PROTO_SIMPLE && resources->shared) {
-            if (sub->reg) {
-              // Wait until CUDA kernel has started before we access the user buffer directly.
-              if (connFifo[sub->base%NCCL_STEPS].size == -1) continue;
-              ptrs[subCount] = sub->recvbuff;
-              sizes[subCount] = std::min(MAX_NET_SIZE, sub->nbytes);
+          if (p == NCCL_PROTO_SIMPLE) {
+            if (resources->shared) {
+              if (sub->reg) {
+                // Wait until CUDA kernel has started before we access the user buffer directly.
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                ptrs[subCount] = sub->recvbuff + sub->posted * NCCL_MAX_NET_SIZE;
+                sizes[subCount] = std::min(NCCL_MAX_NET_SIZE, (ssize_t)(sub->nbytes - sub->posted * NCCL_MAX_NET_SIZE));
+              } else {
+                int sharedBuffSlot = sub->posted % maxDepth;
+                int offset;
+                NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot * args->nsubs + s + i, &offset, sizes + subCount));
+                connFifo[buffSlot].offset = offset;
+                ptrs[subCount] = localBuff + offset;
+              }
             } else {
-              int sharedBuffSlot = sub->posted%maxDepth;
-              int offset;
-              NCCLCHECK(sharedBuffersGet(proxyState, sub->channelId, sharedBuffSlot*args->nsubs+s+i, &offset, sizes+subCount));
-              connFifo[buffSlot].offset = offset;
-              ptrs[subCount] = localBuff+offset;
+              if (sub->reg) {
+                if (!sub->regBufferReady && connFifo[sub->base % NCCL_STEPS].size == -1) continue;
+                sub->regBufferReady = 1;
+                sub->ringAlgo->getNextRecvAddr(sub->posted, (uint8_t**)&ptrs[subCount], &sizes[subCount], &sub->recvMhandle);
+              } else {
+                ptrs[subCount] = localBuff + buffSlot * stepSize;
+                sizes[subCount] = stepSize * args->sliceSteps;
+              }
             }
           } else {
             ptrs[subCount] = localBuff+buffSlot*stepSize;
@@ -1284,7 +1322,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           }
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
-          mhandles[subCount] = sub->mhandle;
+          mhandles[subCount] = sub->recvMhandle;
           subCount++;
         }
       }
@@ -1292,15 +1330,19 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         uint64_t step = subGroup->posted;
         struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
+        bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
+        if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
         NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup+i;
+            int postedStepId = sub->posted;
+            TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->posted-args->sliceSteps, sub->posted, ncclProfilerProxyStepRecvWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
           args->idle = 0;
         }
@@ -1321,31 +1363,18 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         if (done) {
           int needFlush = 0;
           int totalSize = 0;
-          int subIndex = 0;
           for (int i=0; i<NCCL_PROXY_MAX_SUBS; i++) totalSize += sizes[i];
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
-            if (sub->received < sub->nsteps) {
-              int size = sizes[subIndex++];
-              if (sub->reg) {
-                if (size < sub->nbytes) {
-                  sub->recvbuff += size;
-                  sub->nbytes -= size;
-                  // Do one more step (at least)
-                  sub->nsteps++;
-                } else {
-                  // Reset connFifo size indicating the GPU was ready to receive.
-                  // There is a __sync_synchronize() later to ensure it is reset before it is set again by the GPU.
-                  struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-                  volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
-                  connFifo[sub->base%NCCL_STEPS].size = -1;
-                }
-              }
-            }
-            sub->received += args->sliceSteps;
+            int receivedStepId = sub->received;
+            int buffSlot = (sub->base + sub->received) % NCCL_STEPS;
+            struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources);
+            volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
+            connFifo[buffSlot].size = -1;
             sub->transSize += sizes[i];
+            sub->received += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->received-args->sliceSteps, sub->received, ncclProfilerProxyStepRecvFlushWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait);
             if (step < sub->nsteps) {
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               if (resources->useGdr) needFlush |= resources->needFlush;
@@ -1372,10 +1401,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                   int stepSize = resources->buffSizes[p] / NCCL_STEPS;
                   char* localBuff = NCCL_NET_MAP_GET_POINTER(&resources->map, cpu, buffs[p]);
                   int buffSlot = (sub->base+sub->received-args->sliceSteps)%NCCL_STEPS;
-                  ptrs[subCount] = resources->shared ?
-                    (sub->reg ? (char*)sub->recvbuff : localBuff+resources->recvMem->connFifo[buffSlot].offset) :
-                    localBuff+buffSlot*stepSize;
-                  mhandles[subCount] = sub->mhandle;
+                  if (resources->shared) {
+                    ptrs[subCount] = sub->reg ? (char*)sub->recvbuff + step * NCCL_MAX_NET_SIZE : localBuff + resources->recvMem->connFifo[buffSlot].offset;
+                  } else {
+                    if (sub->reg) {
+                      sub->ringAlgo->getNextRecvAddr(step, (uint8_t**)&ptrs[subCount], NULL, &sub->recvMhandle);
+                    } else {
+                      ptrs[subCount] = localBuff + buffSlot * stepSize;
+                    }
+                  }
+                  mhandles[subCount] = sub->recvMhandle;
                   subCount++;
                 }
               }
@@ -1399,19 +1434,16 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         if (done) {
           for (int i=0; i<subGroup->groupSize; i++) {
             struct ncclProxySubArgs* sub = subGroup + i;
+            int transmittedStepId = sub->transmitted;
 
             sub->transmitted += args->sliceSteps;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
-            ncclProfilerRecordProxyStepEventStates(s+i, args, sub->transmitted-args->sliceSteps, sub->transmitted, ncclProfilerProxyStepRecvGPUWait);
+            ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait);
             if (step < sub->nsteps) {
               __sync_synchronize();
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
               volatile uint64_t* recvTail = resources->gdcSync ? resources->gdcSync : &resources->recvMem->tail;
-              if (sub->reg) {
-                // We may have added more net steps, but reg operations only have a single step w.r.t. the GPU.
-                if (sub->transmitted == sub->nsteps) *recvTail = sub->base + args->sliceSteps;
-              } else
-                *recvTail = sub->base + sub->transmitted;
+              *recvTail = sub->base + sub->transmitted;
               if (resources->gdcSync) wc_store_fence(); // Flush out WC write
             }
           }
@@ -1425,11 +1457,12 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       struct ncclProxySubArgs* subGroup = args->subs+s;
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
+        int doneStepId = sub->done;
         if (sub->done == sub->nsteps) continue;
         if (sub->transmitted > sub->done) {
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
           volatile uint64_t* sendHead = &resources->sendMem->head;
-          uint64_t done = sub->reg ? sub->base + sub->nsteps : *sendHead;
+          uint64_t done = *sendHead;
           while (done > sub->base + sub->done &&
               // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted.
               sub->transmitted > sub->done) {
@@ -1440,15 +1473,13 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
               subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
             }
             sub->done += args->sliceSteps;
-            ncclProfilerStopProxyStepEvents(s+i, args, sub->done-args->sliceSteps, sub->done);
+            ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
             args->idle = 0;
             if (sub->done == sub->nsteps) {
-              struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
-              if (sub->reg && sub->nbytes > 0) {
-                NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, sub->mhandle));
-              }
               args->done++;
+              if (sub->ringAlgo && sub->ringAlgo->decRefCount() == 0) delete sub->ringAlgo;
+              sub->ringAlgo = NULL;
               break;
             }
           }
@@ -1465,9 +1496,228 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetDeregBuffer(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* handle) {
+  NCCLCHECK(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgDeregister, &handle, sizeof(void*), NULL, 0));
+  INFO(NCCL_REG, "rank %d - deregistered net buffer handle %p", comm->rank, handle);
+  return ncclSuccess;
+}
+
+static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  int gdrFlag = 1;
+
+  if (regRecord) {
+    for (int p = 0; p < nPeers; ++p) {
+      struct ncclConnector* peerConn = peerConns[p];
+      struct ncclProxyConnector* peerProxyConn = NULL;
+      struct ncclRegNetHandles* netHandle = NULL;
+      bool found = false;
+      if (peerConn == NULL) continue;
+      peerProxyConn = &peerConn->proxyConn;
+      netHandle = regRecord->netHandleHead;
+      while (netHandle) {
+        if (netHandle->proxyConn == peerProxyConn) {
+          found = true;
+          break;
+        }
+        netHandle = netHandle->next;
+      }
+      if (found) {
+        *outRegBufFlag = 1;
+        outHandle[p] = netHandle->handle;
+        INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle);
+      } else {
+        struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+        void* handle = NULL;
+
+        if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, peerProxyConn, ncclProxyMsgRegister, &info, sizeof(struct netRegInfo), &handle, sizeof(void*)), ret, fail);
+          if (handle) {
+            struct ncclRegNetHandles* netHandle;
+            regRecord->state |= NET_REG_COMPLETE;
+            NCCLCHECK(ncclCalloc(&netHandle, 1));
+            netHandle->handle = handle;
+            netHandle->proxyConn = peerProxyConn;
+            netHandle->next = regRecord->netHandleHead;
+            regRecord->netHandleHead = netHandle;
+            outHandle[p] = handle;
+            *outRegBufFlag = 1;
+            INFO(NCCL_REG, "rank %d - NET register userbuff %p (handle %p), buffSize %ld", comm->rank, userbuff, handle, buffSize);
+          } else {
+            goto fail;
+          }
+        } else {
+          gdrFlag = 0;
+          goto fail;
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  goto exit;
+}
+
+ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+struct ncclNetCleanupCallback {
+  struct ncclCommCallback base;
+  struct ncclComm *comm;
+  struct ncclReg *reg;
+};
+
+static ncclResult_t cleanupNet(struct ncclComm* comm, struct ncclCommCallback* cb) {
+  struct ncclNetCleanupCallback* obj = (struct ncclNetCleanupCallback*)cb;
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
+  free(obj);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, struct ncclConnector** peerConns, int nPeers, int* outRegBufFlag, void** outHandle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclNetCleanupCallback *record = NULL;
+  struct ncclReg *regRecord = NULL;
+  void *baseSend;
+  size_t baseSendSize;
+
+  *outRegBufFlag = 0;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+    if (*outRegBufFlag) {
+      NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
+      record->base.fn = cleanupNet;
+      record->comm = comm;
+      record->reg = regRecord;
+      ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
+    }
+  }
+exit:
+  return ret;
+fail:
+  *outRegBufFlag = 0;
+  goto exit;
+}
+
+static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  void* handle;
+  struct netRegInfo* info = (struct netRegInfo*)reqBuff;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+  ncclResult_t ret = ncclSuccess;
+  bool needReg = true;
+
+  assert(reqSize == sizeof(struct netRegInfo));
+  assert(respSize == sizeof(void*));
+
+#if CUDART_VERSION >= 11070
+  /* DMA-BUF support */
+  if (resources->useDmaBuf) {
+    int dmabuf_fd;
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
+    (void)close(dmabuf_fd);
+    needReg = false;
+  }
+peermem:
+#endif
+  if (needReg) {
+    NCCLCHECKGOTO(proxyState->ncclNet->regMr(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
+  }
+
+exit:
+  memcpy(respBuff, (void*)&handle, sizeof(void*));
+  *done = 1;
+  return ncclSuccess;
+fail:
+  handle = NULL;
+  goto exit;
+}
+
+static ncclResult_t sendProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netSendComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t recvProxyDeregBuffer(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, int* done) {
+  void* handle;
+  struct recvNetResources* resources = (struct recvNetResources*)(connection->transportResources);
+
+  assert(reqSize == sizeof(void*));
+  memcpy(&handle, reqBuff, sizeof(void*));
+  NCCLCHECK(proxyState->ncclNet->deregMr(resources->netRecvComm, handle));
+  *done = 1;
+  return ncclSuccess;
+}
+
 struct ncclTransport netTransport = {
   "NET",
   canConnect,
-  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, NULL },
-  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, NULL }
+  { sendSetup, sendConnect, sendFree, proxySharedInit, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
+  { recvSetup, recvConnect, recvFree, proxySharedInit, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
 };
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index d828c9801..bc54133d3 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -42,14 +42,12 @@ struct ncclIbMrCache {
 };
 
 static int ncclNMergedIbDevs = -1;
-#define NCCL_IB_MAX_DEVS_PER_NIC 2
+#define NCCL_IB_MAX_DEVS_PER_NIC 4
 #define MAX_MERGED_DEV_NAME (MAXNAMESIZE*NCCL_IB_MAX_DEVS_PER_NIC)+NCCL_IB_MAX_DEVS_PER_NIC
 struct alignas(64) ncclIbMergedDev {
-  int ndevs;
-  int devs[NCCL_IB_MAX_DEVS_PER_NIC]; // Points to an index in ncclIbDevs
+  ncclNetVDeviceProps_t vProps;
   int speed;
   char devName[MAX_MERGED_DEV_NAME]; // Up to NCCL_IB_MAX_DEVS_PER_NIC * name size, and a character for each '+'
-  int dmaBufSupported;               //  0 = uninit, 1 = yes, -1 = no
 };
 
 struct ncclIbStats {
@@ -69,16 +67,20 @@ struct alignas(64) ncclIbDev {
   ibv_pd* pd;
   char devName[MAXNAMESIZE];
   char* pciPath;
+  char* virtualPciPath;
   int realPort;
   int maxQp;
+  float latency;
   struct ncclIbMrCache mrCache;
   int ar; // ADAPTIVE_ROUTING
   struct ibv_port_attr portAttr;
   struct ncclIbStats stats;
+  int dmaBufSupported;
 };
 
-#define MAX_IB_DEVS 32
-struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_DEVS];
+#define MAX_IB_DEVS  32
+#define MAX_IB_VDEVS MAX_IB_DEVS*8
+struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
@@ -95,7 +97,7 @@ NCCL_PARAM(IbTc, "IB_TC", 0);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
-NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", 0);
+NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1);
 NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
 NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
 
@@ -223,17 +225,17 @@ static void* envIbAddrRange(sa_family_t af, int* mask) {
   *(maskStrPtr++) = '\0';
 
   if (inet_pton(af, addrStrPtr, ret) == 0) {
-    WARN("NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address '%s' is invalid for family %s, ignoring address", addrStrPtr, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     return NULL;
   }
 
   *mask = (int)strtol(maskStrPtr, NULL, 10);
   if (af == AF_INET && *mask > 32) {
-    WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     *mask = 0;
     ret = NULL;
   } else if (af == AF_INET6 && *mask > 128) {
-    WARN("NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB: Ip address mask '%d' is invalid for family %s, ignoring mask", *mask, (af == AF_INET) ? "AF_INET" : "AF_INET6");
     *mask = 0;
     ret = NULL;
   }
@@ -314,7 +316,7 @@ static bool validGid(union ibv_gid* gid) {
 static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum, int gidIndex, int* version) {
   char gidRoceVerStr[16] = { 0 };
   char roceTypePath[PATH_MAX] = { 0 };
-  sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
+  snprintf(roceTypePath, sizeof(roceTypePath), "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d", deviceName, portNum, gidIndex);
 
   int fd = open(roceTypePath, O_RDONLY);
   if (fd == -1) {
@@ -423,6 +425,16 @@ NCCL_PARAM(IbDisable, "IB_DISABLE", 0);
 NCCL_PARAM(IbMergeVfs, "IB_MERGE_VFS", 1);
 NCCL_PARAM(IbMergeNics, "IB_MERGE_NICS", 1);
 
+// Returns 0 if this is the path of two VFs of the same physical device
+static int ncclIbMatchVfPath(char* path1, char* path2) {
+  // Merge multi-port NICs into the same PCI device
+  if (ncclParamIbMergeVfs()) {
+    return strncmp(path1, path2, strlen(path1)-4) == 0;
+  } else {
+    return strncmp(path1, path2, strlen(path1)-1) == 0;
+  }
+}
+
 static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) {
   char devicePath[PATH_MAX];
   snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
@@ -430,14 +442,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   if (p == NULL) {
     WARN("Could not find real path of %s (%s)", devName, devicePath);
   } else {
-    // Merge multi-port NICs into the same PCI device
-    p[strlen(p)-1] = '0';
-    // Also merge virtual functions (VF) into the same device
-    if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
-    // And keep the real port aside (the ibv port is always 1 on recent cards)
+    // Keep the real port aside (the ibv port is always 1 on recent cards)
     *realPort = 0;
     for (int d=0; d<ncclNIbDevs; d++) {
-      if (strcmp(p, ncclIbDevs[d].pciPath) == 0) (*realPort)++;
+      if (ncclIbMatchVfPath(p, ncclIbDevs[d].pciPath)) (*realPort)++;
     }
   }
   *path = p;
@@ -478,23 +486,66 @@ static int ncclIbRelaxedOrderingCapable(void) {
   return r == ncclInternalError ? 0 : 1;
 }
 
-// Compare ncclIbDev[dev] to all stored mergedIbDevs
-int ncclIbFindMatchingDev(int dev) {
-  for (int i = 0; i < ncclNMergedIbDevs; i++) {
-    if (ncclIbMergedDevs[i].ndevs < NCCL_IB_MAX_DEVS_PER_NIC) {
-      int compareDev = ncclIbMergedDevs[i].devs[0];
-      if (strcmp(ncclIbDevs[dev].pciPath, ncclIbDevs[compareDev].pciPath) == 0 &&
-          (ncclIbDevs[dev].guid == ncclIbDevs[compareDev].guid) &&
-          (ncclIbDevs[dev].link == ncclIbDevs[compareDev].link)) {
-          TRACE(NCCL_NET, "NET/IB: Matched name1=%s pciPath1=%s guid1=0x%lx link1=%u name2=%s pciPath2=%s guid2=0x%lx link2=%u",
-            ncclIbDevs[dev].devName, ncclIbDevs[dev].pciPath, ncclIbDevs[dev].guid, ncclIbDevs[dev].link,
-            ncclIbDevs[compareDev].devName, ncclIbDevs[compareDev].pciPath, ncclIbDevs[compareDev].guid, ncclIbDevs[compareDev].link);
-          return i;
-      }
+ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
+  if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) {
+    WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL.");
+    return ncclInvalidUsage;
+  }
+
+  if (props->ndevs == 0) {
+      WARN("NET/IB : Can't make virtual NIC with 0 devices");
+      return ncclInvalidUsage;
+  }
+
+  if (ncclNMergedIbDevs == MAX_IB_VDEVS) {
+    WARN("NET/IB : Cannot allocate any more virtual devices (%d)", MAX_IB_VDEVS);
+    return ncclInvalidUsage;
+  }
+
+  // Always count up number of merged devices
+  ncclIbMergedDev* mDev = ncclIbMergedDevs + ncclNMergedIbDevs;
+  mDev->vProps.ndevs = 0;
+  mDev->speed = 0;
+
+  for (int i = 0; i < props->ndevs; i++) {
+    ncclIbDev* dev = ncclIbDevs + props->devs[i];
+    if (mDev->vProps.ndevs == NCCL_IB_MAX_DEVS_PER_NIC) return ncclInvalidUsage;
+    mDev->vProps.devs[mDev->vProps.ndevs++] = props->devs[i];
+    mDev->speed += dev->speed;
+    // Each successive time, copy the name '+' new name
+    if (mDev->vProps.ndevs > 1) {
+      snprintf(mDev->devName + strlen(mDev->devName), sizeof(mDev->devName) - strlen(mDev->devName), "+%s", dev->devName);
+    // First time, copy the plain name
+    } else {
+      strncpy(mDev->devName, dev->devName, MAXNAMESIZE);
+    }
+  }
+
+  // Check link layers
+  ncclIbDev* dev0 = ncclIbDevs + props->devs[0];
+  for (int i = 1; i < props->ndevs; i++) {
+    if (props->devs[i] >= ncclNIbDevs) {
+      WARN("NET/IB : Cannot use physical device %d, max %d", props->devs[i], ncclNIbDevs);
+      return ncclInvalidUsage;
+    }
+    ncclIbDev* dev = ncclIbDevs + props->devs[i];
+    if (dev->link != dev0->link) {
+      WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA",
+        dev0->devName, dev0->link, dev->devName, dev->link);
+      return ncclInvalidUsage;
     }
   }
 
-  return ncclNMergedIbDevs;
+  *d = ncclNMergedIbDevs++;
+  INFO(NCCL_NET, "NET/IB : Made virtual device [%d] name=%s speed=%d ndevs=%d", *d, mDev->devName, mDev->speed, mDev->vProps.ndevs);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  pthread_mutex_lock(&ncclIbLock);
+  ncclResult_t res = ncclIbMakeVDeviceInternal(d, props);
+  pthread_mutex_unlock(&ncclIbLock);
+  return res;
 }
 
 ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
@@ -531,10 +582,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
 
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
 
-      // Should NCCL merge multi-port devices into one?
-      int mergeNics;
-      mergeNics = ncclParamIbMergeNics();
-build_ib_list:
       for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
         struct ibv_context * context;
         if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
@@ -593,82 +640,38 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
           PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
 
-          int mergedDev = ncclNMergedIbDevs;
-          if (mergeNics) {
-            mergedDev = ncclIbFindMatchingDev(ncclNIbDevs);
-          }
-
-          // No matching dev found, create new mergedDev entry (it's okay if there's only one dev inside)
-          if (mergedDev == ncclNMergedIbDevs) {
-            // Set ndevs to 1, assign first ibDevN to the current IB device
-            ncclIbMergedDevs[mergedDev].ndevs = 1;
-            ncclIbMergedDevs[mergedDev].devs[0] = ncclNIbDevs;
-            ncclNMergedIbDevs++;
-            strncpy(ncclIbMergedDevs[mergedDev].devName, ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE);
-          // Matching dev found, edit name
-          } else {
-            // Set next device in this array to the current IB device
-            int ndevs = ncclIbMergedDevs[mergedDev].ndevs;
-            ncclIbMergedDevs[mergedDev].devs[ndevs] = ncclNIbDevs;
-            ncclIbMergedDevs[mergedDev].ndevs++;
-            snprintf(ncclIbMergedDevs[mergedDev].devName + strlen(ncclIbMergedDevs[mergedDev].devName), MAXNAMESIZE+1, "+%s", ncclIbDevs[ncclNIbDevs].devName);
-          }
+          // Add this plain physical device to the list of virtual devices
+          int vDev;
+          ncclNetVDeviceProps_t vProps = {0};
+          vProps.ndevs = 1;
+          vProps.devs[0] = ncclNIbDevs;
+          NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
 
-          // Aggregate speed
-          ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed;
           ncclNIbDevs++;
           nPorts++;
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
       }
 
-      // Detect if there are both multi-port and single-port NICs in the system. If so, disable port merging and build the list again
-      if (mergeNics) {
-        for (int d = 0; d < ncclNMergedIbDevs; d++) {
-          if (ncclIbMergedDevs[d].ndevs != ncclIbMergedDevs[0].ndevs) {
-            INFO(NCCL_NET, "Detected a mix of single and multiple-port NICs. Force-disabling NCCL_IB_MERGE_NICS");
-            mergeNics = 0;
-            ncclNIbDevs = 0;
-            ncclNMergedIbDevs = 0;
-            memset(ncclIbMergedDevs, 0, sizeof(ncclIbMergedDevs));
-            goto build_ib_list;
-          }
-        }
-      }
-
       if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
     }
     if (ncclNIbDevs == 0) {
       INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
-    } else {
-      char line[2048];
-      line[0] = '\0';
-      // Determine whether RELAXED_ORDERING is enabled and possible
-      ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
-      for (int d = 0; d < ncclNMergedIbDevs; d++) {
-        struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + d;
-        if (mergedDev->ndevs > 1) {
-          // Print out merged dev info
-          snprintf(line+strlen(line), 2047-strlen(line), " [%d]={", d);
-          for (int i = 0; i < mergedDev->ndevs; i++) {
-            int ibDev = mergedDev->devs[i];
-            snprintf(line+strlen(line), 2047-strlen(line), "[%d] %s:%d/%s%s", ibDev, ncclIbDevs[ibDev].devName,
-              ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE",
-              // Insert comma to delineate
-              i == (mergedDev->ndevs - 1) ? "" : ", ");
-          }
-          snprintf(line+strlen(line), 2047-strlen(line), "}");
-        } else {
-          int ibDev = mergedDev->devs[0];
-          snprintf(line+strlen(line), 2047-strlen(line), " [%d]%s:%d/%s", ibDev, ncclIbDevs[ibDev].devName,
-            ncclIbDevs[ibDev].portNum, ncclIbDevs[ibDev].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
-        }
-      }
-      line[2047] = '\0';
-      char addrline[SOCKET_NAME_MAXLEN+1];
-      INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
-           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
     }
+
+    // Print out all net devices to the user (in the same format as before)
+    char line[2048];
+    line[0] = '\0';
+    // Determine whether RELAXED_ORDERING is enabled and possible
+    ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
+    for (int d = 0; d < ncclNIbDevs; d++) {
+        snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
+          ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+    }
+    char addrline[SOCKET_NAME_MAXLEN+1];
+    INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
+          ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
+
     pthread_mutex_unlock(&ncclIbLock);
   }
 exit:
@@ -706,27 +709,25 @@ ncclResult_t ncclIbGdrSupport() {
 static __thread int ibDmaSupportInitDev; // which device to init, must be thread local
 static void ibDmaBufSupportInitOnce(){
   ncclResult_t res;
-  // select the appropriate
-  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
-  // Test each real devices
   int dev_fail = 0;
-  for (int i = 0; i < mergedDev->ndevs; i++) {
-    int ibDev = mergedDev->devs[i];
-    struct ibv_pd* pd;
-    struct ibv_context* ctx = ncclIbDevs[ibDev].context;
-    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
-    // Test kernel DMA-BUF support with a dummy call (fd=-1)
-    (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
-    // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
-    dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
-    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
-    // stop the search and goto failure
-    if (dev_fail) goto failure;
-  }
-  mergedDev->dmaBufSupported = 1;
+
+  // This is a physical device, not a virtual one, so select from ibDevs
+  ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
+  ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0];
+  struct ibv_pd* pd;
+  struct ibv_context* ctx = ibDev->context;
+  NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, ctx), res, failure);
+  // Test kernel DMA-BUF support with a dummy call (fd=-1)
+  (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
+  // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+  dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
+  NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  // stop the search and goto failure
+  if (dev_fail) goto failure;
+  ibDev->dmaBufSupported = 1;
   return;
 failure:
-  mergedDev->dmaBufSupported = -1;
+  ibDev->dmaBufSupported = -1;
   return;
 }
 // Detect whether DMA-BUF support is present in the kernel
@@ -741,21 +742,20 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
   // init the device only once
   ibDmaSupportInitDev = dev;
   pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
-
-  int dmaBufSupported = ncclIbMergedDevs[dev].dmaBufSupported;
+  ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
+  ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0];
+  int dmaBufSupported = ibDev->dmaBufSupported;
   if (dmaBufSupported == 1) return ncclSuccess;
   return ncclSystemError;
 }
 
 #define NCCL_NET_IB_MAX_RECVS 8
 
-ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
-  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs+dev;
-  props->name = mergedDev->devName;
-  props->speed = mergedDev->speed;
-
-  // Take the rest of the properties from an arbitrary sub-device (should be the same)
-  struct ncclIbDev* ibDev = ncclIbDevs + mergedDev->devs[0];
+ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
+  struct ncclIbDev* ibDev = ncclIbDevs + dev;
+  pthread_mutex_lock(&ibDev->lock);
+  props->name = ibDev->devName;
+  props->speed = ibDev->speed;
   props->pciPath = ibDev->pciPath;
   props->guid = ibDev->guid;
   props->ptrSupport = NCCL_PTR_HOST;
@@ -766,12 +766,29 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
   if (ncclIbDmaBufSupport(dev) == ncclSuccess) {
     props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
   }
+  props->forceFlush = 0;
   props->latency = 0; // Not set
   props->port = ibDev->portNum + ibDev->realPort;
   props->maxComms = ibDev->maxQp;
   props->maxRecvs = NCCL_NET_IB_MAX_RECVS;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  pthread_mutex_unlock(&ibDev->lock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
+  if (dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Requested properties for vNic %d, only %d vNics have been created", dev, ncclNMergedIbDevs);
+    return ncclInvalidUsage;
+  }
+  struct ncclIbMergedDev* mergedDev = ncclIbMergedDevs + dev;
+  // Take the rest of the properties from an arbitrary sub-device (should be the same)
+  NCCLCHECK(ncclIbGetPhysProperties(mergedDev->vProps.devs[0], props));
+  props->name = mergedDev->devName;
+  props->speed = mergedDev->speed;
+  memcpy(&props->vProps, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t));
   return ncclSuccess;
 }
 
@@ -826,6 +843,8 @@ enum ncclIbCommState {
   ncclIbCommStateConnecting = 6,
   ncclIbCommStateConnected = 7,
   ncclIbCommStatePendingReady = 8,
+  ncclIbCommStateSendDevList = 9,
+  ncclIbCommStateRecvDevList = 10,
 };
 
 struct ncclIbCommStage {
@@ -890,12 +909,12 @@ struct ncclIbListenComm {
 
 struct ncclIbSendFifo {
   uint64_t addr;
-  int      size;
+  uint64_t size;
   uint32_t rkeys[NCCL_IB_MAX_DEVS_PER_NIC];
   uint32_t nreqs;
   uint32_t tag;
   uint64_t idx;
-  char padding[24];
+  char padding[16];
 };
 
 struct ncclIbQp {
@@ -927,7 +946,7 @@ struct ncclIbMrHandle {
 };
 
 struct alignas(32) ncclIbNetCommBase {
-  int ndevs;
+  ncclNetVDeviceProps_t vProps;
   bool isSend;
   struct ncclIbRequest reqs[MAX_REQUESTS];
   struct ncclIbQp qps[NCCL_IB_MAX_QPS];
@@ -938,6 +957,7 @@ struct alignas(32) ncclIbNetCommBase {
   int ready;
   // Track necessary remDevInfo here
   int nRemDevs;
+  int nDataQps;
   struct ncclIbDevInfo remDevs[NCCL_IB_MAX_DEVS_PER_NIC];
   // statistics about the comm
   struct ncclIbStats stats;
@@ -981,7 +1001,6 @@ struct ncclIbRemFifo {
 struct alignas(16) ncclIbRecvCommDev {
   struct ncclIbNetCommDevBase base;
   struct ncclIbGpuFlush gpuFlush;
-  uint32_t fifoRkey;
   struct ibv_mr* fifoMr;
   struct ibv_sge fifoSge;
   struct ibv_mr* sizesFifoMr;
@@ -989,7 +1008,7 @@ struct alignas(16) ncclIbRecvCommDev {
 
 struct ncclIbRecvComm {
   struct ncclIbNetCommBase base;
-  struct ncclIbRecvCommDev    devs[NCCL_IB_MAX_DEVS_PER_NIC];
+  struct ncclIbRecvCommDev devs[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbRemFifo remFifo;
   int sizesFifo[MAX_REQUESTS][NCCL_NET_IB_MAX_RECVS];
   int gpuFlushHostMem;
@@ -1060,10 +1079,12 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   qpAttr.port_num = ib_port;
   qpAttr.qp_access_flags = access_flags;
   NCCLCHECK(wrap_ibv_modify_qp(qp->qp, &qpAttr, IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS));
+  TRACE(NCCL_NET, "NET/IB : ncclIbCreateQp port=%d dev=%d devName=%s ndevs=%d nmdevs=%d qpn=%u pkey=%u pd=%p",
+    ib_port, base->ibDevN, ncclIbDevs[base->ibDevN].devName, ncclNIbDevs, ncclNMergedIbDevs, qp->qp->qp_num, qpAttr.pkey_index, base->pd);
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool override_tc) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -1079,11 +1100,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
     qpAttr.ah_attr.grh.flow_label = 0;
     qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    if(ncclParamIbFifoTc() && override_tc) {
-      qpAttr.ah_attr.grh.traffic_class = ncclParamIbFifoTc();
-    } else {
-      qpAttr.ah_attr.grh.traffic_class = ncclParamIbTc();
-    }
+    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc();
   } else {
     //pick lid if subnet prefixs are same, FLID if they are not
     if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
@@ -1108,6 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
   qpAttr.ah_attr.sl = ncclParamIbSl();
   qpAttr.ah_attr.src_path_bits = 0;
   qpAttr.ah_attr.port_num = info->ib_port;
+  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port);
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -1154,10 +1172,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   int ready;
   *sendComm = NULL;
 
-  if (stage->state == ncclIbCommStateConnect)    goto ib_connect_check;
-  if (stage->state == ncclIbCommStateSend)       goto ib_send;
-  if (stage->state == ncclIbCommStateConnecting) goto ib_connect;
-  if (stage->state == ncclIbCommStateConnected)  goto ib_send_ready;
+  if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
+  if (stage->state == ncclIbCommStateSendDevList)  goto ib_send_dev_list;
+  if (stage->state == ncclIbCommStateRecvDevList)  goto ib_recv_dev_list;
+  if (stage->state == ncclIbCommStateSend)         goto ib_send;
+  if (stage->state == ncclIbCommStateConnecting)   goto ib_connect;
+  if (stage->state == ncclIbCommStateConnected)    goto ib_send_ready;
   if (stage->state != ncclIbCommStateStart) {
     WARN("Error: trying to connect already connected sendComm");
     return ncclInternalError;
@@ -1178,21 +1198,51 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
 
   // IB Setup
   struct ncclIbMergedDev* mergedDev;
+  if (dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Trying to use non-existant virtual device %d", dev);
+    return ncclInternalError;
+  }
+
   mergedDev = ncclIbMergedDevs + dev;
-  comm->base.ndevs = mergedDev->ndevs;
-  comm->base.nqps = ncclParamIbQpsPerConn() * comm->base.ndevs; // We must have at least 1 qp per-device
+  comm->base.vProps = mergedDev->vProps;
   comm->base.isSend = true;
+  stage->state = ncclIbCommStateSendDevList;
+  stage->offset = 0;
+  struct ncclIbConnectionMetadata meta;
+  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
+  memcpy(stage->buffer, &mergedDev->vProps, sizeof(ncclNetVDeviceProps_t));
+
+// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps
+ib_send_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+
+  stage->state = ncclIbCommStateRecvDevList;
+  stage->offset = 0;
+
+ib_recv_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &comm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+  stage->offset = 0;
+  ncclNetVDeviceProps_t remoteVProps;
+  memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
+  mergedDev = ncclIbMergedDevs + dev;
+  comm->base.vProps = mergedDev->vProps;
+  int localNqps, remoteNqps;
+  localNqps  = ncclParamIbQpsPerConn() * comm->base.vProps.ndevs; // We must have at least 1 qp per-device
+  remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs;
+  comm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote)
 
   // Init PD, Ctx for each IB device
   comm->ar = 1; // Set to 1 for logic
-  for (int i = 0; i < mergedDev->ndevs; i++) {
-    int ibDevN = mergedDev->devs[i];
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    int ibDevN = comm->base.vProps.devs[i];
     NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &comm->devs[i].base, &comm->base.stats), ret, fail);
-    comm->ar = comm->ar && ncclIbDevs[dev].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
+    comm->ar = comm->ar && ncclIbDevs[ibDevN].ar; // ADAPTIVE_ROUTING - if all merged devs have it enabled
   }
 
-  struct ncclIbConnectionMetadata meta;
-  meta.ndevs = comm->base.ndevs;
+  memset(&meta, 0, sizeof(meta));
+  meta.ndevs = comm->base.vProps.ndevs;
 
   // Alternate QPs between devices
   int devIndex;
@@ -1211,10 +1261,10 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     } else {
       meta.qpInfo[q].ece_supported = 0;
     }
-    devIndex = (devIndex + 1) % comm->base.ndevs;
+    devIndex = (devIndex + 1) % comm->base.vProps.ndevs;
   }
 
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     ncclIbSendCommDev* commDev = comm->devs + i;
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
 
@@ -1241,7 +1291,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
         // Print just the QPs for this dev
         if (comm->base.qps[q].devIndex == i)
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu  FLID %d fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
+            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
             dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
 	    devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
@@ -1250,7 +1300,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
         // Print just the QPs for this dev
         if (comm->base.qps[q].devIndex == i)
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
+            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
             commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
             devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
@@ -1261,7 +1311,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
-  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(meta)), ret, fail);
 
   memcpy(stage->buffer, &meta, sizeof(meta));
 
@@ -1282,17 +1331,12 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   memcpy(&remMeta, stage->buffer, sizeof(ncclIbConnectionMetadata));
 
   comm->base.nRemDevs = remMeta.ndevs;
-  if (comm->base.nRemDevs != comm->base.ndevs) {
-    mergedDev = ncclIbMergedDevs + dev;
-    WARN("NET/IB : Local mergedDev=%s has a different number of devices=%d as remoteDev=%s nRemDevs=%d",
-      mergedDev->devName, comm->base.ndevs, remMeta.devName, comm->base.nRemDevs);
-  }
 
   int link_layer;
   link_layer = remMeta.devs[0].link_layer;
   for (int i = 1; i < remMeta.ndevs; i++) {
     if (remMeta.devs[i].link_layer != link_layer) {
-      WARN("NET/IB : Can't merge net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
+      WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
       i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
       return ncclInternalError;
     }
@@ -1309,7 +1353,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     comm->remSizesFifo.addr = remMeta.fifoAddr;
   }
 
-  for (int i=0; i < comm->base.ndevs; i++) {
+  for (int i=0; i < comm->base.vProps.ndevs; i++) {
     NCCLCHECKGOTO(wrap_ibv_reg_mr(comm->remSizesFifo.mrs+i, comm->devs[i].base.pd, &comm->remSizesFifo.elems, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
   }
   comm->base.nRemDevs = remMeta.ndevs;
@@ -1327,6 +1371,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     if (remQpInfo->ece_supported)
       NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
 
+    ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
+    remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
     NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
@@ -1341,6 +1387,8 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
     }
   }
 
+  comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs);
+
   comm->base.ready = 1;
   stage->state = ncclIbCommStateConnected;
   stage->offset = 0;
@@ -1359,6 +1407,50 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   goto exit;
 }
 
+NCCL_PARAM(IbWarnRailLocal, "IB_WARN_RAIL_LOCAL", 0);
+
+ncclResult_t ncclIbCheckVProps(ncclNetVDeviceProps_t* vProps1, ncclNetVDeviceProps_t* vProps2) {
+  ncclNetVDeviceProps_t  outVProps = {0};
+  ncclNetVDeviceProps_t* minVProps = vProps2;
+  ncclNetVDeviceProps_t* maxVProps = vProps1;
+  if (vProps2->ndevs > vProps1->ndevs) {
+    minVProps = vProps1;
+    maxVProps = vProps2;
+  }
+
+  // Find the intersection of devices
+  for (int i = 0; i < minVProps->ndevs; i++) {
+    int dev = minVProps->devs[i];
+    for (int j = 0; j < maxVProps->ndevs; j++) {
+      // Found
+      if (maxVProps->devs[j] == dev) {
+        outVProps.devs[outVProps.ndevs++] = dev;
+      }
+    }
+  }
+
+  // In the case that at least one side has a fused NIC but there are no matching physical NICs, we should check if the user wants this
+  if (ncclParamIbWarnRailLocal() && outVProps.ndevs < maxVProps->ndevs) {
+    char local[128];
+    int cursor = 1;
+    snprintf(local, sizeof(local), "%d", vProps1->devs[0]);
+    for (int i = 1; i < vProps1->ndevs; i++) {
+      snprintf(local+cursor, sizeof(local)-cursor, ",%d", vProps1->devs[i]);
+      cursor += 2;
+    }
+    char remote[128];
+    snprintf(remote, sizeof(remote), "%d", vProps2->devs[0]);
+    cursor = 1;
+    for (int i = 1; i < vProps2->ndevs; i++) {
+      snprintf(remote+cursor, sizeof(remote)-cursor, ",%d", vProps2->devs[i]);
+      cursor += 2;
+    }
+    INFO(NCCL_NET, "NET/IB : There are mismatched physical devices between local (%s) and remote (%s). To disable this warning, set NCCL_IB_WARN_RAIL_LOCAL=0", local, remote);
+  }
+
+  return ncclSuccess;
+}
+
 NCCL_PARAM(IbGdrFlushDisable, "GDR_FLUSH_DISABLE", 0);
 
 ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
@@ -1369,7 +1461,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   int ready;
   *recvComm = NULL;
 
-  if (stage->state == ncclIbCommStateAccept) goto ib_accept_check;
+  if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
+  if (stage->state == ncclIbCommStateRecvDevList) goto ib_recv_dev_list;
+  if (stage->state == ncclIbCommStateSendDevList) goto ib_send_dev_list;
   if (stage->state == ncclIbCommStateRecv) goto ib_recv;
   if (stage->state == ncclIbCommStateSend) goto ib_send;
   if (stage->state == ncclIbCommStatePendingReady) goto ib_recv_ready;
@@ -1385,14 +1479,49 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   NCCLCHECKGOTO(ncclSocketInit(&rComm->base.sock), ret, fail);
   NCCLCHECKGOTO(ncclSocketAccept(&rComm->base.sock, &lComm->sock), ret, fail);
 
+  // Alloc stage->buffer here to be used for all following steps
+  struct ncclIbConnectionMetadata remMeta;
+  stage->offset = 0;
+  NCCLCHECK(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)));
+
 ib_accept_check:
   NCCLCHECKGOTO(ncclSocketReady(&rComm->base.sock, &ready), ret, fail);
   if (!ready) return ncclSuccess;
+  stage->state = ncclIbCommStateRecvDevList;
+  stage->offset = 0;
+
+// In the case of mismatched nDevs, we will make sure that both sides of a logical connection have the same number of RC qps
+ib_recv_dev_list:
+  NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset));
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+  ncclNetVDeviceProps_t remoteVProps;
+  memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
+  if (lComm->dev >= ncclNMergedIbDevs) {
+    WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev);
+    return ncclInternalError;
+  }
+
+  // Reduce the physical device list and store in the connection base
+  struct ncclIbMergedDev* mergedDev;
+  mergedDev = ncclIbMergedDevs + lComm->dev;
+  NCCLCHECK(ncclIbCheckVProps(&mergedDev->vProps, &remoteVProps));
+  rComm->base.vProps = mergedDev->vProps;
+  memcpy(stage->buffer, &rComm->base.vProps, sizeof(ncclNetVDeviceProps_t));
+  rComm->base.isSend = false;
+  int localNqps, remoteNqps;
+  localNqps  = ncclParamIbQpsPerConn() * rComm->base.vProps.ndevs; // We must have at least 1 qp per-device
+  remoteNqps = ncclParamIbQpsPerConn() * remoteVProps.ndevs;
+  rComm->base.nqps = remoteNqps > localNqps ? remoteNqps : localNqps; // Select max nqps (local or remote)
 
-  struct ncclIbConnectionMetadata remMeta;
-  stage->state = ncclIbCommStateRecv;
   stage->offset = 0;
-  NCCLCHECKGOTO(ncclIbMalloc((void**)&stage->buffer, sizeof(remMeta)), ret, fail);
+  stage->state = ncclIbCommStateSendDevList;
+
+ib_send_dev_list:
+  NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_SEND, &rComm->base.sock, stage->buffer, sizeof(ncclNetVDeviceProps_t), &stage->offset), ret, fail);
+  if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
+
+  stage->offset = 0;
+  stage->state = ncclIbCommStateRecv;
 
 ib_recv:
   NCCLCHECKGOTO(ncclSocketProgress(NCCL_SOCKET_RECV, &rComm->base.sock, stage->buffer, sizeof(remMeta), &stage->offset), ret, fail);
@@ -1403,7 +1532,6 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
 
   // IB setup
   // Pre-declare variables because of goto
-  struct ncclIbMergedDev* mergedDev;
   struct ncclIbDev* ibDev;
   int ibDevN;
   struct ncclIbRecvCommDev* rCommDev;
@@ -1411,21 +1539,18 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   struct ncclIbQp* qp;
 
   mergedDev = ncclIbMergedDevs + lComm->dev;
-  rComm->base.ndevs = mergedDev->ndevs;
-  rComm->base.nqps  = ncclParamIbQpsPerConn() * rComm->base.ndevs; // We must have at least 1 qp per-device
-  rComm->base.isSend = false;
-
   rComm->base.nRemDevs = remMeta.ndevs;
-  if (rComm->base.nRemDevs != rComm->base.ndevs) {
-    WARN("NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d",
-      mergedDev->devName, rComm->base.ndevs, remMeta.devName, rComm->base.nRemDevs);
+  if (rComm->base.nRemDevs != rComm->base.vProps.ndevs) {
+    INFO(NCCL_NET, "NET/IB : Local mergedDev %s has a different number of devices=%d as remote %s %d",
+      mergedDev->devName, rComm->base.vProps.ndevs, remMeta.devName, rComm->base.nRemDevs);
   }
 
   // Metadata to send back to requestor (sender)
   struct ncclIbConnectionMetadata meta;
-  for (int i = 0; i < rComm->base.ndevs; i++) {
+  memset(&meta, 0, sizeof(meta));
+  for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
     rCommDev = rComm->devs + i;
-    ibDevN = mergedDev->devs[i];
+    ibDevN = rComm->base.vProps.devs[i];
     NCCLCHECKGOTO(ncclIbInitCommDevBase(ibDevN, &rCommDev->base, &rComm->base.stats), ret, fail);
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
@@ -1456,7 +1581,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
     qp->devIndex = devIndex;
-    devIndex = (devIndex + 1) % rComm->base.ndevs;
+    devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
 
     // Set the ece (enhanced connection establishment) on this QP before RTR
     if (remMeta.qpInfo[q].ece_supported) {
@@ -1469,23 +1594,22 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       // Store this in our own qpInfo for returning to the requestor
       if (meta.qpInfo[q].ece_supported)
         NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
+    } else {
+      meta.qpInfo[q].ece_supported = 0;
     }
 
-    bool override_tc = (q == 0) ? true : false;
-    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, override_tc), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)
                             && (ncclParamIbGdrFlushDisable() == 0)) ? 1 : 0;
 
-  for (int i = 0; i < mergedDev->ndevs; i++) {
+  for (int i = 0; i < rComm->base.vProps.ndevs; i++) {
     rCommDev = rComm->devs + i;
-    ibDevN = rCommDev->base.ibDevN;
-    ibDev = ncclIbDevs + ibDevN;
+    ibDev = ncclIbDevs + rCommDev->base.ibDevN;
 
     // Retain remote fifo info and prepare my RDMA ops
-    rCommDev->fifoRkey = remMeta.devs[i].fifoRkey;
     rComm->remFifo.addr = remMeta.fifoAddr;
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->fifoMr, rCommDev->base.pd, &rComm->remFifo.elems, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     rCommDev->fifoSge.lkey = rCommDev->fifoMr->lkey;
@@ -1510,15 +1634,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     }
 
     // Fill Handle
-    meta.devs[i].lid        = ibDev->portAttr.lid;
-    meta.devs[i].link_layer = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
-    meta.devs[i].ib_port    = ibDev->portNum;
+    meta.devs[i].lid                            = ibDev->portAttr.lid;
+    meta.devs[i].link_layer                     = rCommDev->base.gidInfo.link_layer = ibDev->portAttr.link_layer;
+    meta.devs[i].ib_port                        = ibDev->portNum;
     meta.devs[i].gid.global.subnet_prefix       = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
     meta.devs[i].gid.global.interface_id        = rCommDev->base.gidInfo.localGid.global.interface_id;
-
-    // Adjust the MTU
-    remMeta.devs[i].mtu    = (enum ibv_mtu) std::min(remMeta.devs[i].mtu, ibDev->portAttr.active_mtu);
-    meta.devs[i].mtu      = remMeta.devs[i].mtu;
+    meta.devs[i].mtu                            = ibDev->portAttr.active_mtu;
 
     // Prepare sizes fifo
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&rComm->devs[i].sizesFifoMr, rComm->devs[i].base.pd, rComm->sizesFifo, sizeof(int)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
@@ -1530,9 +1651,9 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
     meta.qpInfo[q].devIndex = rComm->base.qps[q].devIndex;
   }
-
-  meta.ndevs = rComm->base.ndevs;
+  meta.ndevs = rComm->base.vProps.ndevs;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
+  rComm->base.nDataQps = std::max(rComm->base.vProps.ndevs, rComm->base.nRemDevs);
 
   stage->state = ncclIbCommStateSend;
   stage->offset = 0;
@@ -1662,7 +1783,7 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
   assert(size > 0);
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
-  for (int i = 0; i < base->ndevs; i++) {
+  for (int i = 0; i < base->vProps.ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
     NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
@@ -1706,9 +1827,11 @@ ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle)
 }
 
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
+  if (mhandle == NULL) return ncclSuccess;
+
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
-  for (int i = 0; i < base->ndevs; i++) {
+  for (int i = 0; i < base->vProps.ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
     NCCLCHECK(ncclIbDeregMrInternal(devComm, mhandleWrapper->mrs[i]));
@@ -1773,7 +1896,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
 
   // Multi-QP: make sure IB writes are multiples of 128B so that LL and LL128 protocols still work
   const int align = 128;
-  int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+  int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
   for (int i = 0; i < nqps; i++) {
     int qpIndex = comm->base.qpIndex;
     ncclIbQp* qp = comm->base.qps + qpIndex;
@@ -1822,7 +1945,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -1852,7 +1975,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
       char line[SOCKET_NAME_MAXLEN + 1];
       union ncclSocketAddress addr;
       ncclSocketGetAddr(&comm->base.sock, &addr);
-      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %d addr %lx rkeys[0]=%x",
+      WARN("NET/IB : req %d/%d tag %x peer %s posted incorrect receive info: size %ld addr %lx rkeys[0]=%x",
         r, nreqs, tag, ncclSocketToString(&addr, line), slots[r].size, slots[r].addr, slots[r].rkeys[0]);
       return ncclInternalError;
     }
@@ -1868,7 +1991,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
     req->send.offset = 0;
 
     // Populate events
-    int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+    int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
     int qpIndex = comm->base.qpIndex;
     // Count down
     while (nEvents > 0) {
@@ -1883,7 +2006,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
     }
 
     // Store all lkeys
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       req->send.lkeys[i] = mhandleWrapper->mrs[i]->lkey;
     }
 
@@ -1909,7 +2032,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int tag, void* mh
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
+ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, size_t* sizes, int* tags, void** mhandles, struct ncclIbRequest* req) {
   struct ibv_send_wr wr;
   memset(&wr, 0, sizeof(wr));
 
@@ -1921,14 +2044,14 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   // Select the next devIndex (local) and QP to use for posting this CTS message
   // Since QPs are initialized by striping across devIndex, we can simply assign this to the same value
   ncclIbQp* ctsQp = comm->base.qps + comm->base.devIndex;
-  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.ndevs;
+  comm->base.devIndex = (comm->base.devIndex + 1) % comm->base.vProps.ndevs;
 
   for (int i=0; i<n; i++) {
     localElem[i].addr = (uint64_t)data[i];
     struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandles[i];
 
     // Send all applicable rkeys
-    for (int j = 0; j < comm->base.ndevs; j++)
+    for (int j = 0; j < comm->base.vProps.ndevs; j++)
       localElem[i].rkeys[j] = mhandleWrapper->mrs[j]->rkey;
 
     localElem[i].nreqs = n;
@@ -1986,7 +2109,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, int
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -1999,7 +2122,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
   req->sock = &comm->base.sock;
   req->nreqs = n;
 
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     req->devBases[i] = &comm->devs[i].base;
   }
 
@@ -2011,7 +2134,7 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, int* sizes, int* ta
 
   TIME_START(1);
   // Select either all QPs, or one qp per-device
-  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.ndevs;
+  const int nqps = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
 
   // Post recvs
   struct ibv_recv_wr* bad_wr;
@@ -2047,7 +2170,7 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   struct ncclIbMrHandle* mhandle = (struct ncclIbMrHandle*) mhandles[last];
 
   // We don't know which devIndex the recv was on, so we flush on all devices
-  for (int i = 0; i < comm->base.ndevs; i++) {
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     struct ibv_send_wr wr;
     memset(&wr, 0, sizeof(wr));
     wr.wr_id = req - comm->base.reqs;
@@ -2078,7 +2201,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   *done = 0;
   while (1) {
     NCCLCHECK(ncclIbStatsCheckFatalCount(&r->base->stats,__func__));
-    if (r->events[0] == 0 && r->events[1] == 0) {
+    if (r->events[0] == 0 && r->events[1] == 0 && r->events[2] == 0 && r->events[3] == 0) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
@@ -2112,13 +2235,13 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
             char remoteGidString[INET6_ADDRSTRLEN] = "";
             const char* localGidStr = NULL, *remoteGidStr = NULL;
             if (r->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
-              localGidStr = inet_ntop(AF_INET6, &r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
-              remoteGidStr = inet_ntop(AF_INET6, &r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
+              localGidStr = ibvGetGidStr(&r->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
+              remoteGidStr = ibvGetGidStr(&r->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
             }
 
             char line[SOCKET_NAME_MAXLEN+1];
             char *hcaName = r->devBases[i]->pd->context->device->name;
-            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%d vendor err %d (%s)%s%s%s%s hca %s",
+            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
                 ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
                 localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
             return ncclRemoteError;
@@ -2130,7 +2253,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
           #ifdef ENABLE_TRACE
           char line[SOCKET_NAME_MAXLEN+1];
-          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%d wr_id=%ld r=%p type=%d events={%d,%d}, i=%d",
+          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d",
               ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
           #endif
           if (req && req->type == NCCL_NET_IB_REQ_SEND) {
@@ -2174,7 +2297,7 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
     for (int q = 0; q < comm->base.nqps; q++)
       if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
 
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       struct ncclIbSendCommDev* commDev = comm->devs + i;
       if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
       if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i]));
@@ -2194,7 +2317,7 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) {
     for (int q = 0; q < comm->base.nqps; q++)
       if (comm->base.qps[q].qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->base.qps[q].qp));
 
-    for (int i = 0; i < comm->base.ndevs; i++) {
+    for (int i = 0; i < comm->base.vProps.ndevs; i++) {
       struct ncclIbRecvCommDev* commDev = comm->devs + i;
       if (comm->flushEnabled) {
         if (commDev->gpuFlush.qp.qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(commDev->gpuFlush.qp.qp));
@@ -2237,6 +2360,11 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseRecv,
   ncclIbCloseListen,
   NULL /* getDeviceMr */,
-  NULL /* irecvConsumed */
+  NULL /* irecvConsumed */,
+  ncclIbMakeVDevice
 };
 
+/*
+  ncclIbSetProperties,
+  ncclIbRefreshDevices
+*/
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 73a5d55b0..235dee865 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -44,6 +44,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
       ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
+        pthread_mutex_unlock(&ncclNetSocketLock);
         return ncclInternalError;
       } else {
         #define MAX_LINE_LEN (2047)
@@ -76,7 +77,7 @@ static ncclResult_t ncclNetSocketGetSpeed(char* devName, int* speed) {
   ncclResult_t ret = ncclSuccess;
   *speed = 0;
   char speedPath[PATH_MAX];
-  sprintf(speedPath, "/sys/class/net/%s/speed", devName);
+  snprintf(speedPath, sizeof(speedPath), "/sys/class/net/%s/speed", devName);
   int fd = -1;
   SYSCHECKSYNC(open(speedPath, O_RDONLY), "open", fd);
   if (fd != -1) {
@@ -102,6 +103,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->guid = dev;
   props->ptrSupport = NCCL_PTR_HOST;
   props->regIsGlobal = 0;
+  props->forceFlush = 0;
   NCCLCHECK(ncclNetSocketGetSpeed(props->name, &props->speed));
   props->latency = 0; // Not set
   props->port = 0;
@@ -109,6 +111,7 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->maxRecvs = 1;
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
   return ncclSuccess;
 }
 
@@ -297,6 +300,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
 
 ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
+    WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs);
     return ncclInternalError;
   }
   ncclResult_t ret = ncclSuccess;
@@ -558,16 +562,16 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
   if (n != 1) return ncclInternalError;
-  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], sizes[0], (struct ncclNetSocketRequest**)request));
+  NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
   return ncclSuccess;
 }
 
@@ -632,5 +636,6 @@ ncclNet_t ncclNetSocket = {
   ncclNetSocketClose,
   ncclNetSocketCloseListen,
   NULL /* getDeviceMr */,
-  NULL /* irecvConsumed */
+  NULL /* irecvConsumed */,
+  NULL /* mergeDevices */
 };
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index aa9c486b1..582c30a35 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -108,11 +108,12 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
   CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
   CUCHECK(cuMemUnmap(ptr, size));
   CUCHECK(cuMemAddressFree(ptr, size));
   CUCHECK(cuMemRelease(*mcHandler));
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
   return ncclSuccess;
 }
 
@@ -450,11 +451,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
 
     if (comm->localRank == 0) {
       shmPath[0] = '\0';
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
     } else {
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
     }
     /* need 2 pools and a shared counter for shmem-based collectives */
     comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
@@ -495,7 +496,7 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
-ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, bool *regUsed) {
+ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t buffSize, CUdeviceptr *regAddr, int *regUsed) {
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   CUdeviceptr regPtr = 0;
@@ -601,43 +602,33 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
-  *regUsed = true;
+  *regUsed = 1;
 exit:
   free(regData);
   return ret;
 fail:
-  *regUsed = false;
+  *regUsed = 0;
   goto exit;
 }
 
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, struct ncclReg *sendRegRecord, struct ncclReg *recvRegRecord, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
   ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
+  int regBufUsed = 0;
   struct localRegData *regData = NULL;
   bool sendNeedReg = false, recvNeedReg = false;
   CUdeviceptr regSendPtr = 0;
   CUdeviceptr regRecvPtr = 0;
-  struct ncclReg *sendRegRecord = NULL;
-  struct ncclReg *recvRegRecord = NULL;
-
-  *outRegBufUsed = false;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks * 2), ret, fail);
 
-  if (sendbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord), ret, fail);
-    if (sendRegRecord) {
-      memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
-    }
+  if (sendRegRecord) {
+    memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
   }
 
-  if (recvbuff) {
-    NCCLCHECKGOTO(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord), ret, fail);
-    if (recvRegRecord) {
-      memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
-      regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
-    }
+  if (recvRegRecord) {
+    memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
+    regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
   }
 
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
@@ -682,229 +673,127 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   }
 
   if ((!sendNeedReg || sendbuff == NULL) && (!recvNeedReg || recvbuff == NULL)) {
-    localRegBufUsed = true;
-    INFO(NCCL_NVLS, "rank %d reuse local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+    regBufUsed = 1;
+    INFO(NCCL_REG, "rank %d reuse registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
     goto exit;
   }
 
   /* Start Registration. Not found registered buffers, then check whether both send and recv buffer locate
    * in register request cache. */
-  if (sendNeedReg && sendbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (sendNeedReg && sendbuff && sendbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)sendbuff, sendbuffSize, &regSendPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
   }
 
-  if (recvNeedReg && recvbuff) {
-    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &localRegBufUsed);
-    if (localRegBufUsed == false) goto fail;
+  if (recvNeedReg && recvbuff && recvbuffSize > 0) {
+    tryRegisterBuffer(comm, (uintptr_t)recvbuff, recvbuffSize, &regRecvPtr, &regBufUsed);
+    if (regBufUsed == 0) goto fail;
   }
 
-  INFO(NCCL_NVLS, "rank %d successfully local-registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
+  INFO(NCCL_REG, "rank %d successfully registered NVLS sendbuff %p, recvbuff %p, sendbuff size %ld, recvbuff size %ld, reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, recvbuffSize, (void*)regSendPtr, (void*)regRecvPtr);
 
 exit:
   *outRegBufSend = (void*)regSendPtr;
   *outRegBufRecv = (void*)regRecvPtr;
-  *outRegBufUsed = localRegBufUsed;
+  *outRegBufUsed = regBufUsed;
   free(regData);
   return ncclSuccess;
 fail:
-  localRegBufUsed = false;
+  regBufUsed = 0;
+  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
   goto exit;
 }
 
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;
+  bool sendIsValid = false;
+  bool recvIsValid = false;
+
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid));
+  } else {
+    sendIsValid = true;
+  }
+  if (recvbuff) {
+    NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord));
+    NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid));
+  } else {
+    recvIsValid = true;
+  }
+
+  if (sendIsValid && recvIsValid)
+    NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+
+  return ncclSuccess;
+}
+
 struct ncclNvlsCleanupCallback {
   struct ncclCommCallback base;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
+  struct ncclReg *reg;
+  struct ncclComm *comm;
 };
 
 static ncclResult_t cleanupNvls(struct ncclComm* comm, struct ncclCommCallback* cb) {
   struct ncclNvlsCleanupCallback* obj = (struct ncclNvlsCleanupCallback*)cb;
-  NCCLCHECK(ncclNvlsDeregBuffer(&obj->mcHandle, obj->ptr, obj->dev, obj->size));
-  INFO(NCCL_NVLS, "rank %d - deregistered buffer %p on device %d, size %ld", comm->rank, (void*)obj->ptr, obj->dev, obj->size);
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
   free(obj);
   return ncclSuccess;
 }
 
 ncclResult_t ncclNvlsGraphRegisterBuffer(
     struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
     struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
   ) {
-  ncclResult_t ret = ncclSuccess;
-  bool localRegBufUsed = false;
   struct ncclNvlsCleanupCallback* sendRecord = NULL;
   struct ncclNvlsCleanupCallback* recvRecord = NULL;
-  CUdeviceptr regSendPtr = 0;
-  CUdeviceptr regRecvPtr = 0;
-  CUmulticastObjectProp mcprop;
-  CUmemAllocationProp ucprop;
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  CUmemGenericAllocationHandle sendMcHandle, recvMcHandle;
-  size_t sendGran = 0, recvGran = 0;
-  bool *regBufFlags = NULL;
-  struct graphRegData *rdata = NULL;
-  const void *baseSend = NULL;
-  const void *baseRecv = NULL;
-  size_t baseSendSize = 1;
-  size_t baseRecvSize = 1;
-  size_t ucgran;
-
-  *outRegBufUsed = false;
-  NCCLCHECKGOTO(ncclCalloc(&regBufFlags, comm->localRanks), ret, fail);
-  NCCLCHECKGOTO(ncclCalloc(&rdata, comm->localRanks), ret, fail);
-
-  if (sendbuffSize > 0 || recvbuffSize > 0) {
-    /* retrieve base pointer and size */
-    if (CUPFN(cuMemGetAddressRange) == nullptr) goto fail;
-    if (sendbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff), ret, fail);
-    if (recvbuff != NULL)
-      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff), ret, fail);
-
-    memset(&ucprop, 0, sizeof(CUmemAllocationProp));
-    ucprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    ucprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    ucprop.location.id = comm->cudaDev;
-    ucprop.requestedHandleTypes = ncclCuMemHandleType;
-    CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-
-    localRegBufUsed = ((uint64_t)baseSend % ucgran != 0 || (uint64_t)baseRecv % ucgran != 0) ? false : true;
-    regBufFlags[comm->localRank] = localRegBufUsed;
-    NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, regBufFlags, sizeof(bool)), ret, fail);
-    for (int i = 0; i < comm->localRanks; ++i)
-      if (regBufFlags[i] == false) goto fail;
-
-    memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
-    mcprop.numDevices = comm->localRanks;
-    mcprop.handleTypes = ncclCuMemHandleType;
-    mcprop.flags = 0;
-
-    if (sendbuff != NULL) {
-      mcprop.size = baseSendSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&sendGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      /* check send buffer offset and size */
-      rdata[comm->localRank].offset = (uintptr_t)sendbuff - (uintptr_t)baseSend;
-      rdata[comm->localRank].size = baseSendSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseSendSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseSendSize > rdata[i].size) baseSendSize = rdata[i].size;
-      }
-      if (baseSendSize % sendGran != 0) goto fail;
-
-      mcprop.size = baseSendSize;
+  void *baseSend = NULL;
+  void *baseRecv = NULL;
+  size_t baseSendSize = 0;
+  size_t baseRecvSize = 0;
+  struct ncclReg *sendRegRecord = NULL;
+  struct ncclReg *recvRegRecord = NULL;
 
-      /* register sendbuff */
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &sendMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &sendMcHandle), ret, fail);
-      }
+  *outRegBufUsed = 0;
+  if (sendbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord));
+  }
 
-      CUCHECKGOTO(cuMulticastAddDevice(sendMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(sendMcHandle, 0, (CUdeviceptr)baseSend, baseSendSize, 0), ret, fail);
+  if (recvbuff) {
+    CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff));
+    NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord));
+  }
 
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regSendPtr, baseSendSize, sendGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regSendPtr, baseSendSize, 0, sendMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regSendPtr, baseSendSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+  NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
 
+  if (*outRegBufUsed) {
+    if (sendRegRecord) {
       sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
       sendRecord->base.fn = cleanupNvls;
-      sendRecord->mcHandle = sendMcHandle;
-      sendRecord->ptr = regSendPtr;
-      sendRecord->dev = comm->nvlsResources->dev;
-      sendRecord->size = baseSendSize;
-    }
-
-    if (recvbuff != NULL) {
-      mcprop.size = baseRecvSize;
-      CUCHECKGOTO(cuMulticastGetGranularity(&recvGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
-
-      rdata[comm->localRank].offset = (uintptr_t)recvbuff - (uintptr_t)baseRecv;
-      rdata[comm->localRank].size = baseRecvSize;
-      NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, rdata, sizeof(struct graphRegData)), ret, fail);
-      baseRecvSize = rdata[0].size;
-      for (int i = 1; i < comm->localRanks; ++i) {
-        if (rdata[0].offset != rdata[i].offset) goto fail;
-        if (baseRecvSize > rdata[i].size) baseRecvSize = rdata[i].size;
-      }
-      if (baseRecvSize % recvGran != 0) goto fail;
-
-      mcprop.size = baseRecvSize;
-      if (comm->localRank == 0) {
-        NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &recvMcHandle, shareableHandle), ret, fail);
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      } else {
-        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-        NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &recvMcHandle), ret, fail);
-      }
-
-      CUCHECKGOTO(cuMulticastAddDevice(recvMcHandle, comm->nvlsResources->dev), ret, fail);
-      CUCHECKGOTO(cuMulticastBindAddr(recvMcHandle, 0, (CUdeviceptr)baseRecv, baseRecvSize, 0), ret, fail);
-
-      // Create a VA for the NVLS
-      CUCHECKGOTO(cuMemAddressReserve(&regRecvPtr, baseRecvSize, recvGran, 0U, 0), ret, fail);
-      // Map the VA locally
-      CUCHECKGOTO(cuMemMap(regRecvPtr, baseRecvSize, 0, recvMcHandle, 0), ret, fail);
-      CUCHECKGOTO(cuMemSetAccess(regRecvPtr, baseRecvSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
-
-      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
-      recvRecord->base.fn = cleanupNvls;
-      recvRecord->mcHandle = recvMcHandle;
-      recvRecord->ptr = regRecvPtr;
-      recvRecord->dev = comm->nvlsResources->dev;
-      recvRecord->size = baseRecvSize;
-    }
-
-    localRegBufUsed = true;
-  }
-
-exit:
-  if (localRegBufUsed == false) {
-    if (sendRecord) {
-      ncclNvlsDeregBuffer(&sendRecord->mcHandle, sendRecord->ptr, sendRecord->dev, sendRecord->size);
-      free(sendRecord);
-    }
-
-    if (recvRecord) {
-      // Yes, it's a dead code.  That's fine...
-      // coverity[dead_error_begin]
-      ncclNvlsDeregBuffer(&recvRecord->mcHandle, recvRecord->ptr, recvRecord->dev, recvRecord->size);
-      free(recvRecord);
-    }
-  } else {
-    if (sendRecord) {
-      *outRegBufSend = (void*)((uintptr_t)regSendPtr + (uintptr_t)sendbuff - (uintptr_t)baseSend);
+      sendRecord->reg = sendRegRecord;
+      sendRecord->comm = comm;
       ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)sendRecord);
       *nCleanupQueueEltsAdded += 1;
     }
 
-    if (recvRecord) {
-      *outRegBufRecv = (void*)((uintptr_t)regRecvPtr + (uintptr_t)recvbuff - (uintptr_t)baseRecv);
+    if (recvRegRecord) {
+      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      recvRecord->base.fn = cleanupNvls;
+      recvRecord->reg = recvRegRecord;
+      recvRecord->comm = comm;
       ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)recvRecord);
       *nCleanupQueueEltsAdded += 1;
     }
-
-    INFO(NCCL_NVLS, "rank %d successfully graph-registered sendbuff %p, recvbuff %p, sendbuff size %ld (register size %ld, sendGran %ld), recvbuff size %ld (register size %ld, recvGran %ld), reg sendbuff %p, reg recvbuff %p", comm->rank, sendbuff, recvbuff, sendbuffSize, baseSendSize, sendGran, recvbuffSize, baseRecvSize, recvGran, (void*)regSendPtr, (void*)regRecvPtr);
+  } else {
+    if (sendbuff) NCCLCHECK(ncclCommGraphDeregister(comm, sendRegRecord));
+    if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord));
   }
 
-  *outRegBufUsed = localRegBufUsed;
-  free(regBufFlags);
-  free(rdata);
-  /* always return success. */
   return ncclSuccess;
-fail:
-  localRegBufUsed = false;
-  goto exit;
 }
 
 #else
@@ -936,19 +825,19 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
 
 ncclResult_t ncclNvlsGraphRegisterBuffer(
     struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize,
-    bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
+    int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv,
     struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueEltsAdded
   ) {
   *outRegBufUsed = false;
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv) {
   *outRegBufUsed = false;
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 6569ae175..3ae514e45 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -91,6 +91,8 @@ struct p2pCuMemProxyInfo {
 
 #include <sys/types.h>
 
+NCCL_PARAM(LegacyCudaRegister, "LEGACY_CUDA_REGISTER", 0);
+
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(int64_t busId) {
   int ndev;
@@ -120,21 +122,9 @@ extern int64_t ncclParamMNNVLEnable();
 ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
   initCeOperation();
 
-  // MNNVL support
-  if (comm->MNNVL && info1->hostHash != info2->hostHash) {
-    NCCLCHECK(ncclTopoCheckMNNVL(comm->topo, info1, info2, ret));
-    if (*ret) return ncclSuccess;
-  }
-
-  // Rule out different nodes / isolated containers
-  if (info1->hostHash != info2->hostHash || info1->shmDev != info2->shmDev) {
-    *ret = 0;
-    return ncclSuccess;
-  }
-
   // Check topology / p2p level.
   int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
   if (*ret == 0) return ncclSuccess;
   if (intermediateRank != -1) {
     if (useMemcpy) *ret = 0;
@@ -149,6 +139,12 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph
     return ncclSuccess;
   }
 
+  if (info1->hostHash != comm->peerInfo[comm->rank].hostHash ||
+      info1->hostHash != info2->hostHash) {
+    // If either peer is non-local then we are done.
+    return ncclSuccess;
+  }
+
   // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES)
   int cudaDev1 = busIdToCudaDev(info1->busId);
   int cudaDev2 = busIdToCudaDev(info2->busId);
@@ -313,11 +309,11 @@ NCCL_PARAM(P2pDirectDisable, "P2P_DIRECT_DISABLE", 0);
 
 #define P2P_SAME_PID(MYINFO, PEERINFO) ((MYINFO->hostHash == PEERINFO->hostHash) && (MYINFO->pidHash == PEERINFO->pidHash))
 
-static ncclResult_t p2pGetInfo(struct ncclTopoSystem* topo, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
+static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* read, int* intermediateRank) {
   int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
 
   int readEnable = ncclParamP2pReadEnable();
   if (readEnable != -2) *read = readEnable;
@@ -367,7 +363,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   send->transportResources = resources;
   int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));
   if (useMemcpy) useRead = 0;
 
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
@@ -386,7 +382,6 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = myInfo->rank;
     if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
-      send->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
       INFO(NCCL_INIT|NCCL_P2P, "Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/direct pointer%s",
           channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr);
     } else {
@@ -402,8 +397,8 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
         INFO(NCCL_INIT|NCCL_P2P,"Channel %02d/%01d : %d[%d] -> %d[%d] via P2P/IPC%s%s",
              channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, useReadStr, useMemcpy ? "/CE" : "");
       }
-      send->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
+    send->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
@@ -437,7 +432,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
   NCCLCHECK(ncclCalloc(&resources, 1));
   recv->transportResources = resources;
   int useRead, intermediateRank;
-  NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank));
+  NCCLCHECK(p2pGetInfo(comm, myInfo, peerInfo, &useRead, &intermediateRank));
 
   static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big");
   struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo;
@@ -454,7 +449,6 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = myInfo->rank;
     if (P2P_SAME_PID(myInfo, peerInfo) && ncclParamP2pDirectDisable() == 0 && useMemcpy == 0) {
       resources->type = P2P_DIRECT;
-      recv->conn.flags |= info->read ? NCCL_DIRECT_READ : NCCL_DIRECT_WRITE;
     } else {
       if (ncclCuMemEnable()) {
         // cuMem API support
@@ -465,8 +459,8 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
         // Legacy CUDA IPC
         resources->type = P2P_IPC;
       }
-      recv->conn.flags |= info->read ? NCCL_IPC_READ : NCCL_IPC_WRITE;
     }
+    recv->conn.flags |= info->read ? NCCL_P2P_READ : NCCL_P2P_WRITE;
   } else {
     resources->type = P2P_INTERMEDIATE;
     info->rank = intermediateRank;
@@ -807,9 +801,8 @@ static ncclResult_t p2pSendProxyProgress(struct ncclProxyState* proxyState, stru
   return ncclSuccess;
 }
 
-ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
-  ncclResult_t ret = ncclSuccess;
-  struct ncclReg *regRecord = NULL;
+static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, struct ncclReg* regRecord, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, bool* isLegacyIpc) {
+ncclResult_t ret = ncclSuccess;
   struct ncclIpcRegInfo* newInfo = NULL;
   uintptr_t* peerRmtAddrs = NULL;
   bool legacyIpcCap = false;
@@ -820,123 +813,125 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   *regBufFlag = 0;
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
-  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
-    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
-    if (regRecord) {
-      // buffer was registered by by users, we need to start to register or reuse it
-      int peerLocalRank;
-      for (int p = 0; p < nPeers; p++) {
-        int peerRank = peerRanks[p];
-        peerLocalRank = comm->rankToLocalRank[peerRank];
-        if (regRecord->ipcInfos[peerLocalRank]) {
-          // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
-          *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC local reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
-        } else {
-          // Register buffer with peerLocalRank
-          struct ncclProxyConnector* proxyConn = NULL;
-          struct p2pIpcExpInfo ipcInfo;
+  if (isLegacyIpc) *isLegacyIpc = false;
+  if (regRecord) {
+    // buffer was registered by by users, we need to start to register or reuse it
+    int peerLocalRank;
+    for (int p = 0; p < nPeers; p++) {
+      int peerRank = peerRanks[p];
+      peerLocalRank = comm->rankToLocalRank[peerRank];
+      if (regRecord->ipcInfos[peerLocalRank]) {
+        // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
+        *regBufFlag = 1;
+        if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap;
+        INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
+      } else {
+        // Register buffer with peerLocalRank
+        struct ncclProxyConnector* proxyConn = NULL;
+        struct p2pIpcExpInfo ipcInfo;
 
-          if (baseAddr == NULL) {
-            CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-            CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-          }
-          if (comm->gproxyConn[peerRank].initialized == false)
-            NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-          proxyConn = &comm->gproxyConn[peerRank];
-
-          ipcInfo.legacyIpcCap = legacyIpcCap;
-          // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
-          // get the CUDA legacy mem handle, or through cuMem*.
-          if (ipcInfo.legacyIpcCap) {
-            // legacy export
-            if (comm->directMode) goto fail;
+        if (baseAddr == NULL) {
+          CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+          CUCHECKGOTO(cuPointerGetAttribute((void*)&legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
+        }
+        if (comm->gproxyConn[peerRank].initialized == false)
+          NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
+        proxyConn = &comm->gproxyConn[peerRank];
+
+        // Get the mem handle for that buffer. It may have been allocated through cudaMalloc in which case we'll
+        // get the CUDA legacy mem handle, or through cuMem*.
+        if (ncclCuMemEnable()) {
+          CUmemGenericAllocationHandle handle;
+          if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
+            // if cuMem* export fails, retry legacy export
+            if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
             CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-          } else if (ncclCuMemEnable()) {
-            CUmemGenericAllocationHandle handle;
-            if (CUPFN(cuMemRetainAllocationHandle(&handle, baseAddr)) != CUDA_SUCCESS) {
-              // if cuMem* export fails, retry legacy export
-              if (comm->directMode) goto fail;
-              CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-              ipcInfo.legacyIpcCap = true;
+            ipcInfo.legacyIpcCap = true;
+            if (isLegacyIpc) *isLegacyIpc = true;
+          } else {
+            ipcInfo.legacyIpcCap = false;
+            if (isLegacyIpc) *isLegacyIpc = false;
+            // cuMem* export to file descriptor or fabric handle
+            if (proxyConn->sameProcess) {
+              memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
             } else {
-              // cuMem* export to file descriptor or fabric handle
-              if (proxyConn->sameProcess) {
-                memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
+              if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+                int expFd = -1;
+                CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
+                NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
+                SYSCHECKGOTO(close(expFd), "close", ret, fail);
               } else {
-                if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-                  int expFd = -1;
-                  CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-                  NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-                  SYSCHECKGOTO(close(expFd), "close", ret, fail);
-                } else {
-                  // Allow this to silently fail for cases where the user buff cannot be registered
-                  if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
-                    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-                    goto fail;
-                  }
+                // Allow this to silently fail for cases where the user buff cannot be registered
+                if (CUPFN(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0)) != CUDA_SUCCESS) {
+                  CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+                  goto fail;
                 }
               }
-              CUCHECKGOTO(cuMemRelease(handle), ret, fail);
             }
-          } else {
-            // nothing works, just return
-            goto fail;
+            CUCHECKGOTO(cuMemRelease(handle), ret, fail);
           }
+        } else if (legacyIpcCap) {
+          // legacy export
+          if (comm->directMode || !ncclParamLegacyCudaRegister()) goto fail;
+          CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
+          ipcInfo.legacyIpcCap = true;
+          if (isLegacyIpc) *isLegacyIpc = true;
+        } else {
+          // nothing works, just return
+          goto fail;
+        }
 
-          void* rmtRegAddr = NULL;
-          ipcInfo.size = baseSize;
-          ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
-          // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
-          // and get the remote register address back.
-          if (proxyConn)
-            NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-          if (rmtRegAddr) {
-            NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
-            assert(regRecord->ipcInfos[peerLocalRank] == NULL);
-            regRecord->state |= IPC_REG_COMPLETE;
-            newInfo->peerRank = peerRank;
-            newInfo->baseAddr = baseAddr;
-            newInfo->impInfo.rmtRegAddr = rmtRegAddr;
-            newInfo->impInfo.offset = ipcInfo.offset;
-            newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-            newInfo->ipcProxyconn = proxyConn;
-            regRecord->ipcInfos[peerLocalRank] = newInfo;
-            if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
-              NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-            }
-            regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
-            needUpdate = true;
-            *regBufFlag = 1;
-            INFO(NCCL_REG, "rank %d - IPC local register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
+        void* rmtRegAddr = NULL;
+        ipcInfo.size = baseSize;
+        ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
+        // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
+        // and get the remote register address back.
+        if (proxyConn)
+          NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+        if (rmtRegAddr) {
+          NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
+          assert(regRecord->ipcInfos[peerLocalRank] == NULL);
+          regRecord->state |= IPC_REG_COMPLETE;
+          newInfo->peerRank = peerRank;
+          newInfo->baseAddr = baseAddr;
+          newInfo->impInfo.rmtRegAddr = rmtRegAddr;
+          newInfo->impInfo.offset = ipcInfo.offset;
+          newInfo->impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
+          newInfo->ipcProxyconn = proxyConn;
+          regRecord->ipcInfos[peerLocalRank] = newInfo;
+          if (regRecord->regIpcAddrs.hostPeerRmtAddrs == NULL) {
+            NCCLCHECKGOTO(ncclCalloc(&regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
           }
+          regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
+          needUpdate = true;
+          *regBufFlag = 1;
+          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
         }
       }
+    }
 
-      if (*regBufFlag) {
-        if (type == NCCL_IPC_COLLECTIVE) {
-          // for collective, store registered remote buffers into dev memory for future reference
-          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-            NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-            if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-              NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            if (needUpdate)
-              NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-            NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-          }
-          peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
-        } else {
-          assert(nPeers == 1);
-          // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
-          peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
+    if (*regBufFlag) {
+      if (type == NCCL_IPC_COLLECTIVE) {
+        // for collective, store registered remote buffers into dev memory for future reference
+        if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
+          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          if (needUpdate)
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
         }
-        *offsetOut = (uintptr_t)userbuff - regRecord->addr;
-        *peerRmtAddrsOut = peerRmtAddrs;
+        peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
+      } else {
+        assert(nPeers == 1);
+        // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
+        peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
       }
+      *offsetOut = (uintptr_t)userbuff - regRecord->addr;
+      *peerRmtAddrsOut = peerRmtAddrs;
     }
   }
-
 exit:
   return ret;
 fail:
@@ -944,146 +939,81 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
   if (newInfo) free(newInfo);
+  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  goto exit;
+}
+
+ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclReg *regRecord = NULL;
+  bool isValid = false;
+
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
+    NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
+    if (isValid)
+      NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail);
+  }
+
+exit:
+  return ret;
+fail:
+  *regBufFlag = 0;
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
   goto exit;
 }
 
 struct ncclIpcCleanupCallback {
   struct ncclCommCallback base;
-  bool isAddrs;
-  union {
-    struct ncclIpcRegInfo regInfo;
-    struct ncclPeerRegIpcAddr regIpcAddrs;
-  };
+  struct ncclComm *comm;
+  struct ncclReg *reg;
 };
 
 static ncclResult_t cleanupIpc(struct ncclComm* comm, struct ncclCommCallback* cb) {
   struct ncclIpcCleanupCallback* obj = (struct ncclIpcCleanupCallback*)cb;
-  if (obj->isAddrs) {
-    if (obj->regIpcAddrs.hostPeerRmtAddrs)
-      free(obj->regIpcAddrs.hostPeerRmtAddrs);
-    if (obj->regIpcAddrs.devPeerRmtAddrs)
-      NCCLCHECK(ncclCudaFree(obj->regIpcAddrs.devPeerRmtAddrs));
-  } else {
-    NCCLCHECK(ncclIpcDeregBuffer(comm, &obj->regInfo));
-  }
+  NCCLCHECK(ncclCommGraphDeregister(obj->comm, obj->reg));
   free(obj);
   return ncclSuccess;
 }
 
 ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, size_t buffSize, int* peerRanks, int nPeers, ncclIpcRegType type, int* regBufFlag, uintptr_t* offsetOut, uintptr_t** peerRmtAddrsOut, void* cleanupQueuePtr, int* nCleanupQueueElts) {
   ncclResult_t ret = ncclSuccess;
-  struct ncclProxyConnector* proxyConn = NULL;
-  struct p2pIpcExpInfo ipcInfo;
   void* baseAddr;
   size_t baseSize;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue = reinterpret_cast<struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>*>(cleanupQueuePtr);
-  uintptr_t* peerRmtAddrs = NULL;
-  struct ncclIpcCleanupCallback* addrsRecord = NULL;
+  bool isLegacyIpc = false;
+  struct ncclReg *regRecord = NULL;
 
   *regBufFlag = 0;
-  CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
-  CUCHECKGOTO(cuPointerGetAttribute((void*)&ipcInfo.legacyIpcCap, CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE, (CUdeviceptr)baseAddr), ret, fail);
-
-  if (type == NCCL_IPC_COLLECTIVE) {
-    // collective needs host memory array to hold all remote buffer addrs.
-    // We need to put this into graph release queue
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord, 1), ret, fail);
-    addrsRecord->base.fn = cleanupIpc;
-    addrsRecord->isAddrs = true;
-    NCCLCHECKGOTO(ncclCalloc(&addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks), ret, fail);
-  } else {
-    assert(nPeers == 1);
-    // p2p does not need anything, just returning the remote buffer is enough, but for now, we register
-    // peer one by one so nPeers must be 1
-  }
-
-  for (int p = 0; p < nPeers; ++p) {
-    int peerRank = peerRanks[p];
-    if (comm->gproxyConn[peerRank].initialized == false)
-      NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_P2P, 1, peerRank, &comm->gproxyConn[peerRank]), ret, fail);
-    proxyConn = &comm->gproxyConn[peerRank];
-    // Same as local registration. Get the mem handle for that buffer. It may have been allocated through
-    // cudaMalloc in which case we'll get the CUDA legacy mem handle, or through cuMem*.
-    if (ipcInfo.legacyIpcCap) {
-      if (comm->directMode) goto fail;
-      CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-    } else if (ncclCuMemEnable()) {
-      // cuMem* export
-      CUmemGenericAllocationHandle handle;
-      if (pfn_cuMemRetainAllocationHandle(&handle, baseAddr) != CUDA_SUCCESS) {
-        if (comm->directMode) goto fail;
-        CUDACHECKGOTO(cudaIpcGetMemHandle(&ipcInfo.ipcDesc.devIpc, baseAddr), ret, fail);
-        ipcInfo.legacyIpcCap = true;
-      } else {
-        if (proxyConn->sameProcess) {
-          memcpy(&ipcInfo.ipcDesc.memHandle, &handle, sizeof(CUmemGenericAllocationHandle));
-        } else {
-          if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-            int expFd = -1;
-            CUCHECKGOTO(cuMemExportToShareableHandle(&expFd, handle, ncclCuMemHandleType, 0), ret, fail);
-            if (proxyConn->sameProcess) {
-              ipcInfo.impFd = expFd;
-            } else {
-              NCCLCHECKGOTO(ncclProxyClientQueryFdBlocking(comm, proxyConn, expFd, &ipcInfo.impFd), ret, fail);
-              SYSCHECKGOTO(close(expFd), "close", ret, fail);
-            }
-          } else {
-            CUCHECKGOTO(cuMemExportToShareableHandle(&ipcInfo.ipcDesc.cuDesc.handle, handle, ncclCuMemHandleType, 0), ret, fail);
-          }
-        }
-        CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-      }
-    } else {
-      goto fail;
-    }
-
-    void* rmtRegAddr = NULL;
-    ipcInfo.size = baseSize;
-    ipcInfo.offset = 0;
-    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(struct p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
-    if (rmtRegAddr) {
+  *offsetOut = 0;
+  *peerRmtAddrsOut = NULL;
+  if (comm && userbuff && buffSize > 0 && nPeers > 0) {
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)&regRecord), ret, fail);
+    NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail);
+    if (*regBufFlag) {
       struct ncclIpcCleanupCallback* record;
       NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
       record->base.fn = cleanupIpc;
-      record->isAddrs = false;
-      record->regInfo.peerRank = peerRank;
-      record->regInfo.baseAddr = baseAddr;
-      record->regInfo.impInfo.rmtRegAddr = rmtRegAddr;
-      record->regInfo.impInfo.offset = 0;
-      record->regInfo.impInfo.legacyIpcCap = ipcInfo.legacyIpcCap;
-      record->regInfo.ipcProxyconn = proxyConn;
-      // store the remote address into host addr array
-      if (type == NCCL_IPC_COLLECTIVE)
-        addrsRecord->regIpcAddrs.hostPeerRmtAddrs[comm->rankToLocalRank[peerRank]] = (uintptr_t)rmtRegAddr;
-      else
-        peerRmtAddrs = (uintptr_t*)rmtRegAddr;
-      *regBufFlag = 1;
-      if (ipcInfo.legacyIpcCap)
-        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &record->base);
-      else
-        ncclIntruQueueEnqueue(cleanupQueue, &record->base);
-      if (nCleanupQueueElts) *nCleanupQueueElts += 1;
-      INFO(NCCL_REG, "rank %d - IPC graph register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, baseAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - (uintptr_t)baseAddr);
+      record->comm = comm;
+      record->reg = regRecord;
+      if (isLegacyIpc) {
+        ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, (struct ncclCommCallback*)record);
+      } else {
+        ncclIntruQueueEnqueue(cleanupQueue, (struct ncclCommCallback*)record);
+        if (nCleanupQueueElts) *nCleanupQueueElts += 1;
+      }
+    } else {
+      NCCLCHECKGOTO(ncclCommGraphDeregister(comm, regRecord), ret, fail);
     }
   }
 
-  if (type == NCCL_IPC_COLLECTIVE) {
-    // allocate the dev addr array and copy all previously stored addrs into it.
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&addrsRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(addrsRecord->regIpcAddrs.devPeerRmtAddrs, addrsRecord->regIpcAddrs.hostPeerRmtAddrs, comm->nRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
-    peerRmtAddrs = addrsRecord->regIpcAddrs.devPeerRmtAddrs;
-    if (ipcInfo.legacyIpcCap)
-      ncclIntruQueueEnqueue(&comm->legacyRegCleanupQueue, &addrsRecord->base);
-    else
-      ncclIntruQueueEnqueue(cleanupQueue, &addrsRecord->base);
-  }
-  *offsetOut = (uintptr_t)userbuff - (uintptr_t)baseAddr;
-  *peerRmtAddrsOut = peerRmtAddrs;
-
 exit:
+  // coverity[leaked_storage:FALSE] => normally, addrsRecord is added to the cleanupQueue
   return ret;
 fail:
   *regBufFlag = 0;
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index 9be95fd80..d2d6906e8 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -454,6 +454,7 @@ static ncclResult_t shmRecvProxyProgress(struct ncclProxyState* proxyState, stru
 }
 
 static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
   struct shmRequest* req = (struct shmRequest*)reqBuff;
   /* check message size */
   if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -463,13 +464,18 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }
 
 static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t result = ncclSuccess;
   struct shmRequest* req = (struct shmRequest*)reqBuff;
   /* check message size */
   if (reqSize != sizeof(struct shmRequest)) return ncclInternalError;
@@ -479,10 +485,14 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr));
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
-  return ncclSuccess;
+exit:
+  return result;
+fail:
+  free(proxyInfo);
+  goto exit;
 }
 
 static void initCeOperation() {
@@ -534,7 +544,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
   } else {
     char shmPath[SHM_PATH_MAX] = { '\0' };
     desc->shmli.shmSize = size;
-    NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
     memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
     desc->legacy = true;
     INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
@@ -542,7 +552,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
 #else /* CUDART_VERSION >= 12020 */
   char shmPath[SHM_PATH_MAX] = { '\0' };
   desc->shmli.shmSize = size;
-  NCCLCHECK(ncclShmOpen(shmPath, size, hptr, dptr, 1, &desc->shmli.handle));
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), size, hptr, dptr, 1, &desc->shmli.handle));
   memcpy(desc->shmli.shmSuffix, shmPath + sizeof("/dev/shm/nccl-") - 1, sizeof(desc->shmli.shmSuffix));
   desc->legacy = true;
   INFO(NCCL_SHM, "MMAP allocated shareable host buffer %s size %zi ptr %p", shmPath, size, *hptr);
@@ -618,15 +628,15 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
     INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
   } else {
     char shmPath[SHM_PATH_MAX];
-    sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-    NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+    snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+    NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
     descOut->legacy = true;
     INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
   }
 #else /* CUDART_VERSION >= 12020 */
   char shmPath[SHM_PATH_MAX];
-  sprintf(shmPath, "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
-  NCCLCHECK(ncclShmOpen(shmPath, desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
+  snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);
+  NCCLCHECK(ncclShmOpen(shmPath, sizeof(shmPath), desc->shmli.shmSize, hptr, dptr, -1, &descOut->shmli.handle));
   descOut->legacy = true;
   INFO(NCCL_SHM, "MMAP imported shareable host buffer %s size %zi ptr %p", shmPath, desc->shmli.shmSize, *hptr);
 #endif

From d7ccab8b7e35eb6b0df5825c2813b2b8780b6d12 Mon Sep 17 00:00:00 2001
From: Giuseppe Congiu <gcongiu@nvidia.com>
Date: Tue, 10 Dec 2024 06:29:57 -0800
Subject: [PATCH 03/21] Add profiler documentation

Add the following files:

 - ext-profiler/README.md: plugin writed documentation
 - ext-profiler/example/README.md: example plugin user documentation
---
 ext-profiler/README.md         | 318 +++++++++++++++++++++++++++++++++
 ext-profiler/example/README.md | 239 +++++++++++++++++++++++++
 2 files changed, 557 insertions(+)
 create mode 100644 ext-profiler/README.md
 create mode 100644 ext-profiler/example/README.md

diff --git a/ext-profiler/README.md b/ext-profiler/README.md
new file mode 100644
index 000000000..7ef44b2fa
--- /dev/null
+++ b/ext-profiler/README.md
@@ -0,0 +1,318 @@
+# NCCL Profiler Plugin Documentation
+
+This page describes the NCCL Profiler plugin API and how to implement a profiler plugin for NCCL.
+
+# Overview
+
+To allow NCCL to better integrate with DL frameworks, NCCL v2.23 introduced a profiler plugin
+interface. Any NCCL user can write profiler plugins to extract performance data from NCCL and
+use it for debugging and analysis.
+
+Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
+called `libnccl-profiler.so`. That shared library contains one or more implementations of the
+NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
+functions.
+
+# Plugin architecture
+
+## Plugin name and supporting multiple profiler plugins
+
+When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load
+it, then look for symbols inside the library.
+
+The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
+will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
+advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so`
+to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
+path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
+the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library.
+
+## Struct versioning
+
+Once a library is found, NCCL will look for a symbol named `ncclProfiler_vX`, with `X` increasing
+over time. The versioning ensures that the plugin and the NCCL core are compatible.
+
+Plugins are encouraged to provide multiple of those symbols, implementing multiple versions of the
+NCCL PROFILER API, so that the same plugin can be compiled and support a wide range of NCCL versions.
+
+Conversely, and to ease transition, NCCL can choose to support different plugin versions, looking
+for the latest ncclProfiler struct version, but also looking for older ones so that older plugins
+would still work.
+
+## Headers management
+
+To help users build plugins effortlessly, plugins should copy the `ncclProfiler_vX` definitions
+they support to their internal includes. An example is shown in `ext-profiler/example` where we
+keep all headers in the `nccl/` directory and provide thin layers to implement old version on top
+of newer ones.
+
+The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
+from old API versions. It also provides error codes in `err.h`.
+
+# API (v2)
+
+Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
+
+```
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+```
+
+## Error codes
+
+As rule of thumb, profiler generated errors should not be propagated to NCCL and alter its normal
+functioning. Nevertheless, the profiler interface returns NCCL error codes, in case any need for
+them arises in the future. For now, any profiler interface call should only return `ncclSuccess`.
+The only exception is `init` that can return an error so that NCCL can disable the plugin.
+
+## Operation overview
+
+NCCL will call the `init` function first for every new communicator that is initialized. The profiler
+returns an opaque context handle that is used to isolate profiler instances across communicators.
+Similarly, NCCL will call `finalize` to destroy the profiler context, thus freeing resources.
+
+The NCCL core code is instrumented with calls to `startEvent`, `stopEvent` and `recordEventState`.
+These are used to start, stop and update events in the profiler, respectively.
+
+## API Functions
+
+### Initialization
+
+#### name
+
+The `name` field should point to a character string with the name of the profiler plugin. This will
+be used for all logging, especially when `NCCL_DEBUG=INFO` is set.
+
+#### init
+
+As soon as NCCL finds the plugin and the correct ncclProfiler symbol, it calls its `init` function.
+This allows the plugin to initialize its internal context, used during profiling of NCCL events.
+If the `init` function does not return `ncclSuccess`, NCCL disables the plugin.
+
+#### finalize
+
+When the profiler is no longer needed, a call to `finalize` destroys the profiler context and frees
+up resources.
+
+### Profiling
+
+#### startEvent
+
+When NCCL needs to start profiling a new event it calls `startEvent`. `startEvent` takes the profiler
+context, previously created by `init`, an event descriptor of type `ncclProfilerEventDescr_t` and
+returns an opaque profiler event handle that can be passed to other profiler functions, as discussed
+later in the document.
+
+
+The event descriptor contains all the event metadata. Every event type has its own descriptor. Below
+is the `ncclProfilerEventDescr_t` struct.
+
+```
+typedef struct {
+  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
+  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                 // rank that generated the event
+  union {
+    struct {                // collective events metadata
+      const char* name;     // string containing name of the communicator
+      uint64_t commHash;    // unique hash/id for the communicator
+      uint64_t seqNumber;   // sequence number of this collective operation in the communicator
+      const char* func;     // string containing name of the collective
+      void const* sendBuff; // address of send buffer
+      void* recvBuff;       // address of recv buffer
+      size_t count;         // data count
+      int root;             // root rank
+      const char* datatype; // string containing the name of the datatype
+      size_t trafficBytes;  // number of transfer bytes
+      uint8_t nMaxChannels; // max number of channels for this collective
+      uint8_t nWarps;       // number of GPU warps for this collective
+      const char* algo;     // string containing name of the algorithm for this collective
+      const char* proto;    // string containing name of the protocol for this collective
+    } coll;
+
+    struct {                // point-to-point events metadata
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;             // peer rank for this point-to-point
+    } p2p;
+
+    struct {                // proxyOp events metadata
+      pid_t pid;            // process id that generated the associated `ncclProxyOp` object
+      uint8_t channelId;    // id of the channel used by the associated `ncclProxyOp` object
+      int peer;             // peer rank
+      int nSteps;           // number of network transfers/steps required by the `ncclProxyOp`
+      int chunkSize;        // chunk size for this `ncclProxyOp`
+      int isSend;           // set to 1 for sends and 0 for recvs
+    } proxyOp;
+
+    struct {                // proxyStep events metadata
+      int step;             // individual step in `ncclProxyOp`
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+```
+
+NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
+
+#### stopEvent
+
+`stopEvent` takes the event handle returned by `startEvent` to stop the event. After the event
+has been stopped the handle can no longer be used with other profiler calls. Using the event
+handle after `eventStop` is undefined behavior.
+
+#### recordEventState
+
+Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
+`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+
+`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through
+calls to `recordEventState`.
+
+The state of proxy generated events can be updated, along with event attributes, using
+`recordEventState`. These events can go through several states during their lifecycle.
+The list of supported states for the proxy-defined events is reported below.
+
+```
+typedef enum {
+  // ncclProfileProxyOp event states
+  ncclProfilerProxyOpSendPosted,        // state marks the posting of send buffer to GPU for given network transfer/step
+  ncclProfilerProxyOpSendRemFifoWait,   // state marks the waiting of CTS credits from peer rank
+  ncclProfilerProxyOpSendTransmitted,   // state marks the sending of network transfer/step to peer rank
+  ncclProfilerProxyOpSendDone,          // state marks the ending  of network transfer/step
+  ncclProfilerProxyOpRecvPosted,        // state marks the posting of recv to network for given network transfer/step
+  ncclProfilerProxyOpRecvReceived,      // state marks the recving of network transfer/step from peer rank
+  ncclProfilerProxyOpRecvTransmitted,   // state marks the ending  of the network transfer/step
+  ncclProfilerProxyOpRecvDone,          // state marks the consuming of data from GPU
+
+  // ncclProfileProxyStep event states
+  ncclProfilerProxyStepSendGPUWait,     // state marks the waiting of send data from GPU for given network transfer/step
+  ncclProfilerProxyStepSendWait,        // state marks the waiting of send data from network for given network transfer/step
+  ncclProfilerProxyStepRecvWait,        // state marks the waiting of recv data from network for given network transfer/step
+  ncclProfilerProxyStepRecvFlushWait,   // state marks the waiting of recv data flush to GPU for given network transfer/step
+  ncclProfilerProxyStepRecvGPUWait,     // state marks the waiting of recv data consumption from GPU for given network transfer/step
+
+  // ncclProfileProxyCtrl event states
+  ncclProfilerProxyCtrlIdle,            // state marks proxy progress thread idle
+  ncclProfilerProxyCtrlActive,          // state marks proxy progress thread active
+  ncclProfilerProxyCtrlSleep,           // state marks proxy progress thread sleeping
+  ncclProfilerProxyCtrlWakeup,          // state marks proxy progress thread waking up
+  ncclProfilerProxyCtrlAppend,          // state marks append of new network work item begin
+  ncclProfilerProxyCtrlAppendEnd,       // state marks append of new network work item end
+} ncclProfilerEventState_v2_t;
+```
+
+`ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyOp events are generated for every active channel and
+provide a summary of the activity of the proxy progress thread for that channel.
+
+`ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
+network requests for the GPU kernel. ProxyStep events describe individual network transfer in
+the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
+
+`ncclProfileProxyCtrl` events are generated by the proxy progress thread while it is not processing
+network requests for the GPU kernel. This includes everything else that the proxy thread might be
+doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
+
+State transitions for the events described can also come with event attribute updates. For this
+reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
+
+```
+typedef union {
+  struct {                // attributes to update for ncclProfileProxyOp events
+    size_t transSize;     // data transferred thus far
+    int steps;            // network transfer/steps processed thus far
+  } proxyOp;
+
+  struct {                // attributes to update for ncclProfileProxyCtrl
+    int appendedProxyOps; // number of appended proxy ops thus far
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+```
+
+The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
+
+### Event hierarchy
+
+NCCL core events (reported above) are organized into a hierarchy as reported below:
+
+```
+Group event
+   |
+   +- Collective event
+   |  |
+   |  +- ProxyOp event
+   |     |
+   |     +- ProxyStep event
+   |
+   +- Point-to-point event
+      |
+      +- ProxyOp event
+         |
+         +- ProxyStep event
+
+ProxyCtrl event
+```
+
+# Profiler instrumentation and logging
+
+## Profiling of collective and p2p operations
+
+The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
+collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature
+of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
+precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
+figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
+the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if
+these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent`
+call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
+can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
+increment and decrement the reference counter, respectively.
+
+## PXN
+
+PXN causes some proxy operations to be processed in a remote proxy thread that differs from the one that
+generated the operation. When this happens, the event hierarchy reported above breaks. Because the
+profiler can use the hierarchy information, provided by NCCL in the event descriptor, to dereference the
+parent event during `startEvent`, the remote proxy thread must be in the same address space of the proxy
+thread originating the operation. To avoid the profiler instance in the remote proxy address space to
+dereference a pointer from another address space the event descriptor includes the PID of the originator.
+The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
+parent event.
diff --git a/ext-profiler/example/README.md b/ext-profiler/example/README.md
new file mode 100644
index 000000000..d98e58f15
--- /dev/null
+++ b/ext-profiler/example/README.md
@@ -0,0 +1,239 @@
+# NCCL Example Profiler Plugin Usage
+
+This page describes how to use the NCCL example profiler plugin
+
+# Overview
+
+The example profiler plugin implements the NCCL profiler plugin API introduced in NCCL v2.23. The API
+defines a set of events and data structures that NCCL uses to share event information with profiler
+plugins. The user can control what events are instrumented by NCCL and when traces collected by the
+profiler should be dumped through environment variables, as described in the rest of the document.
+The user can also control other profiler parameters that alter its behavior. For example, users can
+change the size of the event window the profiler keeps track of.
+
+## Building the profiler plugin
+
+To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
+You can override `NCCL_HOME` to where the NCCL installation is on your system.
+
+## Using the profiler plugin
+
+1. Add the directory of this profiler plugin to your `LD_LIBRARY_PATH` or set the `NCCL_PROFILER_PLUGIN`,
+   as documented in `ext-profiler/README.md`.
+
+2. Set `NCCL_PROFILE_EVENT_MASK` bitmask to specify the NCCL events you want to instrument. By
+   default, all collectives and send/recv operations will be traced. For more details about the event
+   representation used by the profiler refer to `ext-profiler/README.md`.
+
+   As an example, setting:
+
+   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+
+   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
+   in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
+   is that the profiler can easily correlate events that belong to the same NCCL operation and present
+   them accordingly.
+
+3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
+   ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
+   event format (more precisely, using asynchronous events).
+
+4. If you set the dump file variable, type chrome://tracing on your chromium browser search bar and
+   open the created dump file to visualize the traces.
+
+# Changing the profiler memory pool sizes
+
+The example profiler uses separate memory pools for different types of events. The size of these memory
+pools (i.e., the # events) determines the number of events that the profiler can keep track of at the
+same time. When NCCL requests a new event (e.g., collective event) to profile a `ncclAllReduce`
+operation, by calling `startEvent`, the profiler searches in the collective pool for a free event. If it
+finds one, it marks it as in use and returns the handle to NCCL. If the pool is completely used the
+profiler returns `NULL` to NCCL and ignores all the following NCCL profiler calls for the `NULL` event
+handle. When the `ncclAllReduce` has been processed, NCCL calls `stopEvent` with the previosly returned
+event handle. The profiler has a total of 5 memory pools.
+
+The group, collective and p2p pools contain objects for the corresponding events. The `ProxyCtrl` pool
+contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
+generated by remote proxies. A list of pools and their size is reported below:
+
+- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+
+Remote proxy operations are generated when PXN is in use. Refer to this article for more information
+about PXN and how it works:
+https://developer.nvidia.com/blog/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12/
+
+# Reported events
+
+The example profiler generates traces using the json format. An example of trace is reported below:
+
+```
+[
+{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
+{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
+{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
+{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
+{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
+{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
+{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
+{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+ ... [ trace truncated for brevity ]
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
+{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{}]
+```
+
+Details about the fields used in the trace can be found at this link:
+https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
+
+The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
+(Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
+one collective and this is what is presented in the traces above).
+
+
+The `AllReduce` event encloses traces for the proxy operation associated to the `ncclAllReduce` operation. The `args`
+field in the traces contains NCCL specific information (aside from the chrome trace event format).
+
+## AllReduce trace
+
+The `AllReduce` entry presents information about the `ncclAllReduce` operation. It contains the following info in the args field:
+
+- seqNum      : sequential number of the collective in the communicator (every collective type has its own sequence number in the communicator)
+- commHash    : communicator unique identifier
+- rank        : NCCL rank for the ncclAllReduce
+- datatype    : NCCL datatype
+- algorithm   : algorithm used to process the ncclAllReduce
+- protocol    : protocol used to process the ncclAllReduce
+- nMaxChannels: max number of channels used to process the ncclAllReduce
+
+If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
+consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
+of collective and p2p operations`.
+
+### Proxy Send
+The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel      : id of the channel used by this proxy operation to send data to the peer
+- Peer         : peer rank
+- Steps        : number of network steps required to transfer transSize bytes to the peer
+- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize    : bytes transferred across the channel by this proxy operation
+- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
+- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
+- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
+- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
+
+In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
+which could help identify at which point the network problem occurred.
+
+The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+#### Proxy SendBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
+
+#### Proxy SendGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
+buffer.
+
+#### Proxy SendWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
+
+### Proxy Recv
+
+The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
+info in the args field:
+
+- Channel    : id of the channel used by this proxy operation to recv data from the peer
+- Peer       : peer rank
+- Steps      : number of network steps required to transfer transSize bytes from the peer
+- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
+- transSize  : bytes transferred across the channel by this proxy operation
+- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
+- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
+- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
+- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
+
+The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
+needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
+entries below are also reported by the profiler.
+
+
+#### Proxy RecvBufferWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
+become available.
+
+#### Proxy RecvWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
+
+#### Proxy RecvFlushWait
+
+Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
+
+#### Proxy RecvGPUWait
+
+Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data

From 1672c85781ba6158d5d173d3ecac969f8796af11 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Fri, 17 Jan 2025 02:03:22 -0800
Subject: [PATCH 04/21] Fix packaging scripts.

Issue #1578
---
 pkg/debian/Makefile               | 2 +-
 pkg/debian/libnccl-dev.install.in | 2 +-
 pkg/debian/rules                  | 3 +++
 pkg/redhat/nccl.spec.in           | 6 ++++--
 pkg/txz/create_txz.sh.in          | 2 +-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pkg/debian/Makefile b/pkg/debian/Makefile
index 0494f3e03..650ca4270 100644
--- a/pkg/debian/Makefile
+++ b/pkg/debian/Makefile
@@ -25,7 +25,7 @@ prep : $(DEBTARGETS)
 build : prep
 	$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
 	@printf "Building Debian package\n"
-	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
+	(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b -Zxz)
 	mkdir -p $(PKGDIR)
 	mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
 
diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in
index 13eca26c6..45120e6de 100644
--- a/pkg/debian/libnccl-dev.install.in
+++ b/pkg/debian/libnccl-dev.install.in
@@ -1,4 +1,4 @@
+bin/ncclras /usr/bin
 include/nccl.h /usr/include
-include/nccl_net.h /usr/include
 lib/libnccl.so /usr/lib/${pkg:MultiArch}
 lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
diff --git a/pkg/debian/rules b/pkg/debian/rules
index 23b90a9e0..8005d3020 100755
--- a/pkg/debian/rules
+++ b/pkg/debian/rules
@@ -11,3 +11,6 @@ override_dh_auto_test:
 
 override_dh_auto_clean:
 	# Do not make clean
+
+override_dh_builddeb:
+	dh_builddeb -- -Zxz
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index 8e5aed6f3..d62955592 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -20,6 +20,7 @@ sockets.
 %package devel
 Summary:        NVIDIA Collective Communication Library (NCCL) Runtime
 Group:          Development/Libraries
+Requires:       libnccl >= ${nccl:Major}.${nccl:Minor}.${nccl:Patch}
 %description devel
 NCCL development files
 
@@ -44,9 +45,10 @@ install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUI
 ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
 
 # devel
+install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
 install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
 install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
-install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
 ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
 
 # static
@@ -64,8 +66,8 @@ rm -rf $RPM_BUILD_ROOT
 %files devel
 %doc LICENSE.txt
 %defattr(-,root,root,-)
+%{_bindir}/ncclras
 %{_includedir}/nccl.h
-%{_includedir}/nccl_net.h
 %{_libdir}/libnccl.so
 
 %files static
diff --git a/pkg/txz/create_txz.sh.in b/pkg/txz/create_txz.sh.in
index deae85483..88f961325 100644
--- a/pkg/txz/create_txz.sh.in
+++ b/pkg/txz/create_txz.sh.in
@@ -21,4 +21,4 @@ PKG_ARCH=${pkg:Arch}
 
 NCCLNAME="nccl_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${PKG_REVISION}+cuda${CUDA_MAJOR}.${CUDA_MINOR}_${PKG_ARCH}"
 
-tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt
+tar --transform "s/^$BUILDDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $BUILDDIR/bin $BUILDDIR/include $BUILDDIR/lib $BUILDDIR/*.txt

From 80f6bda4378b99d99e82b4d76a633791cc45fef0 Mon Sep 17 00:00:00 2001
From: Sylvain Jeaugey <sjeaugey@nvidia.com>
Date: Mon, 27 Jan 2025 03:30:22 -0800
Subject: [PATCH 05/21] NCCL 2.25.1-1

Add Blackwell/SM100 support
 * Add compilation for sm100
 * Add graph search speeds for Blackwell
 * Optimize graph search to converge on large NVLink domains
 * Limit NVLS heads to 32
 * Increase various limits to fit large NVLink domains
 * Add extra checks for IMEX setup, needed for MNNVL
 * Increase MAXCHANNELS to 64

Extend NVTX instrumentation to track NCCL communicators
 * Add communicator ID to NVTX traces to allow for correlation
   between ranks.

RAS fixes
---
 makefiles/common.mk                         |   8 +-
 makefiles/version.mk                        |   4 +-
 src/Makefile                                |   2 +-
 src/collectives.cc                          |  79 ++-----
 src/device/Makefile                         |   2 +-
 src/device/all_reduce.h                     |   2 +-
 src/enqueue.cc                              |  24 ++-
 src/graph/connect.cc                        |   7 +-
 src/graph/paths.cc                          |  35 +++-
 src/graph/search.cc                         |  45 ++--
 src/graph/topo.cc                           |  57 +++---
 src/graph/topo.h                            |   7 +-
 src/graph/tuning.cc                         |  21 +-
 src/include/alloc.h                         |  11 +-
 src/include/device.h                        |   6 +-
 src/include/enqueue.h                       |   2 +-
 src/include/graph.h                         |   3 +-
 src/include/mnnvl.h                         |  15 ++
 src/include/nvtx.h                          |  85 ++++++--
 src/include/nvtx3/nvToolsExtPayloadHelper.h |   6 +-
 src/include/nvtx_payload_schemas.h          | 125 ++++++++++++
 src/include/proxy.h                         |   2 +
 src/init.cc                                 | 215 ++++++++------------
 src/mnnvl.cc                                |  82 ++++++++
 src/proxy.cc                                |  36 ++--
 src/ras/client_support.cc                   |   6 +-
 src/ras/ras_internal.h                      |   2 +-
 src/register/coll_reg.cc                    |  18 +-
 src/transport/nvls.cc                       |  12 +-
 src/transport/p2p.cc                        |  14 +-
 30 files changed, 603 insertions(+), 330 deletions(-)
 create mode 100644 src/include/mnnvl.h
 create mode 100644 src/include/nvtx_payload_schemas.h
 create mode 100644 src/mnnvl.cc

diff --git a/makefiles/common.mk b/makefiles/common.mk
index 82164ab5c..1b1bb8674 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -39,14 +39,20 @@ endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
+CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                 -gencode=arch=compute_120,code=sm_120
 
 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
 CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
+CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
 
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
+# Include Blackwell support if we're using CUDA12.8 or above
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 252300934..b02cf909c 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 24
-NCCL_PATCH   := 3
+NCCL_MINOR   := 25
+NCCL_PATCH   := 1
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index 2c5d9e863..b66ebefa2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc \
+	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
diff --git a/src/collectives.cc b/src/collectives.cc
index 479d4c511..03122f8a7 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -8,6 +8,7 @@
 #include "collectives.h"
 #include "enqueue.h"
 #include "nccl.h"
+#include "nvtx_payload_schemas.h"
 
 const char* ncclFuncToString(ncclFunc_t fn) {
   switch (fn) {
@@ -78,11 +79,8 @@ NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size
 ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
   // Just pass the size of one message and not the total bytes sent/received.
-  constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
-  };
-  size_t msgsize = sendcount * ncclTypeSize(datatype);
-  NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
+  NVTX3_FUNC_WITH_PARAMS(AllGather, NcclNvtxParamsAllGather,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, sendcount * ncclTypeSize(datatype)));
 
   struct ncclInfo info = { ncclFuncAllGather, "AllGather",
     sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
@@ -94,18 +92,8 @@ NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsAllReduce {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  // Just pass the size of one message and not the total bytes sent/received.
-  static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsAllReduce, op)}
-  };
-  NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(AllReduce, NcclNvtxParamsAllReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
     sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
@@ -117,16 +105,8 @@ NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
     ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsBroadcast {
-    size_t bytes;
-    int root;
-  };
-  constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
-  };
-  NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
-  NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Broadcast, NcclNvtxParamsBroadcast,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
 
   struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
     sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
@@ -145,19 +125,8 @@ NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
-  struct NvtxParamsReduce {
-    size_t bytes;
-    int root;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduce, op)}
-  };
-  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
-  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Reduce, NcclNvtxParamsReduce,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root, op));
 
   struct ncclInfo info = { ncclFuncReduce, "Reduce",
     sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
@@ -169,17 +138,8 @@ NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
-  struct NvtxParamsReduceScatter {
-    size_t bytes;
-    ncclRedOp_t op;
-  };
-  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
-    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
-      offsetof(NvtxParamsReduceScatter, op)}
-  };
-  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
-  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, NcclNvtxParamsReduceScatter,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, recvcount * ncclTypeSize(datatype), op));
 
   struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
     sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
@@ -187,21 +147,12 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
   return ncclEnqueueCheck(&info);
 }
 
-struct NvtxParamsSendRecv {
-    size_t bytes;
-    int peer;
-};
-constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
-};
-
 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Send, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncSend, "Send",
     NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
@@ -213,8 +164,8 @@ NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t da
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream) {
-  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
-  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(Recv, NcclNvtxParamsSendRecv,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), peer));
 
   struct ncclInfo info = { ncclFuncRecv, "Recv",
     NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
diff --git a/src/device/Makefile b/src/device/Makefile
index 1e9311f1f..3562563fc 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -5,7 +5,7 @@
 #
 
 SHELL := /usr/bin/env bash
-MAKEFALGS += -r
+MAKEFLAGS += -r
 .SUFFIXES:
 .SECONDARY:
 
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index c6c131517..216159747 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -436,7 +436,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPL
           int nelem = work->regUsed ? 0 : min(loopCount, channelCount - elemOffset);
           prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
         }
-      } else if (tid < tidEndReduce) {
+      } else if (tid < tidEndReduce && nvls->headRank != -1) {
         // Reduce, broadcast through NVLS
         using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
         Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 285e17f69..23f463397 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -21,19 +21,21 @@
 NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 
 // Returns maximum kernel stack size of all CUDA kernels
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
+  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
+  int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
   for (int k=0; k < ncclDevKernelCount; k++) {
     void* fn = ncclDevKernelList[k];
+    cudaFuncAttributes attr = {0};
     if (fn == nullptr) continue;
 
+    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
     if (maxStackSize) {
-      cudaFuncAttributes attr = {0};
-      CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
       if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
     ignore0:;
     }
@@ -43,9 +45,17 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize) {
         result, ignore1);
     ignore1:;
     }
-    if (ncclShmemDynamicSize(cudaArch) != 0) {
+    if (ncclMaxSharedMem != 0) {
+      int sharedMemSize = ncclMaxSharedMem;
+      if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
+        if (print++ == 0)
+          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        // Reduce requested MaxDynamicSharedMemorySize attribute
+        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+      }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, ncclShmemDynamicSize(cudaArch)),
+        cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
         result, next_kernel);
     }
   next_kernel:;
@@ -1445,7 +1455,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
   if (driverVersion >= 11080) {
     int compCap = comm->compCap;
-    unsigned int clusterSize = (compCap == 90) ? comm->config.cgaClusterSize : 0;
+    unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
     CUlaunchAttribute launchAttrs[3];
@@ -1597,7 +1607,7 @@ static ncclResult_t updateCollCostTable(
     if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
     /* now we only support single-node NVLS allgather and reducescatter */
-    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && comm->nNodes > 1) continue;
+    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
     /* Tree reduceScatter doesn't support scaling yet */
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 3f639a022..64fc1c5dd 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -19,7 +19,6 @@
 ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
   int rank = comm->rank;
   int localRanks = comm->topo->nodes[GPU].count;
-  int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
   int nChannels = comm->nChannels;
 
   topoRanks->nvlsHeadNum = 0;
@@ -74,7 +73,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
   // Get nvls heads and the number of heads. Duplicate head is not allowed.
   for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
     bool addHead = true;
-    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
+    int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * localRanks;
 
     for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
       if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
@@ -259,8 +258,6 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
     channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
     channel->nvls.headRank = headRank;
     channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-    channel->nvls.node = comm->node;
-    channel->nvls.nNodes = comm->nNodes;
     if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
   }
   if (comm->nNodes == 1) return ncclSuccess;
@@ -466,7 +463,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
-  if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
+  if (comm->minCompCap >= 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
      nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
   }
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 6e9356826..587a8b282 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -828,14 +828,37 @@ ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nr
   return ncclSuccess;
 }
 
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system) {
-  int minPath = PATH_DIS;
+ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min) {
+  int minPath = PATH_SYS;
   for (int i=0; i<system->nodes[GPU].count; i++) {
-    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[GPU];
-    for (int j=0; j<system->nodes[GPU].count; j++) {
-      if (i == j) continue;
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
       minPath = std::min(minPath, paths[j].type);
     }
   }
-  return minPath >= PATH_PIX ? 0 : 1;
+  *min = minPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max) {
+  int maxPath = PATH_LOC;
+  for (int i=0; i<system->nodes[GPU].count; i++) {
+    struct ncclTopoLinkList* paths = system->nodes[GPU].nodes[i].paths[type];
+    if (paths == NULL) continue;
+    for (int j=0; j<system->nodes[type].count; j++) {
+      if (type == GPU && i == j) continue;
+      maxPath = std::max(maxPath, paths[j].type);
+    }
+  }
+  *max = maxPath;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink) {
+  int maxPath;
+  NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxPath));
+  *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
+  return ncclSuccess;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 9b72ac160..0185b3f7b 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -937,6 +937,11 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
+float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
+float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+#define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
+#define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
+
 ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
   int ngpus = system->nodes[GPU].count;
   int crossNic = (system->nodes[NET].count > 1) &&
@@ -946,8 +951,20 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   graph->crossNic = crossNic == 1 ? 1 : 0;
   graph->bwIntra = graph->bwInter = 0;
   graph->latencyInter = 0;
-  graph->typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
-  graph->typeInter = PATH_PIX;
+  int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX;
+  int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS;
+  if (ngpus > 1) {
+    NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
+    NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
+  }
+  if (system->nodes[NET].count > 0) {
+    NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
+    NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
+    maxTypeIntra = maxTypeInter;
+  }
+
+  graph->typeIntra = minTypeIntra;
+  graph->typeInter = minTypeInter;
   graph->nChannels = 0;
   int trySameChannels = graph->pattern == NCCL_TOPO_PATTERN_NVLS ? 0 : 1;
   graph->sameChannels = trySameChannels;
@@ -972,14 +989,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   NCCLCHECK(ncclTopoGetCompCap(system, &ccMin, NULL));
   if (graph->pattern == NCCL_TOPO_PATTERN_NVLS && (system->nodes[NVS].count == 0 || ccMin < 90)) return ncclSuccess;
   // NVLS and COLLNET_DIRECT search must have ngpus heads at most.
-  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT)
-    graph->maxChannels = system->nodes[GPU].count;
+  if (graph->pattern == NCCL_TOPO_PATTERN_NVLS) graph->maxChannels = std::min(NCCL_MAX_NVLS_ARITY, system->nodes[GPU].count);
+  if (graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) graph->maxChannels = std::min(NCCL_MAX_DIRECT_ARITY+1, system->nodes[GPU].count);
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
   if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     // Force intra-node NVLS algorithm to pull evenly from all GPUs.
-    graph->minChannels = graph->maxChannels = system->nodes[GPU].count;
+    graph->minChannels = graph->maxChannels;
   }
 
   struct ncclTopoGraph tmpGraph;
@@ -989,11 +1006,11 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   int nspeeds = 0;
   float* speedArray = NULL;
   if (system->nodes[NET].count == 0) {
-    nspeeds = ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA;
-    speedArray = ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra;
+    nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
+    speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
   } else {
-    nspeeds = ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER;
-    speedArray = ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter;
+    nspeeds = ccMin >= 100 ? NSPEEDSINTER_SM100 : (ccMin >= 90 ? NSPEEDSINTER_SM90 : NSPEEDSINTER);
+    speedArray = ccMin >= 100 ? sm100SpeedArrayInter : (ccMin >= 90 ? sm90SpeedArrayInter : speedArrayInter);
   }
   int pass = 1;
   int speedIndex = 0;
@@ -1048,18 +1065,18 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     }
     tmpGraph.pattern = graph->pattern;
 
-    int maxTypeIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : PATH_SYS;
-    if (tmpGraph.typeIntra < maxTypeIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
+    int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
+    if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
       goto search;
     }
-    tmpGraph.typeIntra = ngpus == 1 ? PATH_LOC : PATH_NVL;
+    tmpGraph.typeIntra = minTypeIntra;
 
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < PATH_SYS && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
+    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       goto search;
     }
-    tmpGraph.typeInter = PATH_PIX;
+    tmpGraph.typeInter = minTypeInter;
 
     if (crossNic == 2 && tmpGraph.crossNic == 0
         && (graph->pattern == NCCL_TOPO_PATTERN_RING || graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE)) {
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index d758ac989..ba82cafb7 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -1357,11 +1357,11 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   goto exit;
 }
 
-ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int** locals, int* localCount, int* pathType) {
+static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
+                                     int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
   int minType = PATH_DIS;
   float maxBw = 0;
   int count = 0;
-  NCCLCHECK(ncclCalloc(locals, system->nodes[resultType].count));
   struct ncclTopoLinkList* paths = system->nodes[type].nodes[index].paths[resultType];
   if (paths == NULL) { *localCount = 0; return ncclSuccess; }
   for (int i=0; i<system->nodes[resultType].count; i++) {
@@ -1371,7 +1371,15 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
       if (pathType) *pathType = minType;
       count = 0;
     }
-    if (paths[i].bw == maxBw && paths[i].type == minType) (*locals)[count++] = i;
+    if (paths[i].bw == maxBw && paths[i].type == minType) {
+      if (count == NCCL_TOPO_MAX_NODES) {
+        WARN("Error : ran out of room to store found nodes in ncclTopoGetLocal."
+             " Filled %d of type %d, starting from index %d of type %d.",
+             NCCL_TOPO_MAX_NODES, resultType, index, type);
+        return ncclInternalError;
+      }
+      locals[count++] = i;
+    }
   }
   *localCount = count;
   return ncclSuccess;
@@ -1379,7 +1387,7 @@ ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index
 
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count) {
   int localNetCount = 0, netCountByBw = 0;
-  int* localNets;
+  int localNets[NCCL_TOPO_MAX_NODES];
   float totalNetBw = 0, gpuBw = 0;
 
   for (int l=0; l<system->nodes[GPU].nodes[gpu].nlinks; l++) {
@@ -1391,54 +1399,55 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
     }
   }
 
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
   for (int l=0; (l < localNetCount) && (totalNetBw < gpuBw); l++, netCountByBw++) {
      totalNetBw += system->nodes[GPU].nodes[gpu].paths[NET][localNets[l]].bw;
   }
   *count = netCountByBw;
 
-  free(localNets);
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
-  ncclResult_t ret = ncclSuccess;
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
-  int* localNets;
+
+  int localNets[NCCL_TOPO_MAX_NODES];
   int localNetCount;
-  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, &localNets, &localNetCount, NULL));
-  int* localGpus = NULL;
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNets, &localNetCount, NULL));
+  if (localNetCount==0) {
+    WARN("Could not find any local path from gpu %d to net.", gpu);
+    return ncclInternalError;
+  }
+
+  int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
-  int net;
-  NCCLCHECKGOTO(ncclTopoGetLocal(system, NET, localNets[0], GPU, &localGpus, &localGpuCount, NULL), ret, fail);
-  net = system->nodes[GPU].nodes[gpu].gpu.dev;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+
+  int net = system->nodes[GPU].nodes[gpu].gpu.dev;
   if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
   net += channelId%(DIVUP(localNetCount,localGpuCount));
   if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
   if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
-exit:
-  free(localNets);
-  if (localGpus) free(localGpus);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
   ncclResult_t ret = ncclSuccess;
   int netIndex;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &netIndex));
-  int* localGpus = NULL;
+
+  int localGpus[NCCL_TOPO_MAX_NODES];
   int localGpuCount;
+  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, localGpus, &localGpuCount, NULL));
+
   int foundGpu = -1;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, netIndex, GPU, &localGpus, &localGpuCount, NULL));
   for (int c=0; c<MAXCHANNELS; c++) {
     for (int lg=0; lg<localGpuCount; lg++) {
       int g = localGpus[lg];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       int64_t id;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &id, NULL));
       if (netId == id) {
         foundGpu = g;
         goto exit;
@@ -1447,8 +1456,6 @@ ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, i
   }
 exit:
   *gpuIndex = foundGpu;
-fail:
-  free(localGpus);
   return ret;
 }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 8e7cda5b4..2be029b88 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -16,6 +16,7 @@
 #define SM80_NVLINK_BW 20.0
 #define SM90_NVLINK_BW 20.6
 #define SM86_NVLINK_BW 12.0
+#define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define QPI_BW 6.0
 #define AMD_BW 16.0
@@ -91,7 +92,8 @@ struct ncclTopoLink {
   float bw;
   struct ncclTopoNode* remNode;
 };
-#define NCCL_TOPO_MAX_LINKS 128
+// Allows for up to 32 NICs per node on GB200-NVL72
+#define NCCL_TOPO_MAX_LINKS 576
 #define NCCL_TOPO_MAX_HOPS (NCCL_TOPO_MAX_NODES*NCCL_TOPO_NODE_TYPES)
 
 struct ncclTopoLinkList {
@@ -172,6 +174,8 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
 ncclResult_t ncclTopoPrintPaths(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem* system);
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
+ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
+ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
@@ -230,6 +234,7 @@ static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id
 // Returns NVLink bw in GB/s
 static float ncclTopoNVLinkBw(int cudaCompCap) {
   return
+    cudaCompCap >= 100 ? SM100_NVLINK_BW :
     cudaCompCap >= 90 ? SM90_NVLINK_BW :
     cudaCompCap == 86 ? SM86_NVLINK_BW :
     cudaCompCap >= 80 ? SM80_NVLINK_BW :
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index f5f2e1185..8da4aeb9e 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -145,28 +145,33 @@ static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
 #define VOLTA_COMPCAP_IDX 0
 #define AMPERE_COMPCAP_IDX 1
 #define HOPPER_COMPCAP_IDX 2
+#define BLACKWELL_COMPCAP_IDX 3
 
 // LL128 max BW per channel
-static const double llMaxBws[3][3] = {
+static const double llMaxBws[][3] = {
   /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
   /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
-  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0}
+  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
+  /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
 };
 
-static const double perChMaxRingLL128Bws[3][3] = {
+static const double perChMaxRingLL128Bws[][3] = {
   /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
   /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
   /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
+  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
 };
-static const double perChMaxTreeLL128Bws[3][3] = {
+static const double perChMaxTreeLL128Bws[][3] = {
   /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
   /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
   /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
+  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
 };
-static const double perChMaxTreeBws[3][3] = {
+static const double perChMaxTreeBws[][3] = {
   /* Volta (N1/N2/N4) */  {26.5, 18.5, 10.0},
   /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
   /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
+  /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
 };
 
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
@@ -207,7 +212,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int nRanks = comm->nRanks;
   if (nRanks <= 1) return ncclSuccess;
 
-  int compCapIndex = minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX;
+  int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
   int index1 = nNodes == 1 ? compCapIndex :
@@ -418,7 +423,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
-      // Enable LL128 by default only on Volta/Ampere/Hopper+NVLink. Other cases are not tested and may cause silent data corruption.
+      // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
       pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
       pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
@@ -427,6 +432,8 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       case 70: pEnable &= 1; break;
       case 80: pEnable &= 1; break;
       case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
+      case 100: pEnable &= 1; break;
+      case 120: pEnable &= 1; break;
       default: pEnable &= 0; break;
       }
     }
diff --git a/src/include/alloc.h b/src/include/alloc.h
index 7744119c3..021c91f77 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -204,14 +204,13 @@ static inline ncclResult_t ncclCuMemFreeAddr(void *ptr) {
   return result;
 }
 
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, CUmemAllocationHandleType type, size_t size) {
   ncclResult_t result = ncclSuccess;
   size_t granularity = 0;
   CUdevice currentDev;
   CUmemAllocationProp prop = {};
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
-  CUmemAllocationHandleType type = ncclCuMemHandleType;
   int cudaDev;
   int flag = 0;
   CUDACHECK(cudaGetDevice(&cudaDev));
@@ -260,7 +259,7 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
 
 extern int ncclCuMemEnable();
 
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, int type, size_t size) {
   WARN("CUMEM not supported prior to CUDA 11.3");
   return ncclInternalError;
 }
@@ -288,7 +287,7 @@ ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, in
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   if (nelem > 0) {
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
@@ -312,7 +311,7 @@ ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, in
     cudaStream_t stream;
     CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
@@ -336,7 +335,7 @@ ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   if (nelem > 0) {
     if (ncclCuMemEnable()) {
-      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*ncclSizeOfT<T>()), result, finish);
+      NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, ncclCuMemHandleType, nelem*ncclSizeOfT<T>()), result, finish);
     } else {
       CUDACHECKGOTO(cudaMalloc(ptr, nelem*ncclSizeOfT<T>()), result, finish);
     }
diff --git a/src/include/device.h b/src/include/device.h
index 0c861f595..3f918ab23 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -59,8 +59,8 @@ union ncclLLFifoLine {
 };
 
 #define WARP_SIZE 32
-#define MAXCHANNELS 32
-#define NCCL_MAX_LOCAL_RANKS 64
+#define MAXCHANNELS 64
+#define NCCL_MAX_LOCAL_RANKS 72
 #define NCCL_MAX_NTHREADS 640
 #define NCCL_MIN_NTHREADS (4*WARP_SIZE)
 #define NCCL_SIMPLE_MAX_NTHREADS 512
@@ -187,8 +187,6 @@ struct ncclNvls {
   int down;
   int treeUp;
   int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
-  int node;
-  int nNodes;
 };
 
 #if __CUDA_ARCH__ >= 900
diff --git a/src/include/enqueue.h b/src/include/enqueue.h
index 3eb6c0743..5337eeba9 100644
--- a/src/include/enqueue.h
+++ b/src/include/enqueue.h
@@ -17,7 +17,7 @@
 #define NCCL_SIMPLE_ALIGNMENT (WARP_SIZE * 8LL * 16LL)
 #define NCCL_BYTES_ALIGNMENT 16
 
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize);
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
 ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
diff --git a/src/include/graph.h b/src/include/graph.h
index 602cc8cd9..a22b62bb2 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -28,7 +28,8 @@ void ncclTopoFree(struct ncclTopoSystem* system);
 ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
 ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink);
+
 ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
diff --git a/src/include/mnnvl.h b/src/include/mnnvl.h
new file mode 100644
index 000000000..dedbefe43
--- /dev/null
+++ b/src/include/mnnvl.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_MNNVL_H_
+#define NCCL_MNNVL_H_
+
+#include "nccl.h"
+#include "comm.h"
+
+ncclResult_t ncclMnnvlCheck(struct ncclComm* comm);
+
+#endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 14b317fdd..5d00f0792 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -30,6 +30,7 @@
 #define NVTX_SID_CommInitRankConfig   11 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
+#define NVTX_SID_CommFinalize         14
 
 // Define static schema ID for the reduction operation.
 #define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
@@ -38,11 +39,13 @@ extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
 struct nccl_domain{static constexpr char const* name{"NCCL"};};
 
+/// @brief Register an NVTX payload schema for static-size payloads.
 class payload_schema {
  public:
-  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries,
+    const uint64_t schemaId, const size_t size) noexcept
   {
-    schema_attr.name = schemaName;
+    schema_attr.payloadStaticSize = size;
     schema_attr.entries = entries;
     schema_attr.numEntries = numEntries;
     schema_attr.schemaId = schemaId;
@@ -63,26 +66,84 @@ class payload_schema {
     NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
     NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
     NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
-    nullptr,
+    nullptr, /* schema name is not needed */
     NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
     NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
     nullptr, 0, 0, 0, 0, nullptr};
 };
 
+// Convenience macro to give the payload parameters a scope.
+#define NVTX3_PAYLOAD(...) __VA_ARGS__
+
 // Create NVTX push/pop range with parameters
-// @param name of the operation (see `NVTX_SID_*`)
-// @param N  schema name
-// @param S  schema (entries)
-// @param P  payload (struct)
-#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
-  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
-    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
+// @param N NCCL API name without the `nccl` prefix.
+// @param T name of the used NVTX payload schema without "Schema" suffix.
+// @param P payload parameters/entries
+#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
+  constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
+  static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
+    schemaId, sizeof(T)}; \
   static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
-  nvtxPayloadData_t nvtx3_bpl__[] = { \
-    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
+  const T _payload = {P}; \
+  nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
   ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
   ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
 
+/// @brief Creates an NVTX range with extended payload using the RAII pattern.
+/// @tparam PayloadType Data type of the payload.
+template <typename PayloadType>
+class ncclNvtxRange {
+ public:
+  explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept {
+    nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), evtAttr);
+  }
+
+  ~ncclNvtxRange() noexcept {
+    if (payloadData.payload) {
+      nvtxRangePopPayload(nvtx3::domain::get<nccl_domain>(), &payloadData, 1);
+    } else {
+      nvtxDomainRangePop(nvtx3::domain::get<nccl_domain>());
+    }
+  }
+
+  void setPayloadData(const uint64_t schemaId) noexcept
+  {
+    payloadData = {schemaId, sizeof(PayloadType), &payload};
+  }
+
+  ncclNvtxRange() = delete;
+  ncclNvtxRange(ncclNvtxRange const&) = default;
+  ncclNvtxRange& operator=(ncclNvtxRange const&) = default;
+  ncclNvtxRange(ncclNvtxRange&&) = default;
+  ncclNvtxRange& operator=(ncclNvtxRange&&) = default;
+
+  // Holds the payload data.
+  PayloadType payload{};
+
+ private:
+  nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL};
+};
+
+// Create an NVTX range with the function name as the range name. Use RAII pattern.
+// @param T Type ID of the NVTX payload (pointer for variable-size payloads).
+#define NVTX3_RANGE(T) \
+  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
+  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+  ncclNvtxRange<T> nvtx3_range__{nvtx3_func_attr__.get()};
+
+// Add static-size payload to the NVTX range created with `NVTX3_RANGE()`,
+// which must be in this or an outer scope.
+// @param N NCCL API name without the `nccl` prefix.
+// @param S name of the used NVTX payload schema.
+// @param P payload parameters/entries
+#define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
+  constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
+  static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
+    sizeof(nvtx3_range__.payload)}; \
+  nvtx3_range__.payload = {P}; \
+  nvtx3_range__.setPayloadData(schema_id); \
+} while (0)
+
 extern void initNvtxRegisteredEnums();
 
 #endif
diff --git a/src/include/nvtx3/nvToolsExtPayloadHelper.h b/src/include/nvtx3/nvToolsExtPayloadHelper.h
index 304d5d6a5..0f0c87d6a 100644
--- a/src/include/nvtx3/nvToolsExtPayloadHelper.h
+++ b/src/include/nvtx3/nvToolsExtPayloadHelper.h
@@ -11,7 +11,7 @@
 
 /* This is just an empty marker (for readability), which can be omitted. */
 /* TODO: Fix issue with trailing comma at end of entry list. */
-#define NVTX_PAYLOAD_ENTRIES
+#define NCCL_NVTX_PAYLOAD_ENTRIES
 
 
 /**
@@ -32,7 +32,7 @@
  *
  * Example:
  *  NVTX_DEFINE_SCHEMA_FOR_STRUCT(your_struct, "SchemaName",
- *      NVTX_PAYLOAD_ENTRIES(
+ *      NCCL_NVTX_PAYLOAD_ENTRIES(
  *          (index, TYPE_INT, "integer value"),
  *          (dpfloat, TYPE_DOUBLE, "fp64 value"),
  *          (text, TYPE_CSTRING, "text", NULL, 24)
@@ -80,7 +80,7 @@
  *
  * Example:
  *  NVTX_DEFINE_STRUCT_WITH_SCHEMA(your_struct_name, "Your schema name",
- *      NVTX_PAYLOAD_ENTRIES(
+ *      NCCL_NVTX_PAYLOAD_ENTRIES(
  *          (int, index, TYPE_INT, "integer value"),
  *          (double, dpfloat, TYPE_DOUBLE, "fp64 value"),
  *          (const char, (text, 24), TYPE_CSTRING, "text", NULL, 24)
diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h
new file mode 100644
index 000000000..228a19275
--- /dev/null
+++ b/src/include/nvtx_payload_schemas.h
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/// Definitions of NVTX payload types and schemas used for the NVTX
+/// instrumentation in init.cc and collectives.cc.
+
+#ifndef NVTX_PAYLOAD_SCHEMAS_H_
+#define NVTX_PAYLOAD_SCHEMAS_H_
+
+
+#include "nccl.h"
+#include "nvtx3/nvToolsExtPayload.h"
+#include "nvtx3/nvToolsExtPayloadHelper.h"
+
+/**
+ * \brief Define a C struct together with the matching schema entries.
+ *
+ * Does the same as `NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA`, but without creating the
+ * schema attributes. (Remove this helper when it is available in the NVTX headers.)
+ */
+#define NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(struct_id, prefix, entries) \
+  _NVTX_PAYLOAD_TYPEDEF_STRUCT(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries) \
+  prefix _NVTX_PAYLOAD_SCHEMA_INIT_ENTRIES(struct_id, _NVTX_PAYLOAD_PASS_THROUGH entries)
+
+// C strings used as NVTX payload entry names.
+static constexpr char const* nccl_nvtxCommStr = "NCCL communicator ID";
+static constexpr char const* nccl_nvtxCudaDevStr = "CUDA device";
+static constexpr char const* nccl_nvtxRankStr = "Rank";
+static constexpr char const* nccl_nvtxNranksStr = "No. of ranks";
+static constexpr char const* nccl_nvtxMsgSizeStr = "Message size [bytes]";
+static constexpr char const* nccl_nvtxReductionOpStrpStr = "Reduction operation";
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitAll, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, commhash, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, ndev, TYPE_INT, "No. of devices")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr)
+  )
+)
+// The typedef and payload schema for ncclCommInitRank is also used for,
+// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (uint64_t, parentcomm, TYPE_UINT64, "Parent NCCL communicator ID"),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
+    (int, color, TYPE_INT, "Color"),
+    (int, key, TYPE_INT, "Key")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root")
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root"),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (ncclRedOp_t, op, NCCL_REDOP, nccl_nvtxReductionOpStrpStr)
+  )
+)
+
+// Used in NCCL APIs `ncclSend` and `ncclRecv`.
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, peer, TYPE_INT, "Peer rank")
+  )
+)
+
+#endif // end include guard
diff --git a/src/include/proxy.h b/src/include/proxy.h
index b6ef0fa9d..c97a4d7ce 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -363,6 +363,8 @@ ncclResult_t ncclProxyStart(struct ncclComm* comm);
 ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses, uint64_t *peerAddressesUDS);
 ncclResult_t ncclProxyCreate(struct ncclComm* comm);
 ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
+
+// NB: ncclProxyMsgTypeStr[] in proxy.cc needs to match
 enum ncclProxyMsgType {
   ncclProxyMsgInit = 1,
   ncclProxyMsgSharedInit = 2,
diff --git a/src/init.cc b/src/init.cc
index 5caaaae09..3e218ab07 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -18,6 +18,7 @@
 #include "argcheck.h"
 #include "tuner.h"
 #include "ras.h"
+#include "mnnvl.h"
 #include <fcntl.h>
 #include <string.h>
 #include <errno.h>
@@ -27,6 +28,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #include "param.h"
+#include "nvtx_payload_schemas.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -213,6 +215,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
   free(comm->rankToNode);
   free(comm->rankToLocalRank);
   free(comm->collNetHeads);
+  free(comm->clique.ranks);
 
   if (comm->bootstrap)
     NCCLCHECK(bootstrapClose(comm->bootstrap));
@@ -530,6 +533,7 @@ static void showVersion() {
   }
 }
 
+NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
 NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
 
 static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
@@ -564,12 +568,16 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
     info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
     (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
     if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+      if (ncclParamMNNVLUUID() != -1) {
+        ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
+        ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
+      }
+      if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
       INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
            info->busId,
            ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
            info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
     }
-    if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
   }
 
   return ncclSuccess;
@@ -638,71 +646,6 @@ NCCL_PARAM(AllocP2pNetLLBuffers, "ALLOC_P2P_NET_LL_BUFFERS", 0);
 // MNNVL: Flag to indicate whether to enable Multi-Node NVLink
 NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 
-#if CUDART_VERSION >= 11030
-
-#include <cuda.h>
-#include "cudawrap.h"
-
-// Determine if MNNVL support is available
-static int checkMNNVL(struct ncclComm* comm) {
-  ncclResult_t ret = ncclSuccess;
-
-  // MNNVL requires cuMem to be enabled
-  if (!ncclCuMemEnable()) return 0;
-
-  // MNNVL also requires FABRIC handle support
-  int cudaDev;
-  int flag = 0;
-  CUdevice currentDev;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
-  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));;
-  if (!flag) return 0;
-  // Check that all ranks have initialized the fabric fully
-  for (int i = 0; i < comm->nRanks; i++) {
-    if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return 0;
-  }
-
-  // Determine our MNNVL domain/clique
-  NCCLCHECKGOTO(ncclCalloc(&comm->clique.ranks, comm->nRanks), ret, fail);
-  comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
-  for (int i = 0; i < comm->nRanks; i++) {
-    nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
-    nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
-    // Check if the cluster UUID and cliqueId match
-    // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
-    if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) goto fail;
-    if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
-        (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
-      if (i == comm->rank) {
-        comm->cliqueRank = comm->clique.size;
-      }
-      comm->clique.ranks[comm->clique.size++] = i;
-    }
-  }
-  // Determine whether to enable MNNVL or not
-  comm->MNNVL = ncclParamMNNVLEnable() == 2 ? comm->clique.size > 1 : ncclParamMNNVLEnable();
-  INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d ", comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
-
-  if (comm->MNNVL) {
-    // Force the CUMEM handle type to be FABRIC for MNNVL
-    ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
-  }
-
-  return comm->MNNVL;
-
-fail:
-  if (comm->clique.ranks) free(comm->clique.ranks);
-  return 0;
-}
-
-#else
-static int checkMNNVL(struct ncclComm* comm) {
-  return 0;
-}
-#endif
-
 #define TIMER_INIT_TOTAL 0
 #define TIMER_INIT_KERNELS 1
 #define TIMER_INIT_BOOTSTRAP 2
@@ -782,12 +725,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // AllGather1 - end
   timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
 
-  // MNNVL support
-  if (nNodes > 1 && !checkMNNVL(comm) && ncclParamMNNVLEnable() == 1) {
-    // Return an error if the user specifically requested MNNVL support
-    WARN("MNNVL is not supported on this system");
-    ret = ncclSystemError;
-    goto fail;
+  // Check for MNNVL support
+  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
+    NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
   }
 
   do {
@@ -1079,7 +1019,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       comm->collNetSupport = 0;
     }
   }
-  comm->isAllNvlink = ncclTopoPathAllNVLink(comm->topo);
+  NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
   comm->isOneRPN = (comm->maxLocalRanks == 1);
 
   NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
@@ -1406,18 +1346,20 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   int cudaDev = job->cudaDev;
   int* parentRanks = NULL;
   int cudaArch;
+  int maxSharedMem = 0;
   double sum_timers = 0;
   uint64_t timers[TIMERS_INIT_COUNT] = {0};
   unsigned long long commIdHash;
 
   timers[TIMER_INIT_TOTAL] = clockNano();
   CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail);
+  CUDACHECKGOTO(cudaDeviceGetAttribute(&maxSharedMem, cudaDevAttrMaxSharedMemoryPerBlockOptin, cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail);
   CUDACHECKGOTO(cudaDeviceGetAttribute(&archMinor, cudaDevAttrComputeCapabilityMinor, cudaDev), res, fail);
   cudaArch = 100*archMajor + 10*archMinor;
 
   timers[TIMER_INIT_KERNELS] = clockNano();
-  NCCLCHECK(ncclInitKernelsForDevice(cudaArch, &maxLocalSizeBytes));
+  NCCLCHECK(ncclInitKernelsForDevice(cudaArch, maxSharedMem, &maxLocalSizeBytes));
   // Set the maximum kernel stack size of all kernels to avoid
   // a CUDA memory reconfig on load (c.f. NVSHMEM issue)
   if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) {
@@ -1533,18 +1475,24 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
     comm->config.cgaClusterSize = cgaClusterSizeEnv;
   } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
-    WARN("NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
+    INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
     comm->config.cgaClusterSize = NCCL_MAX_CGA_CLUSTER_SIZE;
   }
 
   minCTAsEnv = ncclParamMinCTAs();
   if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.minCTAs = minCTAsEnv;
+    if (minCTAsEnv <= 0)
+      INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
+    else
+      comm->config.minCTAs = minCTAsEnv;
   }
 
   maxCTAsEnv = ncclParamMaxCTAs();
   if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.maxCTAs = maxCTAsEnv;
+    if (maxCTAsEnv <= 0)
+      INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
+    else
+      comm->config.maxCTAs = maxCTAsEnv;
   }
 
   envNetName = ncclGetEnv("NCCL_NET");
@@ -1565,22 +1513,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   /* cap channels if needed */
   if (comm->config.minCTAs > MAXCHANNELS) {
-    WARN("minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
+    INFO(NCCL_ENV, "minCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.minCTAs, MAXCHANNELS, MAXCHANNELS);
     comm->config.minCTAs = MAXCHANNELS;
   }
 
   if (comm->config.maxCTAs > MAXCHANNELS) {
-    WARN("maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
+    INFO(NCCL_ENV, "maxCTAs %d is larger than #channels upper limit %d, cap it to %d", comm->config.maxCTAs, MAXCHANNELS, MAXCHANNELS);
     comm->config.maxCTAs = MAXCHANNELS;
   }
 
   if (comm->config.minCTAs > comm->config.maxCTAs) {
-    WARN("minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
+    INFO(NCCL_ENV, "minCTAs %d is larger than maxCTAs %d, set both to %d", comm->config.minCTAs, comm->config.maxCTAs, comm->config.maxCTAs);
     comm->config.minCTAs = comm->config.maxCTAs;
   }
 
   if (comm->config.splitShare != 1 && comm->config.splitShare != 0) {
-    WARN("splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
+    INFO(NCCL_ENV, "splitShare %d is not a valid value 0/1, set it to 0", comm->config.splitShare);
     comm->config.splitShare = 0;
   }
 
@@ -1763,20 +1711,9 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   goto exit;
 }
 
-struct NvtxParamsCommInitRank
-{
-  int rank;
-  int nranks;
-  int cudaDev;
-};
-constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommInitRank, nranks)},
-  {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommInitRank, cudaDev)},
-};
-
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  NVTX3_RANGE(NcclNvtxParamsCommInitRank)
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
 
@@ -1784,10 +1721,11 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRank, CommInitRankSchema, payload)
-
   NCCLCHECK(ncclCommInitRankDev(newcomm, nranks, 1, &commId, myrank, cudaDev, &config, __func__));
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommInitRank, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+
   return ncclSuccess;
 }
 
@@ -1799,10 +1737,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
   ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
   int oldDev = 0;
 
-  constexpr nvtxPayloadSchemaEntry_t CommInitAllSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of devices"}
-  };
-  NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev)
+  NVTX3_RANGE(NcclNvtxParamsCommInitAll);
 
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
@@ -1840,14 +1775,17 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
 
   ncclUniqueId uniqueId;
   NCCLCHECKGOTO(ncclGetUniqueId(&uniqueId), ret, fail);
-  NCCLCHECKGOTO(ncclGroupStart(), ret, fail);
+  NCCLCHECKGOTO(ncclGroupStartInternal(), ret, fail);
   for (int i=0; i<ndev; i++) {
     // Ignore return codes .. we need to call ncclGroupEnd to clean up anyway
     int dev = devlist ? devlist[i] : i;
     CUDACHECKGOTO(cudaSetDevice(dev), ret, fail);
     ncclCommInitRankDev(comms+i, ndev,1, &uniqueId, i, dev, &config, __func__);
   }
-  NCCLCHECKGOTO(ncclGroupEnd(), ret, fail);
+  NCCLCHECKGOTO(ncclGroupEndInternal(), ret, fail);
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommInitAll, NcclNvtxParamsCommInitAllSchema,
+    NVTX3_PAYLOAD(comms[0]->commHash, ndev));
 
 exit:
   (void)cudaSetDevice(oldDev);
@@ -1873,14 +1811,14 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
   ncclResult_t ret = ncclSuccess;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t *internalConfigPtr = NULL;
+
+  NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
+
   NCCLCHECK(ncclGroupStartInternal());
 
   (void)ncclCudaLibraryInit();
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRankConfig, CommInitRankSchema, payload)
-
   if (config == NULL)
     internalConfigPtr = &internalConfig;
   else
@@ -1890,7 +1828,13 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  if (newcomm && *newcomm) {
+    if (!(*newcomm)->config.blocking) {
+      (void) ncclCommGetAsyncError(*newcomm, &ret);
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommInitRankConfig, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+  }
   return ret;
 fail:
   if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -1899,6 +1843,8 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
 
 NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
 ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
+  NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
+
   int cudaDev;
   ncclResult_t ret = ncclSuccess;
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
@@ -1908,9 +1854,6 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
   (void)ncclCudaLibraryInit();
   CUDACHECK(cudaGetDevice(&cudaDev));
 
-  NvtxParamsCommInitRank payload{myrank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommInitRankScalable, CommInitRankSchema, payload)
-
   if (config == NULL)
     internalConfigPtr = &internalConfig;
   else
@@ -1920,7 +1863,13 @@ ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myran
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommGetAsyncError(*newcomm, &ret);
+  if (newcomm && *newcomm) {
+    if (!(*newcomm)->config.blocking) {
+      (void) ncclCommGetAsyncError(*newcomm, &ret);
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommInitRankScalable, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, nranks, myrank, cudaDev));
+  }
   return ret;
 fail:
   if (newcomm && *newcomm && !(*newcomm)->config.blocking) (void) ncclCommSetAsyncError(*newcomm, ret);
@@ -1980,7 +1929,8 @@ static ncclResult_t commCleanup(ncclComm_t comm) {
 
 NCCL_API(ncclResult_t, ncclCommFinalize, ncclComm_t comm);
 ncclResult_t ncclCommFinalize(ncclComm_t comm) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NVTX3_RANGE(NcclNvtxParamsCommFinalize);
+
   ncclResult_t ret = ncclSuccess;
   struct ncclCommFinalizeAsyncJob *job = NULL;
 
@@ -2005,7 +1955,13 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
 exit:
   ncclGroupErrCheck(ret);
   NCCLCHECK(ncclGroupEndInternal());
-  if (comm && !comm->config.blocking) { NCCLCHECK(ncclCommGetAsyncError(comm, &ret)); }
+  if (comm) {
+    if (!comm->config.blocking) {
+      NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommFinalize, NcclNvtxParamsCommFinalizeSchema,
+      NVTX3_PAYLOAD(comm->commHash));
+  }
   return ret;
 fail:
   free(job);
@@ -2077,8 +2033,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   struct ncclCommFinalizeAsyncJob *job = NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommDestroy, CommInitRankSchema, payload)
+  NVTX3_FUNC_WITH_PARAMS(CommDestroy, NcclNvtxParamsCommInitRank,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
   NCCLCHECK(ncclGroupStartInternal());
@@ -2105,8 +2061,9 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
 
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
+  NVTX3_RANGE(NcclNvtxParamsCommAbort);
+
   if (comm == NULL) {
-    NVTX3_FUNC_RANGE_IN(nccl_domain);
     return ncclSuccess;
   }
   NCCLCHECK(ncclGroupStartInternal());
@@ -2127,8 +2084,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   struct ncclCommFinalizeAsyncJob *job = NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommInitRank payload{rank, nranks, cudaDev};
-  NVTX3_FUNC_WITH_PARAMS(CommAbort, CommInitRankSchema, payload)
+  NVTX3_RANGE_ADD_PAYLOAD(CommAbort, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
 
@@ -2144,29 +2101,13 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   goto exit;
 }
 
-struct NvtxParamsCommSplit {
-  int rank;
-  int nranks;
-  int cudaDev;
-  int color;
-  int key;
-};
-constexpr nvtxPayloadSchemaEntry_t CommSplitSchema[] = {
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Rank"},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "No. of ranks", nullptr, 0, offsetof(NvtxParamsCommSplit, nranks)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "CUDA device", nullptr, 0, offsetof(NvtxParamsCommSplit, cudaDev)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "color", nullptr, 0, offsetof(NvtxParamsCommSplit, color)},
-    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "key", nullptr, 0, offsetof(NvtxParamsCommSplit, key)},
-};
-
 NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
 ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
   struct ncclCommInitRankAsyncJob *job = NULL;
   struct ncclComm* childComm = NCCL_COMM_NULL;
   ncclResult_t res = ncclSuccess;
 
-  NvtxParamsCommSplit payload{comm->rank, comm->nRanks, comm->cudaDev, color, key};
-  NVTX3_FUNC_WITH_PARAMS(CommSplit, CommSplitSchema, payload)
+  NVTX3_RANGE(NcclNvtxParamsCommSplit)
 
   int oldDev;
   CUDACHECK(cudaGetDevice(&oldDev));
@@ -2224,6 +2165,12 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   (void)cudaSetDevice(oldDev);
   (void)ncclGroupErrCheck(res);
   NCCLCHECK(ncclGroupEndInternal());
+
+  if (res == ncclSuccess && *newcomm) {
+    NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
+      NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
+  }
+
   return res;
 fail:
   if (childComm) {
diff --git a/src/mnnvl.cc b/src/mnnvl.cc
new file mode 100644
index 000000000..07e8b21d9
--- /dev/null
+++ b/src/mnnvl.cc
@@ -0,0 +1,82 @@
+/*************************************************************************
+ * Copyright (c) 2015-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "mnnvl.h"
+#include "transport.h"
+#include <cuda.h>
+#include "cudawrap.h"
+
+// Determine if MNNVL support is available
+ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
+  // MNNVL requires cuMem to be enabled
+  if (!ncclCuMemEnable()) return ncclSuccess;
+
+  // MNNVL also requires FABRIC handle support
+  int cudaDev;
+  int flag = 0;
+  CUdevice currentDev;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  // Ignore error if CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED is not supported
+  (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
+  if (!flag) return ncclSuccess;
+  // Check that all ranks have initialized the fabric fully
+  for (int i = 0; i < comm->nRanks; i++) {
+    if (comm->peerInfo[i].fabricInfo.state != NVML_GPU_FABRIC_STATE_COMPLETED) return ncclSuccess;
+  }
+
+  // Determine our MNNVL domain/clique
+  NCCLCHECK(ncclCalloc(&comm->clique.ranks, comm->nRanks));
+  comm->clique.id = comm->peerInfo[comm->rank].fabricInfo.cliqueId;
+  for (int i = 0; i < comm->nRanks; i++) {
+    nvmlGpuFabricInfoV_t *fabricInfo1 = &comm->peerInfo[comm->rank].fabricInfo;
+    nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
+    // Check if the cluster UUID and cliqueId match
+    // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
+    if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+    if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
+        (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
+      if (i == comm->rank) {
+        comm->cliqueRank = comm->clique.size;
+      }
+      comm->clique.ranks[comm->clique.size++] = i;
+    }
+  }
+
+  // No MNNVL clique found
+  if (comm->clique.size <= 1) return ncclSuccess;
+
+  // Check that FABRIC handles can be exported & imported by IMEX
+  {
+    void *ptr = NULL;
+    CUmemGenericAllocationHandle handle;
+    ncclCuDesc cuDesc;
+    CUresult err;
+
+    // Allocate FABRIC handle compatible memory
+    ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
+    if (ret != ncclSuccess) return ncclSuccess;
+    err = CUPFN(cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
+    if (err != CUDA_SUCCESS ||
+        (err = CUPFN(cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC))) != CUDA_SUCCESS) {
+      const char *errStr;
+      (void) pfn_cuGetErrorString(err, &errStr);
+      NCCLCHECK(ncclCuMemFree(ptr));
+      // Return an error if this is a MNNVL capable system but it's not working
+      WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
+          comm->clique.size);
+      return ncclSystemError;
+    }
+    NCCLCHECK(ncclCuMemFree(ptr));
+
+    // Force the CUMEM handle type to be FABRIC for MNNVL
+    ncclCuMemHandleType = CU_MEM_HANDLE_TYPE_FABRIC;
+    comm->MNNVL = 1;
+    INFO(NCCL_INIT, "MNNVL %d cliqueId %x cliqueSize %d cliqueRank %d",
+        comm->MNNVL, comm->clique.id, comm->clique.size, comm->cliqueRank);
+  }
+  return ncclSuccess;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index bd8188a37..5a83ef3eb 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -21,6 +21,8 @@
 #include <sys/time.h>
 #include <sched.h>
 
+#define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
+
 enum { proxyRecv=0, proxySend=1 };
 void* ncclProxyServiceUDS(void* _args);
 
@@ -770,8 +772,8 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
   ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
   ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlAppend);
   TIME_START(2);
-  int freeOp[NCCL_MAX_LOCAL_RANKS];
-  int freeOpEnd[NCCL_MAX_LOCAL_RANKS];
+  int freeOp[NCCL_MAX_PROXY_CONNECTIONS];
+  int freeOpEnd[NCCL_MAX_PROXY_CONNECTIONS];
   for (int i = 0; i < proxyState->tpLocalnRanks; i++) freeOp[i] = -1;
 
   uint64_t lastOpCount = 0;
@@ -1060,7 +1062,8 @@ ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, in
   struct ncclProxyState* sharedProxyState = comm->proxyState;
   int tpProxyRank = comm->topParentRanks[proxyRank];
 
-  proxyConn->sameProcess = comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash ? 1 : 0;
+  proxyConn->sameProcess = ((comm->peerInfo[proxyRank].hostHash == comm->peerInfo[comm->rank].hostHash) &&
+                            (comm->peerInfo[proxyRank].pidHash == comm->peerInfo[comm->rank].pidHash)) ? 1 : 0;
   // Keep one connection per local rank
   proxyConn->connection = NULL;
   proxyConn->tpRank = tpProxyRank;
@@ -1193,7 +1196,7 @@ ncclResult_t ncclProxyClientQueryFdBlocking(struct ncclComm* comm, struct ncclPr
   goto exit;
 }
 
-const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd" };
+const char* ncclProxyMsgTypeStr[] = { "Unknown", "Init", "SharedInit", "Setup", "Connect", "Start", "Close", "Abort", "Stop", "GetFd", "QueryFd", "Register", "Deregister" };
 ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId) {
   struct ncclSocket* sock;
   ncclResult_t ret = ncclSuccess;
@@ -1552,18 +1555,18 @@ void* ncclProxyService(void* _args) {
   connectionPool.banks = 0;
   connectionPool.offset = NCCL_PROXY_CONN_POOL_SIZE;
 
-  struct pollfd pollfds[NCCL_MAX_LOCAL_RANKS+1];
-  struct ncclProxyLocalPeer peers[NCCL_MAX_LOCAL_RANKS];
-  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_LOCAL_RANKS);
-  for (int s=0; s<NCCL_MAX_LOCAL_RANKS; s++) {
+  struct pollfd pollfds[NCCL_MAX_PROXY_CONNECTIONS+1]; // one extra for listenSock fd
+  struct ncclProxyLocalPeer peers[NCCL_MAX_PROXY_CONNECTIONS];
+  memset(&peers, 0, sizeof(struct ncclProxyLocalPeer)*NCCL_MAX_PROXY_CONNECTIONS);
+  for (int s=0; s<NCCL_MAX_PROXY_CONNECTIONS; s++) {
     pollfds[s].fd = -1;
     pollfds[s].events = POLLHUP|POLLIN;
   }
-  if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_LOCAL_RANKS].fd) != ncclSuccess) {
+  if (ncclSocketGetFd(proxyState->listenSock, &pollfds[NCCL_MAX_PROXY_CONNECTIONS].fd) != ncclSuccess) {
     WARN("[Proxy Service] Get listenSock fd fails");
     return NULL;
   };
-  pollfds[NCCL_MAX_LOCAL_RANKS].events = POLLIN;
+  pollfds[NCCL_MAX_PROXY_CONNECTIONS].events = POLLIN;
 
   int maxnpeers = 0;
   int npeers = 0;
@@ -1577,17 +1580,19 @@ void* ncclProxyService(void* _args) {
     /* never let proxy service thread blocks in poll, or it cannot receive abortFlag. */
     int ret;
     do {
-      ret = poll(pollfds, NCCL_MAX_LOCAL_RANKS+1, asyncOpCount ? 0 : 500);
+      // poll all fds including the listenSock
+      ret = poll(pollfds, NCCL_MAX_PROXY_CONNECTIONS+1, asyncOpCount ? 0 : 500);
     } while (ret < 0 && errno == EINTR);
     if (ret < 0) {
       WARN("[Proxy Service] Poll failed: %s", strerror(errno));
       return NULL;
     }
-    if (pollfds[NCCL_MAX_LOCAL_RANKS].revents) {
+    if (pollfds[NCCL_MAX_PROXY_CONNECTIONS].revents) {
+      // We got an event on the listenSock
       int s = 0;
-      while (s < NCCL_MAX_LOCAL_RANKS && pollfds[s].fd >= 0) s++;
-      if (s == NCCL_MAX_LOCAL_RANKS) {
-        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_LOCAL_RANKS);
+      while (s < NCCL_MAX_PROXY_CONNECTIONS && pollfds[s].fd >= 0) s++;
+      if (s == NCCL_MAX_PROXY_CONNECTIONS) {
+        WARN("[Proxy service] Too many connections (%d max)", NCCL_MAX_PROXY_CONNECTIONS);
         return NULL;
       }
       if (maxnpeers < s+1) maxnpeers = s+1;
@@ -1819,6 +1824,7 @@ ncclResult_t ncclProxyStop(struct ncclComm* comm) {
 
     if ((comm->proxyRefCountOld = ncclAtomicRefCountDecrement(&sharedProxyState->refCount)) == 0) {
       if (*comm->abortFlag == 0 && sharedProxyState->peerAddresses) {
+        // We need to send a ncclProxyMsgStop message to our own proxy
         struct ncclSocket sock;
         int type = ncclProxyMsgStop;
         NCCLCHECK(ncclSocketInit(&sock, sharedProxyState->peerAddresses + comm->topParentRanks[comm->rank], comm->sharedRes->magic, ncclSocketTypeProxy, comm->abortFlag));
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 414a1ed94..3e4e9a504 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -80,7 +80,7 @@ static int rasOutBufferSize = 0;
 
 // We use them all over the place; no point in wasting the stack...
 static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS_CLIENT_DETAIL_THRESHOLD) rank numbers
-                           // or for printing the local GPU devices, which can't be more than 64 (NCCL_MAX_LOCAL_RANKS)
+                           // or for printing the local GPU devices, which can't be more than 64
                            // small numbers (times two if the NVML mask is different than the CUDA mask).
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
@@ -1687,7 +1687,7 @@ static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
 const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size) {
   bool first = true;
   buf[0] = '\0';
-  for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+  for (int i = 0; i < sizeof(cudaDevs)*8; i++)
     if (cudaDevs & (1UL << i)) {
       snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
       first = false;
@@ -1695,7 +1695,7 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
   if (cudaDevs != nvmlDevs) {
     snprintf(buf+strlen(buf), size-strlen(buf), " (NVML ");
     first = true;
-    for (int i = 0; i < NCCL_MAX_LOCAL_RANKS; i++)
+    for (int i = 0; i < sizeof(nvmlDevs)*8; i++)
       if (nvmlDevs & (1UL << i)) {
         snprintf(buf+strlen(buf), size-strlen(buf), "%s%d", (first ? "" : ","), i);
         first = false;
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 68cac0b44..715fff4a4 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -78,7 +78,7 @@ struct rasCollResponse {
 struct rasPeerInfo {
   union ncclSocketAddress addr;
   pid_t pid;
-  uint64_t cudaDevs; // Bitmask.  Conveniently, NCCL_MAX_LOCAL_RANKS == 64.
+  uint64_t cudaDevs; // Bitmask.  This is for local devices so 64 bits is enough.
   uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
 };
 
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
index 4282dc9c8..2ab7e9448 100644
--- a/src/register/coll_reg.cc
+++ b/src/register/coll_reg.cc
@@ -73,15 +73,19 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
 
     if (nvlsReged) {
       *regNeedConnect = 0;
-      /* tweak NVLS channels usage; for registered NVLS buffer, we only need 4/5 channels to
-       * saturate bandwidth. */
+      /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
       if (comm->nNodes == 1) {
-        if (info->func == ncclFuncReduceScatter)
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 5));
-        else
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 4));
+        if (info->func == ncclFuncReduceScatter) {
+          // RS: Further tweaks for Blackwell with NVLS registered buffers
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
+        }
+        else {
+          // AR/AG: Further tweaks for Blackwell with NVLS registered buffers
+          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
+        }
       } else {
-        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, 6));
+        // Further tweaks for Blackwell with NVLS registered buffers
+        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
       }
       info->regBufType |= NCCL_NVLS_REG_BUFFER;
     }
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 582c30a35..3fe25a324 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -141,9 +141,11 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr,
 #include "channel.h"
 
 #define NVLS_MEM_ALIGN_SIZE (1 << 21)
+#define NVLS_NCHANNELS_SM90 16
+#define NVLS_NCHANNELS_SM100 32
 
 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
-NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", 16);
+NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2);
 NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
@@ -152,7 +154,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
 
   int gpuCount;
   NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
-  if (!ncclParamNvlsEnable() || ((!comm->MNNVL && gpuCount <= 2) || (comm->MNNVL && comm->clique.size <= 2))) return ncclSuccess;
+  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
 
   CUdevice dev;
   int driverVersion;
@@ -170,7 +172,11 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
   }
 
   INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
-  if (comm->nvlsSupport == 1) comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (int)ncclParamNvlsChannels()));
+  if (comm->nvlsSupport) {
+    int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90;
+    if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels();
+    comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 3ae514e45..dac762157 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -213,7 +213,7 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, int refcount, ncclIpcDe
 
     // cuMem API support
     CUmemGenericAllocationHandle handle;
-    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, size));
+    NCCLCHECK(ncclCuMemAlloc(ptr, &handle, type, size));
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // Return the native cuMem handle for later Export/Import via UDS
       memcpy(&ipcDesc->cuDesc.data, &handle, sizeof(handle));
@@ -816,7 +816,7 @@ ncclResult_t ret = ncclSuccess;
   if (isLegacyIpc) *isLegacyIpc = false;
   if (regRecord) {
     // buffer was registered by by users, we need to start to register or reuse it
-    int peerLocalRank;
+    int peerLocalRank = -1;
     for (int p = 0; p < nPeers; p++) {
       int peerRank = peerRanks[p];
       peerLocalRank = comm->rankToLocalRank[peerRank];
@@ -886,8 +886,10 @@ ncclResult_t ret = ncclSuccess;
         ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
         // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
         // and get the remote register address back.
-        if (proxyConn)
+        if (proxyConn) {
+          INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank);
           NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
+        }
         if (rmtRegAddr) {
           NCCLCHECKGOTO(ncclCalloc(&newInfo, 1), ret, fail);
           assert(regRecord->ipcInfos[peerLocalRank] == NULL);
@@ -905,7 +907,7 @@ ncclResult_t ret = ncclSuccess;
           regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
           needUpdate = true;
           *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
+          INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
         }
       }
     }
@@ -1039,6 +1041,8 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
   assert(sizeof(struct p2pIpcExpInfo) == reqSize);
   assert(sizeof(void*) == respSize);
 
+  INFO(NCCL_REG, "Proxy rank %d register reqBuff %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, reqBuff, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
+
   // request peer passes all necessary buffer info to import. The proxy thread would register
   // the buffer locally and return register addr back
   if (ipcExpInfo->legacyIpcCap) {
@@ -1070,7 +1074,7 @@ static ncclResult_t p2pProxyRegister(struct ncclProxyConnection* connection, str
     CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)regAddr, ipcExpInfo->size, &accessDesc, 1), ret, fail);
     regAddr = (void*)((uintptr_t)regAddr + ipcExpInfo->offset);
   }
-  INFO(NCCL_REG, "Proxy rank %d register succeeds, regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
+  INFO(NCCL_REG, "Proxy rank %d register success regAddr %p size %ld offset %ld legacyIpcCap %d sameProcess %d", proxyState->tpRank, regAddr, ipcExpInfo->size, ipcExpInfo->offset, ipcExpInfo->legacyIpcCap, connection->sameProcess);
 
 exit:
   memcpy(respBuff, (void*)&regAddr, sizeof(void*));

From f44ac759fee12ecb3cc6891e9e739a000f66fd70 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Wed, 12 Mar 2025 13:46:21 -0700
Subject: [PATCH 06/21] NCCL 2.26.2-1

Profiler improvements
 * Add events for CUDA kernel start and end.
 * Allow network plugins to generate profiling events
 * Enable profiling on a per-operation basis, rather than per-communicator.
 * Add support for graph capturing.

Add implicit launch order
 * Allow to prevent deadlocks when using multiple NCCL communicators per
   device by implicitly ordering NCCL operations using the host program
   order. Disabled by default, set NCCL_LAUNCH_ORDER_IMPLICIT=1 to enable.
 * Add a complementary mechanism to detect host threads racing to launch
   to the same device. Enabled by default, set NCCL_LAUNCH_RACE_FATAL=0 to
   disable.

Optimize the PAT algorithm
 * Separate the computation and execution of PAT steps on different warps,
   allowing to run up to 16 PAT steps in parallel to significantly
   accelerate PAT and reduce its linear part.

Add support for setting QoS per communicator
 * Add a new trafficClass field to the communicator configuration, to
   allow the application to select a particular traffic class for a
   given communicator. The meaning of the traffic class is
   network-specific and should be set in accordance with the network
   configuration.
 * For the IB/RoCE plugin, existing config variables such as NCCL_IB_SL
   and NCCL_IB_TC take precedence.

Allow to enable GPU Direct RDMA specifically on C2C platforms
 * Disabled by default, set NCCL_NET_GDR_C2C=1 to enable.

Do not disable user buffer registration unless PXN is really used
 * Only disable UB when a communicator has more than one rank per
   node on any node.

RAS subsystem improvements
 * Report operation counts separately for each collective operation type.
 * Provide details about missing communicator ranks and reliably
   distinguish ranks that are no longer a given communicator's members
   (now reported as NOCOMM) from those that failed to respond.

Add support for timestamps to NCCL diagnostic messages
 * On by default for WARN messages; NCCL_DEBUG_TIMESTAMP_LEVELS can be
   used to enable them for other debug levels as well.
 * The format can be changed using the NCCL_DEBUG_TIMESTAMP_FORMAT config
   variable.

Reduce the memory usage with NVLink SHARP (NVLS)
 * Potentially save hundreds of MBs of device memory, considering the
   multicast buffer size granularity separately from the address alignment.

Update performance tuning for recent Intel CPUs
 * Improve algorithm/protocol selection on recent CPUs such as Emerald
   Rapids and Sapphire Rapids.

Improve channel scheduling when mixing LL and Simple operations.
 * Make LL operations account for 4x more traffic to ensure LL and simple
   operations complete at the same time.

Refactor the plugin code
 * Clean up and harmonize the support code across the network, tuner,
   and profiler plugins.

Add support for comment lines (starting with #) in the nccl.conf file
* Issue #1540.

Make user buffer registration problems print an INFO instead of a WARN.

Drop support for network plugin interface version 5.

Fix a race condition with split-shared communicators
 * NCCL could hang during connection setup if multiple communicators
   were grouped together that share resources.

Fix a performance regression when using NCCL_CROSS_NIC=1
 * NCCL would unnecessarily alternate rings, breaking the GPU-NIC
   associations.

Make GID index detection code more resilient
 * Dynamic GID detection code was giving up too soon if the
   detected index was not available (e.g., wasn't mapped to the
   container's sysfs).
 * Issues #1538, #1573.

Fix a race condition with non-blocking operation
 * Fix issue when creating a non-blocking communicator after a non-
   blocking collective operation on another communicator.

Fix shared memory usage on recent Blackwell GPUs.
 * Issues NVIDIA/nccl-tests#287, NVIDIA/nccl-tests#291, #1637.

Fix an error with NIC fusion and IB SHARP when recreating communicators
 * Disable the unloading of network plugins

Make the auto-merge failures in the NIC fusion non-fatal
 * This could happen when trying to merge IB and RoCE devices.

Fixes to ncclCommAbort
 * Fix hangs due to the progress thread spinning indefinitely on the
   network progress.
 * Reduce the abort time by up to two orders of magnitude.

Fix a crash when libnccl.so was dynamically unloaded
 * The RAS subsystem was missing a clean-up handler.

Fix a hang if the network plugin's test() call returns an error.

Fix a hang on heterogeneous architectures
 * Ensure we harmonize the tuning to avoid different tuning choices,
   causing a hang.

Fix double-free on failed ncclCommInitRank and ncclCommFinalize.

Fix a potential list traversal bug during a group launch of multiple
communicators
 * Issue #1599.

Unify the handling of NCCL configuration variables
 * Under rare circumstances, some variables specified in the config file
   could be ignored.
---
 ext-net/README.md                           |   30 +-
 ext-net/example/nccl/net.h                  |   13 +-
 ext-net/example/nccl/net_device.h           |    3 +-
 ext-net/example/nccl/net_v10.h              |  101 ++
 ext-net/example/nccl/net_v2.h               |    4 +-
 ext-net/example/nccl/net_v3.h               |    4 +-
 ext-net/example/nccl/net_v4.h               |    4 +-
 ext-net/example/nccl/net_v5.h               |    4 +-
 ext-net/example/nccl/net_v6.h               |    6 +-
 ext-net/example/nccl/net_v7.h               |    6 +-
 ext-net/example/nccl/net_v8.h               |    6 +-
 ext-net/example/nccl/net_v9.h               |   12 +-
 ext-net/example/plugin.c                    |   77 +-
 makefiles/common.mk                         |    5 +
 makefiles/version.mk                        |    4 +-
 src/Makefile                                |   11 +-
 src/bootstrap.cc                            |   16 +-
 src/channel.cc                              |   37 +-
 src/debug.cc                                |  159 ++-
 src/device/all_gather.h                     |   64 +-
 src/device/all_reduce.h                     |   10 +-
 src/device/broadcast.h                      |    2 +-
 src/device/common.h                         |   37 +-
 src/device/primitives.h                     |   16 +-
 src/device/prims_ll.h                       |   17 +-
 src/device/prims_ll128.h                    |   17 +-
 src/device/prims_simple.h                   |  367 +++---
 src/device/reduce_scatter.h                 |   61 +-
 src/device/sendrecv.h                       |    2 +-
 src/enqueue.cc                              |  236 ++--
 src/graph/connect.cc                        |    2 +-
 src/graph/paths.cc                          |   82 +-
 src/graph/search.cc                         |   68 +-
 src/graph/topo.cc                           |  116 +-
 src/graph/topo.h                            |   38 +-
 src/graph/tuning.cc                         |    3 +-
 src/group.cc                                |   68 +-
 src/include/bitops.h                        |   53 +-
 src/include/collectives.h                   |  446 +++----
 src/include/comm.h                          |   10 +-
 src/include/device.h                        |    8 +-
 src/include/graph.h                         |   16 +-
 src/include/group.h                         |    6 +
 src/include/nccl_net.h                      |  604 ----------
 src/include/nccl_profiler.h                 |  235 ----
 src/include/nccl_tuner.h                    |  149 ---
 src/include/net.h                           |    1 -
 src/include/net_device.h                    |    3 +-
 src/include/nvtx.h                          |    3 +-
 src/include/plugin/nccl_net.h               |   54 +
 src/include/plugin/nccl_profiler.h          |   69 ++
 src/include/plugin/nccl_tuner.h             |   22 +
 src/include/plugin/net/net_v10.h            |  158 +++
 src/include/plugin/net/net_v6.h             |  113 ++
 src/include/plugin/net/net_v7.h             |  120 ++
 src/include/plugin/net/net_v8.h             |  134 +++
 src/include/plugin/net/net_v9.h             |  152 +++
 src/include/plugin/plugin.h                 |   18 +
 src/include/plugin/profiler/net_ib.h        |   13 +
 src/include/plugin/profiler/net_ib_v1.h     |   34 +
 src/include/plugin/profiler/net_socket.h    |   13 +
 src/include/plugin/profiler/net_socket_v1.h |   32 +
 src/include/plugin/profiler/profiler_v1.h   |  107 ++
 src/include/plugin/profiler/profiler_v2.h   |  104 ++
 src/include/plugin/profiler/profiler_v3.h   |  112 ++
 src/include/plugin/tuner/tuner_v2.h         |   53 +
 src/include/plugin/tuner/tuner_v3.h         |   55 +
 src/include/plugin/tuner/tuner_v4.h         |   56 +
 src/include/profiler.h                      |   20 +
 src/include/proxy.h                         |    7 +-
 src/include/ras.h                           |    2 +
 src/include/register.h                      |    2 +-
 src/include/shm.h                           |    5 +-
 src/include/socket.h                        |    2 +-
 src/include/strongstream.h                  |   98 +-
 src/include/transport.h                     |   10 +-
 src/init.cc                                 |  123 +-
 src/misc/ipcsocket.cc                       |    3 +-
 src/misc/param.cc                           |    1 +
 src/misc/socket.cc                          |   11 +-
 src/misc/strongstream.cc                    |  481 ++++----
 src/misc/tuner.cc                           |  267 -----
 src/nccl.h.in                               |    4 +-
 src/net.cc                                  | 1033 -----------------
 src/plugin/net.cc                           |  319 +++++
 src/plugin/net/net_v10.cc                   |   32 +
 src/plugin/net/net_v6.cc                    |  178 +++
 src/plugin/net/net_v7.cc                    |  174 +++
 src/plugin/net/net_v8.cc                    |  196 ++++
 src/plugin/net/net_v9.cc                    |  121 ++
 src/plugin/plugin_open.cc                   |  134 +++
 src/{misc => plugin}/profiler.cc            |  426 +++----
 src/plugin/profiler/profiler_v1.cc          |  133 +++
 src/plugin/profiler/profiler_v2.cc          |   45 +
 src/plugin/profiler/profiler_v3.cc          |   20 +
 src/plugin/tuner.cc                         |   99 ++
 src/plugin/tuner/tuner_v2.cc                |   66 ++
 src/plugin/tuner/tuner_v3.cc                |   38 +
 src/plugin/tuner/tuner_v4.cc                |   22 +
 src/proxy.cc                                |   69 +-
 src/ras/client_support.cc                   |  851 ++++++++------
 src/ras/collectives.cc                      |  716 ++++++++----
 src/ras/peers.cc                            |  194 ++--
 src/ras/ras.cc                              |  182 +--
 src/ras/ras_internal.h                      |  139 ++-
 src/ras/rasnet.cc                           | 1156 +++++++++++--------
 src/register/register.cc                    |    4 +-
 src/transport.cc                            |   18 +-
 src/transport/coll_net.cc                   |  104 +-
 src/transport/net.cc                        |   78 +-
 src/transport/net_ib.cc                     |  186 ++-
 src/transport/net_socket.cc                 |   73 +-
 src/transport/nvls.cc                       |  147 +--
 src/transport/p2p.cc                        |   23 +-
 src/transport/profiler.cc                   |   55 +
 src/transport/shm.cc                        |   24 +-
 116 files changed, 7498 insertions(+), 5254 deletions(-)
 create mode 100644 ext-net/example/nccl/net_v10.h
 delete mode 100644 src/include/nccl_net.h
 delete mode 100644 src/include/nccl_profiler.h
 delete mode 100644 src/include/nccl_tuner.h
 create mode 100644 src/include/plugin/nccl_net.h
 create mode 100644 src/include/plugin/nccl_profiler.h
 create mode 100644 src/include/plugin/nccl_tuner.h
 create mode 100644 src/include/plugin/net/net_v10.h
 create mode 100644 src/include/plugin/net/net_v6.h
 create mode 100644 src/include/plugin/net/net_v7.h
 create mode 100644 src/include/plugin/net/net_v8.h
 create mode 100644 src/include/plugin/net/net_v9.h
 create mode 100644 src/include/plugin/plugin.h
 create mode 100644 src/include/plugin/profiler/net_ib.h
 create mode 100644 src/include/plugin/profiler/net_ib_v1.h
 create mode 100644 src/include/plugin/profiler/net_socket.h
 create mode 100644 src/include/plugin/profiler/net_socket_v1.h
 create mode 100644 src/include/plugin/profiler/profiler_v1.h
 create mode 100644 src/include/plugin/profiler/profiler_v2.h
 create mode 100644 src/include/plugin/profiler/profiler_v3.h
 create mode 100644 src/include/plugin/tuner/tuner_v2.h
 create mode 100644 src/include/plugin/tuner/tuner_v3.h
 create mode 100644 src/include/plugin/tuner/tuner_v4.h
 delete mode 100644 src/misc/tuner.cc
 delete mode 100644 src/net.cc
 create mode 100644 src/plugin/net.cc
 create mode 100644 src/plugin/net/net_v10.cc
 create mode 100644 src/plugin/net/net_v6.cc
 create mode 100644 src/plugin/net/net_v7.cc
 create mode 100644 src/plugin/net/net_v8.cc
 create mode 100644 src/plugin/net/net_v9.cc
 create mode 100644 src/plugin/plugin_open.cc
 rename src/{misc => plugin}/profiler.cc (57%)
 create mode 100644 src/plugin/profiler/profiler_v1.cc
 create mode 100644 src/plugin/profiler/profiler_v2.cc
 create mode 100644 src/plugin/profiler/profiler_v3.cc
 create mode 100644 src/plugin/tuner.cc
 create mode 100644 src/plugin/tuner/tuner_v2.cc
 create mode 100644 src/plugin/tuner/tuner_v3.cc
 create mode 100644 src/plugin/tuner/tuner_v4.cc
 create mode 100644 src/transport/profiler.cc

diff --git a/ext-net/README.md b/ext-net/README.md
index aa1a3945e..90fe89bf5 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,20 +60,20 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v9)
+# API (v10)
 
-Below is the main `ncclNet_v9` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
@@ -83,13 +83,13 @@ typedef struct {
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -98,10 +98,10 @@ typedef struct {
   ncclResult_t (*deregMr)(void* comm, void* mhandle);
   // Asynchronous send to a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request);
   // Asynchronous recv from a peer.
   // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request);
   // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
   // visible to the GPU
   ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
@@ -200,6 +200,9 @@ the plugin code adding the following definitions:
 #define INFO(FLAGS, ...) logFunction(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 ```
 
+The `ncclProfilerCallback_t` argument is a NCCL core callback that allows the plugin to define and
+record its own events with the NCCL profiler plugin.
+
 `devices`
 
 Once the plugin is initialized, NCCL will query the number of devices available. It should not
@@ -301,6 +304,11 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.
 
+The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+This field can be used by the network plugin to specify the QoS level of the connection. By default,
+`trafficClass` is set to -1 but can be configured by the application during communicator initialization
+to select a plugin-supported QoS level.
+
 `closeListen`/`closeSend`/`closeRecv`
 
 Once a `listenComm`/`sendComm`/`recvComm` is no longer needed, NCCL will call
@@ -354,6 +362,9 @@ The `isend` operation returns a handle in the `request` argument for further cal
 the `isend` operation cannot be initiated, `request` can be set to `NULL` and NCCL will call
 `isend` again later.
 
+The `pHandle` argument allows NCCL to pass an opaque handle that can be used by the network plugin
+to support network defined events.
+
 `irecv`
 
 To receive data, NCCL will call `irecv` with the `recvComm` returned by `accept`. The argument
@@ -375,6 +386,9 @@ of irecv and is resilient to redundant network writes. This allows the plugin to
 completions on such irecvs (for example, complete the request immediately). The plugin is still
 expected to set a valid request pointer on return which NCCL can poll to check for completion.
 
+The `pHandle` argument allows NCCL to pass an array of opaque handles that can be used by the
+network plugin to support network defined events.
+
 Note: for a given connection, send/receive operations should always match in the order they were
 posted. Tags provided for receive operations are only used to assign a given send operation to one
 of the buffers of the first (multi-)receive in the queue, not to allow for out-of-order tag
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 112967ab8..85ea79ef7 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -2,14 +2,15 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
+#ifndef NET_H_
+#define NET_H_
 
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "common.h"
 #include "err.h"
+#include "net_device.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -22,6 +23,9 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
+#include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
 #include "net_v7.h"
@@ -31,4 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"
 
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
 #endif // end include guard
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index 874fb5999..d693101a3 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h
new file mode 100644
index 000000000..809e7c001
--- /dev/null
+++ b/ext-net/example/nccl/net_v10.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+#endif // end include guard
diff --git a/ext-net/example/nccl/net_v2.h b/ext-net/example/nccl/net_v2.h
index 0d9c90619..dd9f39b69 100644
--- a/ext-net/example/nccl/net_v2.h
+++ b/ext-net/example/nccl/net_v2.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V2_H_
-#define NCCL_NET_V2_H_
+#ifndef NET_V2_H_
+#define NET_V2_H_
 
 typedef struct {
   // Name of the network (mainly for logs)
diff --git a/ext-net/example/nccl/net_v3.h b/ext-net/example/nccl/net_v3.h
index db1287b47..9002165e0 100644
--- a/ext-net/example/nccl/net_v3.h
+++ b/ext-net/example/nccl/net_v3.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V3_H_
-#define NCCL_NET_V3_H_
+#ifndef NET_V3_H_
+#define NET_V3_H_
 
 #define NCCL_NET_MAX_REQUESTS_V3 16
 
diff --git a/ext-net/example/nccl/net_v4.h b/ext-net/example/nccl/net_v4.h
index efe482410..41cef56b4 100644
--- a/ext-net/example/nccl/net_v4.h
+++ b/ext-net/example/nccl/net_v4.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V4_H_
-#define NCCL_NET_V4_H_
+#ifndef NET_V4_H_
+#define NET_V4_H_
 
 #define NCCL_NET_HANDLE_MAXSIZE_V4 64
 
diff --git a/ext-net/example/nccl/net_v5.h b/ext-net/example/nccl/net_v5.h
index b96b6fc6b..47f446c75 100644
--- a/ext-net/example/nccl/net_v5.h
+++ b/ext-net/example/nccl/net_v5.h
@@ -2,8 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V5_H_
-#define NCCL_NET_V5_H_
+#ifndef NET_V5_H_
+#define NET_V5_H_
 
 typedef ncclNetProperties_v6_t ncclNetProperties_v5_t;
 typedef struct {
diff --git a/ext-net/example/nccl/net_v6.h b/ext-net/example/nccl/net_v6.h
index fffaf8c62..de90f297c 100644
--- a/ext-net/example/nccl/net_v6.h
+++ b/ext-net/example/nccl/net_v6.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V6_H_
-#define NCCL_NET_V6_H_
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
+#ifndef NET_V6_H_
+#define NET_V6_H_
 
 typedef struct {
   char* name;     // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v7.h b/ext-net/example/nccl/net_v7.h
index d607095de..3802a3d78 100644
--- a/ext-net/example/nccl/net_v7.h
+++ b/ext-net/example/nccl/net_v7.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V7_H_
-#define NCCL_NET_V7_H_
-
-#include "net_device.h"
+#ifndef NET_V7_H_
+#define NET_V7_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v8.h b/ext-net/example/nccl/net_v8.h
index 54a61f61b..74eb72dd4 100644
--- a/ext-net/example/nccl/net_v8.h
+++ b/ext-net/example/nccl/net_v8.h
@@ -2,10 +2,8 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V8_H_
-#define NCCL_NET_V8_H_
-
-#include "net_device.h"
+#ifndef NET_V8_H_
+#define NET_V8_H_
 
 typedef struct {
   char* name;                      // Used mostly for logging.
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
index 61035ecc9..ca60ad651 100644
--- a/ext-net/example/nccl/net_v9.h
+++ b/ext-net/example/nccl/net_v9.h
@@ -2,18 +2,14 @@
  * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  */
 
-#ifndef NCCL_NET_V9_H_
-#define NCCL_NET_V9_H_
-
-#include "net_device.h"
+#ifndef NET_V9_H_
+#define NET_V9_H_
 
 #define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
 typedef struct {
   int ndevs;
   int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
 } ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
 
 typedef struct {
   char* name;                      // Used mostly for logging.
@@ -35,8 +31,6 @@ typedef struct {
   size_t maxCollBytes;             // Max transfer size for collective operations
 } ncclNetProperties_v9_t;
 
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -93,7 +87,7 @@ typedef struct {
 
   // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
   // what index this new vNIC exists at
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
 } ncclNet_v9_t;
 
 #endif // end include guard
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 285224261..97a29875d 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -11,7 +11,7 @@
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -52,13 +52,13 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
 }
 
 __hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginDeregMr(void* collComm, void* mhandle) { return ncclInternalError;}
-__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) { return ncclInternalError; }
-__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) { return ncclInternalError; }
+__hidden ncclResult_t pluginIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) { return ncclInternalError; }
 __hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; }
 __hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; }
@@ -70,7 +70,7 @@ __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
 
 #define PLUGIN_NAME "Plugin"
 
-ncclNet_v9_t ncclNetPlugin_v9 = {
+const ncclNet_v10_t ncclNetPlugin_v10 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -93,6 +93,51 @@ ncclNet_v9_t ncclNetPlugin_v9 = {
   .makeVDevice   = pluginMakeVDevice,
 };
 
+__hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
+  return pluginInit(logFunction, NULL);
+}
+
+__hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
+  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+}
+
+__hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
+  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+}
+
+__hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+  return pluginIsend(sendComm, data, size, tag, mhandle, NULL, request);
+}
+
+__hidden ncclResult_t pluginIrecv_v9(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+  return pluginIrecv(recvComm, n, data, sizes, tags, mhandles, NULL, request);
+}
+
+__hidden ncclResult_t pluginMakeVDevice_v9(int* d, ncclNetVDeviceProps_v9_t* props) { return ncclInternalError; }
+
+const ncclNet_v9_t ncclNetPlugin_v9 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v9,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v9,
+  .listen = pluginListen,
+  .connect = pluginConnect_v9,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend_v9,
+  .irecv = pluginIrecv_v9,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v9,
+};
+
 __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* props_v8) {
   ncclNetProperties_t props;
   ncclResult_t ret = pluginGetProperties(dev, &props);
@@ -113,22 +158,22 @@ __hidden ncclResult_t pluginGetProperties_v8(int dev, ncclNetProperties_v8_t* pr
 }
 
 __hidden ncclResult_t pluginIsend_v8(void* sendComm, void* data, int size, int tag, void* mhandle, void** request) {
-  return pluginIsend(sendComm, data, (int)size, tag, mhandle, request);
+  return pluginIsend(sendComm, data, (int)size, tag, mhandle, NULL, request);
 }
 
 __hidden ncclResult_t pluginIrecv_v8(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request) {
   size_t sizesOut[NCCL_PLUGIN_MAX_RECVS];
   for (int i=0; i<n; i++) sizesOut[i] = sizes[i];
-  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, request);
+  return pluginIrecv(recvComm, 1, data, sizesOut, tags, mhandles, NULL, request);
 }
 
 const ncclNet_v8_t ncclNetPlugin_v8 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v8,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -168,11 +213,11 @@ __hidden ncclResult_t pluginRegMr_v7(void* collComm, void* data, int size, int t
 
 const ncclNet_v7_t ncclNetPlugin_v7 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v7,
   .listen = pluginListen,
-  .connect = pluginConnect,
+  .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr_v7,
   .regMrDmaBuf = pluginRegMrDmaBuf,
@@ -209,7 +254,7 @@ __hidden ncclResult_t pluginAccept_v6(void* listenComm, void** recvComm) { retur
 
 const ncclNet_v6_t ncclNetPlugin_v6 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -230,7 +275,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
 /* v5 Compat */
 const ncclNet_v5_t ncclNetPlugin_v5 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
   .listen = pluginListen,
@@ -275,7 +320,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
   ncclResult_t ret;
   do {
     ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, handle, sendComm, &handle);
+    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
   } while (ret == ncclSuccess && *sendComm == NULL);
   return ret;
 }
@@ -289,7 +334,7 @@ static ncclResult_t pluginAccept_v4(void* listenComm, void** recvComm) {
 }
 const ncclNet_v4_t ncclNetPlugin_v4 = {
   .name = PLUGIN_NAME,
-  .init = pluginInit,
+  .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v4,
   .listen = pluginListen,
@@ -318,7 +363,7 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
   max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction);
+  return pluginInit(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 1b1bb8674..545203a10 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -16,6 +16,7 @@ WERROR ?= 0
 PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
+NET_PROFILER ?= 0
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -137,3 +138,7 @@ endif
 ifneq ($(RDMA_CORE), 0)
 CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
 endif
+
+ifneq ($(NET_PROFILER), 0)
+CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
+endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index b02cf909c..df3ee5c68 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 25
-NCCL_PATCH   := 1
+NCCL_MINOR   := 26
+NCCL_PATCH   := 2
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index b66ebefa2..65da6300b 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,11 +10,15 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc net.cc proxy.cc transport.cc mnnvl.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
 	$(wildcard register/*.cc) \
+	$(wildcard plugin/*.cc) \
+	$(wildcard plugin/net/*.cc) \
+	$(wildcard plugin/tuner/*.cc) \
+	$(wildcard plugin/profiler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -49,6 +53,7 @@ LIBOBJ     := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o)
 BINOBJ     := $(BINSRCFILES:%.cc=$(OBJDIR)/%.o)
 DEPFILES   := $(LIBOBJ:%.o=%.d) $(BINOBJ:%.o=%.d)
 LDFLAGS    += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl
+INCPLUGIN  := include/plugin
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
@@ -126,8 +131,8 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 675bcfcd4..9e24faadf 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -153,7 +153,7 @@ static ncclResult_t netIsend(ncclNet_t* net, void* sendComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*sendReq) {
-    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, sendReq));
+    NCCLCHECK(net->isend(sendComm, data, (size_t)size, tag, dataHandle, NULL, sendReq));
   }
   if (*sendReq) {
     NCCLCHECK(net->test(*sendReq, done, NULL));
@@ -167,8 +167,8 @@ static ncclResult_t netIrecv(ncclNet_t* net, void* recvComm, void* data, int siz
                              int* done) {
   if (*done) return ncclSuccess;
   if (!*recvReq) {
-    size_t size64 = size; 
-    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, recvReq));
+    size_t size64 = size;
+    NCCLCHECK(net->irecv(recvComm, 1, &data, &size64, &tag, &dataHandle, NULL, recvReq));
   }
   if (*recvReq) {
     NCCLCHECK(net->test(*recvReq, done, NULL));
@@ -484,7 +484,7 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
   if (devOOB < 0) {
     pthread_mutex_lock(&bootstrapNetLock);
     if (devOOB < 0) {
-      char* userIfEnv = getenv("NCCL_OOB_NET_IFNAME");
+      const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
       if (userIfEnv && strlen(userIfEnv) > 0) {
         INFO(NCCL_BOOTSTRAP | NCCL_ENV, "NCCL_OOB_NET_IFNAME set to %s", userIfEnv);
         bool searchNot = userIfEnv && userIfEnv[0] == '^';
@@ -540,7 +540,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
   do {
     NCCLCHECK(checkAbort(abortFlag, &abortCounter));
     if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
     if (!*recvComm)
       NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
   } while (!*sendComm || !*recvComm);
@@ -736,6 +736,8 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
     rasRanks[rank].pid = getpid();
     rasRanks[rank].cudaDev = comm->cudaDev;
     rasRanks[rank].nvmlDev = comm->nvmlDev;
+    rasRanks[rank].hostHash = getHostHash();
+    rasRanks[rank].pidHash = getPidHash();
     if (ncclRasCommInit(comm, rasRanks+rank) != ncclSuccess) {
       INFO(NCCL_INIT|NCCL_RAS, "Continuing in spite of a RAS initialization error");
       // We should still participate in the ringAllInfo below as the peers will be waiting for us.
@@ -967,7 +969,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int s
   NCCLCHECK(socketAccept(commState, peer, tag, &sock));
   TRACE(NCCL_BOOTSTRAP, "Receiving tag=%d peer=%d size=%d", tag, peer, size);
   NCCLCHECKGOTO(socketRecv(&sock, ((char*)data), size), ret, fail);
-  NCCLCHECK(ncclSocketClose(&sock));
+  NCCLCHECKGOTO(ncclSocketClose(&sock, /*wait*/true), ret, fail);
   return ret;
 fail:
   (void)ncclSocketClose(&sock);
@@ -1062,7 +1064,7 @@ static ncclResult_t bootstrapP2PBarrier(void* commState, int* ranks, int rank, i
    * Based on the dissemination algorithm by Debra Hensgen, Raphael Finkel, and Udi Manbet,
    * "Two Algorithms for Barrier Synchronization," International Journal of Parallel Programming, 17(1):1-17, 1988"
    */
-  int data[1];
+  int data[1] = {0};
   for (int mask = 1; mask < nranks; mask <<= 1) {
     int src = (rank - mask + nranks) % nranks;
     int dst = (rank + mask) % nranks;
diff --git a/src/channel.cc b/src/channel.cc
index b3a8f29b5..bc48986d8 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -20,8 +20,8 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
   channel->workFifoProduced = 0;
 
   struct ncclSharedResources* sharedRes = comm->sharedRes;
-
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  cudaStream_t deviceStream;
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (channel->peers == NULL) {
     // The extra on nRanks+1 is for collnet root (i.e. network)
@@ -39,33 +39,33 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
 
   if (channel->devPeers == NULL) {
     if (sharedRes->devPeers[channelId] == NULL) {
-      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaCallocAsync(sharedRes->devPeers + channelId, sharedRes->tpNRanks, deviceStream));
     }
     /* channel->devPeers is not shared, so just free it when calling commFree() */
-    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->devPeers, nPeers, deviceStream));
     ncclCommPushCudaFree(comm, channel->devPeers);
     NCCLCHECK(ncclCalloc(&channel->devPeersHostPtr, nPeers));
     for (int r = 0; r < nRanks; r++) {
       uintptr_t addr = (uintptr_t)(comm->sharedRes->devPeers[channelId] + comm->topParentRanks[r]);
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[r] = (struct ncclDevChannelPeer*)addr;
     }
   }
 
   channel->ring.userRanks = ncclMemoryStackAlloc<int>(&comm->memPermanent, nRanks);
-  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, sharedRes->deviceStream.cudaStream));
+  NCCLCHECK(ncclCudaCallocAsync(&channel->devRingUserRanks, nRanks, deviceStream));
   ncclCommPushCudaFree(comm, channel->devRingUserRanks);
 
   /* guarantee addr has been copied into channel->devPeers */
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
-
   return ncclSuccess;
 }
 
 ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share) {
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
+  cudaStream_t deviceStream;
 
   if (channel->nvlsPeers != NULL)
     return ncclSuccess;
@@ -73,7 +73,7 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   int nvlsRanks = comm->localRanks;
 
@@ -84,24 +84,24 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo
       int tr = comm->topParentLocalRanks[r];
       uintptr_t addr = (uintptr_t)(parent->channels[channelId].nvlsDevPeers + tr);
       channel->peers[comm->nRanks + 1 + r] = parent->channels[channelId].nvlsPeers + tr;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&parent->channels[channelId].nvlsPeers[tr].refCount);
     }
   } else {
     NCCLCHECK(ncclCalloc(&channel->nvlsPeers, nvlsRanks));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->nvlsDevPeers, nvlsRanks, deviceStream));
     for (int r = 0; r < nvlsRanks; ++r) {
       uintptr_t addr = (uintptr_t)(channel->nvlsDevPeers + r);
       channel->peers[comm->nRanks + 1 + r] = channel->nvlsPeers + r;
-      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+      NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks + 1 + r), (uintptr_t*)&addr, 1, deviceStream));
       channel->devPeersHostPtr[comm->nRanks + 1 + r] = (struct ncclDevChannelPeer*)addr;
       ncclAtomicRefCountIncrement(&channel->nvlsPeers[r].refCount);
     }
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
@@ -110,6 +110,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   struct ncclChannel* channel = &comm->channels[channelId];
   struct ncclSharedResources* sharedRes = comm->sharedRes;
   uintptr_t addr;
+  cudaStream_t deviceStream;
 
   if (channel->collnetPeers != NULL)
     return ncclSuccess;
@@ -117,28 +118,28 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
   if (channel->id == -1)
     NCCLCHECK(initChannel(comm, channelId));
 
-  NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));
+  NCCLCHECK(ncclStrongStreamAcquire(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
   if (share) {
     channel->collnetPeers = parent->channels[channelId].collnetPeers;
     channel->collnetDevPeers = parent->channels[channelId].collnetDevPeers;
     addr = (uintptr_t)parent->channels[channelId].collnetDevPeers;
     channel->peers[comm->nRanks] = parent->channels[channelId].collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&parent->channels[channelId].collnetPeers->refCount);
   } else {
     NCCLCHECK(ncclCalloc(&channel->collnetPeers, 1));
-    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaCallocAsync(&channel->collnetDevPeers, 1, deviceStream));
     addr = (uintptr_t)channel->collnetDevPeers;
     channel->peers[comm->nRanks] = channel->collnetPeers;
-    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, sharedRes->deviceStream.cudaStream));
+    NCCLCHECK(ncclCudaMemcpyAsync((uintptr_t*)(channel->devPeers + comm->nRanks), (uintptr_t*)&addr, 1, deviceStream));
     channel->devPeersHostPtr[comm->nRanks] = (struct ncclDevChannelPeer*)addr;
     ncclAtomicRefCountIncrement(&channel->collnetPeers->refCount);
   }
 
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &sharedRes->deviceStream));
 
   return ncclSuccess;
 }
diff --git a/src/debug.cc b/src/debug.cc
index 2ea6eabde..2eb8d7749 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -6,6 +6,7 @@
 
 #include "core.h"
 #include "nccl_net.h"
+#include <ctime>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdio.h>
@@ -16,6 +17,11 @@
 #include "param.h"
 
 int ncclDebugLevel = -1;
+static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
+static char ncclDebugTimestampFormat[256];        // with space for subseconds
+static int ncclDebugTimestampSubsecondsStart;     // index where the subseconds starts
+static uint64_t ncclDebugTimestampMaxSubseconds;  // Max number of subseconds plus 1, used in duration ratio
+static int ncclDebugTimestampSubsecondDigits;     // Number of digits to display
 static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
@@ -112,6 +118,84 @@ static void ncclDebugInit() {
       ncclWarnSetDebugInfo = value;
   }
 
+  // Determine which debug levels will have timestamps.
+  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  if (timestamps == nullptr) {
+    ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
+  } else {
+    int invert = 0;
+    if (timestamps[0] == '^') { invert = 1; ++timestamps; }
+    ncclDebugTimestampLevels = invert ? ~0U : 0U;
+    char *timestampsDup = strdup(timestamps);
+    char *level = strtok(timestampsDup, ",");
+    while (level != NULL) {
+      uint32_t mask = 0;
+      if (strcasecmp(level, "ALL") == 0) {
+        mask = ~0U;
+      } else if (strcasecmp(level, "VERSION") == 0) {
+        mask = (1<<NCCL_LOG_VERSION);
+      } else if (strcasecmp(level, "WARN") == 0) {
+        mask = (1<<NCCL_LOG_WARN);
+      } else if (strcasecmp(level, "INFO") == 0) {
+        mask = (1<<NCCL_LOG_INFO);
+      } else if (strcasecmp(level, "ABORT") == 0) {
+        mask = (1<<NCCL_LOG_ABORT);
+      } else if (strcasecmp(level, "TRACE") == 0) {
+        mask = (1<<NCCL_LOG_TRACE);
+      } else {
+        // Silently fail.
+      }
+      if (mask) {
+        if (invert) ncclDebugTimestampLevels &= ~mask;
+        else ncclDebugTimestampLevels |= mask;
+      }
+      level = strtok(NULL, ",");
+    }
+    free(timestampsDup);
+  }
+
+  // Store a copy of the timestamp format with space for the subseconds, if used.
+  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  if (tsFormat == nullptr) tsFormat = "[%F %T] ";
+  ncclDebugTimestampSubsecondsStart = -1;
+  // Find where the subseconds are in the format.
+  for (int i=0; tsFormat[i] != '\0'; ++i) {
+    if (tsFormat[i]=='%' && tsFormat[i+1]=='%') { // Next two chars are "%"
+      // Skip the next character, too, and restart checking after that.
+      ++i;
+      continue;
+    }
+    if (tsFormat[i]=='%' &&                               // Found a percentage
+        ('1' <= tsFormat[i+1] && tsFormat[i+1] <= '9') && // Next char is a digit between 1 and 9 inclusive
+        tsFormat[i+2]=='f'                                // Two characters later is an "f"
+        ) {
+      constexpr int replaceLen = sizeof("%Xf") - 1;
+      ncclDebugTimestampSubsecondDigits = tsFormat[i+1] - '0';
+      if (ncclDebugTimestampSubsecondDigits + strlen(tsFormat) - replaceLen > sizeof(ncclDebugTimestampFormat) - 1) {
+        // Won't fit; fall back on the default.
+        break;
+      }
+      ncclDebugTimestampSubsecondsStart = i;
+      ncclDebugTimestampMaxSubseconds = 1;
+
+      memcpy(ncclDebugTimestampFormat, tsFormat, i);
+      for (int j=0; j<ncclDebugTimestampSubsecondDigits; ++j) {
+        ncclDebugTimestampFormat[i+j] = ' ';
+        ncclDebugTimestampMaxSubseconds *= 10;
+      }
+      strcpy(ncclDebugTimestampFormat+i+ncclDebugTimestampSubsecondDigits, tsFormat+i+replaceLen);
+      break;
+    }
+  }
+  if (ncclDebugTimestampSubsecondsStart == -1) {
+    if (strlen(tsFormat) < sizeof(ncclDebugTimestampFormat)) {
+      strcpy(ncclDebugTimestampFormat, tsFormat);
+    } else {
+      strcpy(ncclDebugTimestampFormat, "[%F %T] ");
+    }
+  }
+
+
   // Cache pid and hostname
   getHostName(hostname, 1024, '.');
   pid = getpid();
@@ -192,39 +276,86 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
     tid = syscall(SYS_gettid);
   }
 
+  char buffer[1024];
+  size_t len = 0;
+
+  // WARNs come with an extra newline at the beginning.
+  if (level == NCCL_LOG_WARN) {
+    buffer[len++] = '\n';
+  };
+
+  // Add the timestamp to the buffer if they are turned on for this level.
+  if (ncclDebugTimestampLevels & (1<<level)) {
+    if (ncclDebugTimestampFormat[0] != '\0') {
+      struct timespec ts;
+      clock_gettime(CLOCK_REALTIME, &ts);   // clock_gettime failure should never happen
+      std::tm nowTm;
+      localtime_r(&ts.tv_sec, &nowTm);
+
+      // Add the subseconds portion if it is part of the format.
+      char localTimestampFormat[sizeof(ncclDebugTimestampFormat)];
+      const char* pformat = ncclDebugTimestampFormat;
+      if (ncclDebugTimestampSubsecondsStart != -1) {
+        pformat = localTimestampFormat;   // Need to use the local version which has subseconds
+        memcpy(localTimestampFormat, ncclDebugTimestampFormat, ncclDebugTimestampSubsecondsStart);
+        snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
+                 ncclDebugTimestampSubsecondDigits+1,
+                 "%0*ld", ncclDebugTimestampSubsecondDigits,
+                 ts.tv_nsec / (1000000UL/ncclDebugTimestampMaxSubseconds));
+        strcpy(    localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
+               ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
+      }
+
+      // Format the time. If it runs out of space, fall back on a simpler format.
+      int adv = std::strftime(buffer+len, sizeof(buffer)-len, pformat, &nowTm);
+      if (adv==0 && ncclDebugTimestampFormat[0] != '\0') {
+        // Ran out of space. Fall back on the default. This should never fail.
+        adv = std::strftime(buffer+len, sizeof(buffer)-len, "[%F %T] ", &nowTm);
+      }
+      len += adv;
+    }
+  }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add hostname, pid and tid portion of the log line.
+  if (level != NCCL_LOG_VERSION) {
+    len += snprintf(buffer+len, sizeof(buffer)-len, "%s:%d:%d ", hostname, pid, tid);
+    len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+  }
+
   int cudaDev = 0;
   if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
     (void)cudaGetDevice(&cudaDev);
   }
 
-  char buffer[1024];
-  size_t len = 0;
+  // Add level specific formatting.
   if (level == NCCL_LOG_WARN) {
-    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
-                   hostname, pid, tid, cudaDev, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
     if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
   } else if (level == NCCL_LOG_INFO) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
   } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "NCCL CALL ");
   } else if (level == NCCL_LOG_TRACE) {
     auto delta = std::chrono::steady_clock::now() - ncclEpoch;
     double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
-    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
-                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+    len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %f %s:%d NCCL TRACE ", cudaDev, timestamp, filefunc, line);
   }
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
 
+  // Add the message as given by the call site.
   va_list vargs;
   va_start(vargs, fmt);
   len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
   va_end(vargs);
   // vsnprintf may return len >= sizeof(buffer) in the case of a truncated output.
-  // Rewind len so that we can replace the final \0 by \n
-  if (len >= sizeof(buffer)) len = sizeof(buffer)-1;
-  if (len) {
-    buffer[len++] = '\n';
-    fwrite(buffer, 1, len, ncclDebugFile);
-  }
+  // Rewind len so that we can replace the final \0 by "\n"
+  len = std::min(len, sizeof(buffer)-1);  // prevent overflows
+
+  // Add a newline and write it to the debug file. No terminating null is
+  // necessary since we write bytes instead of the string.
+  buffer[len++] = '\n';
+  fwrite(buffer, 1, len, ncclDebugFile);
 }
 
 NCCL_API(void, ncclResetDebugInit);
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 5d79d7357..854ebbf3a 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -67,7 +67,7 @@ namespace {
         offset = dataOffset + rankDest * count;
 
         // Final wait/copy.
-        prims.directRecv(offset, offset, nelem);
+        prims.directRecv(offset, nelem);
       }
     } else if (inputBuf != outputBuf + ringRanks[0] * count) {
       inputBuf = inputBuf + partOffset;
@@ -111,25 +111,63 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatAg);
-
-    PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-      prims.patCopy(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
+
+    if (tid == nworkers) { // Algo computation thread
+      PatAGAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatAg);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patCopy(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 216159747..81da55401 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -78,7 +78,7 @@ namespace {
       offset = gridOffset + elemOffset + chunkOffset;
       nelem = (int)min(chunkCount, remCount - chunkOffset);
 
-      prims.directRecv(offset, offset, nelem);
+      prims.directRecv(offset, nelem);
     }
   }
 
@@ -132,7 +132,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -215,7 +215,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         }
       }
       else {
@@ -710,7 +710,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
             for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
               ssize_t offset = gridOffset + bid * int(chunkSize);
               int nelem = min(chunkSize, size - offset);
-              prims.directRecv(offset, offset, nelem, /*postOp*/true);
+              prims.directRecv(offset, nelem, /*postOp*/true);
             }
           }
         } else {
@@ -737,7 +737,7 @@ struct RunWorkColl<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PR
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
             ssize_t offset = gridOffset + bid*int(chunkSize);
             int nelem = min(chunkSize, size-offset);
-            prims.directRecv(offset, offset, nelem);
+            prims.directRecv(offset, nelem);
           }
         } else {
           for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
diff --git a/src/device/broadcast.h b/src/device/broadcast.h
index 017d379ad..5948891f8 100644
--- a/src/device/broadcast.h
+++ b/src/device/broadcast.h
@@ -46,7 +46,7 @@ namespace {
             prims.directCopySend(offset, offset, nelem);
           }
         } else if (nextRank == root) {
-          prims.directRecv(offset, offset, nelem);
+          prims.directRecv(offset, nelem);
         } else {
           prims.directRecvCopyDirectSend(offset, offset, nelem);
         }
diff --git a/src/device/common.h b/src/device/common.h
index 05465ff5a..2dca70dc2 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -53,6 +53,7 @@ struct ncclShmemData {
   int nWorks;
   int workSize;
   uint32_t workConsumed;
+  uint64_t workCounter;
   struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
 
@@ -113,24 +114,6 @@ __device__ inline bool barrier_red_or(bool vote, int name, int nThreads) {
       : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
   return bool(ans);
 }
-__device__ inline bool barrier_red_or_aligned(bool vote, int name) {
-  int ans;
-  asm volatile("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name) : "memory");
-  return bool(ans);
-}
-__device__ inline bool barrier_red_or_aligned(bool vote, int name, int nThreads) {
-  int ans;
-  asm("{ .reg .pred p;"
-      "  setp.ne.s32 p, %1, 0;"
-      "  barrier.red.or.pred.aligned p, %2, %3, p; "
-      "  selp.s32 %0, 1, 0, p; }"
-      : "=r"(ans) : "r"((int)vote), "r"(name), "r"(nThreads) : "memory");
-  return bool(ans);
-}
 
 // Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
 inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
@@ -331,7 +314,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   /* set abort flag to 0 */
   if (tid == 0) ncclShmem.aborted = 0;
 
-  // Use first 2 warps to load comm and channel, and reamaining load work batch.
+  // Use first 2 warps to load comm and channel, and remaining load work batch.
   switch (tid/WARP_SIZE) {
   case 0:
     { void* dst = &ncclShmem.comm;
@@ -364,7 +347,8 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
   }
 
-  while (true) {
+  while (ncclShmem.aborted == 0) {
+    if (tid == 0) ncclShmem.comm.workStarted[ncclShmem.channelId] = (ncclShmem.channel.workCounter += ncclShmem.nWorks);
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
       SpecializedRunWorkBatch().run();
     } else {
@@ -374,17 +358,18 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     if (ncclShmem.nextBatchIx == -1) break;
     int batchIx = ncclShmem.nextBatchIx;
     __syncthreads();
+    if (tid == 0) ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
     loadWorkBatchToShmem(tid, tn, args, batchIx);
+    __syncthreads();
 
-    // Check whether the last operation was aborted and make sure all threads exit
-    bool aborted = false;
-    if (tid == 0) aborted = *ncclShmem.comm.abortFlag;
-    aborted = barrier_red_or_aligned(aborted, 0); // publish ncclShmem.work
     if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-      // ncclShmem.workConsumed written by loadWorkBatchToShmem before barrier_red_or()
+      // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
       ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
     }
-    if (aborted) break;
+  }
+  if (tid == 0) {
+    ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
   }
 }
 
diff --git a/src/device/primitives.h b/src/device/primitives.h
index 73c10c264..3b9f169f7 100644
--- a/src/device/primitives.h
+++ b/src/device/primitives.h
@@ -12,7 +12,7 @@
 #include "common_kernel.h"
 #include "common.h"
 
-#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
+#define NCCL_SPINS_BEFORE_CHECK_ABORT 10000
 
 /* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
  * We use these as template args to the Primtiives class instead of integral
@@ -115,7 +115,7 @@ struct PrimitivesWithoutDirect {
   __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
   }
-  __device__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN) {
+  __device__ void directRecv(intptr_t outIx, int eltN) {
     static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
   }
   __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
@@ -139,6 +139,18 @@ struct PrimitivesWithoutDirect {
   }
 };
 
+__device__ inline int checkAbort(int &abortCache, const int abortValue, int &spins) {
+  if (abortCache & abortValue) return 1;
+  if (++spins < NCCL_SPINS_BEFORE_CHECK_ABORT) return 0;
+  spins = 0;
+  int abort = *ncclShmem.comm.abortFlag;
+  if (abort) {
+    ncclShmem.aborted = abort;
+    abortCache |= abortValue;
+  }
+  return abort;
+}
+
 #include "prims_simple.h"
 #include "prims_ll.h"
 #include "prims_ll128.h"
diff --git a/src/device/prims_ll.h b/src/device/prims_ll.h
index 3e00f3b85..2a0f5564b 100644
--- a/src/device/prims_ll.h
+++ b/src/device/prims_ll.h
@@ -51,23 +51,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     }
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
@@ -102,7 +93,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     do {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     } while ((flag1 != flag) || (flag2 != flag));
     uint64_t val64 = data1 + (((uint64_t)data2) << 32);
     return val64;
@@ -126,7 +117,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p, isNetOffload>:
     int spins = 0;
     while (line[i].flag1 != flag || line[i].flag2 != flag) {
       asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4) : "memory");
-      if (checkAbort(spins, 0)) break;
+      if (checkAbort(abort, 1, spins)) break;
     }
     uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
     return val64;
diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h
index 617b7acf3..6985e6771 100644
--- a/src/device/prims_ll128.h
+++ b/src/device/prims_ll128.h
@@ -53,23 +53,14 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
     barrier_sync(15-group, nthreads);
   }
 
-  uint32_t abort = 0;
-
-  inline __device__ int checkAbort(int &spins, int i, int send) {
-    spins++;
-    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      abort = *ncclShmem.comm.abortFlag;
-      spins = 0;
-    }
-    return abort;
-  }
+  int abort = 0;
 
   inline __device__ void waitSend(int nbytes) {
     if (sendConnHeadPtr) {
       int spins = 0;
       while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
         sendConnHeadCache = *sendConnHeadPtr;
-        if (checkAbort(spins, wid, 1)) break;
+        if (checkAbort(abort, 1, spins)) break;
       }
       if (sendConnFifo) {
         sendConnFifo[sendStep[wid]%NCCL_STEPS].size = nbytes;
@@ -201,7 +192,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
           load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
           needReload |= flagThread && (vr[u+1] != flag);
         }
-        needReload &= (0 == checkAbort(spins, 0, 0));
+        needReload &= (0 == checkAbort(abort, 1, spins));
       } while (__any_sync(WARP_MASK, needReload));
 
       #pragma unroll
@@ -248,7 +239,7 @@ class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p, isNetOffload>:
             load128(ptr+u*WARP_SIZE, vr[u], vr[u+1]);
             needReload |= flagThread && (vr[u+1] != flag);
           }
-          needReload &= (0 == checkAbort(spins, i, 0));
+          needReload &= (0 == checkAbort(abort, 1, spins));
         } while (__any_sync(WARP_MASK, needReload));
 
         #pragma unroll
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index 005101940..cf3ba9b55 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -52,7 +52,7 @@ class Primitives<
   uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
   int      connStepSize; // Connection step size
   void*    netDeviceHandle;
-  uint64_t accSize; // Accumulated size. Used by PAT operations
+  uint64_t accSize;
 
   // Don't use barrier 0 as it's used by the final sync
   __device__ void barrier() {
@@ -70,6 +70,11 @@ class Primitives<
     }
   }
 
+  // PAT uses a single barrier across all groups
+  __device__ void patBarrier() {
+    barrier_sync(15, NCCL_PAT_NWORKERS);
+  }
+
   __device__ bool barrierAny(int vote) {
     if (nthreads == WARP_SIZE) {
       return __any_sync(~0u, vote);
@@ -87,18 +92,6 @@ class Primitives<
     }
   }
 
-  inline __device__ bool checkAbort(int &spins) {
-    spins++;
-    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-      if (*ncclShmem.comm.abortFlag) {
-        flags |= Aborted;
-        ncclShmem.aborted = 1;
-      }
-      spins = 0;
-    }
-    return flags & Aborted;
-  }
-
   inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
     #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
     if (flags & NvlsMinPolling) {
@@ -121,7 +114,7 @@ class Primitives<
       int spins = 0;
       while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
         connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+        if (checkAbort(flags, Aborted, spins)) break;
         //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
       }
     }
@@ -338,13 +331,8 @@ class Primitives<
     peerPtr->recv[connIndex].step += steps;
     st_relaxed_sys_global(peerPtr->recv[connIndex].head, peerPtr->recv[connIndex].step);
     while (ld_volatile_global(peerPtr->recv[connIndex].tail) < peerPtr->recv[connIndex].step) {
-      if (spins++ == NCCL_SPINS_BEFORE_CHECK_ABORT) {
-        if (*ncclShmem.comm.abortFlag) {
-          ncclShmem.aborted = 1;
-          break;
-        }
-        spins = 0;
-      }
+      int abort = 0;
+      if (checkAbort(abort, 1, spins)) break;
     }
   }
 
@@ -359,7 +347,7 @@ class Primitives<
           int spins = 0;
           while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
             connStepCache = loadStepValue(connStepPtr);
-            if (checkAbort(spins)) break;
+            if (checkAbort(flags, Aborted, spins)) break;
           }
           void **ptrs = isSendNotRecv ? ncclShmem.groups[group].dsts
                                       : ncclShmem.groups[group].srcs;
@@ -601,13 +589,13 @@ class Primitives<
     tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
     stepSize(stepSize_ == 0 ? ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T) : stepSize_) {
 
-    // For send operations, we need an extra warp to overlap the threadfence and the copy
-    this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
-
     int peer = -1;
     flags = 0;
     index = -1;
     if (mode == primsModeDefault) { // Connect to ranks in sendPeers/recvPeers
+      // For send operations, we need an extra warp to overlap the threadfence and the copy
+      this->nworkers = nthreads - (MaxSend > 0 && nthreads >= NCCL_SIMPLE_EXTRA_GROUP_IF_NTHREADS_GE ? WARP_SIZE : 0);
+
       int nrecv=0, nsend=0;
       // Yes, for some template arguments this code will be unreachable.  That's fine.
       // coverity[dead_error_line]
@@ -637,68 +625,84 @@ class Primitives<
 
       if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
       if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+
+      // Coverity thinks that index could be -1 here but that's not actually the case.
+      // coverity[negative_returns:FALSE]
+      int sendIpcReg;
+      int recvIpcReg;
+      int sendNetReg;
+      int recvNetReg;
+      if (P2p) {
+        sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
+        recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
+        sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
+        recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
+      } else {
+        recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
+        recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
+      }
+
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
+      // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+      if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
+
+      if (barrierAny(flags & NetDeviceUnpack)) {
+        flags |= AnyNetDeviceUnpack;
+        // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
+        // have NetDeviceUnpack.
+        uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
+        if (tid == 0) {
+          ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+        }
+      }
+
+      // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
+      // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
+      setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
+      // coverity[uninit_member] => coverity thinks fan.n is not initialized
     } else if (mode == primsModePatRs || mode == primsModePatAg) { // Connect to all ranks +/- 2^n
       flags |= PatMode;
-      accSize = 0;
+      const int roles[5] = { RoleWaitRecv, RolePostRecv, RoleWaitSend, RolePostSend, RoleInput | RoleOutput };
+      if (tid < 5) flags |= roles[tid];
+
       int nranks = ncclShmem.comm.nRanks;
-      int rank = ncclShmem.comm.rank;
-      // A thread is responsible for rank +/- 2 ^ (tid%32). That should be fine as long as rank is a 32-bits integer.
-      index = tid % 32;
-      uint32_t delta = 1 << index;
-      const int roles[4] = { RoleWaitRecv, RoleWaitSend, RolePostSend, RolePostRecv};
-      int block = tid / 32;
-      if (block < 4 && delta < nranks) {
-        int role = roles[block];
-        if (mode == primsModePatRs) {
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank + delta) % nranks;
-        } else if (mode == primsModePatAg) {
-          if (role & (RoleWaitSend|RolePostSend)) peer = (rank - delta + nranks) % nranks;
-          if (role & (RoleWaitRecv|RolePostRecv)) peer = (rank + delta) % nranks;
-        }
-        flags |= role;
-      } else if (tid == 128) {
-        flags |= RoleInput | RoleOutput; // Only one will be used depending on the operation
+      if (tid < 32 && ((1UL<<tid) < nranks)) {
+        int rank = ncclShmem.comm.rank;
+        uint32_t delta = 1 << tid;
+        // Load recv peer
+        int recvPeer = mode == primsModePatRs ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        struct ncclPatPeer* peer = ((struct ncclPatPeer*)recvPeers)+tid;
+        struct ncclConnInfo* conn = peer->conn = ncclShmem.channel.peers[recvPeer]->recv+connIndexRecv;
+        peer->step = conn->step;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->tailPtr = conn->tail);
+        peer->headPtr = conn->head;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
+        // Load send peer
+        int sendPeer = mode == primsModePatAg ? (rank - delta + nranks) % nranks : (rank + delta) % nranks;
+        peer = ((struct ncclPatPeer*)sendPeers)+tid;
+        conn = peer->conn = ncclShmem.channel.peers[sendPeer]->send+connIndexSend;
+        peer->step = conn->step;
+        peer->connFifo = conn->connFifo;
+        peer->buff = conn->buffs[NCCL_PROTO_SIMPLE];
+        peer->stepCache = loadStepValue(peer->headPtr = conn->head);
+        peer->tailPtr = conn->tail;
+        peer->accSize = 0;
+        peer->connStepSize = conn->stepSize/sizeof(T);
       }
-    }
-
-    // Coverity thinks that index could be -1 here but that's not actually the case.
-    // coverity[negative_returns:FALSE]
-    int sendIpcReg;
-    int recvIpcReg;
-    int sendNetReg;
-    int recvNetReg;
-    if (P2p) {
-      sendIpcReg = p2pWork ? p2pWork->sendIpcReg : 0;
-      recvIpcReg = p2pWork ? p2pWork->recvIpcReg : 0;
-      sendNetReg = p2pWork ? p2pWork->sendNetReg : 0;
-      recvNetReg = p2pWork ? p2pWork->recvNetReg : 0;
-    } else {
-      recvIpcReg = sendIpcReg = collWork ? collWork->regUsed : 0;
-      recvNetReg = sendNetReg = collWork ? collWork->netRegUsed : 0;
-    }
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitRecv|RolePostRecv)) loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, collWork ? collWork->direct : 0, recvIpcReg, recvNetReg);
-    // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-    if (flags & (RoleWaitSend|RolePostSend)) loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, collWork ? collWork->direct : 0, sendIpcReg, sendNetReg);
-
-    if (barrierAny(flags & NetDeviceUnpack)) {
-      flags |= AnyNetDeviceUnpack;
-      // RoleWaitRecv starts at tid=0, so this creates the bitmask of which recv peers
-      // have NetDeviceUnpack.
-      uint32_t mask = __ballot_sync(~0u, ((flags & RoleWaitRecv) && (flags & NetDeviceUnpack)) ? 1 : 0);
-      if (tid == 0) {
-        ncclShmem.groups[this->group].devicePlugin.unpack.unpackNetDeviceIndexMask = mask;
+      if (tid==0) {
+        ncclShmem.groups[group].userInput = (void*)inputBuf;
+        ncclShmem.groups[group].userOutput = (void*)outputBuf;
+        ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
       }
+      patBarrier();
     }
-
-    // coverity[negative_returns:FALSE] => coverity thinks that index could be -1 but that's not actually the case
-    // coverity[var_deref_model] => coverity thinks work can dereferenced if NULL but this is not the case
-    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclDevWorkCollReg*)collWork, sendIpcReg || recvIpcReg, peer);
-    // coverity[uninit_member] => coverity thinks fan.n is not initialized
   }
 
   __device__ ~Primitives() {
+    if (flags&PatMode) return;
     // Save steps for the next operation
     if (flags & (RolePostSend|RolePostRecv)) conn->step = step;
     if ((flags & NetRegMode) && (flags & RoleWaitSend)) {
@@ -708,7 +712,7 @@ class Primitives<
       uint64_t prevStep = step - StepPerSlice;
       volatile ssize_t* ptr = &(connFifo[prevStep%NCCL_STEPS].size);
       int spins = 0;
-      while (*ptr != -1) if (checkAbort(spins)) break;
+      while (*ptr != -1) if (checkAbort(flags, Aborted, spins)) break;
     }
 
     if (flags & NetDeviceUnpack) {
@@ -726,7 +730,7 @@ class Primitives<
       int spins = 0;
       volatile uint64_t* tail = conn->tail;
       volatile uint64_t* head = conn->head;
-      while (*tail > *head) if (checkAbort(spins)) break;
+      while (*tail > *head) if (checkAbort(flags, Aborted, spins)) break;
     }
   }
 
@@ -749,7 +753,7 @@ class Primitives<
         if (slot) {
           T* exchgPtr;
           directBuff = (T*)outputBuf;
-          while (*slot != nullptr && !checkAbort(spins));
+          while (*slot != nullptr && !checkAbort(flags, Aborted, spins));
           if (P2p) {
             exchgPtr = (T*)outputBuf;
           } else {
@@ -766,7 +770,7 @@ class Primitives<
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot) {
@@ -785,7 +789,7 @@ class Primitives<
         // Wait for consumer to consume previous value before trampling it.
         if (slot && argSlot0 && argSlot1) {
           T* exchgPtr;
-          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(spins));
+          while ((*slot != nullptr || *argSlot0 != 0 || *argSlot1 != 0) && !checkAbort(flags, Aborted, spins));
           // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
           // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
           directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
@@ -815,7 +819,7 @@ class Primitives<
         void* ptr;
         while (slot) {
           ptr = *slot;
-          if (ptr != nullptr || checkAbort(spins)) break;
+          if (ptr != nullptr || checkAbort(flags, Aborted, spins)) break;
         }
 
         if (slot && argSlot0 && argSlot1) {
@@ -826,7 +830,7 @@ class Primitives<
             while (true) {
               arg0 = *argSlot0;
               arg1 = *argSlot1;
-              if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+              if ((arg0 != 0 && arg1 != 0) || checkAbort(flags, Aborted, spins)) break;
             }
             ncclShmem.redOpArgs[1 + index] = ((arg1 & 0xffffffff) << 32) | (arg0 & 0xffffffff);
           }
@@ -866,8 +870,8 @@ class Primitives<
   __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
   }
-  __device__ __forceinline__ void directRecv(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
-    genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, postOp);
+  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 0, -1, Output>(outIx, outIx, eltN, postOp);
   }
   __device__ __forceinline__ void directRecvCopy(intptr_t inpIx, intptr_t outIx, int eltN) {
     genericOp<1, 0, 1, 0, -1, Output>(inpIx, outIx, eltN, /*postOp=*/false);
@@ -945,54 +949,65 @@ class Primitives<
     ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
   }
 
-  __device__ __forceinline__ void patReduce(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int sendStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patReduce(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + recvOffset;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
+    }
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
+    }
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
       int spins = 0;
-      while (connStepCache < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+      while (peer->stepCache < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
       }
-      if (postRecv) step += StepPerSlice;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
+    if (send && (flags & RoleWaitSend)) {
       int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + sendStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+      while (peer->stepCache + NCCL_STEPS < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
       }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + ((step+sendStepOffset)%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (accSize < sendOffset + nelem + (step+sendStepOffset)*connStepSize) {
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
+      if (peer->accSize < ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = sendOffset + nelem + (step+sendStepOffset)*connStepSize;
-        if (flags & ConnFifoEnabled)
-          connFifo[(step+sendStepOffset)%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
       } else {
         // There is already data in there, accumulate instead of writing to it.
         ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
       }
-      if (postSend) step += StepPerSlice;
     }
-    if (sendPow2 < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
-      ncclShmem.groups[group].dsts[0] = userOutput + outIx;
-      if (accSize < outIx + nelem) {
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->sendDim < 0 && (flags & RoleOutput)) { // Destination is our own local buffer
+      ncclShmem.groups[group].dsts[0] = userOutput + ps->outIx;
+      if (localAccSize < ps->outIx + nelem) {
         // New data, add our own data to it.
-        ncclShmem.groups[group].srcs[1] = userInput + inpIx;
-        accSize = outIx + nelem;
+        ncclShmem.groups[group].srcs[1] = userInput + ps->inpIx;
+        localAccSize = ps->outIx + nelem;
       } else {
         // There is already data in there, accumulate instead of writing to it.
         ncclShmem.groups[group].srcs[1] = ncclShmem.groups[group].dsts[0];
       }
     }
-    barrier();
+    patBarrier();
     int nSrcs = 2;
     void** srcs = ncclShmem.groups[group].srcs;
-    if (recvPow2 < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
+    if (ps->recvDim < 0) { srcs++; nSrcs--; } // No peer to receive from, remove one source
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
 
@@ -1000,59 +1015,92 @@ class Primitives<
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        nSrcs, srcs, 1, ncclShmem.groups[group].dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->sendDim < 0 && (flags & RoleOutput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->sendDim >= 0 && (flags & RoleWaitSend)) atomicMax(&peer->accSize, ps->sendOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
-  __device__ __forceinline__ void patCopy(int recvPow2, int sendPow2, intptr_t inpIx, intptr_t outIx, int recvOffset, int sendOffset, int recvStepOffset, int nelem, int postRecv, int postSend) {
-    nelem = nelem < 0 ? 0 : nelem;
+  __device__ __forceinline__ void patCopy(struct ncclPatStep* ps, struct ncclPatShmem* shmem) {
+    if (ps->flags & PatSkipped) { patBarrier(); patBarrier(); return; } // Skipped
+    int nelem = ps->nelem < 0 ? 0 : ps->nelem;
     T* userInput = (T*)ncclShmem.groups[group].userInput;
     T* userOutput = (T*)ncclShmem.groups[group].userOutput;
 
-    if (recvPow2 >= 0 && recvPow2 == index && (flags & RoleWaitRecv)) {
-      ncclShmem.groups[group].srcs[0] = (T*)(connEltsFifo + ((step+recvStepOffset)%NCCL_STEPS)*connStepSize) + recvOffset;
+    bool recv = ps->recvDim >= 0 && (flags & (RolePostRecv|RoleWaitRecv));
+    bool send = ps->sendDim >= 0 && (flags & (RolePostSend|RoleWaitSend));
+    bool postRecv = ps->postRecv && recv;
+    bool postSend = ps->postSend && send;
+    struct ncclPatPeer* peer = NULL;
+    if (recv) {
+      peer = shmem->recvDims+ps->recvDim;
+      step = peer->step;
+    }
+    if (send) {
+      peer = shmem->sendDims+ps->sendDim;
+      step = peer->step;
+    }
+
+    if (recv && (flags & RoleWaitRecv)) {
+      ncclShmem.groups[group].srcs[0] = ((T*)peer->buff) + ((step+ps->stepOffset)%NCCL_STEPS)*peer->connStepSize + ps->recvOffset;
       int spins = 0;
-      while (connStepCache < step + recvStepOffset + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
+      while (peer->stepCache < step + ps->stepOffset + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->tailPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
       }
-      if (accSize < recvOffset + nelem + (step+recvStepOffset)*connStepSize) {
+      if (peer->accSize < ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize) {
         // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = recvOffset + nelem + (step+recvStepOffset)*connStepSize;
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
       } else {
         ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
       }
-      if (postRecv) step += StepPerSlice;
     }
-    if (sendPow2 >= 0 && sendPow2 == index && (flags & RoleWaitSend)) {
+    if (send && (flags & RoleWaitSend)) {
       int spins = 0;
-      while (connStepCache + NCCL_STEPS < step + StepPerSlice) {
-        connStepCache = loadStepValue(connStepPtr);
-        if (checkAbort(spins)) break;
-      }
-      ncclShmem.groups[group].dsts[0] = (T*)(connEltsFifo + (step%NCCL_STEPS)*connStepSize) + sendOffset;
-      if (postSend) {
-        if (flags & ConnFifoEnabled)
-          connFifo[step%NCCL_STEPS].size = (sendOffset + nelem)*sizeof(T);
-        step += StepPerSlice;
+      while (peer->stepCache + NCCL_STEPS < step + StepPerSlice) {
+        peer->stepCache = loadStepValue(peer->headPtr);
+        if (checkAbort(flags, Aborted, spins)) break;
       }
+      ncclShmem.groups[group].dsts[0] = ((T*)peer->buff) + (step%NCCL_STEPS)*peer->connStepSize + ps->sendOffset;
     }
-    if (recvPow2 < 0 && (flags & RoleInput)) { // Source is our own local buffer
-      ncclShmem.groups[group].srcs[0] = userInput + inpIx;
-      if (accSize < inpIx + nelem) {
+    long long int localAccSize = shmem->localAccSize;
+    if (ps->recvDim < 0 && (flags & RoleInput)) { // Source is our own local buffer
+      ncclShmem.groups[group].srcs[0] = userInput + ps->inpIx;
+      if (localAccSize < ps->inpIx + nelem) {
         // New data, copy to our output buffer.
-        ncclShmem.groups[group].dsts[1] = userOutput + outIx;
-        accSize = inpIx + nelem;
+        ncclShmem.groups[group].dsts[1] = userOutput + ps->outIx;
+        localAccSize = ps->inpIx + nelem;
       } else {
-        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0]; // Already done
+        // Already done
+        ncclShmem.groups[group].dsts[1] = ncclShmem.groups[group].srcs[0];
       }
     }
-    barrier();
+    patBarrier();
     int nDsts = 2;
     void** dsts = ncclShmem.groups[group].dsts;
-    if (sendPow2 < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
+    if (ps->sendDim < 0) { dsts++; nDsts--; } // No peer to send to, remove one dest
     if (ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[1]) nDsts--; // In-place or already done.
 
     int workSize = ncclShmem.aborted ? 0 : nelem;
@@ -1061,9 +1109,32 @@ class Primitives<
       (tid, nthreads, ncclShmem.redOpArgs[0],  nullptr, /*postOp=*/false,
        1, ncclShmem.groups[group].srcs, nDsts, dsts, workSize);
 
-    barrier();
-    if (postRecv && recvPow2 >= 0 && recvPow2 == index && (flags & RolePostRecv)) postPeer<1, 0>(0 < nelem);
-    if (postSend && sendPow2 >= 0 && sendPow2 == index && (flags & RolePostSend)) postPeer<0, 1>(0 < nelem);
+    // Store conn step here inside the two barriers to make sure next reload will see the update.
+    if (postSend && (flags & RolePostSend)) {
+      if (peer->connFifo) {
+        peer->connFifo[step%NCCL_STEPS].size = (ps->sendOffset + nelem)*sizeof(T);
+      }
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      peer->step = step += StepPerSlice;
+      st_relaxed_sys_global(&peer->conn->step, step); // Also save in global mem for next op
+    }
+
+    // Update accSize
+    if (ps->recvDim < 0 && (flags & RoleInput)) atomicMax(&shmem->localAccSize, localAccSize);
+    if (ps->recvDim >= 0 && (flags & RoleWaitRecv)) atomicMax(&peer->accSize, ps->recvOffset + nelem + (step+ps->stepOffset)*peer->connStepSize);
+
+    patBarrier();
+
+    if (postSend && (flags & RolePostSend)) {
+      if (nelem > 0 || peer->connFifo) fence_acq_rel_sys();
+      st_relaxed_sys_global(peer->tailPtr, step);
+    }
+    if (postRecv && (flags & RolePostRecv)) {
+      st_relaxed_sys_global(peer->headPtr, step);
+    }
   }
 
 };
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index 70538b117..5d8de2819 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -80,29 +80,66 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_L
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int nthreads, struct ncclDevWorkColl* work) {
+#if __CUDA_ARCH__ >= 600
     using Proto = ProtoSimple<1, 1>;
     const int nranks = ncclShmem.comm.nRanks;
     const int rank = ncclShmem.comm.rank;
     size_t count, channelOffset, channelCount, chunkCount;
     ncclCollCbdPart(work, ncclShmem.channelId, Proto::Id, sizeof(T), &count, &channelOffset, &channelCount, &chunkCount);
 
-    T *inputBuf = (T*)work->sendbuff;
-    T *outputBuf = (T*)work->recvbuff;
-    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
-      (tid, nthreads, NULL, NULL, inputBuf, outputBuf, work->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+    static constexpr int nworkers = NCCL_PAT_NWORKERS;
+    struct ncclPatShmem* shmem = (struct ncclPatShmem*)ncclScratchForWarp(0);
+    uint64_t pollCount = 0;
+    __syncthreads(); // Don't start using shared mem until everyone arrives
+    for (int i=tid; i<NCCL_SHMEM_PAT_STEPS; i+=nthreads) shmem->patSteps[i].flags = 0;
+    if (tid == 0) shmem->localAccSize = 0;
+    if (tid == nworkers) shmem->parallelFactor = 0;
+    __syncthreads();
 
-    PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
-    int last = 0;
-    while (!last) {
-      int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-      size_t inpIx, outIx;
-      patAlgo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-      prims.patReduce(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend);
+    if (tid == nworkers) { // Algo computation thread
+      PatRSAlgorithm<T> patAlgo(chunkCount*sizeof(T), NCCL_STEPS, NCCL_PAT_NWORKERS/WARP_SIZE, channelOffset, channelOffset + channelCount, count, chunkCount, rank, nranks);
+      int parallelFactor = shmem->parallelFactor = patAlgo.getParallelFactor();
+      int step = 0;
+      while (1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) != 0) pollCount++; // Wait for workers to be done with step 'step-NCCL_SHMEM_PAT_STEPS'
+        patAlgo.getNextOp(ps);
+        int last = ps->last;
+        step++;
+        if (last == 2) break;
+      }
+    } else if (tid < nworkers) { // Worker threads
+      T *inputBuf = (T*)work->sendbuff;
+      T *outputBuf = (T*)work->recvbuff;
+      int parallelFactor = 0;
+      volatile int* pfPtr = &shmem->parallelFactor;
+      while (parallelFactor == 0) parallelFactor = *pfPtr;
+
+      int groupSize = nworkers/(WARP_SIZE*parallelFactor) * WARP_SIZE;
+      int group = tid / groupSize;
+      int nGroups = nworkers / groupSize;
+      int tidInGroup = tid - group*groupSize;
+      // We don't use recvPeers/sendPeers so let's pass shmem structs instead
+      Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
+        (tidInGroup, groupSize, (int*)shmem->recvDims, (int*)shmem->sendDims, inputBuf, outputBuf, work->redOpArg, group, 0, 0, nullptr, nullptr, 0, primsModePatRs);
+
+      int step = group;
+      while(1) {
+        struct ncclPatStep* ps = shmem->patSteps+(step%NCCL_SHMEM_PAT_STEPS);
+        cuda::atomic_ref<int, cuda::thread_scope_block> poll(ps->flags);
+        while (poll.load(cuda::memory_order_acquire) == 0) pollCount++; // Wait for compute thread
+        int last = ps->last;
+        prims.patReduce(ps, shmem);
+        if (tidInGroup == 0) poll.store(0, cuda::memory_order_release); // Return element to compute thread
+        if (last) break;
+        step += nGroups;
+      }
     }
+#endif
   }
 };
 
-
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
diff --git a/src/device/sendrecv.h b/src/device/sendrecv.h
index fe3b9ca77..f36a511d8 100644
--- a/src/device/sendrecv.h
+++ b/src/device/sendrecv.h
@@ -41,7 +41,7 @@ struct RunWorkBatch<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPL
     size_t cursor = 0;
     do {
       int n = min(size_t(chunkSize), bytes-cursor);
-      prims.directRecv(cursor, cursor, n);
+      prims.directRecv(cursor, n);
       cursor += n;
     } while (cursor < bytes);
   }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 23f463397..5e0b213fc 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -23,7 +23,6 @@ NCCL_PARAM(L1SharedMemoryCarveout, "L1_SHARED_MEMORY_CARVEOUT", 0);
 // Returns maximum kernel stack size of all CUDA kernels
 ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* maxStackSize) {
   ncclResult_t result = ncclSuccess;
-  int print = 0;
 
   if (maxStackSize) *maxStackSize = 0;
   int carveout = ncclParamL1SharedMemoryCarveout();
@@ -48,11 +47,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
     if (ncclMaxSharedMem != 0) {
       int sharedMemSize = ncclMaxSharedMem;
       if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
-        if (print++ == 0)
-          INFO(NCCL_INIT, "ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
-               sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
-        // Reduce requested MaxDynamicSharedMemorySize attribute
-        sharedMemSize = maxSharedMem - attr.sharedSizeBytes;
+        WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+             cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+        return ncclSystemError;
       }
       CUDACHECKGOTO(cudaFuncSetAttribute(fn,
         cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
@@ -388,6 +385,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
         struct ncclTaskColl* next = aggBeg->next;
         aggBeg->algorithm = agg.algorithm;
         aggBeg->protocol = agg.protocol;
+        if (aggBeg->protocol == NCCL_PROTO_LL) aggBeg->trafficBytes *= 4;
         aggBeg->nMaxChannels = agg.nMaxChannels;
         aggBeg->nWarps = agg.nWarps;
         aggBeg->devFuncId = agg.devFuncId;
@@ -478,6 +476,14 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
   return ncclSuccess;
 }
 
+static ncclResult_t addProfilerProxyOpIfNeeded(struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclProxyOp* op) {
+  int tmp = op->pattern;
+  op->pattern = ncclPatternProfiler;
+  ncclResult_t ret = addProxyOpIfNeeded(comm, plan, op);
+  op->pattern = tmp;
+  return ret;
+}
+
 static ncclResult_t scheduleCollTasksToPlan(
     struct ncclComm* comm, struct ncclKernelPlan* plan, struct ncclKernelPlanBudget* budget
   ) {
@@ -550,11 +556,16 @@ static ncclResult_t scheduleCollTasksToPlan(
         proxyOp.opCount = proxyOpId;
         proxyOp.task.coll = task;
         proxyOp.rank = comm->rank;
+        proxyOp.eActivationMask = task->eActivationMask;
+        proxyOp.workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
+        // Set pattern to profiler to add a proxy profiler for kernel events
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOp));
       }
     } else { // not task->isCollnet
       int trafficPerByte = ncclFuncTrafficPerByte(task->func, comm->nRanks);
+      if (task->protocol == NCCL_PROTO_LL) trafficPerByte *= 4;
       size_t cellSize = divUp(divUp(MinTrafficPerChannel, (size_t)trafficPerByte), 16) * 16;
       int elementsPerCell = cellSize/elementSize;
       size_t cells = divUp(task->count*elementSize, cellSize);
@@ -669,11 +680,14 @@ static ncclResult_t scheduleCollTasksToPlan(
           }
           proxyOp->ringAlgo->incRefCount();
         }
+        proxyOp->eActivationMask = task->eActivationMask;
+        proxyOp->workCounter = ++comm->profiler.workCounter[c];
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
         // coverity[uninit_use_in_call:FALSE]
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, proxyOp));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, proxyOp));
       }
     }
 
@@ -797,7 +811,8 @@ static ncclResult_t addP2pToPlan(
     if (protocol[dir] == NCCL_PROTO_LL) chunkSize[dir] *= 2;
 
     if (network[dir]) {
-      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (ncclPxnDisable(comm) || !comm->isAllNvlink)) {
+      bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
+      if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
         int regFlag = 0;
         NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
         for (int part = 0; part < nChannelsMax; part++) {
@@ -888,6 +903,7 @@ static ncclResult_t addP2pToPlan(
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
+    op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
     // The following are modified per channel part in addWorkToChannels():
     // op->buffer, op->nbytes, op->nsteps = ...;
   }
@@ -898,7 +914,6 @@ static ncclResult_t addP2pToPlan(
     plan->channelMask |= uint64_t(1)<<channelId;
     // Add batch first.
     addWorkBatchToPlan(comm, plan, channelId, ncclDevWorkTypeP2p, ncclDevFuncId_P2p(), workOffset, p2pRound);
-    // Add proxy ops.
     for (int dir=0; dir < nProxyOps; dir++) {
       // Partition steps across channels.
       int nParts = dir ? work->nSendChannels : work->nRecvChannels;
@@ -935,9 +950,12 @@ static ncclResult_t addP2pToPlan(
         // equal one plus the batch index this p2p settled in.
         proxyOps[dir].channelId = channelId;
         proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
+        proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1;
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
       }
     }
+    comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0;
   }
 
   return ncclSuccess;
@@ -1157,22 +1175,23 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       struct uploadWork_cleanup_t* cleanup = nullptr;
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       void* fifoBufDev = nullptr;
+      cudaStream_t deviceStream;
+
       CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), result, fail);
 
-      // Acquire deviceStream to gain access to deviceStream.cudaStream. Since the
-      // user's graph will be launched later, and it also acquires the deviceStream,
-      // it will observe this upload.
-      NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), result, fail);
+      // Acquire deviceStream. Since the user's graph will be launched later and it also
+      // acquires the deviceStream, it will observe this upload.
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, fail);
 
-      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMallocAsync(&fifoBufDev, workBytes, comm->memPool, deviceStream), result, fail);
       plan->workBufPersistent = fifoBufDev;
       plan->kernelArgs->workBuf = fifoBufDev;
 
       // coverity[uninit_use_in_call:FALSE] => fifoBufHost is never NULL
-      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(fifoBufDev, fifoBufHost, workBytes, cudaMemcpyDefault, deviceStream), result, fail);
       cudaEvent_t memcpyDone;
       CUDACHECKGOTO(cudaEventCreateWithFlags(&memcpyDone, cudaEventDisableTiming), result, fail);
-      CUDACHECKGOTO(cudaEventRecord(memcpyDone, comm->sharedRes->deviceStream.cudaStream), result, fail);
+      CUDACHECKGOTO(cudaEventRecord(memcpyDone, deviceStream), result, fail);
 
       NCCLCHECKGOTO(ncclCalloc(&cleanup, 1), result, fail);
       cleanup->base.fn = uploadWork_cleanup_fn;
@@ -1180,7 +1199,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       cleanup->hostBuf = fifoBufHost;
       ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
-      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream), result, fail);
+      NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
       NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
 
     finish_scope:
@@ -1254,14 +1273,15 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
-  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->noncapturedRefs);
+  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
   return;
 }
 
 static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback* me) {
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)me; // cast from first member `reclaim`
   if (plan->persistent) {
-    comm->persistentRefs -= 1;
+    comm->sharedRes->persistentRefs -= 1;
+    comm->localPersistentRefs -= 1;
     if (plan->workStorageType == ncclDevWorkStorageTypePersistent) {
       cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
@@ -1317,6 +1337,28 @@ static void persistentDestructor(void* plans_) {
   }
 }
 
+NCCL_PARAM(LaunchOrderImplicit, "LAUNCH_ORDER_IMPLICIT", 0);
+
+namespace {
+  enum ncclImplicitOrder {
+    ncclImplicitOrderNone,
+    ncclImplicitOrderSerial,
+    ncclImplicitOrderLaunch
+  };
+}
+
+static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
+  if (ncclParamLaunchOrderImplicit()) {
+    // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
+    if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
+    if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
+    *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
+    return ncclSuccess;
+  }
+  *mode = ncclImplicitOrderNone;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
@@ -1364,58 +1406,60 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
 
     if (nPlans == 0) return ncclSuccess;
 
-    // Semantically we want these dependencies for the kernels launched:
-    //   1. Launch host task on hostStream.
-    //   2. Launch kernel, depends on all of {deviceStream, hostStream, userStream[i]...}
-    //   3. {deviceStream, userStream[i]...} depend on kernel.
-    // We achieve this by:
-    //   1. userStream[0] waits on deviceStream
-    //   2. deviceStream waits on each of userStream[1...]
-    //   3. host task launch on hostStream
-    //   4. userStream[0] waits on hostStream
-    //   5. kernel launch on userStream[0]
-    //   6. deviceStream waits on userStream[0]
-    //   7. userStream[1...] each waits on deviceStream
-    // The two-level fan-in fan-out is because ncclStrongStreamWaitStream() requires
-    // at least one of the two streams to be strong-stream.
     cudaStream_t launchStream = planner->streams->stream;
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream), result, failure);
+    cudaStream_t deviceStream, launchOrder;
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), result, failure);
 
-    // Create dependency for device stream on user streams. First from extra user
-    // streams to deviceStream. Then deviceStream to first user stream.
+    // userStream[0] waits on each userStream[i]...
     for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, l->stream), result, failure);
+      CUDACHECKGOTO(cudaEventRecord(comm->sharedRes->scratchEvent, l->stream), result, failure);
+      CUDACHECKGOTO(cudaStreamWaitEvent(launchStream, comm->sharedRes->scratchEvent, 0), result, failure);
+    }
+    // userStream[0] waits on deviceStream
+    NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, deviceStream, comm->sharedRes->scratchEvent), result, failure);
+
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
+
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // userStream[0] waits on per-device (context) launchOrder. Concurrent strong stream access is
+      // required if this is a graph capture, non-captured cannot be concurrent because that would violate
+      // deterministic program order of launches.
+      bool concurrent = capturing;
+      NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder), result, failure);
+      NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->deviceStream), result, failure);
 
-    if (persistent || comm->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->noncapturedRefs, __ATOMIC_ACQUIRE)) {
+    if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
+      cudaStream_t hostStream;
       for (struct ncclKernelPlan* plan=planHead; plan != nullptr; plan = plan->next) {
         if (plan->hasProxyOps) {
           if (!acquired) {
             acquired = true;
-            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+            NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
           }
-          if (!persistent) ncclAtomicRefCountIncrement(&comm->noncapturedRefs);
+          if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
           plan->isHostCbEnq = true;
-          NCCLCHECKGOTO(ncclStrongStreamLaunchHost(planner->capturingGraph, &comm->sharedRes->hostStream, hostStreamPlanCallback, plan), result, failure);
+          CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
         }
       }
       if (acquired) {
         // Make to-be-launched kernels dependent on just-launched host stream tasks.
-        NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, launchStream, &comm->sharedRes->hostStream), result, failure);
-        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream), result, failure);
+        NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, hostStream, comm->sharedRes->scratchEvent), result, failure);
+        NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false), result, failure);
       }
     }
 
     if (persistent) {
-      comm->persistentRefs += nPlans;
+      comm->sharedRes->persistentRefs += nPlans;
+      comm->localPersistentRefs += nPlans;
       NCCLCHECKGOTO(ncclCudaGraphAddDestructor(planner->capturingGraph, persistentDestructor, (void*)planHead), result, failure);
     }
   }
-
 failure:
   return result;
 }
@@ -1434,6 +1478,7 @@ NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
   int nChannels = countOneBits(plan->channelMask);
   void* sym = plan->kernelFn;
@@ -1447,18 +1492,19 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     CU_LAUNCH_PARAM_END
   };
 
+  int driverVersion;
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, do_return);
+
   CUfunction fn;
-  CUDACHECK(cudaGetFuncBySymbol(&fn, sym));
+  CUDACHECKGOTO(cudaGetFuncBySymbol(&fn, sym), ret, do_return);
 
+  if (CUDART_VERSION >= 11080 && driverVersion >= 11080) {
   #if CUDART_VERSION >= 11080
-  int driverVersion;
-  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
-  if (driverVersion >= 11080) {
     int compCap = comm->compCap;
     unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
-    CUlaunchAttribute launchAttrs[3];
+    CUlaunchAttribute launchAttrs[4] = {};
     int attrs = 0;
     /* Cooperative Group Array (CGA)
      * On sm90 and later we have an extra level of hierarchy where we
@@ -1485,6 +1531,17 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
       launchAttrs[attrs++].value.memSyncDomain = (CUlaunchMemSyncDomain) ncclParamMemSyncDomain();
     }
     #endif
+    #if CUDART_VERSION >= 12030
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    if (implicitOrder == ncclImplicitOrderLaunch) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
+      launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
+      launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
+      attrs++;
+    }
+    #endif
     launchConfig.gridDimX = grid.x;
     launchConfig.gridDimY = grid.y;
     launchConfig.gridDimZ = grid.z;
@@ -1496,15 +1553,15 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     launchConfig.numAttrs = attrs;
     launchConfig.hStream = launchStream;
 
-    //CUDACHECK(cudaLaunchKernelExC(&launchConfig, fnAddr, args));
-    CUCHECK(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra));
-    return ncclSuccess;
-  }
+    CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
   #endif
-  // Standard kernel launch
-  CUCHECK(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra));
-  //CUDACHECK(cudaLaunchKernel(fnAddr, grid, block, args, smem, launchStream));
-  return ncclSuccess;
+  } else {
+    // Standard kernel launch
+    CUCHECKGOTO(cuLaunchKernel(fn, grid.x, grid.y, grid.z, block.x, block.y, block.z, smem, launchStream, nullptr, extra), ret, do_return);
+  }
+
+do_return:
+  return ret;
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
@@ -1524,34 +1581,39 @@ ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKern
 }
 
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
-  ncclResult_t result = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
-
   if (!ncclIntruQueueEmpty(&planner->planQueue)) {
     // Reset queue to empty without destroying plans since those will be sent
     // back to us for reclaiming via callbackQueue.
     ncclIntruQueueConstruct(&planner->planQueue);
+
     cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
-    // Create dependency for deviceStream on launchStream. We know that deviceStream
-    // hasn't been modified since launchStream waited on it (in ncclLaunchPrepare),
-    // so we can say that launchStream subsumes it.
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, &comm->sharedRes->deviceStream, launchStream, /*b_subsumes_a=*/true), result, resume1);
-  resume1:
-    // Create dependency for other user streams (skip launch stream) on deviceStream.
-    // Again, the user streams haven't been touched since deviceStream waited on them
-    // so we can say they are subsumed by deviceStream.
-    struct ncclCudaStreamList* sl = planner->streams->next;
-    planner->streams = nullptr; // Reset comm->planner.streams to empty.
-    while (sl != nullptr) {
-      NCCLCHECKGOTO(ncclStrongStreamWaitStream(planner->capturingGraph, sl->stream, &comm->sharedRes->deviceStream, /*b_subsumes_a=*/true), result, resume2);
-    resume2:
-      sl = sl->next;
+    cudaStream_t deviceStream, launchOrder;
+    CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
+    // deviceStream waits on userStream[0]
+    NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
+    CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+    // Each userStream[i] waits on userStream[0]
+    for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
+      CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
     }
-    // Release device stream as acquired in ncclLaunchPrepare()
-    NCCLCHECKGOTO(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream), result, resume3);
-  resume3:;
+    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
+    enum ncclImplicitOrder implicitOrder;
+    NCCLCHECK(getImplicitOrder(&implicitOrder, capturing));
+    if (implicitOrder != ncclImplicitOrderNone) {
+      // As in ncclLaunchPrepare, strong stream can be non-concurrent when non-captured.
+      bool concurrent = capturing;
+      // Incorporate launch event into per-device (context) launch order.
+      NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
+      // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
+      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
+      // Release launchOrder as acquired in ncclLaunchPrepare()
+      NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
+    }
+    // Release deviceStream as acquired in ncclLaunchPrepare()
+    NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false));
   }
-  return result;
+  return ncclSuccess;
 }
 
 /*****************************************************************************/
@@ -1655,11 +1717,11 @@ static ncclResult_t topoGetAlgoInfo(
   if (info->algorithm == NCCL_ALGO_UNDEF || info->protocol == NCCL_PROTO_UNDEF) {
     char ncclAlgoEnvStr[1024] = "";
     char ncclProtoEnvStr[1024] = "";
-    char* algoEnv = getenv("NCCL_ALGO");
+    const char* algoEnv = ncclGetEnv("NCCL_ALGO");
     if (algoEnv) {
       snprintf(ncclAlgoEnvStr, 1023, " NCCL_ALGO was set to %s.", algoEnv);
     }
-    char* protoEnv = getenv("NCCL_PROTO");
+    const char* protoEnv = ncclGetEnv("NCCL_PROTO");
     if (protoEnv) {
       snprintf(ncclProtoEnvStr, 1023, " NCCL_PROTO was set to %s.", protoEnv);
     }
@@ -2007,7 +2069,7 @@ static ncclResult_t hostToDevRedOp(
   uint64_t allBits = uint64_t(-1)>>(64-nbits);
   uint64_t signBit = allBits^(allBits>>1);
   bool datatype_signed = false;
-  
+
   switch (int(op)) {
   case ncclSum:  opFull->op = ncclDevSum;  break;
   case ncclProd: opFull->op = ncclDevProd; break;
@@ -2097,6 +2159,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     p2p->datatype = info->datatype;
     p2p->root = info->root;
     p2p->bytes = nBytes;
+    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     ncclIntruQueueEnqueue(
       isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
       p2p);
@@ -2105,6 +2168,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     // Mark channels that need pre-connect
     if (comm->rank != peer) {
       if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
         (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
         int round = 0;
         while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
@@ -2115,12 +2179,17 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
         for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
           int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
           if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+              // shared comms together.
+              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
               comm->connectSend[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
           } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].connected == 0) { // P2P uses only 1 connector
+            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
               comm->connectRecv[peer] |= (1UL<<channelId);
               ncclGroupCommPreconnect(comm);
             }
@@ -2168,6 +2237,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       t->opDev = opDev; // C++ struct assignment
       t->chunkSteps = info->chunkSteps;
       t->sliceSteps = info->sliceSteps;
+      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
 
       planner->nTasksColl += 1;
       ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 64fc1c5dd..76b508c2d 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -390,7 +390,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
 
   // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
+  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
     for (int r=0; r<comm->nRanks; r++) {
       if (comm->rankToNode[r] % 2 == 1) {
         // Exchange rings
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 587a8b282..ace4476f6 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -376,9 +376,12 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
 
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
+const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
 
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, int* useGdr) {
-  *useGdr = 0;
+NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
+
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
+  *gdrMode = ncclTopoGdrModeDisable;
 
   // Get GPU and NET
   int n, g;
@@ -418,25 +421,37 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
   int distance = gpu->paths[NET][n].type;
   if (distance == PATH_PXN) {
     // In case of PXN, use the intermediate GPU distance instead
-    int proxyRank, g;
+    int proxyRank;
     NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
     NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
-    struct ncclTopoNode* proxyGpu = system->nodes[GPU].nodes+g;
-    distance = proxyGpu->paths[NET][n].type;
+    gpu = system->nodes[GPU].nodes+g;
+    distance = gpu->paths[NET][n].type;
   }
+
+  int c;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
+    // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
+    INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
+    distance = PATH_C2C;
+  }
+
   if (distance > netGdrLevel) {
     INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
-  *useGdr = 1;
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d", rank, netId, distance, netGdrLevel, read);
+  // Force PCIe mapping if path goes through PCI on a C2C system
+  if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
+  else *gdrMode = ncclTopoGdrModeDefault;
+
+  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
   return ncclSuccess;
 }
 
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail) {
   int netNum = system->nodes[NET].count;
-  int useGdr = 0;
+  enum ncclTopoGdrMode useGdr = ncclTopoGdrModeDisable;
   *avail = false;
   for (int n = 0; n < netNum; n++) {
     int64_t netId = system->nodes[NET].nodes[n].id;
@@ -469,6 +484,14 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
   if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
+  // On C2C platforms, data could go through a PCI switch while completions and
+  // flags would go through C2C. In that case, force a flush.
+  int c, n;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
+    *flush = 1;
+  }
   return ncclSuccess;
 }
 
@@ -538,7 +561,7 @@ NCCL_PARAM(PxnDisable, "PXN_DISABLE", 0);
 int ncclPxnDisable(struct ncclComm* comm) {
   static int pxnDisable = -1;
   if (pxnDisable == -1) {
-    if (comm && ncclNetVersion(comm) == 4) {
+    if (comm && comm->ncclNetVer == 4) {
       INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
       pxnDisable = 1;
     } else {
@@ -561,9 +584,9 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
     int proxyRank;
     NCCLCHECK(ncclTopoGetNetDev(comm, comm->rank, NULL, 0, rank, &netId, NULL, &proxyRank));
     if (proxyRank == comm->rank) continue;
-    int useGdr;
+    enum ncclTopoGdrMode useGdr;
     NCCLCHECK(ncclTopoCheckGdr(comm->topo, comm->rank, netId, 1, &useGdr));
-    if (useGdr == 0) continue;
+    if (useGdr == ncclTopoGdrModeDisable) continue;
     int found = 0;
     for (int r=0; r<nr; r++) {
       if (ranks[r] == proxyRank) found = 1;
@@ -664,7 +687,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
       if (gpu->paths[NET][n].type < PATH_PHB) {
         // Update path when we dont want to / can't use GPU Direct RDMA.
-        int gdr;
+        enum ncclTopoGdrMode gdr;
         NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].gpu.rank, netNode->id, 0, &gdr));
         if (gdr == 0) {
           // We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
@@ -862,3 +885,38 @@ ncclResult_t ncclTopoPathAllNVLink(struct ncclTopoSystem* system, int* allNvLink
   *allNvLink = maxPath >= PATH_PIX ? 0 : 1;
   return ncclSuccess;
 }
+
+// Check whether we are in a split NVLink situation, with two NVLink domains, not
+// connected through NVLink (e.g. QPI).
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink) {
+  ncclResult_t res = ncclSuccess;
+  int nvlDomains = 0;
+  int *nvlDomain = NULL, *nvlDomainCount = NULL;
+  // Compute NVLink domains
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomain, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) nvlDomain[g] = g;
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
+    int domain = nvlDomain[g];
+    for (int p=g+1; p<system->nodes[GPU].count; p++) {
+      if (gpu->paths[GPU][p].type == PATH_NVL) {
+        nvlDomain[p] = domain;
+      }
+    }
+  }
+  // Compute number of GPUs per NVLink domain.
+  NCCLCHECKGOTO(ncclCalloc(&nvlDomainCount, system->nodes[GPU].count), res, exit);
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    nvlDomainCount[nvlDomain[g]]++;
+  }
+  // Count the number of NVLink domains
+  for (int g=0; g<system->nodes[GPU].count; g++) {
+    if (nvlDomainCount[g] > 1) nvlDomains++;
+  }
+  *splitNvLink = nvlDomains == 2 ? 1 : 0;
+
+exit:
+  if(nvlDomain) free(nvlDomain);
+  if(nvlDomainCount) free(nvlDomainCount);
+  return res;
+}
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 0185b3f7b..15a01243f 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -446,12 +446,11 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
 
-ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int* nets, int* netCountRet) {
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
   int localNetCount;
-  int* localNets;
-  NCCLCHECK(ncclCalloc(&localNets, MAXCHANNELS));
+  int localNets[MAXCHANNELS];
 
   // First add the preferred NICs
   for (int g=0; g<system->nodes[GPU].count; g++) {
@@ -460,8 +459,8 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
     for (int c = 0; c<MAXCHANNELS; c++) {
       int64_t netId;
-      NCCLCHECKGOTO(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL), ret, fail);
-      NCCLCHECKGOTO(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount), ret, fail);
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
       if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
       localNetCount++;
     }
@@ -469,7 +468,7 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     for (int i=0; i<localNetCount; i++) {
       int n = localNets[i];
       int found = 0;
-      while (nets[found] != n && found<netCount) found++;
+      while (found<netCount && nets[found] != n) found++;
       if (found == netCount) nets[netCount++] = n;
     }
   }
@@ -488,22 +487,17 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
         int found = 0;
-        while (nets[found] != n && found<netCount) found++;
+        while (found<netCount && nets[found] != n) found++;
         if (found == netCount) nets[netCount++] = n;
       }
     }
   }
 
   *netCountRet = netCount;
-exit:
-  free(localNets);
   return ret;
-fail:
-  goto exit;
 }
 
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
-  ncclResult_t ret = ncclSuccess;
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
 
@@ -525,7 +519,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   }
   graph->intra[graph->nChannels*ngpus+step] = gpu->gpu.rank;
   int g = gpu - system->nodes[GPU].nodes;
-  int* nets = NULL;
+  int nets[NCCL_TOPO_MAX_NODES];
   if (step == backToNet) {
     // first get back to NIC
     if (system->nodes[NET].count) {
@@ -533,8 +527,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
       int netCount;
-      NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
-      NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount), ret, fail);
+      NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
         struct ncclTopoNode* net = system->nodes[NET].nodes+n;
@@ -555,14 +548,14 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
           graph->bwInter /= 2;
         }
 
-        NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net), ret, fail);
+        NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
         graph->bwInter = bwInterSave;
         if (net) {
           graph->inter[graph->nChannels*2+1] = net->id;
-          NCCLCHECKGOTO(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time), ret, fail);
+          NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, step, nextBackToNet, backToFirstRank, forcedOrder, time));
 
           if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) graph->bwInter /= 2;
-          NCCLCHECKGOTO(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net), ret, fail);
+          NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, -1, &net));
           graph->bwInter = bwInterSave;
         }
       }
@@ -601,21 +594,15 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
     // Next path
     NCCLCHECK(ncclTopoSearchRecGpu(system, graph, saveGraph, gpu, ngpus, -1, -1, forcedOrder, time));
   }
-exit:
-  if (nets) free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
-  ncclResult_t ret = ncclSuccess;
   const int bw = graph->bwInter;
-  int* nets;
-  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  int nets[NCCL_TOPO_MAX_NODES];
   int netCount;
   int graphFound = 0;
-  NCCLCHECKGOTO(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount), ret, fail);
+  NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, -1, nets, &netCount));
   for (int i=0; i<netCount; i++) {
     if ((graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) && graphFound) break;
     int n = nets[(graph->nChannels+i)%netCount];
@@ -639,7 +626,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
         int gpu;
-        NCCLCHECKGOTO(ncclTopoGetLocalGpu(system, net->id, &gpu), ret, fail);
+        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
         if (gpu != -1) {
           int duplicate = 0;
           // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -650,7 +637,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
             }
           }
           if (!duplicate) {
-            NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu), ret, fail);
+            NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, gpu));
             graphFound = 1;
           }
         }
@@ -659,14 +646,14 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       if (graph->nChannels > 0) {
         // Try to replay the last channel
         int g;
-        NCCLCHECKGOTO(ncclTopoReplayGetGpu(system, graph, -1, &g), ret, fail);
-        NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g), ret, fail);
+        NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
+        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
       }
       if (graph->nChannels == 0 || graph->sameChannels == 0) {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
-          NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0), ret, fail);
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, 0));
           if (t == -1) *time = -1;
         }
 
@@ -686,7 +673,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
           for (int i=0; i<system->nodes[GPU].count; i++) {
             int g = (graph->nChannels+i)%system->nodes[GPU].count;
             if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECKGOTO(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g), ret, fail);
+              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
             }
           }
         }
@@ -700,11 +687,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
       }
     }
   }
-exit:
-  free(nets);
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 /* Search Patterns
@@ -999,6 +982,15 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     graph->minChannels = graph->maxChannels;
   }
 
+  int splitNvLink;
+  NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
+    // We have two sockets with NVLink and a slower link in between (typically QPI).
+    // Tree is likely going to work better but it needs at least 2 channels.
+    // Since Tree needs to have the same number of channels as Ring, also force Ring to use 2 channels.
+    if (graph->maxChannels >= 2 && graph->minChannels == 1) graph->minChannels = 2;
+  }
+
   struct ncclTopoGraph tmpGraph;
   memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph));
 
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index ba82cafb7..9499f396d 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -22,8 +22,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -45,7 +45,7 @@ ncclResult_t pciPathToInt64(char* path, int offset, int minOffset, int64_t* id)
   return ncclSuccess;
 }
 
-static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu) {
+static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode** cpu, struct ncclTopoNode* from) {
   *cpu = NULL;
   if (node->type == CPU) {
     *cpu = node;
@@ -54,9 +54,10 @@ static ncclResult_t findLocalCpu(struct ncclTopoNode* node, struct ncclTopoNode*
   for (int l=0; l<node->nlinks; l++) {
     // Go up the PCI tree to find the CPU. Follow only PCI switches.
     if (node->links[l].type == LINK_PCI
+	&& node->links[l].remNode != from
 	&& (node->links[l].remNode->type == PCI
 	    || node->links[l].remNode->type == CPU)) {
-      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu));
+      NCCLCHECK(findLocalCpu(node->links[l].remNode, cpu, node));
     }
     if (*cpu != NULL) return ncclSuccess;
   }
@@ -77,13 +78,17 @@ static ncclResult_t ncclTopoGetInterCpuBw(struct ncclTopoNode* cpu, float* bw) {
     return ncclSuccess;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    *bw = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_BW : QPI_BW;
+    *bw =
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_ERP ? ERP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SRP ? SRP_QPI_BW :
+      cpu->cpu.model == NCCL_TOPO_CPU_MODEL_INTEL_SKL ? SKL_QPI_BW :
+      BDW_QPI_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
     *bw = AMD_BW;
   }
   if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
-    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
+    *bw = cpu->cpu.model ==  NCCL_TOPO_CPU_MODEL_YONGFENG ? YONGFENG_ZPI_BW : ZPI_BW;
   }
   return ncclSuccess;
 }
@@ -511,12 +516,16 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
+      cpu->cpu.model =
+        (familyId == 6 && modelId >= 0xCF) ? NCCL_TOPO_CPU_MODEL_INTEL_ERP :
+        (familyId == 6 && modelId >= 0x8F) ? NCCL_TOPO_CPU_MODEL_INTEL_SRP :
+        (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_MODEL_INTEL_SKL :
+        NCCL_TOPO_CPU_MODEL_INTEL_BDW;
     } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
       int familyId, modelId;
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
       NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
-      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
+      if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_MODEL_YONGFENG;
     }
   }
   for (int s=0; s<xmlCpu->nSubs; s++) {
@@ -565,7 +574,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
       NCCLCHECK(ncclTopoGetNode(system, &remote, GPU, NCCL_TOPO_ID(systemId, busId)));
     } else if (targetType == CPU) {
       // NVL connection to the local CPU
-      NCCLCHECK(findLocalCpu(gpu, &remote));
+      NCCLCHECK(findLocalCpu(gpu, &remote, NULL));
     } else {
       if (system->nodes[NVS].count == 0) {
         NCCLCHECK(ncclTopoCreateNode(system, &remote, NVS, 0));
@@ -642,10 +651,10 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
     NCCLCHECK(xmlGetAttrInt(node, "bw", &bw));
     double c2cBw = (bw*count)/1000.0;
     struct ncclTopoNode* cpu = NULL;
-    NCCLCHECK(findLocalCpu(gpu, &cpu));
+    NCCLCHECK(findLocalCpu(gpu, &cpu, NULL));
     if (cpu == NULL) return ncclSuccess;
-    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_NVL, c2cBw));
-    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_NVL, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(gpu, cpu, LINK_C2C, c2cBw));
+    NCCLCHECK(ncclTopoConnectNodes(cpu, gpu, LINK_C2C, c2cBw));
   } else {
     if (strcmp(node->name, "cpu") == 0) {
       NCCLCHECK(ncclGetSystemId(system, node, &systemId));
@@ -961,26 +970,31 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
   ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
-  if (ret == ncclInvalidUsage) {
-    WARN("TOPO/NET : Tried merging multiple devices together and failed. Try setting NCCL_NET_MERGE_LEVEL=LOC");
-    NCCLCHECK(ret);
+  if (ret != ncclSuccess) {
+    INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
+      vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
+    return ret;
   }
 
   INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+  ncclResult_t ret = ncclSuccess;
   INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  char* ncStr;
+  NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
+  strcpy(ncStr, str);
   char* semi_token;
-  char* semi = strtok_r(str, ";", &semi_token);
+  char* semi = strtok_r(ncStr, ";", &semi_token);
   while (semi) {
     TRACE(NCCL_NET, "Fusing %s", semi);
     struct netIf userIfs[NCCL_NET_MAX_DEVS_PER_NIC];
     int nUserIfs = parseStringList(semi, userIfs, NCCL_NET_MAX_DEVS_PER_NIC);
     if (nUserIfs == 0) {
       INFO(NCCL_NET, "NET/IB : Invalid NCCL_NET_FORCE_MERGE specified %s. Couldn't parse substring %s. Please provide a semicolon-delimited list of comma-delimited NIC groups.",
-        str, semi);
+        ncStr, semi);
       continue;
     }
 
@@ -994,26 +1008,37 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, char* str,
     if (vProps.ndevs != nUserIfs) {
       WARN("TOPO/NET : Only matched %d devices, %d requested from %s",
         vProps.ndevs, nUserIfs, semi);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     if (vProps.ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
       WARN("Specified fused NIC %s which has too many devices (%d). Max %d", semi, vProps.ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     struct ncclXmlNode* netNode;
-    NCCLCHECK(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice));
-
-    // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
-    for (int i = 0; i < vProps.ndevs; i++) {
-      placedDevs[vProps.devs[i]] = 1;
+    ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+    if (ret == ncclSuccess) {
+      // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
+      for (int i = 0; i < vProps.ndevs; i++) {
+        placedDevs[vProps.devs[i]] = 1;
+      }
+    } else {
+      WARN("TOPO/NET : Could not force merge NICs %s. Please specify a valid NCCL_NET_FORCE_MERGE string.", semi);
+      ret = ncclInvalidUsage;
+      goto fail;
     }
 
     semi = strtok_r(NULL, ";", &semi_token);;
   }
 
-  return ncclSuccess;
+exit:
+  free(ncStr);
+  return ret;
+fail:
+  goto exit;
 }
 
 ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
@@ -1061,7 +1086,24 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
       }
 
       struct ncclXmlNode* netNode;
-      NCCLCHECKGOTO(ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice), res, out);
+      ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+
+      // Merging failed.
+      // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
+      // Set i to 0 to restart the automatic merging process and ensure all are placed
+      if (ret != ncclSuccess) {
+        INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "Marking physical devices as unplaced, increasing distance and restarting search.");
+        placedDevs[i] = 0;
+        TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, keeping distance -> self as PATH_LOC", i);
+        for (int k = 1; k < vProps.ndevs; k++) {
+          int dev = vProps.devs[k];
+          placedDevs[dev] = 0;
+          paths[i*nPhysDevs + dev] = PATH_DIS;
+          paths[dev*nPhysDevs + i] = PATH_DIS;
+          TRACE(NCCL_GRAPH, "Setting dev %d as unplaced, setting distance -> %d as PATH_DIS", dev, i);
+        }
+        i = 0;
+      }
     }
   }
 
@@ -1125,16 +1167,16 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   // By default, don't merge any devices
   int mergeLevel;
   mergeLevel = PATH_PORT;
-  char* mergeLevelEnv;
-  mergeLevelEnv = getenv("NCCL_NET_MERGE_LEVEL");
-  if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-  char* forceMerge;
-  forceMerge = getenv("NCCL_NET_FORCE_MERGE");
-  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-  memset(placedDevs, 0, sizeof(int)*physicalDevs);
-
-  if (forceMerge) {
-    NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  { // Avoids warnings related to jumping to "out"
+    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
+    const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+    memset(placedDevs, 0, sizeof(int)*physicalDevs);
+
+    if (forceMerge) {
+      NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+    }
   }
   NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 2be029b88..921a7f5d6 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,9 +18,11 @@
 #define SM86_NVLINK_BW 12.0
 #define SM100_NVLINK_BW 40.0
 #define PCI_BW 12.0           // PCI Gen3 x16
-#define QPI_BW 6.0
 #define AMD_BW 16.0
+#define BDW_QPI_BW 6.0
 #define SKL_QPI_BW 10.0
+#define SRP_QPI_BW 22.0
+#define ERP_QPI_BW 40.0
 #define ZPI_BW 6.0
 #define YONGFENG_ZPI_BW 9.0
 #define P9_BW 32.0
@@ -44,12 +46,13 @@ extern const char* topoNodeTypeStr[];
 #define LINK_LOC 0
 #define LINK_NVL 1
 // Skipping 2 for PATH_NVB
-#define LINK_PCI 3
-// Skipping 4 for PATH_PXB
-// Skipping 5 for PATH_PXN
-// Skipping 6 for PATH_PHB
-#define LINK_SYS 7
-#define LINK_NET 8
+#define LINK_C2C 3
+#define LINK_PCI 4
+// Skipping 5 for PATH_PXB
+// Skipping 6 for PATH_PXN
+// Skipping 7 for PATH_PHB
+#define LINK_SYS 8
+#define LINK_NET 9
 extern const char* topoLinkTypeStr[];
 
 // Local (myself)
@@ -61,29 +64,32 @@ extern const char* topoLinkTypeStr[];
 // Connection through NVLink using an intermediate GPU
 #define PATH_NVB 2
 
+// Connection through C2C
+#define PATH_C2C 3
+
 // Connection traversing at most a single PCIe bridge
-#define PATH_PIX 3
+#define PATH_PIX 4
 
 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
-#define PATH_PXB 4
+#define PATH_PXB 5
 
 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 5
+#define PATH_PXN 6
 
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-#define PATH_PHB 6
+#define PATH_PHB 7
 
 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-#define PATH_SYS 7
+#define PATH_SYS 8
 
 // Connection through the network
-#define PATH_NET 8
+#define PATH_NET 9
 
 // New type of path which should precede PATH_PIX
 #define PATH_PORT PATH_NVL
 
 // Disconnected
-#define PATH_DIS 9
+#define PATH_DIS 10
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -103,9 +109,6 @@ struct ncclTopoLinkList {
   int type;
 };
 
-#define NCCL_TOPO_CPU_INTEL_BDW 1
-#define NCCL_TOPO_CPU_INTEL_SKL 2
-
 #define NCCL_TOPO_UNDEF (-1)
 
 #define NCCL_TOPO_ID_LOCAL_ID_MASK 0x00ffffffffffffff
@@ -176,6 +179,7 @@ ncclResult_t ncclTopoLoadSystem(const char* xmlTopoFile, struct ncclTopoSystem*
 ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank, int64_t netId, int* intermediateRank);
 ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int* min);
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
+ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8da4aeb9e..68085b893 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -177,6 +177,7 @@ static const double perChMaxTreeBws[][3] = {
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
 static int ncclPatEnable(struct ncclComm* comm) {
   int patEnable = ncclParamPatEnable();
+  if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
   if (patEnable != 2) return patEnable;
   if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
   if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
@@ -257,7 +258,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
         if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
-        if (a == NCCL_ALGO_TREE && graphs[a]->pattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
         if (a == NCCL_ALGO_PAT) busBw *= .75;
         if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
diff --git a/src/group.cc b/src/group.cc
index e387db70c..c48c0de88 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -193,7 +193,6 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
 
 static ncclResult_t doLaunches(struct ncclComm* head) {
   ncclResult_t result = ncclSuccess;
-  struct ncclComm* cliqueComm0 = head->intraComm0;
   struct ncclComm* cliqueHead = head;
   struct ncclComm* cliqueNextHead;
   bool useBarrier = ncclParamLaunchMode == ncclLaunchModeGroup;
@@ -209,7 +208,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
       NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
       if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
       comm = comm->groupNext;
-    } while (comm != nullptr && comm->intraComm0 == cliqueComm0);
+    } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
     cliqueNextHead = comm;
 
     if (capturingYes && capturingNo) {
@@ -424,38 +423,47 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   /* Connect channels at runtime if cumem is supported */
   if (groupCommHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommHeadMain;
+    struct ncclComm* cliqueHead = groupCommHeadMain;
+    struct ncclComm* comm = NULL;
     struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
     ncclIntruQueueConstruct(&asyncCollJobs);
     do {
-      bool needConnect = false;
-      bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-      memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-      NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
-
-      if (comm->cuMemSupport && needConnect) {
-        struct ncclPreconnectJob* job;
-        NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-        job->base.func = ncclCollPreconnectFunc;
-        job->base.undo = nullptr;
-        job->base.destructor = free;
-        job->base.state = ncclGroupJobRunning;
-        job->base.abortFlag = comm->abortFlag;
-        job->base.abortFlagDev = comm->abortFlagDev;
-        job->comm = comm;
-        NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-        memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-        ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+      // We need to preconnect connections for collectives clique by clique to avoid
+      // race condition for split shared comms which can connect the same connections
+      // at the same time.
+      comm = cliqueHead;
+      do {
+        bool needConnect = false;
+        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+
+        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
+
+        if (comm->cuMemSupport && needConnect) {
+          struct ncclPreconnectJob* job;
+          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+          job->base.func = ncclCollPreconnectFunc;
+          job->base.undo = nullptr;
+          job->base.destructor = free;
+          job->base.state = ncclGroupJobRunning;
+          job->base.abortFlag = comm->abortFlag;
+          job->base.abortFlagDev = comm->abortFlagDev;
+          job->comm = comm;
+          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
+          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
+        }
+        comm = comm->groupNext;
+      } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
+      // connect
+      NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
+      while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
+        struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
+        if (job->destructor) job->destructor((void*)job);
       }
-      comm = comm->groupNext;
-    } while (comm);
-    NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
-    while (!ncclIntruQueueEmpty(&asyncCollJobs)) {
-      struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncCollJobs);
-      if (job->destructor) job->destructor((void*)job);
-    }
+      cliqueHead = comm;
+    } while (cliqueHead != nullptr);
 
     // done with all buffer allocation, start registration and enqueue
     comm = groupCommHeadMain;
diff --git a/src/include/bitops.h b/src/include/bitops.h
index a650aa7f4..dcf0e2e09 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -8,6 +8,7 @@
 #define NCCL_BITOPS_H_
 
 #include <stdint.h>
+#include <string.h>
 
 #if !__NVCC__
   #ifndef __host__
@@ -276,13 +277,53 @@ inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
   return u32fpDecode(x, 3);
 }
 
-inline __host__ __device__ uint64_t getHash(const char* string, int n) {
-  // Based on DJB2a, result = result * 33 ^ char
-  uint64_t result = 5381;
-  for (int c = 0; c < n; c++) {
-    result = ((result << 5) + result) ^ string[c];
+// The hash isn't just a function of the bytes but also where the bytes are split
+// into different calls to eatHash().
+inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+  char const* ptr = (char const*)bytes;
+  acc[0] ^= size;
+  while (size != 0) {
+    // Mix the accumulator bits.
+    acc[0] += acc[1];
+    acc[1] ^= acc[0];
+    acc[0] ^= acc[0] >> 31;
+    acc[0] *= 0x9de62bbc8cef3ce3;
+    acc[1] ^= acc[1] >> 32;
+    acc[1] *= 0x485cd6311b599e79;
+    // Read in a chunk of input.
+    size_t chunkSize = size < sizeof(uint64_t) ? size : sizeof(uint64_t);
+    uint64_t x = 0;
+    memcpy(&x, ptr, chunkSize);
+    ptr += chunkSize;
+    size -= chunkSize;
+    // Add to accumulator.
+    acc[0] += x;
   }
-  return result;
+}
+
+template<typename T>
+inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+  eatHash(acc, (const void*)bytes, sizeof(T));
+}
+
+inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+  uint64_t h = acc[0];
+  h ^= h >> 31;
+  h *= 0xbac3bd562846de6b;
+  h += acc[1];
+  h ^= h >> 32;
+  h *= 0x995a187a14e7b445;
+  return h;
+}
+
+inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+  uint64_t acc[2] = {1, 1};
+  eatHash(acc, bytes, size);
+  return digestHash(acc);
+}
+template<typename T>
+inline __host__ __device__ uint64_t getHash(const T* bytes) {
+  return getHash((const void*)bytes, sizeof(T));
 }
 
 #endif
diff --git a/src/include/collectives.h b/src/include/collectives.h
index c82ebce6f..c68b0418c 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "device.h"
+
 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 
 // CHUNKSIZE must be a multiple of SLICESIZE
@@ -382,6 +383,42 @@ class RingBCAlgorithm : public RingAlgorithm {
   ~RingBCAlgorithm() {}
 };
 
+#if !defined (__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#include <cuda/atomic>
+#endif
+
+// Need a power of two to ensure it divides by parallelFactor (which is also a power of two)
+#define NCCL_PAT_NWORKERS 512
+
+static constexpr int PatUsed = 0x1,
+                     PatSkipped = 0x2;
+
+struct ncclPatStep {
+  int recvDim, sendDim, recvOffset, sendOffset, stepOffset, postRecv, postSend, nelem, last, flags;
+  size_t inpIx, outIx;
+};
+
+struct ncclPatPeer {
+    uint64_t step;
+    struct ncclConnInfo* conn;
+    struct ncclConnFifo* connFifo;
+    void* buff;
+    uint64_t *headPtr;
+    uint64_t *tailPtr;
+    uint64_t stepCache;
+    long long int accSize;
+    int connStepSize;
+};
+
+#define NCCL_SHMEM_PAT_STEPS 32
+struct ncclPatShmem {
+  struct ncclPatStep patSteps[NCCL_SHMEM_PAT_STEPS];
+  int parallelFactor;
+  long long int localAccSize;
+  struct ncclPatPeer sendDims[32]; // Should cover 2^32 ranks
+  struct ncclPatPeer recvDims[32];
+};
+
 template<typename T>
 class PatRSAlgorithm{
   size_t offset;
@@ -394,18 +431,17 @@ class PatRSAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int sendSkipped; // number of skipped steps during aggregation
-  int recvSkipped; // number of skipped steps during aggregation
-  int phase2recv;  // receive offset for phase 2
+  int stepOffset;
   int aggDelta;
   int scale;
   int phase;
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -433,16 +469,16 @@ class PatRSAlgorithm{
 
   __device__ __host__ void resetA() {
     a = 0;
-    sendSkipped = recvSkipped = 0;
+    sendSkipped = stepOffset = 0;
     lastA = aggFactor;
     if (phase >= 2) lastA /= 2*scale;
+    if (phase == 4) lastA = 1;
   }
 
   __device__ __host__ void reset() {
     nelem = getNelem();
     phase = 0;
     scale = 1;
-    phase2recv = 0;
     as = aggDelta - 1;
     resetA();
   }
@@ -465,8 +501,9 @@ class PatRSAlgorithm{
   }
 
 public:
-   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatRSAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -476,6 +513,7 @@ class PatRSAlgorithm{
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
@@ -486,160 +524,151 @@ class PatRSAlgorithm{
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &sendStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    last = 0;
-    nelemOut = nelem;
-    outIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->outIx = offset;
+    ps->stepOffset = stepOffset;
     int skip = 0;
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
       int sendDataRank = (rank + s) % nranks;
-      inpIx = sendDataRank * count + offset;
-      recvDim = -1;
-      sendDim = 0;
-      outIx = 0;
-      recvOffset = -1;
-      sendOffset = ((a - sendSkipped)%postFreq) * nelem;
-      sendStepOffset = 0;
-      if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postSend = 1;
+      ps->inpIx = sendDataRank * count + offset;
+      ps->recvDim = -1;
+      ps->sendDim = 0;
+      ps->outIx = 0;
+      ps->recvOffset = -1;
+      ps->sendOffset = (a%postFreq) * nelem;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postSend = 1;
       } else {
-        postSend = 0;
-      }
-      postRecv = 0;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        phase = as == 1 ? (aggFactor > 1 ? 2 : 4) : 1; // If as == 1, switch to phase 2
-        resetA();
+        ps->postSend = 0;
       }
-      if (skip == 0) return;
+      ps->postRecv = 0;
     } else if (phase == 1) {
       int s = mirrorInvert(a, lastA)*aggDelta + as;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      sendOffset = ((a - sendSkipped)%postFreq)*nelem;
-      recvOffset = ((a - recvSkipped)%postFreq)*nelem;
-      postSend = 0;
-      if (recvDim == 0) {
-        if ((((a - sendSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) postSend = 1;
-        sendStepOffset = 0;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->sendOffset = (a%postFreq)*nelem;
+      ps->recvOffset = (a%postFreq)*nelem;
+      ps->postSend = 0;
+      if (ps->recvDim == 0 && (((a%postFreq) + 1 >= postFreq) || (a == lastA-1))) ps->postSend = 1;
+      if (((a%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
+        ps->postRecv = 1;
       } else {
-        sendStepOffset = (a - sendSkipped)/postFreq;
+        ps->postRecv = 0;
       }
-      if ((((a - recvSkipped)%postFreq) + 1 >= postFreq) || (a == lastA-1)) {
-        postRecv = 1;
-      } else {
-        postRecv = 0;
-      }
-      s -= (1<<recvDim);
+      s -= (1<<ps->recvDim);
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (sendDim == -1) {
-        sendOffset = -1;
-        sendStepOffset = 0;
-      } else if (as - (1<<recvDim) == 0) {
-        if (newPeer(a, aggFactor)) sendSkipped = a;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->sendDim == -1) {
+        ps->sendOffset = -1;
+      } else if (as - (1<<ps->recvDim) == 0) {
+        if (newPeer(a, aggFactor)) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
         int foffset = a - sendSkipped;
-        sendStepOffset = recvDim == 0 ? 0 : foffset/postFreq;
-        sendOffset = (foffset%postFreq)*nelem;
+        ps->sendOffset = (foffset%postFreq)*nelem;
       }
+      int recvDim = ps->recvDim;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        as--;
-        phase = as % 2 == 1 ? 0 : 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      if (recvDim > 0 && (((a-sendSkipped)%postFreq) + 1 >= postFreq) && skip == 0) stepOffset++;
     } else if (phase == 2) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta + 1;
-      postRecv = 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      recvDim = 0;
-      postSend = a == lastA-1 ? 1 : 0;
+      ps->recvDim = 0;
+      ps->postSend = a == lastA-1 ? 1 : 0;
       s -= 1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
         skip = 0;
       } else if (!skip) {
-        int foffset = phase2recv;
-        phase2recv++;
-        postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-        recvOffset = (foffset%postFreq) * nelem;
+        int foffset = a + aggFactor - aggFactor/scale;
+        ps->postRecv |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+        ps->recvOffset = (foffset%postFreq) * nelem;
       }
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
-      int foffset = a - sendSkipped;
-      postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
-      sendStepOffset = 0;
-      sendOffset = (foffset%postFreq) * nelem;
-      if (skip || sendDim == -1) sendSkipped++;
-      if (++a == lastA) {
-        phase = 3;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      int foffset = a;
+      ps->postSend |= ((foffset+1)%postFreq) == 0 ? 1 : 0;
+      ps->sendOffset = (foffset%postFreq) * nelem;
     } else if (phase == 3) {
       int s = (2*mirrorInvert(a, lastA)+1)*scale*aggDelta;
-      postRecv = a == lastA-1 ? 1 : 0;
+      ps->postRecv = a == lastA-1 ? 1 : 0;
       if (s >= nranks) skip = 1;
-      recvDim = firstBitSet(s, nrPow2);
-      postSend = 0;
-      s -= (1<<recvDim);
-      int foffset = a - recvSkipped;
-      postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
-      recvOffset = (foffset%postFreq) * nelem;
+      ps->recvDim = firstBitSet(s, nrPow2);
+      ps->postSend = 0;
+      s -= (1<<ps->recvDim);
+      int foffset = a;
+      ps->postRecv |= (foffset+1)%postFreq == 0 ? 1 : 0;
+      ps->recvOffset = (foffset%postFreq) * nelem;
       int recvDataRank = (rank + nranks + s) % nranks;
-      inpIx = recvDataRank * count + offset;
-      sendDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->inpIx = recvDataRank * count + offset;
+      ps->sendDim = s ? firstBitSet(s, nrPow2) : -1;
       if (s < nranks && skip) {
-        recvDim = -1;
-        recvOffset = -1;
-        postRecv = 0;
+        ps->recvDim = -1;
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
         skip = 0;
       }
-      if (newPeer(a, aggFactor/(2*scale))) sendSkipped = a;
+      if (newPeer(a, aggFactor/(2*scale))) { sendSkipped = a; ps->stepOffset = stepOffset = 0; }
       foffset = a - sendSkipped;
-      sendStepOffset = foffset / postFreq; // Accumulate on next steps
-      sendOffset = sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
-      if (skip || recvDim == -1) recvSkipped++;
-      if (skip) sendSkipped++;
-      if (++a == lastA) {
-        scale *= 2;
-        phase = scale < aggFactor ? 2 : 4;
-        resetA();
-      }
-      if (skip == 0) return;
+      if ((foffset%postFreq) + 1 >= postFreq && skip == 0) stepOffset++;
+      ps->sendOffset = ps->sendDim >= 0 ? (foffset%postFreq) * nelem : -1;
     } else if (phase == 4) {
-      recvDim = 0;
-      sendDim = -1;
-      inpIx = rank * count + offset;
-      recvOffset = (phase2recv%postFreq) * nelem;
-      sendStepOffset = 0;
-      sendOffset = -1;
-      postRecv = 1;
-      postSend = 0;
+      ps->recvDim = 0;
+      ps->sendDim = -1;
+      ps->inpIx = rank * count + offset;
+      ps->recvOffset = ((aggFactor-1)%postFreq) * nelem;
+      ps->sendOffset = -1;
+      ps->postRecv = 1;
+      ps->postSend = 0;
       offset += chunkCount;
-      if (offset >= end) {
-        last = 1;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 1) as--;
+      if (p == 3) scale *= 2;
+      phase =
+        p == 0 ? as == 1 ? (aggFactor > 1 ? 2 : 4) : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        p == 2 ? 3 :
+        p == 3 ? scale < aggFactor ? 2 : 4 :
+        5;
+      if (p == 4) {
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
       } else {
-        reset();
+        resetA();
       }
-      return;
+    } else if (phase == 4 && offset >= end) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 
@@ -655,14 +684,12 @@ class PatAGAlgorithm{
   int nrPow2;
   int postFreq;
   int lastA;
-
+  int parallelFactor;
   int aggFactor;
   int as; // aggregated steps
   int a; // step inside aggregated step
   int aggDelta;
-
   int scale;
-
   int phase;
 
   // AS computation
@@ -671,7 +698,7 @@ class PatAGAlgorithm{
   int bitCount[32];
   int bitZeroStep[32];
 
-  __device__ __host__ int min(int a, int b) {
+  __device__ __host__ ssize_t min(ssize_t a, ssize_t b) {
     return (a<b)?a:b;
   }
 
@@ -738,8 +765,9 @@ class PatAGAlgorithm{
 
 
 public:
-   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
+   __device__ __host__ PatAGAlgorithm(int stepSize, int stepDepth, int maxParallelFactor, size_t offset, size_t end, size_t count, int chunkCount, int rank, int nranks):
      offset(offset), end(end), count(count), chunkCount(chunkCount), rank(rank), nranks(nranks) {
+    parallelFactor = maxParallelFactor;
     aggDelta = nrPow2 = (1<<log2Up(nranks));
 
     aggFactor = 1;
@@ -749,120 +777,120 @@ class PatAGAlgorithm{
       aggDelta /= 2;
     }
     postFreq = aggFactor;
+    if (postFreq < parallelFactor) parallelFactor = postFreq;
     int d = stepDepth;
     while (d > 1 && aggFactor < nranks/2) {
       d /= 2;
       aggFactor *= 2;
       aggDelta /= 2;
     }
-    //printf("AggFactor %d PostFreq %d AggDelta %d\n", aggFactor, postFreq, aggDelta);
 
     asDim = log2Up(aggDelta);
     reset();
   }
 
-  __device__ __host__ void getNextOp(int &recvDim, int &sendDim, size_t &inpIx, size_t &outIx, int &recvOffset, int &sendOffset, int &recvStepOffset, int &nelemOut, int &postRecv, int &postSend, int &last) {
-restart:
-    //printf("Phase %d as %d/%d a %d/%d scale %d\n", phase, as, aggDelta, a, lastA, scale);
-    last = 0;
-    nelemOut = nelem;
-    inpIx = offset;
+  __device__ __host__ int getParallelFactor() {
+    return parallelFactor;
+  }
+
+  __device__ __host__ void getNextOp(struct ncclPatStep* ps) {
+    ps->last = 0;
+    ps->nelem = nelem;
+    ps->inpIx = offset;
     int skip = 0;
-    if (phase == 0) {
+    if (a >= lastA) {
+      skip = 1;
+    } else if (phase == 0) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      int nextSkip = (a+1)*aggDelta + as >= nranks ? 1 : 0;
       int recvDataRank = (rank + s) % nranks;
-      outIx = recvDataRank * count + offset;
-      sendDim = -1;
-      recvDim = 0;
-      inpIx = 0;
-      sendOffset = -1;
-      recvOffset = (a % postFreq) * nelem;
-      recvStepOffset = 0;
-      postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postSend = 0;
-      a++;
-      if (nextSkip) {
-        as = nextAs();
-        if (as == aggDelta/2) {
-          offset += chunkCount;
-          if (offset >= end) {
-            last = 1;
-          } else {
-            reset();
-          }
-          return;
-        }
-        phase = 1;
-        resetA();
-      }
-      if (skip == 0) return;
+      ps->outIx = recvDataRank * count + offset;
+      ps->sendDim = -1;
+      ps->recvDim = 0;
+      ps->inpIx = 0;
+      ps->sendOffset = -1;
+      ps->recvOffset = (a % postFreq) * nelem;
+      ps->stepOffset = 0;
+      ps->postRecv = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postSend = 0;
    } else if (phase == 1) {
       int s = a*aggDelta + as;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      sendOffset = recvOffset = (a % postFreq) * nelem;
-      postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
-      postRecv = (sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
-      recvStepOffset = (sendDim == 0) ? 0 : a/postFreq;
-      if (recvDim == -1) {
-        recvOffset = -1;
-        postRecv = 0;
-      } else if (as - (1<<sendDim) == 0) {
-        int foffset = (a*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        postRecv = (sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<recvDim) >= nranks) ? 1 : 0;
-        recvStepOffset = (sendDim == 0) ? 0 : foffset/postFreq;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      ps->sendOffset = ps->recvOffset = (a % postFreq) * nelem;
+      ps->postSend = (a % postFreq == postFreq-1) || ((a+1)*aggDelta+as >= nranks) ? 1 : 0;
+      ps->postRecv = (ps->sendDim == 0) && ((a % postFreq == postFreq-1) || ((a+1)*aggDelta+as-1 >= nranks)) ? 1 : 0;
+      ps->stepOffset = (ps->sendDim == 0) ? 0 : a/postFreq;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
+        ps->postRecv = 0;
+      } else if (as - (1<<ps->sendDim) == 0) {
+        int foffset = (a*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->postRecv = (ps->sendDim == 0) && ((foffset % postFreq == postFreq-1) || ((((foffset+1)*2)+1)<<ps->recvDim) >= nranks) ? 1 : 0;
+        ps->stepOffset = (ps->sendDim == 0) ? 0 : foffset/postFreq;
       }
-      if (s < nranks && sendDim == 0 && skip) {
+      if (s < nranks && ps->sendDim == 0 && skip) {
         // Don't forget to receive at least once even if we don't send afterwards
-        sendDim = -1;
-        sendOffset = -1;
-        postSend = 0;
+        ps->sendDim = -1;
+        ps->sendOffset = -1;
+        ps->postSend = 0;
         skip = 0;
       }
-      if (++a == lastA) {
-        if (as % 2 == 1) {
-          phase = 0;
-        } else {
-          as = nextAs();
-        }
-        resetA();
-      }
-      if (skip == 0) return;
     } else if (phase == 2) {
       int s = (2*a+1)*scale*aggDelta;
-      postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
-      postRecv = 0;
+      ps->postSend = (a % postFreq == postFreq-1) || ((2*(a+1)+1)*scale*aggDelta >= nranks) ? 1 : 0;
+      ps->postRecv = 0;
       if (s >= nranks) skip = 1;
-      sendDim = firstBitSet(s, nrPow2);
-      s -= (1<<sendDim);
-      sendOffset = (a%postFreq) * nelem;
-      recvStepOffset = a / postFreq;
+      ps->sendDim = firstBitSet(s, nrPow2);
+      s -= (1<<ps->sendDim);
+      ps->sendOffset = (a%postFreq) * nelem;
+      ps->stepOffset = a / postFreq;
       int sendDataRank = (rank + nranks + s) % nranks;
-      outIx = sendDataRank * count + offset;
-      recvDim = s ? firstBitSet(s, nrPow2) : -1;
-      if (recvDim == -1) {
-        recvOffset = -1;
+      ps->outIx = sendDataRank * count + offset;
+      ps->recvDim = s ? firstBitSet(s, nrPow2) : -1;
+      if (ps->recvDim == -1) {
+        ps->recvOffset = -1;
       } else {
-        s -= (1<<recvDim);
-        int foffset = (a*2*scale*aggDelta) >> (recvDim+1);
-        recvOffset = (foffset%postFreq)*nelem;
-        recvStepOffset = foffset / postFreq;
+        s -= (1<<ps->recvDim);
+        int foffset = (a*2*scale*aggDelta) >> (ps->recvDim+1);
+        ps->recvOffset = (foffset%postFreq)*nelem;
+        ps->stepOffset = foffset / postFreq;
       }
-      if (++a == lastA) {
-        scale /= 2;
-        phase = scale ? 2 : 1;
+    }
+    a++;
+    if (a >= lastA && a >= parallelFactor) {
+      int p = phase;
+      if (p == 2) scale /= 2;
+      phase =
+        p == 2 ? scale ? 2 : 1 :
+        p == 1 ? as % 2 == 1 ? 0 : 1 :
+        1;
+      if (p == 0 || (p == 1 && as % 2 == 0)) as = nextAs();
+      if (p == 0 && as == aggDelta/2) {
+        offset += chunkCount;
+        if (offset >= end) {
+          ps->last = 2;
+        } else {
+          reset();
+        }
+      } else {
         resetA();
       }
-      if (skip == 0) return;
+    } else if (phase == 0 && as == 1 && offset + chunkCount >= end && a-1 >= ((lastA-1) / parallelFactor) * parallelFactor) {
+      ps->last = 1;
     }
-    goto restart;
+    int flags = PatUsed | (skip ? PatSkipped : 0);
+#if __CUDA_ARCH__ >= 600
+    cuda::atomic_ref<int, cuda::thread_scope_block> a(ps->flags);
+    a.store(flags, cuda::memory_order_release);
+#else
+    ps->flags = flags;
+#endif
   }
 };
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index c3f4eb49f..409518713 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -131,6 +131,9 @@ struct ncclSharedResources {
   int* tpRankToLocalRank;
   // Internal streams
   struct ncclStrongStream deviceStream, hostStream;
+  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int persistentRefs;
+  cudaEvent_t launchEvent, scratchEvent;
 
   /* proxy related shared res */
   struct ncclProxyState* proxyState;
@@ -407,6 +410,7 @@ struct ncclComm {
   // List of destructors to run when comm is destructed
   struct ncclDestructor* destructorHead;
 
+  struct ncclCudaContext* context;
   struct ncclSharedResources* sharedRes;
   /* map to top parent ranks. */
   int* topParentRanks;
@@ -419,6 +423,7 @@ struct ncclComm {
 
   int netPluginLoaded;
   ncclNet_t* ncclNet;
+  int ncclNetVer;
   ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
   void* bootstrap;
@@ -426,6 +431,7 @@ struct ncclComm {
   uint64_t* connectSend;
   uint64_t* connectRecv;
   struct ncclTopoGraph graphs[NCCL_NUM_ALGORITHMS];
+  int maxTreePattern;
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
   bool directMode;
@@ -565,8 +571,7 @@ struct ncclComm {
   struct ncclComm* groupNext;
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
-  int persistentRefs; // number of persistent plan-lists capturing this comm
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
+  int localPersistentRefs; // number of persistent plan-lists capturing this comm
   struct P2pSchedulePair { int sendRank; int recvRank; } *p2pSchedule;
 
   struct ncclKernelPlanner planner;
@@ -603,6 +608,7 @@ struct ncclComm {
   // Profiler plugin
   void* profilerContext;
   uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
+  struct ncclProfilerProxy profiler;
 
   // buffer registration cache
   struct ncclRegCache regCache;
diff --git a/src/include/device.h b/src/include/device.h
index 3f918ab23..0763a579a 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -133,6 +133,7 @@ struct ncclProxyConnector {
 
 struct ncclConnector {
   int connected;
+  int hasSeen;
   struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
@@ -374,6 +375,7 @@ struct alignas(16) ncclDevChannel {
   struct ncclDirect collnetDirect;
   struct ncclNvls nvls;
   uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+  uint64_t workCounter;
 };
 
 struct ncclDevComm {
@@ -396,6 +398,10 @@ struct ncclDevComm {
   // Channels, device side
   struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
   int* rankToLocalRank;
+
+  // Profiler counters
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
 };
 
 struct alignas(16) ncclDevCommAndChannels {
@@ -468,7 +474,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
 
 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
   // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? 8 : 4;
+  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
 }
 
 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
diff --git a/src/include/graph.h b/src/include/graph.h
index a22b62bb2..b779773da 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -36,7 +36,13 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, int* useGdr);
+enum ncclTopoGdrMode {
+  ncclTopoGdrModeDisable = 0,
+  ncclTopoGdrModeDefault = 1,
+  ncclTopoGdrModePci = 2,
+  ncclTopoGdrModeNum = 3
+};
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
 ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
@@ -55,9 +61,11 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #define NCCL_TOPO_CPU_VENDOR_AMD 2
 #define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
 #define NCCL_TOPO_CPU_VENDOR_MIXED 4
-#define NCCL_TOPO_CPU_TYPE_BDW 1
-#define NCCL_TOPO_CPU_TYPE_SKL 2
-#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_BDW 1
+#define NCCL_TOPO_CPU_MODEL_INTEL_SKL 2
+#define NCCL_TOPO_CPU_MODEL_INTEL_SRP 3
+#define NCCL_TOPO_CPU_MODEL_INTEL_ERP 4
+#define NCCL_TOPO_CPU_MODEL_YONGFENG 1
 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
 ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
diff --git a/src/include/group.h b/src/include/group.h
index 91bc19068..c06d1ef1b 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -112,6 +112,12 @@ inline void ncclGroupCommJoin(struct ncclComm* comm) {
     struct ncclComm** pp = &ncclGroupCommHead;
     while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
       pp = &(*pp)->groupNext;
+
+    // didn't find its clique, we need to insert it with ascending order based on commHash
+    if (*pp == nullptr) {
+      pp = &ncclGroupCommHead;
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+    }
     comm->groupNext = *pp;
     *pp = comm;
     // Comms gets a new memory stack scope upon joining. Each task batched for
diff --git a/src/include/nccl_net.h b/src/include/nccl_net.h
deleted file mode 100644
index f165aa1bf..000000000
--- a/src/include/nccl_net.h
+++ /dev/null
@@ -1,604 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "net_device.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
-#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
-#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 32
-
-// Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V9
-
-typedef struct {
-  int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
-} ncclNetVDeviceProps_v9_t;
-typedef ncclNetVDeviceProps_v9_t ncclNetVDeviceProps_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int forceFlush;                  // Force a flush on receives
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-  ncclNetVDeviceProps_v9_t vProps;
-  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
-  size_t maxCollBytes;             // Max transfer size for collective operations
-} ncclNetProperties_v9_t;
-typedef ncclNetProperties_v9_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclNet_v9_t;
-
-typedef ncclNet_v9_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v9
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  size_t size;
-} ncclNetSGE_v9_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Create a virtual NIC given the specified properties, which can be accessed at device index d
-  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_t* props);
-} ncclCollNet_v9_t;
-
-typedef ncclCollNet_v9_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v9
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int regIsGlobal;                 // regMr is not tied to a particular comm
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v8_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v8_t;
-
-typedef struct {
-  void* mhandle;
-  void* address;
-  uint32_t size;
-} ncclNetSGE_v8_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
-                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                             void* sendMhandle, void** request);
-  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
-                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                                 ncclDataType_t dataType, ncclRedOp_t redOp,
-                                 void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v8_t;
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v7_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v7_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
-
-// v6 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-} ncclNetProperties_v6_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-#endif // end include guard
diff --git a/src/include/nccl_profiler.h b/src/include/nccl_profiler.h
deleted file mode 100644
index a8164d075..000000000
--- a/src/include/nccl_profiler.h
+++ /dev/null
@@ -1,235 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
-
-#include <cstdint>
-
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      const char* func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      const char* datatype;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      const char* algo;
-      const char* proto;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      const char* func;
-      void* buff;
-      const char* datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v2_t;
-
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
-typedef union {
-  struct {
-    size_t transSize;
-    int steps;
-  } proxyOp;
-
-  struct {
-    int appendedProxyOps;
-  } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
-
-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
-typedef struct {
-  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
-  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
-  int rank;                     // originating rank
-  union {
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint64_t seqNumber;
-      uint8_t func;
-      void const* sendBuff;
-      void* recvBuff;
-      size_t count;
-      int root;
-      uint8_t datatype;
-      uint32_t op;
-      size_t trafficBytes;
-      uint8_t nMaxChannels;
-      uint8_t nWarps;
-      uint8_t algo;
-      uint8_t proto;
-      int isCollnet;
-      int isNvls;
-    } coll;
-
-    struct {
-      const char* name;
-      uint64_t commHash;
-      uint8_t func;
-      void* buff;
-      uint8_t datatype;
-      size_t count;
-      int peer;
-    } p2p;
-
-    struct {
-      pid_t pid;                // pid of the originating process
-      uint8_t channelId;        // channel id for this proxy operation
-      int peer;                 // remote rank for send/recv
-      int nSteps;               // number of steps for this proxy operation
-      int chunkSize;            // amount of data transferred by this proxy operation
-      int isSend;
-    } proxyOp;
-
-    struct {
-      int step;
-    } proxyStep;
-  };
-} ncclProfilerEventDescr_v1_t;
-
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
-
-typedef struct {
-  const char* name;
-
-  // init - initialize the profiler plugin
-  // Input
-  //  - context        : opaque profiler context object for separating profiler behavior across comms
-  // Output
-  //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
-
-  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
-  // Input
-  //  - context: opaque profiler context object
-  //  - eDescr : pointer to ncclProfilerEventDescr_t object
-  // Output
-  //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
-
-  // stopEvent - stop/finalize an event inside and event set
-  // Input
-  //  - eHandle: handle to event object
-  ncclResult_t (*stopEvent)(void* eHandle);
-
-  // recordEventState - record event state transitions and event attribute updates
-  // Input
-  //  - eHandle   : handle to event object created through startEvent
-  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
-  //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
-
-  // finalize - finalize the profiler plugin
-  // Input
-  //  - context: opaque profiler context object
-  ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v1_t;
-
-#endif
diff --git a/src/include/nccl_tuner.h b/src/include/nccl_tuner.h
deleted file mode 100644
index 6e61118b9..000000000
--- a/src/include/nccl_tuner.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //   - regBuff: can register user buffer
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int regBuff, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
-
-typedef ncclTuner_v4_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - numPipeOps: number of operations in the group
-  //   - numAlgo: number of algorithms in collCostTable
-  //   - numProto: number of protocols in collCostTable
-  //
-  // Outputs:
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // InOut:
-  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
-  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
-                              int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v3_t;
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // Inputs:
-  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  //   - nNodes: number of nodes in current communicator.
-  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  // Outputs:
-  //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - context: tuner context object
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - collNetTypeSupport: whether collnet supports this type
-  //   - nvlsTypeSupport: whether nvlink sharp supports this time
-  //   - numPipeOps: number of operations in the group
-  //
-  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the give collective
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int* algorithm, int* protocol, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v2_t;
-
-#endif
diff --git a/src/include/net.h b/src/include/net.h
index d1926ccd8..afc2d160e 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -18,7 +18,6 @@ ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
 ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
-int ncclNetVersion(struct ncclComm* comm);
 
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
diff --git a/src/include/net_device.h b/src/include/net_device.h
index 5fae9b542..c3a79e35c 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -26,6 +26,7 @@ typedef struct {
 
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
-typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 5d00f0792..2c18b36b9 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -31,9 +31,10 @@
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
 #define NVTX_SID_CommFinalize         14
+// When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 14 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
new file mode 100644
index 000000000..d57aad5a9
--- /dev/null
+++ b/src/include/plugin/nccl_net.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+//Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
+#define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
+#define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+
+#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
+#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+// Max number of ncclNet objects which can live in the same process
+#define NCCL_NET_MAX_PLUGINS 3
+
+// NCCL core profiler callback for network defined events instrumentation
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
+#include "net/net_v10.h"
+#include "net/net_v9.h"
+#include "net/net_v8.h"
+#include "net/net_v7.h"
+#include "net/net_v6.h"
+
+typedef ncclNet_v10_t ncclNet_t;
+typedef ncclCollNet_v10_t ncclCollNet_t;
+typedef ncclNetSGE_v10_t ncclNetSGE_t;
+typedef ncclNetProperties_v10_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+
+#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+
+#endif // end include guard
diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h
new file mode 100644
index 000000000..34cf9a927
--- /dev/null
+++ b/src/include/plugin/nccl_profiler.h
@@ -0,0 +1,69 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include <cstdint>
+#include "profiler/profiler_v3.h"
+#include "profiler/profiler_v2.h"
+#include "profiler/profiler_v1.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h
new file mode 100644
index 000000000..f2401890d
--- /dev/null
+++ b/src/include/plugin/nccl_tuner.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+#include "tuner/tuner_v4.h"
+#include "tuner/tuner_v3.h"
+#include "tuner/tuner_v2.h"
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h
new file mode 100644
index 000000000..ada6d482e
--- /dev/null
+++ b/src/include/plugin/net/net_v10.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V10_H_
+#define NET_V10_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+} ncclNetVDeviceProps_v10_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v10_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v10_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v10_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclNet_v10_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v10_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v10_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v10_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v10_t* props);
+} ncclCollNet_v10_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/net/net_v6.h b/src/include/plugin/net/net_v6.h
new file mode 100644
index 000000000..99445ce17
--- /dev/null
+++ b/src/include/plugin/net/net_v6.h
@@ -0,0 +1,113 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V6_H_
+#define NET_V6_H_
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v7.h b/src/include/plugin/net/net_v7.h
new file mode 100644
index 000000000..e9b19dec8
--- /dev/null
+++ b/src/include/plugin/net/net_v7.h
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V7_H_
+#define NET_V7_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v8.h b/src/include/plugin/net/net_v8.h
new file mode 100644
index 000000000..a178132fe
--- /dev/null
+++ b/src/include/plugin/net/net_v8.h
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V8_H_
+#define NET_V8_H_
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+#endif
diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h
new file mode 100644
index 000000000..ce9d91748
--- /dev/null
+++ b/src/include/plugin/net/net_v9.h
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_V9_H_
+#define NET_V9_H_
+
+#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+} ncclNetVDeviceProps_v9_t;
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v9_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+} ncclNetProperties_v9_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v9_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v9_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclNet_v9_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v9_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v9_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v9_t* props);
+} ncclCollNet_v9_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
new file mode 100644
index 000000000..7336c34d9
--- /dev/null
+++ b/src/include/plugin/plugin.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PLUGIN_H_
+#define NCCL_PLUGIN_H_
+
+#include "nccl.h"
+
+void* ncclOpenNetPluginLib(const char* name);
+void* ncclOpenTunerPluginLib(const char* name);
+void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclGetNetPluginLib(void);
+ncclResult_t ncclClosePluginLib(void* handle);
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib.h b/src/include/plugin/profiler/net_ib.h
new file mode 100644
index 000000000..2ac6d5c97
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_H_
+#define NET_IB_H_
+
+#include "nccl_profiler.h"
+#include "net_ib_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_ib_v1.h b/src/include/plugin/profiler/net_ib_v1.h
new file mode 100644
index 000000000..f142de5f5
--- /dev/null
+++ b/src/include/plugin/profiler/net_ib_v1.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket.h b/src/include/plugin/profiler/net_socket.h
new file mode 100644
index 000000000..9f5749633
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_H_
+#define NET_SOCKET_H_
+
+#include "nccl_profiler.h"
+#include "net_socket_v1.h"
+
+#endif
diff --git a/src/include/plugin/profiler/net_socket_v1.h b/src/include/plugin/profiler/net_socket_v1.h
new file mode 100644
index 000000000..0cb664f20
--- /dev/null
+++ b/src/include/plugin/profiler/net_socket_v1.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v1.h b/src/include/plugin/profiler/profiler_v1.h
new file mode 100644
index 000000000..3b6710240
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v1.h
@@ -0,0 +1,107 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v2.h b/src/include/plugin/profiler/profiler_v2.h
new file mode 100644
index 000000000..146152a7a
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v2.h
@@ -0,0 +1,104 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
diff --git a/src/include/plugin/profiler/profiler_v3.h b/src/include/plugin/profiler/profiler_v3.h
new file mode 100644
index 000000000..10c50594f
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v3.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v2.h b/src/include/plugin/tuner/tuner_v2.h
new file mode 100644
index 000000000..ec96f6057
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v2.h
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V2_H_
+#define TUNER_V2_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the give collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int* algorithm, int* protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v2_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v3.h b/src/include/plugin/tuner/tuner_v3.h
new file mode 100644
index 000000000..4fa10e825
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v3.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V3_H_
+#define TUNER_V3_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v3_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v4.h b/src/include/plugin/tuner/tuner_v4.h
new file mode 100644
index 000000000..a4b38a0a3
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v4.h
@@ -0,0 +1,56 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V4_H_
+#define TUNER_V4_H_
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+#endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 2b7efe0f6..8d4107963 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -17,6 +17,18 @@ struct ncclTaskP2p;
 struct ncclInfo;
 struct ncclComm;
 struct ncclProxyOp;
+struct ncclProxyConnector;
+
+struct ncclProfilerProxy {
+  bool initialized;
+  uint64_t* workStarted/*[MAXCHANNELS]*/;
+  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  uint64_t workCounter[MAXCHANNELS]; // host work counter
+  struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
+  struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
+};
+
+extern int ncclProfilerEventMask;
 
 // Plugin Init/Finalize Wrappers
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
@@ -44,6 +56,10 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int sub, struct ncclProxyArgs* args,
 ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHandle);
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
+// Kernel Channel Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+
 // Record Event Wrappers
 ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
@@ -51,5 +67,9 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n
 
 // Profiler utility functions
 ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
+
+// Profiler callback for network plugin
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
 
 #endif
diff --git a/src/include/proxy.h b/src/include/proxy.h
index c97a4d7ce..225acb22d 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -32,7 +32,8 @@ typedef enum : uint8_t {
   ncclPatternPatUp,
   ncclPatternPatDown,
   ncclPatternSend,
-  ncclPatternRecv
+  ncclPatternRecv,
+  ncclPatternProfiler,
 } ncclPattern_t;
 
 enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
@@ -93,6 +94,7 @@ struct ncclProxyOp {
   int peer;
   pid_t pid;
   void* profilerContext;
+  uint64_t workCounter;
 
   struct ncclProxyOp *enqNext;
 };
@@ -129,12 +131,15 @@ struct ncclProxySubArgs {
   // Profiler plugin
   int eActivationMask;
   int rank;
+  uint64_t profilerSteps;
   pid_t pid;
   void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
+  void* kernelEventHandle;
   void* stepEventHandles[NCCL_STEPS];
   size_t transSize;
+  uint64_t workCounter;
 
   void* recvRequestsCache[NCCL_STEPS];
   int recvRequestsSubCount;
diff --git a/src/include/ras.h b/src/include/ras.h
index 7909b3dc8..d27a543e2 100644
--- a/src/include/ras.h
+++ b/src/include/ras.h
@@ -15,6 +15,8 @@ struct rasRankInit {
   pid_t pid;
   int cudaDev;
   int nvmlDev;
+  uint64_t hostHash;
+  uint64_t pidHash;
 };
 
 ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank);
diff --git a/src/include/register.h b/src/include/register.h
index 740a645f4..143f41bc9 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -42,7 +42,7 @@ struct ncclReg {
   uintptr_t baseAddr;
   size_t baseSize;
   CUdeviceptr regAddr;
-  size_t regSize;
+  size_t regUCSize, regMCSize;
   int dev;
   CUmemGenericAllocationHandle mcHandle;
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
diff --git a/src/include/shm.h b/src/include/shm.h
index b519e5dc9..223d87346 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -14,7 +14,6 @@ struct shmCuIpc {
     CUmemFabricHandle handle;
     CUmemGenericAllocationHandle data;
   };
-  int tpProxyRank;
   void *ptr;
   size_t size;
 };
@@ -30,8 +29,8 @@ struct shmIpcDesc {
 
 typedef struct shmIpcDesc ncclShmIpcDesc_t;
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *descOut, void **hptr, void **dptr);
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut);
 ncclResult_t ncclShmIpcClose(ncclShmIpcDesc_t *desc);
 
 #endif
diff --git a/src/include/socket.h b/src/include/socket.h
index f0a3237ce..ffa148091 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -96,5 +96,5 @@ ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
-ncclResult_t ncclSocketClose(struct ncclSocket* sock);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
 #endif
diff --git a/src/include/strongstream.h b/src/include/strongstream.h
index 0984dfe57..c56d5aca5 100644
--- a/src/include/strongstream.h
+++ b/src/include/strongstream.h
@@ -10,13 +10,24 @@
 #include "nccl.h"
 #include "checks.h"
 
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stdint.h>
 
+// ncclCudaContext: wraps a CUDA context with per-context state.
+struct ncclCudaContext;
+
+// Get a ncclCudaContext to track the currently active CUDA context.
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out);
+// Drop reference.
+void ncclCudaContextDrop(struct ncclCudaContext* cxt);
+
 /* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
  * easily.
  */
 struct ncclCudaGraph {
 #if CUDART_VERSION >= 11030
+  cudaStream_t origin;
   cudaGraph_t graph;
   unsigned long long graphId;
 #endif
@@ -25,6 +36,7 @@ struct ncclCudaGraph {
 inline struct ncclCudaGraph ncclCudaGraphNone() {
   struct ncclCudaGraph tmp;
   #if CUDART_VERSION >= 11030
+    tmp.origin = nullptr;
     tmp.graph = nullptr;
     tmp.graphId = ULLONG_MAX;
   #endif
@@ -33,7 +45,7 @@ inline struct ncclCudaGraph ncclCudaGraphNone() {
 
 inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
   #if CUDART_VERSION >= 11030
-    return graph.graph != nullptr;
+    return graph.graphId != ULLONG_MAX;
   #else
     return false;
   #endif
@@ -57,60 +69,37 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
  * streams unfit for the use of serializing access to a persistent resource.
  * Strong streams have been introduced to address this need.
  *
- * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
+ * All updates to a strong stream must be enclosed by a Acquire/Release pair.
  *
- * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
- *   indicating the currently capturing graph (or none). This parameter must be
- *   the same for the entire sequence of {Acquire; ...; Release}.
+ * Acquire retrieves a "work" stream (cudaStream_t) which may be used to add
+ * work.
  *
- * - An {Acquire; ...; Release} sequence must not be concurrent with any
- *   other operations against the strong stream including graph launches which
- *   reference this stream.
+ * Release publishes the work streams work into the strong stream. The Release
+ * must be issued by the same thread that did the Acquire.
  */
 struct ncclStrongStream;
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
 
-// Acquire-fence the strong stream.
+// Acquire the strong stream. Upon return `*workStream` will be usable to add work.
+// `concurrent` indicates if other threads may be using the strong stream.
 ncclResult_t ncclStrongStreamAcquire(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Acquire-fence the strong stream assuming no graph is capturing. This permits
-// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
-// calls. Strong stream still must be released via:
-//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
-
-// Release-fence of the strong stream.
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
-
-// Add a host launch to the stream.
-ncclResult_t ncclStrongStreamLaunchHost(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  cudaHostFn_t fn, void* arg
-);
-// Add a kernel launch to the stream.
-ncclResult_t ncclStrongStreamLaunchKernel(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+// Get the workStream for an already acquired strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent, cudaStream_t* workStream
 );
 
-// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
-// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
-// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
-// implementation to induce few graph dependencies.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-// `b` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
-);
-// `a` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
+// Release of the strong stream.
+// `concurrent` indicates if other threads may be using the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent);
+
+ncclResult_t ncclStreamWaitStream(
+  cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
 );
 
 // Synchrnoization does not need the strong stream to be acquired.
@@ -118,23 +107,28 @@ ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
 
 ////////////////////////////////////////////////////////////////////////////////
 
-struct ncclStrongStreamGraph; // internal to ncclStrongStream
+struct ncclStrongStreamCapture; // internal to ncclStrongStream
 
 struct ncclStrongStream {
-  // Used when not graph capturing.
-  cudaStream_t cudaStream;
+  // The stream to use for non-captured work.
+  cudaStream_t liveStream;
+  void* liveAcquiredBy;
 #if CUDART_VERSION >= 11030
+  // This stream ever appeared in a graph capture.
+  bool everCaptured;
+  pthread_mutex_t lock;
+  struct ncclStrongStreamCapture* captureHead;
   // The event used to establish order between graphs and streams. During acquire
   // this event is waited on, during release it is recorded to.
   cudaEvent_t serialEvent;
-  // This stream ever appeared in a graph capture.
-  bool everCaptured;
-  // Tracks whether serialEvent needs to be recorded to upon Release().
-  bool serialEventNeedsRecord;
-  struct ncclStrongStreamGraph* graphHead;
-#else
-  cudaEvent_t scratchEvent;
 #endif
 };
 
+struct ncclCudaContext {
+  struct ncclCudaContext* next;
+  CUcontext hcontext;
+  int refCount;
+  struct ncclStrongStream launchOrder;
+};
+
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 37187f69e..c563fbbd6 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -18,6 +18,7 @@
 #define TRANSPORT_SHM 1
 #define TRANSPORT_NET 2
 #define TRANSPORT_COLLNET 3
+#define TRANSPORT_PROFILER 4
 
 #include "proxy.h"
 #include "comm.h"
@@ -26,6 +27,7 @@ extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
 extern struct ncclTransport netTransport;
 extern struct ncclTransport collNetTransport;
+extern struct ncclTransport profilerTransport;
 
 extern struct ncclTransport* ncclTransports[];
 // Forward declarations
@@ -65,8 +67,10 @@ struct ncclNvlsSharedRes {
   CUmulticastObjectProp signalProp;
   CUmemAccessDesc accessDesc;
   int dev;
-  size_t buffSize;
-  size_t creditSize;
+  size_t creditUCSize;
+  size_t creditMCSize;
+  size_t buffUCSize;
+  size_t buffMCSize;
   CUmemGenericAllocationHandle mcBuffHandle; // Multicast handle for NVLS buffer
   CUmemGenericAllocationHandle mcCreditHandle; // Multicast handle for NVLS credit buffer
   char* mcBuff; // Multicast NVLS buffer address
@@ -123,7 +127,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm);
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm);
 ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, int* nCleanupQueueElts);
 ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, int *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize);
 ncclResult_t ncclNvlsFree(struct ncclComm* comm);
 
 enum { collNetRecv=0, collNetSend=1 };
diff --git a/src/init.cc b/src/init.cc
index 3e218ab07..46b02e65e 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -51,17 +51,6 @@ NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
-static uint64_t hashUniqueId(ncclUniqueId const &id) {
-  char const *bytes = (char const*)&id;
-  uint64_t h = 0xdeadbeef;
-  for(int i=0; i < (int)sizeof(ncclUniqueId); i++) {
-    h ^= h >> 32;
-    h *= 0x8db3db47fa2994ad;
-    h += bytes[i];
-  }
-  return h;
-}
-
 // GDRCOPY support: Off by default
 NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0);
 
@@ -111,7 +100,7 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
   memset(out, 0, sizeof(*out));
   // copy to avoid alignment mismatch
   memcpy(out, &handle, sizeof(handle));
-  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)hashUniqueId(*out));
+  TRACE_CALL("ncclGetUniqueId(0x%llx)", (unsigned long long)getHash(out->internal, NCCL_UNIQUE_ID_BYTES));
   return ncclSuccess;
 }
 
@@ -232,6 +221,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
       free(comm->sharedRes->tpRankToLocalRank);
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->hostStream));
       NCCLCHECK(ncclStrongStreamDestruct(&comm->sharedRes->deviceStream));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->launchEvent));
+      CUDACHECK(cudaEventDestroy(comm->sharedRes->scratchEvent));
       NCCLCHECK(ncclProxyDestroy(comm));
       free(comm->sharedRes);
     }
@@ -268,6 +259,9 @@ static ncclResult_t commFree(ncclComm_t comm) {
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
   NCCLCHECK(ncclNetPluginUnload(comm));
+
+  ncclCudaContextDrop(comm->context);
+
   free(comm);
 
   return ncclSuccess;
@@ -309,17 +303,12 @@ ncclResult_t ncclCommEnsureReady(ncclComm_t comm) {
     ncclGroupJobAbort(comm->groupJob);
   } else {
     NCCLCHECK(ncclCommGetAsyncError(comm, &ret));
-    if (ret != ncclSuccess) {
-      /* if ret is not ncclInProgress, we just keep it. */
+    if (ret == ncclInProgress) {
       WARN("Attempt to use communicator before the previous operation returned ncclSuccess");
-      if (ret == ncclInProgress) ret = ncclInvalidArgument;
+      ret = ncclInvalidArgument;
       goto exit;
     }
-    /* if there is linked group job, we should complete it. */
-    if (comm->groupJob) {
-      NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
-      comm->groupJob = NULL;
-    }
+    /* if ret is not ncclInProgress, we just keep it. */
   }
 
 exit:
@@ -357,6 +346,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   // the device we're on (failure cause #1) , better know it early.
   CUDACHECK(cudaGetDevice(&comm->cudaDev));
 
+  NCCLCHECK(ncclCudaContextTrack(&comm->context));
+
   NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
   nvmlDevice_t nvmlDev;
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
@@ -396,6 +387,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
     NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
     NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
     comm->sharedRes = sharedRes;
     sharedRes->refCount = 1;
   } else {
@@ -437,13 +430,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   struct ncclDevCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
   bool ccEnable;
+  cudaStream_t deviceStream;
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->deviceStream), ret, fail);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
-  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, tmpCommAndChans.comm.rankToLocalRank);
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.rankToLocalRank, comm->rankToLocalRank, comm->nRanks, deviceStream), ret, fail);
   comm->devComm = &devCommAndChans->comm;
   tmpCommAndChans.comm.rank = comm->rank;
   tmpCommAndChans.comm.nRanks = nRanks;
@@ -494,10 +488,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workFifoConsumedLeast = 0;
   tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
 
+  // Alloc profiler counters for the kernel
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
+  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workCompleted, MAXCHANNELS), ret, fail);
+  tmpCommAndChans.comm.workStarted = comm->profiler.workStarted;
+  tmpCommAndChans.comm.workCompleted = comm->profiler.workCompleted;
+  ncclCommPushCudaHostFree(comm, comm->profiler.workStarted);
+  ncclCommPushCudaHostFree(comm, comm->profiler.workCompleted);
+
   if (comm->collNetDenseToUserRank != nullptr) {
-    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaCallocAsync(&tmpCommAndChans.comm.collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
     ncclCommPushCudaFree(comm, tmpCommAndChans.comm.collNetDenseToUserRank);
-    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+    NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.comm.collNetDenseToUserRank, comm->collNetDenseToUserRank, nRanks, deviceStream), ret, fail);
   }
 
   for (int c=0; c < MAXCHANNELS; c++) {
@@ -510,14 +512,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     tmpCommAndChans.channels[c].nvls = comm->channels[c].nvls;
 
     if (comm->channels[c].ring.userRanks != nullptr) {
-      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+      NCCLCHECKGOTO(ncclCudaMemcpyAsync(tmpCommAndChans.channels[c].ring.userRanks, comm->channels[c].ring.userRanks, nRanks, deviceStream), ret, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, comm->sharedRes->deviceStream.cudaStream), ret, fail);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(devCommAndChans, &tmpCommAndChans, 1, deviceStream), ret, fail);
 exit:
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   NCCLCHECK(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream));
   return ret;
 fail:
   goto exit;
@@ -1000,6 +1002,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
       graphs[a]->typeInter = std::max(allGather3Data[i].graphInfo[a].typeInter, graphs[a]->typeInter);
       graphs[a]->crossNic = std::max(allGather3Data[i].graphInfo[a].crossNic, graphs[a]->crossNic);
     }
+    comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
   }
   if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
@@ -1376,12 +1379,12 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash for the comm, re-using part of the parent's hash, commHash is a 64bit struct (=16 hex),
-    // add unique split counter and the color
-    ncclUniqueId tmpId;
-    memset(&tmpId,0,sizeof(ncclUniqueId));// must set 0 here to avoid undefined bits
-    snprintf((char*)&tmpId, NCCL_UNIQUE_ID_BYTES, "%016lx-%d-%d", job->parent->commHash, job->splitCount, job->color);
-    comm->commHash = getHash(tmpId.internal, NCCL_UNIQUE_ID_BYTES);
+    // child hash obtained from (parent hash, split count, color)
+    uint64_t hacc[2] = {1, 1};
+    eatHash(hacc, &job->parent->commHash);
+    eatHash(hacc, &job->splitCount);
+    eatHash(hacc, &job->color);
+    comm->commHash = digestHash(hacc);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1394,8 +1397,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
     // obtain a unique hash using the first commId
-    comm->commHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
-    commIdHash = hashUniqueId(job->commId[0]);
+    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1610,6 +1612,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   NCCL_CONFIG_DEFAULT(internalConfigPtr, maxCTAs, NCCL_CONFIG_UNDEF_INT, MAXCHANNELS, "Max CTAs", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
 
   /* assign config to communicator */
   comm->config.blocking = internalConfigPtr->blocking;
@@ -1618,6 +1621,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   comm->config.maxCTAs = internalConfigPtr->maxCTAs;
   comm->config.netName = internalConfigPtr->netName;
   comm->config.splitShare = internalConfigPtr->splitShare;
+  comm->config.trafficClass = internalConfigPtr->trafficClass;
 
   NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
@@ -1642,6 +1646,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   const char* commIdEnv = NULL;
   ncclComm_t comm = NULL;
   struct ncclCommInitRankAsyncJob* job = NULL;
+  bool launchedJob = false;
   // first call ncclInit, this will setup the environment
   NCCLCHECKGOTO(ncclInit(), res, fail);
 
@@ -1695,12 +1700,13 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
     // start the bootstrap root before bootstrapping, use only the first handle
     NCCLCHECKGOTO(bootstrapCreateRoot((struct ncclBootstrapHandle*)&job->commId[0], true), res, fail);
   }
+  launchedJob = true;
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, ncclCommInitJobFree, comm), res, fail);
 
 exit:
   return ncclGroupErrCheck(res);
 fail:
-  if (job) ncclCommInitJobFree(job);
+  if (job && !launchedJob) ncclCommInitJobFree(job);
   if (comm) {
     free(comm->abortFlag);
     if (comm->abortFlagDev) (void)ncclCudaHostFree((void*)comm->abortFlagDev);
@@ -1896,7 +1902,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
     NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
     NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
     // And keep polling until all graphs referencing us die.
-    while (comm->persistentRefs != 0) {
+    while (comm->localPersistentRefs != 0) {
       NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/true), ret, fail);
     }
     while (!ncclIntruQueueEmpty(&comm->legacyRegCleanupQueue)) {
@@ -1964,7 +1970,6 @@ ncclResult_t ncclCommFinalize(ncclComm_t comm) {
   }
   return ret;
 fail:
-  free(job);
   if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, ret);
   goto exit;
 }
@@ -2215,6 +2220,11 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+  /* if there is linked group job, we should complete it. */
+  if (*asyncError == ncclSuccess && comm->groupJob) {
+    NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
+    comm->groupJob = NULL;
+  }
   return ncclSuccess;
 }
 
@@ -2265,16 +2275,13 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
 #if CUDART_VERSION >= 12010
   size_t memGran = 0;
-  size_t mcGran = 0;
   CUdevice currentDev;
   CUmemAllocationProp memprop = {};
-  CUmulticastObjectProp mcprop = {};
   CUmemAccessDesc accessDesc = {};
   CUmemGenericAllocationHandle handle;
   int cudaDev;
   int flag;
   int dcnt;
-  int mcSupport = 0;
 
   if (ptr == NULL || size == 0) goto fallback;
 
@@ -2284,6 +2291,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
   CUCHECK(cuDeviceGet(&currentDev, cudaDev));
 
   if (ncclCuMemEnable()) {
+    size_t handleSize = size;
     int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     // Query device to see if FABRIC handle support is available
     flag = 0;
@@ -2299,40 +2307,25 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
     if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
     CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
     CUDACHECK(cudaGetDeviceCount(&dcnt));
-
-    if (CUPFN(cuMulticastCreate) != NULL) CUCHECK(cuDeviceGetAttribute(&mcSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, currentDev));
-    if (mcSupport) {
-      /* mc property */
-      mcprop.size = size;
-      /* device cnt is a dummy value right now, it might affect mc granularity in the future. */
-      mcprop.numDevices = dcnt;
-      mcprop.handleTypes = requestedHandleTypes;
-      mcprop.flags = 0;
-      CUCHECK(cuMulticastGetGranularity(&mcGran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-      /* only size needs to be aligned to mcGran */
-      ALIGN_SIZE(size, mcGran);
-    } else {
-      ALIGN_SIZE(size, memGran);
-    }
+    ALIGN_SIZE(handleSize, memGran);
 
     if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
       /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
-      CUresult err = CUPFN(cuMemCreate(&handle, size, &memprop, 0));
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
       if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
         requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
         memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
         /* Allocate the physical memory on the device */
-        CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
       }
     } else {
       /* Allocate the physical memory on the device */
-      CUCHECK(cuMemCreate(&handle, size, &memprop, 0));
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
     }
     /* Reserve a virtual address range */
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, size, memGran, 0, 0));
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
     /* Map the virtual address range to the physical allocation */
-    CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
     /* Now allow RW access to the newly mapped memory */
     for (int i = 0; i < dcnt; ++i) {
       int p2p = 0;
@@ -2340,7 +2333,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
         accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         accessDesc.location.id = i;
         accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
       }
       if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
     }
diff --git a/src/misc/ipcsocket.cc b/src/misc/ipcsocket.cc
index 23746b3c5..3e9dfcdb8 100644
--- a/src/misc/ipcsocket.cc
+++ b/src/misc/ipcsocket.cc
@@ -169,7 +169,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   } control_un;
 
   struct cmsghdr *cmptr;
-  char dummy_buffer[1];
+  char dummy_buffer[1] = {'\0'};
   struct sockaddr_un cliaddr;
 
   // Construct client address to send this shareable handle to
@@ -190,6 +190,7 @@ ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
   TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d fd %d to UDS socket %s", hdr, hdrLen, sendFd, temp);
 
   if (sendFd != -1) {
+    memset(&control_un, '\0', sizeof(control_un));
     msg.msg_control = control_un.control;
     msg.msg_controllen = sizeof(control_un.control);
 
diff --git a/src/misc/param.cc b/src/misc/param.cc
index eb50cfeed..d7c324fe9 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -32,6 +32,7 @@ void setEnvFile(const char* fileName) {
   size_t n = 0;
   ssize_t read;
   while ((read = getline(&line, &n, file)) != -1) {
+    if (line[0] == '#') continue;
     if (line[read-1] == '\n') line[read-1] = '\0';
     int s=0; // Env Var Size
     while (line[s] != '\0' && line[s] != '=') s++;
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index dfb4e6888..731dbcee1 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -171,6 +171,7 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
       strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+      memset(addrs+found, '\0', sizeof(*addrs));
       memcpy(addrs+found, interface->ifa_addr, salen);
       found++;
     }
@@ -905,9 +906,17 @@ ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclSocketClose(struct ncclSocket* sock) {
+ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
   if (sock != NULL) {
     if (sock->state > ncclSocketStateNone && sock->state < ncclSocketStateNum && sock->fd >= 0) {
+      if (wait) {
+        char data;
+        int closed = 0;
+        do {
+          int offset = 0;
+          if (ncclSocketProgress(NCCL_SOCKET_RECV, sock, &data, sizeof(char), &offset, &closed) != ncclSuccess) break;
+        } while (closed == 0);
+      }
       /* shutdown() is needed to send FIN packet to proxy thread; shutdown() is not affected
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 61b0e4b5b..e6cce9807 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -9,28 +9,61 @@
 #include "checks.h"
 #include "param.h"
 
-// Tracks the chain of graph nodes for a given graph captured identified by
-// its graph id. This state has to live for as long as captured work is being
-// submitted. CUDA doesn't have mechanism to inform us when the user ends capture
-// so the best we can do is get notified when the graph is destroyed.
-struct ncclStrongStreamGraph {
-  struct ncclStrongStreamGraph* next;
-  // Atomically exchanged to false by both the main thread or the graph destructor
-  // callback. The last to arrive deletes the node.
-  bool alive;
+// Tracks the captured work a given graph captured identified by its graph id.
+struct ncclStrongStreamCapture {
+  struct ncclStrongStreamCapture* next;
+  cudaGraph_t graph;
   unsigned long long graphId;
-  // For each graph we track the "tip" of the chain of graph nodes. A linear
-  // chain would always have just one node at its tip, but since we have to merge
-  // in chains from other streams (via ncclStrongStreamWaitStream) some spots
-  // in the chain can be wider than a single node and thus need a list, so we
-  // maintain a dynamically sized array of tip nodes.
-  int tipCount, tipCapacity;
-  cudaGraphNode_t* tipNodes;
+  cudaStream_t captureStream;
+  cudaGraphNode_t lastRecord;
+  void* acquiredBy;
 };
 
-static void ncclStrongStreamGraphDelete(struct ncclStrongStreamGraph* g) {
-  free(g->tipNodes);
-  free(g);
+////////////////////////////////////////////////////////////////////////////////
+
+static ncclCudaContext* cxtListHead = nullptr;
+static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
+
+ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
+  ncclResult_t result = ncclSuccess;
+  CUcontext hcontext;
+  CUCHECK(cuCtxGetCurrent(&hcontext));
+
+  pthread_mutex_lock(&cxtListLock);
+  struct ncclCudaContext* p = cxtListHead;
+  while (1) {
+    if (p == nullptr) {
+      p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
+      p->refCount = 1;
+      p->hcontext = hcontext;
+      p->next = cxtListHead;
+      cxtListHead = p;
+      NCCLCHECKGOTO(ncclStrongStreamConstruct(&p->launchOrder), result, leave);
+      break;
+    }
+    if (p->hcontext == hcontext) {
+      p->refCount += 1;
+      break;
+    }
+    p = p->next;
+  }
+leave:
+  pthread_mutex_unlock(&cxtListLock);
+  *out = p;
+  return ncclSuccess;
+}
+
+void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
+  pthread_mutex_lock(&cxtListLock);
+  if (0 == --cxt->refCount) {
+    struct ncclCudaContext** pp = &cxtListHead;
+    while (*pp != cxt) pp = &(*pp)->next;
+    *pp = cxt->next; // remove from list
+    // Destroy resources held in cxt
+    ncclStrongStreamDestruct(&cxt->launchOrder);
+    free(cxt);
+  }
+  pthread_mutex_unlock(&cxtListLock);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -43,9 +76,9 @@ ncclResult_t ncclCudaGetCapturingGraph(
     NCCLCHECK(ncclCudaDriverVersion(&driver));
     if (CUDART_VERSION < 11030 || driver < 11030) {
       cudaStreamCaptureStatus status;
-      unsigned long long gid;
-      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, &gid));
+      CUDACHECK(cudaStreamGetCaptureInfo(stream, &status, nullptr));
       #if CUDART_VERSION >= 11030
+        graph->origin = nullptr;
         graph->graph = nullptr;
         graph->graphId = ULLONG_MAX;
       #endif
@@ -56,13 +89,14 @@ ncclResult_t ncclCudaGetCapturingGraph(
     } else {
       #if CUDART_VERSION >= 11030
         cudaStreamCaptureStatus status;
-        unsigned long long gid;
-        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &gid, &graph->graph, nullptr, nullptr));
+        CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
         if (status != cudaStreamCaptureStatusActive) {
+          graph->origin = nullptr;
           graph->graph = nullptr;
-          gid = ULLONG_MAX;
+          graph->graphId = ULLONG_MAX;
+        } else {
+          graph->origin = stream;
         }
-        graph->graphId = gid;
       #endif
     }
   #endif
@@ -86,315 +120,218 @@ ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t
 ////////////////////////////////////////////////////////////////////////////////
 
 ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamCreateWithFlags(&ss->cudaStream, cudaStreamNonBlocking));
+  CUDACHECK(cudaStreamCreateWithFlags(&ss->liveStream, cudaStreamNonBlocking));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
     ss->everCaptured = false;
-    ss->serialEventNeedsRecord = false;
-    ss->graphHead = nullptr;
-  #else
-    CUDACHECK(cudaEventCreateWithFlags(&ss->scratchEvent, cudaEventDisableTiming));
+    ss->captureHead = nullptr;
+    pthread_mutex_init(&ss->lock, nullptr);
+    CUDACHECK(cudaEventCreateWithFlags(&ss->serialEvent, cudaEventDisableTiming));
   #endif
   return ncclSuccess;
 }
 
-static void graphDestructor(void* arg) {
-  struct ncclStrongStreamGraph* g = (struct ncclStrongStreamGraph*)arg;
-  if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-    // Last to arrive deletes list node.
-    ncclStrongStreamGraphDelete(g);
-  }
-}
-
 ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss) {
-  CUDACHECK(cudaStreamDestroy(ss->cudaStream));
+  CUDACHECK(cudaStreamDestroy(ss->liveStream));
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaEventDestroy(ss->serialEvent));
-    // Delete list of per-graph chains.
-    struct ncclStrongStreamGraph* g = ss->graphHead;
-    while (g != nullptr) {
-      struct ncclStrongStreamGraph* next = g->next;
-      if (false == __atomic_exchange_n(&g->alive, false, __ATOMIC_ACQ_REL)) {
-        // Last to arrive deletes list node.
-        ncclStrongStreamGraphDelete(g);
-      }
-      g = next;
+    struct ncclStrongStreamCapture* cap = ss->captureHead;
+    while (cap) {
+      struct ncclStrongStreamCapture* next = cap->next;
+      CUDACHECK(cudaStreamDestroy(cap->captureStream));
+      free(cap);
+      cap = next;
     }
-  #else
-    CUDACHECK(cudaEventDestroy(ss->scratchEvent));
+    CUDACHECK(cudaEventDestroy(ss->serialEvent));
+    pthread_mutex_destroy(&ss->lock);
   #endif
   return ncclSuccess;
 }
 
 NCCL_PARAM(GraphMixingSupport, "GRAPH_MIXING_SUPPORT", 1)
+NCCL_PARAM(LaunchRaceFatal, "LAUNCH_RACE_FATAL", 1);
+constexpr char const* launchRaceFatalMsg = "Fatal: host threads racing to launch NCCL on same device.";
 
-static void ensureTips(struct ncclStrongStreamGraph* g, int n) {
-  if (g->tipCapacity < n) {
-    g->tipNodes = (cudaGraphNode_t*)realloc(g->tipNodes, n*sizeof(cudaGraphNode_t));
-    g->tipCapacity = n;
-  }
-}
+static __thread char threadIdMarker;
+static void* localThreadId() { return &threadIdMarker; }
 
 ncclResult_t ncclStrongStreamAcquire(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss
+   struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+   cudaStream_t* workStream
   ) {
   #if CUDART_VERSION >= 11030
     bool mixing = ncclParamGraphMixingSupport();
-    if (graph.graph == nullptr) {
-      if (mixing && ss->everCaptured) {
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        ss->serialEventNeedsRecord = false;
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
+      ss->liveAcquiredBy = localThreadId();
+      if (mixing && __atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+        CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
       }
     } else {
-      ss->everCaptured = true;
-      // Find the current graph in our list of graphs if it exists.
-      struct ncclStrongStreamGraph** pg = &ss->graphHead;
-      struct ncclStrongStreamGraph* g;
-      while (*pg != nullptr) {
-        g = *pg;
-        if (g->graphId == graph.graphId) {
-          // Move to front of list so that operations after acquire don't have to search the list.
-          *pg = g->next;
-          g->next = ss->graphHead;
-          ss->graphHead = g;
+      bool firstCapture = !ss->everCaptured;
+      __atomic_store_n(&ss->everCaptured, true, __ATOMIC_RELAXED);
+
+      ncclResult_t ret = ncclSuccess;
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+
+      // Look for capture in our list of active captures.
+      struct ncclStrongStreamCapture** pcap = &ss->captureHead;
+      struct ncclStrongStreamCapture* cap;
+      struct ncclStrongStreamCapture* spare = nullptr;
+      while (*pcap != nullptr) {
+        cap = *pcap;
+        if (cap->graphId == graph.graphId) { // Capture node already exists.
+          *workStream = cap->captureStream;
+          cap->acquiredBy = localThreadId();
+          if (concurrent) pthread_mutex_unlock(&ss->lock);
           return ncclSuccess;
-        } else if (false == __atomic_load_n(&g->alive, __ATOMIC_ACQUIRE)) {
-          // Unrelated graph that has been destroyed. Remove and delete.
-          *pg = g->next;
-          ncclStrongStreamGraphDelete(g);
         } else {
-          pg = &g->next;
+          cudaStreamCaptureStatus status;
+          CUDACHECKGOTO(cudaStreamIsCapturing(cap->captureStream, &status), ret, do_unlock);
+          if (status == cudaStreamCaptureStatusActive) {
+            pcap = &cap->next; // Active capture doesn't match, on to next.
+          } else { // Capture no longer active
+            *pcap = cap->next; // Remove from current list
+            if (spare == nullptr) { // Keep one spare to reuse below.
+              spare = cap;
+            } else {
+              cudaStreamDestroy(cap->captureStream);
+              free(cap);
+            }
+          }
         }
       }
-
-      // This is a new graph so add to the list.
-      g = (struct ncclStrongStreamGraph*)malloc(sizeof(struct ncclStrongStreamGraph));
-      g->graphId = graph.graphId;
-      g->tipNodes = nullptr;
-      g->tipCapacity = 0;
-      g->tipCount = 0;
-      g->next = ss->graphHead;
-      ss->graphHead = g;
-      g->alive = true;
-      NCCLCHECK(ncclCudaGraphAddDestructor(graph, graphDestructor, (void*)g));
-
-      if (mixing && ss->serialEventNeedsRecord) {
-        // Can only be here if previous release was for uncaptured work that
-        // elided updating the event because no capture had yet occurred.
-        CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
+      // No matching capture, need a new entry.
+      cap = spare;
+      if (cap == nullptr) {
+        cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
+        CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
       }
-      ss->serialEventNeedsRecord = false;
+      cap->graphId = graph.graphId;
+      cap->lastRecord = nullptr;
+      cap->acquiredBy = localThreadId();
+      // Push to capturing list.
+      cap->next = ss->captureHead;
+      ss->captureHead = cap;
 
-      // First node in the chain must be a wait on the serialEvent.
-      if (mixing) {
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventWaitNode(&g->tipNodes[0], graph.graph, nullptr, 0, ss->serialEvent));
-        g->tipCount = 1;
-      } else {
-        g->tipCount = 0;
-      }
-    }
-  #endif
-  return ncclSuccess;
-}
+    do_unlock:
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
+      if (ret != ncclSuccess) return ret;
 
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss) {
-  #if CUDART_VERSION >= 11030
-    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->everCaptured) {
-      CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-    }
-    ss->serialEventNeedsRecord = true; // Assume the caller is going to add work to stream.
-  #endif
-  return ncclSuccess;
-}
+      *workStream = cap->captureStream;
 
-static ncclResult_t checkGraphId(struct ncclStrongStreamGraph* g, unsigned long long id) {
-  if (g == nullptr || g->graphId != id) {
-    WARN("Expected graph id=%llu was not at head of strong stream's internal list.", id);
-    return ncclInternalError;
-  }
-  return ncclSuccess;
-}
+      // Bring captureStream into the graph but without any dependencies.
+      cudaEvent_t scratch;
+      CUDACHECK(cudaEventCreateWithFlags(&scratch, cudaEventDisableTiming));
+      CUDACHECK(cudaEventRecord(scratch, graph.origin));
+      CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
+      CUDACHECK(cudaEventDestroy(scratch));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
 
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss) {
-  #if CUDART_VERSION >= 11030
-    bool mixing = ncclParamGraphMixingSupport();
-    if (mixing && ss->serialEventNeedsRecord) {
-      if (graph.graph == nullptr) {
-        if (ss->everCaptured) {
-          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->cudaStream));
-          ss->serialEventNeedsRecord = false;
-        }
-      } else {
-        struct ncclStrongStreamGraph* g = ss->graphHead;
-        NCCLCHECK(checkGraphId(g, graph.graphId));
-        ensureTips(g, 1);
-        CUDACHECK(cudaGraphAddEventRecordNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, ss->serialEvent));
-        g->tipCount = 1;
-        ss->serialEventNeedsRecord = false;
+      if (mixing && firstCapture) {
+        CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+      }
+      if (mixing) {
+        // First dependency is to wait on serialEvent
+        CUDACHECK(cudaStreamWaitEvent(cap->captureStream, ss->serialEvent, cudaEventWaitExternal));
       }
     }
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamLaunchHost(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss, cudaHostFn_t fn, void* arg
+ncclResult_t ncclStrongStreamAcquiredWorkStream(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent,
+    cudaStream_t* workStream
   ) {
   #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
+    if (graph.graphId == ULLONG_MAX) {
+      *workStream = ss->liveStream;
     } else {
-      cudaHostNodeParams p;
-      p.fn = fn;
-      p.userData = arg;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddHostNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
+      if (concurrent) pthread_mutex_lock(&ss->lock);
+      struct ncclStrongStreamCapture* cap = ss->captureHead;
+      while (cap->graphId != graph.graphId) cap = cap->next;
+      *workStream = cap->captureStream;
+      if (concurrent) pthread_mutex_unlock(&ss->lock);
     }
-    ss->serialEventNeedsRecord = true;
   #else
-    CUDACHECK(cudaLaunchHostFunc(ss->cudaStream, fn, arg));
+    *workStream = ss->liveStream
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamLaunchKernel(
-    struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-    void* fn, dim3 grid, dim3 block, void* args[], size_t sharedMemBytes
+ncclResult_t ncclStrongStreamRelease(
+    struct ncclCudaGraph graph, struct ncclStrongStream* ss, bool concurrent
   ) {
   #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-    } else {
-      cudaKernelNodeParams p;
-      p.func = fn;
-      p.gridDim = grid;
-      p.blockDim = block;
-      p.kernelParams = args;
-      p.sharedMemBytes = sharedMemBytes;
-      p.extra = nullptr;
-      struct ncclStrongStreamGraph* g = ss->graphHead;
-      NCCLCHECK(checkGraphId(g, graph.graphId));
-      ensureTips(g, 1);
-      CUDACHECK(cudaGraphAddKernelNode(&g->tipNodes[0], graph.graph, g->tipNodes, g->tipCount, &p));
-      g->tipCount = 1;
-    }
-    ss->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaLaunchKernel(fn, grid, block, args, sharedMemBytes, ss->cudaStream));
-  #endif
-  return ncclSuccess;
-}
+    bool mixing = ncclParamGraphMixingSupport();
+    if (mixing) {
+      if (graph.graphId == ULLONG_MAX) {
+        if (__atomic_load_n(&ss->everCaptured, __ATOMIC_RELAXED)) {
+          CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
+        }
+        if (ss->liveAcquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
+        }
+      } else {
+        if (concurrent) pthread_mutex_lock(&ss->lock);
+        struct ncclStrongStreamCapture* cap = ss->captureHead;
+        while (cap->graphId != graph.graphId) cap = cap->next;
+        if (concurrent) pthread_mutex_unlock(&ss->lock);
 
-// Merge node list `b` into list `a` but don't add duplicates.
-static void mergeTips(struct ncclStrongStreamGraph* a, cudaGraphNode_t const* bNodes, int bn) {
-  int an = a->tipCount;
-  ensureTips(a, an + bn);
-  for (int bi=0; bi < bn; bi++) {
-    for (int ai=0; ai < an; ai++) {
-      if (a->tipNodes[ai] == bNodes[bi]) goto next_b;
-    }
-    a->tipNodes[a->tipCount++] = bNodes[bi];
-  next_b:;
-  }
-}
+        // Add event record node with dependencies added further down.
+        cudaGraphNode_t recordNode;
+        CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
 
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bg->tipNodes, bg->tipCount);
-    }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, b->scratchEvent, 0));
-  #endif
-  return ncclSuccess;
-}
+        // Make this record order after previous record on this stream.
+        if (cap->lastRecord != nullptr) {
+          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
+        }
+        cap->lastRecord = recordNode;
 
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      // It is ok to use a->serialEvent to record b since we'll be setting
-      // a->serialEventNeedsRecord so the event won't be considered accurate
-      // until re-recorded.
-      CUDACHECK(cudaEventRecord(a->serialEvent, b));
-      CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->serialEvent, 0));
-    } else {
-      cudaStreamCaptureStatus status;
-      unsigned long long bGraphId;
-      cudaGraphNode_t const* bNodes;
-      size_t bCount = 0;
-      CUDACHECK(cudaStreamGetCaptureInfo_v2(b, &status, &bGraphId, nullptr, &bNodes, &bCount));
-      if (status != cudaStreamCaptureStatusActive || graph.graphId != bGraphId) {
-        WARN("Stream is not being captured by the expected graph.");
-        return ncclInvalidUsage;
+        // Get current nodes from work stream so we can add them as dependencies.
+        cudaStreamCaptureStatus status;
+        cudaGraphNode_t const* nodes;
+        size_t count = 0;
+        cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
+
+        #if CUDART_VERSION >= 12030
+        if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+          cudaGraphEdgeData const* edges;
+          CUDACHECK(cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, &edges, &count));
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, &edges[i], 1));
+          }
+        }
+        #else
+        if (false) {}
+        #endif
+        else {
+          CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+          for (int i=0; i < (int)count; i++) {
+            CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
+          }
+        }
+
+        if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
+          WARN("%s", launchRaceFatalMsg);
+          return ncclInvalidUsage;
+        }
       }
-      struct ncclStrongStreamGraph* ag = a->graphHead;
-      NCCLCHECK(checkGraphId(ag, graph.graphId));
-      if (b_subsumes_a) ag->tipCount = 0;
-      mergeTips(ag, bNodes, bCount);
     }
-    a->serialEventNeedsRecord = true;
-  #else
-    CUDACHECK(cudaEventRecord(a->scratchEvent, b));
-    CUDACHECK(cudaStreamWaitEvent(a->cudaStream, a->scratchEvent, 0));
   #endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclStrongStreamWaitStream(
-    struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b,
-    bool b_subsumes_a
-  ) {
-  #if CUDART_VERSION >= 11030
-    if (graph.graph == nullptr) {
-      if (b->serialEventNeedsRecord) {
-        b->serialEventNeedsRecord = false;
-        CUDACHECK(cudaEventRecord(b->serialEvent, b->cudaStream));
-      }
-      CUDACHECK(cudaStreamWaitEvent(a, b->serialEvent, 0));
-    } else {
-      struct ncclStrongStreamGraph* bg = b->graphHead;
-      NCCLCHECK(checkGraphId(bg, graph.graphId));
-      CUDACHECK(cudaStreamUpdateCaptureDependencies(a, bg->tipNodes, bg->tipCount,
-        b_subsumes_a ? cudaStreamSetCaptureDependencies : cudaStreamAddCaptureDependencies
-      ));
-    }
-  #else
-    CUDACHECK(cudaEventRecord(b->scratchEvent, b->cudaStream));
-    CUDACHECK(cudaStreamWaitEvent(a, b->scratchEvent, 0));
-  #endif
+ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent) {
+  CUDACHECK(cudaEventRecord(scratchEvent, b));
+  CUDACHECK(cudaStreamWaitEvent(a, scratchEvent, 0));
   return ncclSuccess;
 }
 
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
   #if CUDART_VERSION >= 11030
-    CUDACHECK(cudaStreamWaitEvent(ss->cudaStream, ss->serialEvent, 0));
-    ss->serialEventNeedsRecord = false;
+    CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
   #endif
-  CUDACHECK(cudaStreamSynchronize(ss->cudaStream));
+  CUDACHECK(cudaStreamSynchronize(ss->liveStream));
   return ncclSuccess;
 }
diff --git a/src/misc/tuner.cc b/src/misc/tuner.cc
deleted file mode 100644
index 267e12a03..000000000
--- a/src/misc/tuner.cc
+++ /dev/null
@@ -1,267 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <dlfcn.h>
-#include <errno.h>
-#include <stdlib.h>
-
-#include "checks.h"
-#include "debug.h"
-#include "tuner.h"
-
-pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int tunerPluginRefCount;
-static void* tunerPluginLib = nullptr;
-static ncclTuner_v4_t* tunerSymbol = nullptr;
-static ncclTuner_v3_t* ncclTuner_v3 = nullptr;
-static ncclTuner_v2_t* ncclTuner_v2 = nullptr;
-static ncclTuner_v4_t ncclTuner_v2_as_v4;
-static ncclTuner_v4_t ncclTuner_v3_as_v4;
-
-static int hasNvlsSupport(float** collCostTable) {
-  // Requirements for support of different algorithms:
-  //
-  // - NVLS intra-node: nvlsSupport
-  // - NVLS intra+inter-node: collNetSupport
-  // - NVLSTree intra-node: always disabled
-  // - NVLSTree inter-node: nvlsSupport
-  // - Collnet* inter-node: collNetSupport
-  //
-  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
-}
-
-static int hasCollNetSupport(float** collCostTable) {
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
-  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v3_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-  ncclTuner_v3_as_v4.getCollInfo = ncclTuner_v3_as_v4_getCollInfo;
-  ncclTuner_v3_as_v4.destroy = ncclTuner_v3->destroy;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
-  int algorithm = NCCL_ALGO_UNDEF;
-  int protocol = NCCL_PROTO_UNDEF;
-  int nvlsSupport = hasNvlsSupport(collCostTable);
-  int collNetSupport = hasCollNetSupport(collCostTable);
-  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
-  // set time to 0 below to make sure this algorithm/protocol is selected later on
-  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
-    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclTuner_v2_as_v4_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void** context) {
-  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logFunction, context));
-  ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-  ncclTuner_v2_as_v4.getCollInfo = ncclTuner_v2_as_v4_getCollInfo;
-  ncclTuner_v2_as_v4.destroy = ncclTuner_v2->destroy;
-  return ncclSuccess;
-}
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(const char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openTunerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char tunerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envTunerPluginName = getenv("NCCL_TUNER_PLUGIN");
-  if (envTunerPluginName && strlen(envTunerPluginName)) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: NCCL_TUNER_PLUGIN set to %s", envTunerPluginName);
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner-%s.so", envTunerPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-tuner.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    // Users are allowed to pack tuner into the net plugin
-    snprintf(tunerPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Plugin name set by env to %s", tunerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  } else {
-    snprintf(tunerPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(tunerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, tunerPluginLibName);
-  }
-  tunerPluginLibName[0] = '\0';
-  return nullptr;
-}
-
-enum {
-  tunerPluginLoadFailed  = -1,
-  tunerPluginLoadReady   =  0,
-  tunerPluginLoadSuccess =  1,
-};
-
-#define MAX_PLUGIN_LOAD 4
-
-static int status = tunerPluginLoadReady;
-
-ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
-  // Initialize to nullptr by default if plugin tuner cannot be loaded.
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  comm->tuner = nullptr;
-  if (tunerPluginLoadFailed == status) {
-    return ncclSuccess;
-  }
-
-  pthread_mutex_lock(&tunerPluginLock);
-  if (tunerPluginLoadFailed == status) {
-    goto exit;
-  }
-
-  if (tunerPluginLoadSuccess == status) {
-    comm->tuner = tunerSymbol;
-    ++tunerPluginRefCount;
-    goto exit;
-  }
-
-  tunerPluginLib = openTunerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (nullptr == tunerPluginLib) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Could not find:%s. Using internal tuner plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using internal tuner plugin.");
-    }
-    goto fail;
-  }
-
-  tunerSymbol = (ncclTuner_v4_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v4");
-  if (tunerSymbol == nullptr) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
-    ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v3");
-    if (ncclTuner_v3 == nullptr) {
-      INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
-      ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(tunerPluginLib, "ncclTunerPlugin_v2");
-      if (ncclTuner_v2 == nullptr) {
-        INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
-        dlclose(tunerPluginLib);
-        goto fail;
-      } else {
-        ncclTuner_v2_as_v4.init = ncclTuner_v2_as_v4_init;
-        ncclTuner_v2_as_v4.name = ncclTuner_v2->name;
-        tunerSymbol = &ncclTuner_v2_as_v4;
-      }
-    } else {
-      ncclTuner_v3_as_v4.init = ncclTuner_v3_as_v4_init;
-      ncclTuner_v3_as_v4.name = ncclTuner_v3->name;
-      tunerSymbol = &ncclTuner_v3_as_v4;
-    }
-  }
-
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", tunerSymbol->name);
-  comm->tuner = tunerSymbol;
-  ++tunerPluginRefCount;
-  status = tunerPluginLoadSuccess;
-  comm->tunerPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-fail:
-  tunerPluginLib = nullptr;
-  status = tunerPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&tunerPluginLock);
-  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
-    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
-    dlclose(tunerPluginLib);
-    tunerPluginLib = nullptr;
-    tunerSymbol = nullptr;
-    comm->tuner = nullptr;
-    status = tunerPluginLoadReady;
-    comm->tunerPluginLoaded = 0;
-  }
-  pthread_mutex_unlock(&tunerPluginLock);
-  return ncclSuccess;
-}
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 8a6f94e24..f3ab5344f 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -66,6 +66,7 @@ typedef struct ncclConfig_v21700 {
   int maxCTAs;
   const char *netName;
   int splitShare;
+  int trafficClass;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -79,7 +80,8 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
   NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
   NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
-  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
diff --git a/src/net.cc b/src/net.cc
deleted file mode 100644
index 13e8c2b51..000000000
--- a/src/net.cc
+++ /dev/null
@@ -1,1033 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "net.h"
-#include "bootstrap.h"
-#include "checks.h"
-
-#include <string.h>
-#include <errno.h>
-#include <dlfcn.h>
-//#include <sys/types.h>
-//#include <sys/stat.h>
-//#include <unistd.h>
-
-static ncclNet_v9_t ncclNet_v5_as_v9;
-static ncclNet_v9_t ncclNet_v6_as_v9;
-static ncclNet_v9_t ncclNet_v7_as_v9;
-static ncclNet_v9_t ncclNet_v8_as_v9;
-static ncclNet_v5_t *ncclNet_v5;
-static ncclNet_v6_t *ncclNet_v6;
-static ncclNet_v7_t *ncclNet_v7;
-static ncclNet_v8_t *ncclNet_v8;
-static ncclCollNet_v9_t ncclCollNet_v5_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v6_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v7_as_v9;
-static ncclCollNet_v9_t ncclCollNet_v8_as_v9;
-static ncclCollNet_v5_t *ncclCollNet_v5;
-static ncclCollNet_v6_t *ncclCollNet_v6;
-static ncclCollNet_v7_t *ncclCollNet_v7;
-static ncclCollNet_v8_t *ncclCollNet_v8;
-
-#define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
-#define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
-
-static ncclResult_t ncclNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType = p8.netDeviceType;
-  props->netDeviceVersion = p8.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v8->init(logfn));
-  ncclNet_v8_as_v9.name = ncclNet_v8->name;
-  ncclNet_v8_as_v9.devices = ncclNet_v8->devices;
-  ncclNet_v8_as_v9.getProperties = ncclNet_v8_as_v9_getProperties;
-  ncclNet_v8_as_v9.listen = ncclNet_v8->listen;
-  ncclNet_v8_as_v9.connect = ncclNet_v8->connect;
-  ncclNet_v8_as_v9.accept =  ncclNet_v8->accept;
-  ncclNet_v8_as_v9.regMr = ncclNet_v8->regMr;
-  ncclNet_v8_as_v9.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
-  ncclNet_v8_as_v9.deregMr = ncclNet_v8->deregMr;
-  ncclNet_v8_as_v9.isend = ncclNet_v8_as_v9_isend;
-  ncclNet_v8_as_v9.irecv = ncclNet_v8_as_v9_irecv;
-  ncclNet_v8_as_v9.iflush = ncclNet_v8->iflush;
-  ncclNet_v8_as_v9.test = ncclNet_v8->test;
-  ncclNet_v8_as_v9.closeSend = ncclNet_v8->closeSend;
-  ncclNet_v8_as_v9.closeRecv = ncclNet_v8->closeRecv;
-  ncclNet_v8_as_v9.closeListen = ncclNet_v8->closeListen;
-  ncclNet_v8_as_v9.getDeviceMr = ncclNet_v8->getDeviceMr;
-  ncclNet_v8_as_v9.irecvConsumed = ncclNet_v8->irecvConsumed;
-  ncclNet_v8_as_v9.makeVDevice   = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType = p7.netDeviceType;
-  props->netDeviceVersion = p7.netDeviceVersion;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v7_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v7->init(logfn));
-  ncclNet_v7_as_v9.name = ncclNet_v7->name;
-  ncclNet_v7_as_v9.devices = ncclNet_v7->devices;
-  ncclNet_v7_as_v9.getProperties = ncclNet_v7_as_v9_getProperties; // ncclNet_v5->getProperties;
-  ncclNet_v7_as_v9.listen = ncclNet_v7->listen;
-  ncclNet_v7_as_v9.connect = ncclNet_v7->connect;
-  ncclNet_v7_as_v9.accept =  ncclNet_v7->accept;
-  ncclNet_v7_as_v9.regMr = ncclNet_v7_as_v9_regMr;
-  ncclNet_v7_as_v9.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
-  ncclNet_v7_as_v9.deregMr = ncclNet_v7->deregMr;
-  ncclNet_v7_as_v9.isend = ncclNet_v7_as_v9_isend;
-  ncclNet_v7_as_v9.irecv = ncclNet_v7_as_v9_irecv;
-  ncclNet_v7_as_v9.iflush = ncclNet_v7->iflush;
-  ncclNet_v7_as_v9.test = ncclNet_v7->test;
-  ncclNet_v7_as_v9.closeSend = ncclNet_v7->closeSend;
-  ncclNet_v7_as_v9.closeRecv = ncclNet_v7->closeRecv;
-  ncclNet_v7_as_v9.closeListen = ncclNet_v7->closeListen;
-  ncclNet_v7_as_v9.getDeviceMr = ncclNet_v7->getDeviceMr;
-  ncclNet_v7_as_v9.irecvConsumed = ncclNet_v7->irecvConsumed;
-  ncclNet_v7_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v6->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v6->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v6_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v6->init(logfn));
-  ncclNet_v6_as_v9.name = ncclNet_v6->name;
-  ncclNet_v6_as_v9.devices = ncclNet_v6->devices;
-  ncclNet_v6_as_v9.getProperties = ncclNet_v6_as_v9_getProperties;
-  ncclNet_v6_as_v9.listen = ncclNet_v6->listen;
-  ncclNet_v6_as_v9.connect = ncclNet_v6_as_v9_connect;
-  ncclNet_v6_as_v9.accept =  ncclNet_v6_as_v9_accept;
-  ncclNet_v6_as_v9.regMr = ncclNet_v6_as_v9_regMr;
-  ncclNet_v6_as_v9.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
-  ncclNet_v6_as_v9.deregMr = ncclNet_v6->deregMr;
-  ncclNet_v6_as_v9.isend = ncclNet_v6_as_v9_isend;
-  ncclNet_v6_as_v9.irecv = ncclNet_v6_as_v9_irecv;
-  ncclNet_v6_as_v9.iflush = ncclNet_v6->iflush;
-  ncclNet_v6_as_v9.test = ncclNet_v6->test;
-  ncclNet_v6_as_v9.closeSend = ncclNet_v6->closeSend;
-  ncclNet_v6_as_v9.closeRecv = ncclNet_v6->closeRecv;
-  ncclNet_v6_as_v9.closeListen = ncclNet_v6->closeListen;
-  ncclNet_v6_as_v9.getDeviceMr = NULL;
-  ncclNet_v6_as_v9.irecvConsumed = NULL;
-  ncclNet_v6_as_v9.makeVDevice  = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_connect(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
-  return ncclNet_v5->connect(dev, handle, sendComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
-  return ncclNet_v5->accept(listenComm, recvComm);
-}
-
-static ncclResult_t ncclNet_v5_as_v9_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
-   int sizeInt;
-   if (size > MAX_NET_SIZE) return ncclInternalError;
-   sizeInt = (int)size;
-   ncclResult_t ans = ncclNet_v5->isend(sendComm, data, sizeInt, tag, mhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclNet_v5_as_v9_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
-   int sizesInt[NCCL_PROXY_MAX_SUBS];
-   //reset to NULL if optional receive completion is set
-   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = NULL;
-   for (int i=0; i<n; i++) {
-     if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
-     sizesInt[i] = (int) sizes[i];
-   }
-   ncclResult_t ans = ncclNet_v5->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclNet_v5->init(logfn));
-  ncclNet_v5_as_v9.name = ncclNet_v5->name;
-  ncclNet_v5_as_v9.devices = ncclNet_v5->devices;
-  ncclNet_v5_as_v9.getProperties = ncclNet_v5_as_v9_getProperties;
-  ncclNet_v5_as_v9.listen = ncclNet_v5->listen;
-  ncclNet_v5_as_v9.connect = ncclNet_v5_as_v9_connect;
-  ncclNet_v5_as_v9.accept =  ncclNet_v5_as_v9_accept;
-  ncclNet_v5_as_v9.regMr = ncclNet_v5_as_v9_regMr;
-  ncclNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclNet_v5_as_v9.deregMr = ncclNet_v5->deregMr;
-  ncclNet_v5_as_v9.isend = ncclNet_v5_as_v9_isend;
-  ncclNet_v5_as_v9.irecv = ncclNet_v5_as_v9_irecv;
-  ncclNet_v5_as_v9.iflush = ncclNet_v5->iflush;
-  ncclNet_v5_as_v9.test = ncclNet_v5->test;
-  ncclNet_v5_as_v9.closeSend = ncclNet_v5->closeSend;
-  ncclNet_v5_as_v9.closeRecv = ncclNet_v5->closeRecv;
-  ncclNet_v5_as_v9.closeListen = ncclNet_v5->closeListen;
-  ncclNet_v5_as_v9.getDeviceMr = NULL;
-  ncclNet_v5_as_v9.irecvConsumed = NULL;
-  ncclNet_v5_as_v9.makeVDevice = NULL;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v5->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v5->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v5_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v5->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v5 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v5_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v5->init(logfn));
-  ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-  ncclCollNet_v5_as_v9.devices = ncclCollNet_v5->devices;
-  ncclCollNet_v5_as_v9.getProperties = ncclCollNet_v5_as_v9_getProperties;
-  ncclCollNet_v5_as_v9.listen = ncclCollNet_v5->listen;
-  ncclCollNet_v5_as_v9.connect = ncclCollNet_v5->connect;
-  ncclCollNet_v5_as_v9.reduceSupport = ncclCollNet_v5->reduceSupport;
-  ncclCollNet_v5_as_v9.regMr = ncclCollNet_v5_as_v9_regMr;
-  ncclCollNet_v5_as_v9.regMrDmaBuf = NULL;
-  ncclCollNet_v5_as_v9.deregMr = ncclCollNet_v5->deregMr;
-  ncclCollNet_v5_as_v9.iallreduce = ncclCollNet_v5_as_v9_iallreduce;
-  ncclCollNet_v5_as_v9.iallgather = nullptr;
-  ncclCollNet_v5_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v5_as_v9.iflush = ncclCollNet_v5->iflush;
-  ncclCollNet_v5_as_v9.test = ncclCollNet_v5->test;
-  ncclCollNet_v5_as_v9.closeColl = ncclCollNet_v5->closeColl;
-  ncclCollNet_v5_as_v9.closeListen = ncclCollNet_v5->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v6_t p6;
-  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
-  if (ans != ncclSuccess) return ans;
-  props->name = p6.name;
-  props->pciPath = p6.pciPath;
-  props->guid = p6.guid;
-  props->ptrSupport = p6.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p6.speed;
-  props->port = p6.port;
-  props->maxComms = p6.maxComms;
-  props->maxRecvs = p6.maxRecvs;
-  props->latency = p6.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v6_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v6 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v6_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v6->init(logfn));
-  ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-  ncclCollNet_v6_as_v9.devices = ncclCollNet_v6->devices;
-  ncclCollNet_v6_as_v9.getProperties = ncclCollNet_v6_as_v9_getProperties;
-  ncclCollNet_v6_as_v9.listen = ncclCollNet_v6->listen;
-  ncclCollNet_v6_as_v9.connect = ncclCollNet_v6->connect;
-  ncclCollNet_v6_as_v9.reduceSupport = ncclCollNet_v6->reduceSupport;
-  ncclCollNet_v6_as_v9.regMr = ncclCollNet_v6_as_v9_regMr;
-  ncclCollNet_v6_as_v9.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
-  ncclCollNet_v6_as_v9.deregMr = ncclCollNet_v6->deregMr;
-  ncclCollNet_v6_as_v9.iallreduce = ncclCollNet_v6_as_v9_iallreduce;
-  ncclCollNet_v6_as_v9.iallgather = nullptr;
-  ncclCollNet_v6_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v6_as_v9.iflush = ncclCollNet_v6->iflush;
-  ncclCollNet_v6_as_v9.test = ncclCollNet_v6->test;
-  ncclCollNet_v6_as_v9.closeColl = ncclCollNet_v6->closeColl;
-  ncclCollNet_v6_as_v9.closeListen = ncclCollNet_v6->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v7_t p7;
-  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
-  if (ans != ncclSuccess) return ans;
-  props->name = p7.name;
-  props->pciPath = p7.pciPath;
-  props->guid = p7.guid;
-  props->ptrSupport = p7.ptrSupport;
-  props->regIsGlobal = 0;
-  props->forceFlush = 0;
-  props->speed = p7.speed;
-  props->port = p7.port;
-  props->maxComms = p7.maxComms;
-  props->maxRecvs = p7.maxRecvs;
-  props->latency = p7.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  if (size >= 1UL<<31) return ncclInternalError;
-  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
-}
-
-static ncclResult_t ncclCollNet_v7_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v7 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v7_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v7->init(logfn));
-  ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-  ncclCollNet_v7_as_v9.devices = ncclCollNet_v7->devices;
-  ncclCollNet_v7_as_v9.getProperties = ncclCollNet_v7_as_v9_getProperties;
-  ncclCollNet_v7_as_v9.listen = ncclCollNet_v7->listen;
-  ncclCollNet_v7_as_v9.connect = ncclCollNet_v7->connect;
-  ncclCollNet_v7_as_v9.reduceSupport = ncclCollNet_v7->reduceSupport;
-  ncclCollNet_v7_as_v9.regMr = ncclCollNet_v7_as_v9_regMr;
-  ncclCollNet_v7_as_v9.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
-  ncclCollNet_v7_as_v9.deregMr = ncclCollNet_v7->deregMr;
-  ncclCollNet_v7_as_v9.iallreduce = ncclCollNet_v7_as_v9_iallreduce;
-  ncclCollNet_v7_as_v9.iallgather = nullptr;
-  ncclCollNet_v7_as_v9.ireducescatter = nullptr;
-  ncclCollNet_v7_as_v9.iflush = ncclCollNet_v7->iflush;
-  ncclCollNet_v7_as_v9.test = ncclCollNet_v7->test;
-  ncclCollNet_v7_as_v9.closeColl = ncclCollNet_v7->closeColl;
-  ncclCollNet_v7_as_v9.closeListen = ncclCollNet_v7->closeListen;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_getProperties(int dev, ncclNetProperties_v9_t* props) {
-  ncclNetProperties_v8_t p8;
-  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
-  if (ans != ncclSuccess) return ans;
-  props->name = p8.name;
-  props->pciPath = p8.pciPath;
-  props->guid = p8.guid;
-  props->ptrSupport = p8.ptrSupport;
-  props->regIsGlobal = p8.regIsGlobal;
-  props->forceFlush = 0;
-  props->speed = p8.speed;
-  props->port = p8.port;
-  props->maxComms = p8.maxComms;
-  props->maxRecvs = p8.maxRecvs;
-  props->latency = p8.latency;
-  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
-  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
-  props->vProps.ndevs = 1;
-  props->vProps.devs[0] = dev;
-  props->maxP2pBytes = MAX_NET_SIZE;
-  props->maxCollBytes = MAX_COLLNET_SIZE;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
-   int countInt;
-   if (count > MAX_NET_SIZE) return ncclInternalError;
-   countInt = (int)count;
-   ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
-                  sendMhandle, recvMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v9_t* recvParts,
-                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                           void* sendMhandle, void** request) {
-   ncclNetSGE_v8_t recvPartsInt;
-   if (nRecvParts > 1) return ncclInternalError;
-   if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   recvPartsInt.mhandle = recvParts->mhandle;
-   recvPartsInt.address = recvParts->address;
-   recvPartsInt.size = (int)recvParts->size;
-   ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
-                   bytesPerRank, windowOffset, windowBytes,
-                   sendMhandle, request);
-   return ans;
-}
-
-static ncclResult_t ncclCollNet_v8_as_v9_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_v9_t* sendParts, void* recvData,
-                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
-                               ncclDataType_t dataType, ncclRedOp_t redOp,
-                               void* recvMhandle, void** request) {
-   ncclNetSGE_v8_t sendPartsInt;
-   if (nSendParts > 1) return ncclInternalError;
-   if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
-   sendPartsInt.mhandle = sendParts->mhandle;
-   sendPartsInt.address = sendParts->address;
-   sendPartsInt.size = (int)sendParts->size;
-   ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
-                   recvData, bytesPerRank, windowOffset, windowBytes,
-                   dataType, redOp,
-                  recvMhandle, request);
-   return ans;
-}
-
-// We use a wrapper around the v8 init to copy over the struct contents
-// post-init since they may not be initialized before hand.
-static ncclResult_t ncclCollNet_v8_as_v9_init(ncclDebugLogger_t logfn) {
-  NCCLCHECK(ncclCollNet_v8->init(logfn));
-  ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-  ncclCollNet_v8_as_v9.devices = ncclCollNet_v8->devices;
-  ncclCollNet_v8_as_v9.getProperties = ncclCollNet_v8_as_v9_getProperties;
-  ncclCollNet_v8_as_v9.listen = ncclCollNet_v8->listen;
-  ncclCollNet_v8_as_v9.connect = ncclCollNet_v8->connect;
-  ncclCollNet_v8_as_v9.reduceSupport = ncclCollNet_v8->reduceSupport;
-  ncclCollNet_v8_as_v9.regMr = ncclCollNet_v8->regMr;
-  ncclCollNet_v8_as_v9.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
-  ncclCollNet_v8_as_v9.deregMr = ncclCollNet_v8->deregMr;
-  ncclCollNet_v8_as_v9.iallreduce = ncclCollNet_v8_as_v9_iallreduce;
-  ncclCollNet_v8_as_v9.iallgather = ncclCollNet_v8_as_v9_iallgather;
-  ncclCollNet_v8_as_v9.ireducescatter = ncclCollNet_v8_as_v9_ireducescatter;
-  ncclCollNet_v8_as_v9.iflush = ncclCollNet_v8->iflush;
-  ncclCollNet_v8_as_v9.test = ncclCollNet_v8->test;
-  ncclCollNet_v8_as_v9.closeColl = ncclCollNet_v8->closeColl;
-  ncclCollNet_v8_as_v9.closeListen = ncclCollNet_v8->closeListen;
-  return ncclSuccess;
-}
-
-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
-ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
-enum ncclNetState {
-  ncclNetStateInit = 0,
-  ncclNetStateEnabled = 1,
-  ncclNetStateDisabled = 2
-};
-enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-
-#define MAX_STR_LEN 255
-
-static void* tryOpenLib(char* name, int* err, char* errStr) {
-  *err = 0;
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = '\0';
-    // "handle" and "name" won't be NULL at the same time.
-    // coverity[var_deref_model]
-    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openNetPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char netPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-  const char *envNetPluginName = getenv("NCCL_NET_PLUGIN");
-  if (envNetPluginName && strlen(envNetPluginName)) {
-    snprintf(netPluginLibName, PATH_MAX, "%s", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net-%s.so", envNetPluginName);
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Plugin name set by env to %s", netPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  } else {
-    snprintf(netPluginLibName, PATH_MAX, "libnccl-net.so");
-    pluginLib = tryOpenLib(netPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, netPluginLibName);
-  }
-  return nullptr;
-}
-
-static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static int netPluginRefCount;
-static void* netPluginLib;
-
-enum {
-  netPluginLoadFailed  = -1,
-  netPluginLoadReady   =  0,
-  netPluginLoadSuccess =  1,
-};
-
-static int netPluginStatus = netPluginLoadReady;
-
-#define MAX_PLUGIN_LOAD 2
-
-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
-  pthread_mutex_lock(&netPluginLock);
-  if (netPluginLoadFailed == netPluginStatus) {
-    goto exit;
-  }
-  if (netPluginLoadSuccess == netPluginStatus) {
-    ++netPluginRefCount;
-    goto exit;
-  }
-
-  netPluginLib = openNetPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
-  if (netPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Could not find:%s. Using internal network plugin.", couldNotFindNames);
-    } else {
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Using internal network plugin.");
-    }
-    goto fail;
-  }
-
-  ncclNets[0] = (ncclNet_v9_t*)dlsym(netPluginLib, "ncclNetPlugin_v9");
-  if (ncclNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
-    ncclNet_v8 = (ncclNet_v8_t*)dlsym(netPluginLib, "ncclNetPlugin_v8");
-    if (ncclNet_v8 == nullptr) {
-      // Try v7 plugin
-      ncclNet_v7 = (ncclNet_v7_t*)dlsym(netPluginLib, "ncclNetPlugin_v7");
-      if (ncclNet_v7 == nullptr) {
-        // Try v6 plugin
-        ncclNet_v6 = (ncclNet_v6_t*)dlsym(netPluginLib, "ncclNetPlugin_v6");
-        if (ncclNet_v6 == nullptr) {
-          // Try v5 plugin
-          ncclNet_v5 = (ncclNet_v5_t*)dlsym(netPluginLib, "ncclNetPlugin_v5");
-          if (ncclNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin symbol (>= v5). ncclNetPlugin symbols v4 and lower are not supported.");
-            goto fail;
-          } else {
-            ncclNets[0] = &ncclNet_v5_as_v9;
-            ncclNet_v5_as_v9.init = ncclNet_v5_as_v9_init;
-            // Set the name right away to allow for NCCL_NET=... to work
-            ncclNet_v5_as_v9.name = ncclNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v5)", ncclNets[0]->name);
-          }
-        } else {
-          ncclNets[0] = &ncclNet_v6_as_v9;
-          ncclNet_v6_as_v9.init = ncclNet_v6_as_v9_init;
-          // Set the name right away to allow for NCCL_NET=... to work
-          ncclNet_v6_as_v9.name = ncclNet_v6->name;
-          INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNets[0]->name);
-        }
-      } else {
-        ncclNets[0] = &ncclNet_v7_as_v9;
-        ncclNet_v7_as_v9.init = ncclNet_v7_as_v9_init;
-        // Set the name right away to allow for NCCL_NET=... to work
-        ncclNet_v7_as_v9.name = ncclNet_v7->name;
-        INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNets[0]->name);
-      }
-    } else {
-      ncclNets[0] = &ncclNet_v8_as_v9;
-      ncclNet_v8_as_v9.init = ncclNet_v8_as_v9_init;
-      // Set the name right away to allow for NCCL_NET=... to work
-      ncclNet_v8_as_v9.name = ncclNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNets[0]->name);
-  }
-
-  // Check for CollNet
-  ncclCollNets[0] = (ncclCollNet_v9_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v9");
-  if (ncclCollNets[0] == nullptr) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
-    ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v8");
-    if (ncclCollNet_v8 == nullptr) {
-      ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v7");
-      if (ncclCollNet_v7 == nullptr) {
-        ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v6");
-        if (ncclCollNet_v6 == nullptr) {
-          ncclCollNet_v5 = (ncclCollNet_v5_t*)dlsym(netPluginLib, "ncclCollNetPlugin_v5");
-          if (ncclCollNet_v5 == nullptr) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin symbol (>= v5). ncclCollNetPlugin symbols v4 and lower are not supported.");
-          } else {
-            ncclCollNets[0] = &ncclCollNet_v5_as_v9;
-            ncclCollNet_v5_as_v9.init = ncclCollNet_v5_as_v9_init;
-            ncclCollNet_v5_as_v9.name = ncclCollNet_v5->name;
-            INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v5)", ncclCollNets[0]->name);
-          }
-        } else {
-         ncclCollNets[0] = &ncclCollNet_v6_as_v9;
-         ncclCollNet_v6_as_v9.init = ncclCollNet_v6_as_v9_init;
-         ncclCollNet_v6_as_v9.name = ncclCollNet_v6->name;
-         INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNets[0]->name);
-        }
-      } else {
-       ncclCollNets[0] = &ncclCollNet_v7_as_v9;
-       ncclCollNet_v7_as_v9.init = ncclCollNet_v7_as_v9_init;
-       ncclCollNet_v7_as_v9.name = ncclCollNet_v7->name;
-       INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNets[0]->name);
-      }
-    } else {
-      ncclCollNets[0] = &ncclCollNet_v8_as_v9;
-      ncclCollNet_v8_as_v9.init = ncclCollNet_v8_as_v9_init;
-      ncclCollNet_v8_as_v9.name = ncclCollNet_v8->name;
-      INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNets[0]->name);
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNets[0]->name);
-  }
-
-  ++netPluginRefCount;
-  netPluginStatus = netPluginLoadSuccess;
-  comm->netPluginLoaded = 1;
-
-exit:
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-fail:
-  if (netPluginLib) dlclose(netPluginLib);
-  netPluginStatus = netPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&netPluginLock);
-  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
-    if (ncclNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
-    }
-    if (ncclCollNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
-    }
-    dlclose(netPluginLib);
-    netPluginLib = nullptr;
-    ncclNets[0] = nullptr;
-    ncclCollNets[0] = nullptr;
-    netPluginStatus = netPluginLoadReady;
-    comm->netPluginLoaded = 0;
-    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
-      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
-  }
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
-  ncclNetProperties_t props;
-
-  NCCLCHECK(net->getProperties(dev, &props));
-  ncclNetDeviceType type = props.netDeviceType;
-  if (type) switch (type) {
-    case NCCL_NET_DEVICE_UNPACK:
-      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
-        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
-          props.netDeviceVersion);
-        return ncclSuccess;
-      } else {
-        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
-          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
-        return ncclInternalError;
-      }
-    default:
-      WARN("Unknown device code index %d \n", type);
-      return ncclInternalError;
-  }
-
-  return ncclSuccess;
-}
-
-static ncclResult_t netGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclNets[i]->init(ncclDebugLog) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
-    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
-    else ncclNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclCollNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else ncclCollNetStates[i] = ncclNetStateEnabled;
-  }
-  *state = ncclCollNetStates[i];
-  pthread_mutex_unlock(&netLock);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetInit(struct ncclComm* comm) {
-  // Initialize main communication network
-  const char* netName;
-  bool ok = false;
-
-  netName = comm->config.netName;
-  for (int i=0; i<3; i++) {
-    if (ncclNets[i] == nullptr) continue;
-    enum ncclNetState state;
-    NCCLCHECK(netGetState(i, &state));
-    if (state != ncclNetStateEnabled) continue;
-    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
-    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
-      // Mismatched device plugin version
-      continue;
-    }
-
-    comm->ncclNet = ncclNets[i];
-    ok = true;
-
-    if (ncclCollNets[i]) {
-      NCCLCHECK(collNetGetState(i, &state));
-      if (state == ncclNetStateEnabled) {
-        comm->ncclCollNet = ncclCollNets[i];
-      }
-    }
-    break;
-  }
-
-  if (!ok) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
-  comm->ncclNet = nullptr;
-  comm->ncclCollNet = nullptr;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
-  constexpr int GPU_BUF_SIZE = 2*1024*1024;
-#if CUDART_VERSION >= 11030
-  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
-  int driverVersion;
-  CUDACHECK(cudaDriverGetVersion(&driverVersion));
-  if (driverVersion >= 11030) {
-    int cudaDev, attr = 0;
-    CUDACHECK(cudaGetDevice(&cudaDev));
-    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
-    *gdrSupport = attr;
-    return ncclSuccess;
-  }
-#endif
-  static int gdrSupportMatrix[32] = {
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
-  if (gdrSupportMatrix[comm->cudaDev] == -1) {
-    int netDevs;
-    NCCLCHECK(comm->ncclNet->devices(&netDevs));
-    gdrSupportMatrix[comm->cudaDev] = 0;
-    for (int dev=0; dev<netDevs; dev++) {
-      // Find a net device which is GDR-capable
-      ncclNetProperties_t props;
-      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
-      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-
-    // Allocate memory on the GPU and try to register it on the NIC.
-    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
-    ncclNetHandle_t handle;
-    char* gpuPtr = NULL;
-    void* mHandle = NULL;
-    ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
-
-    bool connected;
-    connected = false;
-    while (!connected) {
-
-      // If we're aborting now, skip to cleanup
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
-        goto cleanup2;
-      }
-
-      if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, &handle, &sComm, NULL), ret, cleanup2);
-
-      if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
-
-      connected = (rComm != NULL) && (sComm != NULL);
-    }
-
-    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
-      gdrSupportMatrix[comm->cudaDev] = 1;
-    }
-    ncclDebugNoWarn = 0;
-    NCCLCHECK(ncclCudaFree(gpuPtr));
-cleanup2:
-    if (rComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
-    if (sComm != NULL)
-      NCCLCHECK(comm->ncclNet->closeSend(sComm));
-    NCCLCHECK(comm->ncclNet->closeListen(lComm));
-cleanup1:
-      break;
-    }
-  }
-  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
-  return ncclSuccess;
-}
-
-int ncclNetVersion(struct ncclComm* comm) {
-  return
-    (comm->ncclNet == &ncclNet_v5_as_v9) ? 5 :
-    (comm->ncclNet == &ncclNet_v6_as_v9) ? 6 :
-    (comm->ncclNet == &ncclNet_v7_as_v9) ? 7 :
-    (comm->ncclNet == &ncclNet_v8_as_v9) ? 8 :
-    9;
-}
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
new file mode 100644
index 000000000..9257d7786
--- /dev/null
+++ b/src/plugin/net.cc
@@ -0,0 +1,319 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "net.h"
+#include "bootstrap.h"
+#include "checks.h"
+#include "plugin.h"
+
+#include <string.h>
+#include <errno.h>
+//#include <sys/types.h>
+//#include <sys/stat.h>
+//#include <unistd.h>
+
+extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
+extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
+
+extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
+extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
+
+static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
+ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
+static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
+ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
+enum ncclNetState {
+  ncclNetStateInit = 0,
+  ncclNetStateEnabled = 1,
+  ncclNetStateDisabled = 2
+};
+enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+
+NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
+static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static void* netPluginLib;
+
+static int netPluginRefCount;
+static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
+
+enum {
+  netPluginLoadFailed  = -1,
+  netPluginLoadReady   =  0,
+  netPluginLoadSuccess =  1,
+};
+
+static int netPluginStatus = netPluginLoadReady;
+
+ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
+  static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
+
+  pthread_mutex_lock(&netPluginLock);
+  if (netPluginLoadFailed == netPluginStatus) {
+    goto exit;
+  }
+  if (netPluginLoadSuccess == netPluginStatus) {
+    ++netPluginRefCount;
+    goto exit;
+  }
+
+  netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
+  if (netPluginLib == nullptr) {
+    goto fail;
+  }
+
+  ncclNets[0] = getNcclNet_v10(netPluginLib);
+  if (ncclNets[0]) ncclNetsVer[0] = 10;
+  if (ncclNets[0] == nullptr) {
+    // Try v9 plugin
+    ncclNets[0] = getNcclNet_v9(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 9;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v8 plugin
+    ncclNets[0] = getNcclNet_v8(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 8;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v7 plugin
+    ncclNets[0] = getNcclNet_v7(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 7;
+  }
+  if (ncclNets[0] == nullptr) {
+    // Try v6 plugin
+    ncclNets[0] = getNcclNet_v6(netPluginLib);
+    if (ncclNets[0]) ncclNetsVer[0] = 6;
+  }
+  if (ncclNets[0] == nullptr) {
+    goto fail;
+  }
+
+  // Check for CollNet
+  ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
+  }
+  if (ncclCollNets[0] == nullptr) {
+    ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
+  }
+
+  ++netPluginRefCount;
+  netPluginStatus = netPluginLoadSuccess;
+  comm->netPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+fail:
+  if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
+  netPluginStatus = netPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&netPluginLock);
+  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
+    if (ncclNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
+    }
+    if (ncclCollNets[0]) {
+      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
+    }
+    NCCLCHECK(ncclClosePluginLib(netPluginLib));
+    netPluginLib = nullptr;
+    ncclNets[0] = nullptr;
+    ncclCollNets[0] = nullptr;
+    netPluginStatus = netPluginLoadReady;
+    comm->netPluginLoaded = 0;
+    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
+      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
+  }
+  pthread_mutex_unlock(&netPluginLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
+  ncclNetProperties_t props;
+
+  NCCLCHECK(net->getProperties(dev, &props));
+  ncclNetDeviceType type = props.netDeviceType;
+  if (type) switch (type) {
+    case NCCL_NET_DEVICE_UNPACK:
+      if (props.netDeviceVersion == NCCL_NET_DEVICE_UNPACK_VERSION) {
+        INFO(NCCL_INIT, "Using NCCL_NET_DEVICE_UNPACK net plugin version %d",
+          props.netDeviceVersion);
+        return ncclSuccess;
+      } else {
+        WARN("NCCL_DEVICE_UNPACK plugin has incompatible version %d, this NCCL build is compatible with %d, not using it",
+          props.netDeviceVersion, NCCL_NET_DEVICE_UNPACK_VERSION);
+        return ncclInternalError;
+      }
+    default:
+      WARN("Unknown device code index %d \n", type);
+      return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+static ncclResult_t netGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
+    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
+    else ncclNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
+  pthread_mutex_lock(&netLock);
+  if (ncclCollNetStates[i] == ncclNetStateInit) {
+    int ndev;
+    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
+    else ncclCollNetStates[i] = ncclNetStateEnabled;
+  }
+  *state = ncclCollNetStates[i];
+  pthread_mutex_unlock(&netLock);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
+  // Initialize main communication network
+  const char* netName;
+  bool ok = false;
+
+  netName = comm->config.netName;
+  for (int i=0; i<3; i++) {
+    if (ncclNets[i] == nullptr) continue;
+    enum ncclNetState state;
+    NCCLCHECK(netGetState(i, &state));
+    if (state != ncclNetStateEnabled) continue;
+    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
+    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
+      // Mismatched device plugin version
+      continue;
+    }
+
+    comm->ncclNet = ncclNets[i];
+    comm->ncclNetVer = ncclNetsVer[i];
+    ok = true;
+
+    if (ncclCollNets[i]) {
+      NCCLCHECK(collNetGetState(i, &state));
+      if (state == ncclNetStateEnabled) {
+        comm->ncclCollNet = ncclCollNets[i];
+      }
+    }
+    break;
+  }
+
+  if (!ok) {
+    WARN("Error: network %s not found.", netName ? netName : "");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
+  comm->ncclNet = nullptr;
+  comm->ncclCollNet = nullptr;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
+  constexpr int GPU_BUF_SIZE = 2*1024*1024;
+#if CUDART_VERSION >= 11030
+  // In CUDA 11.3 and later we can now query the cudaDevAttrGPUDirectRDMASupported attribute
+  int driverVersion;
+  CUDACHECK(cudaDriverGetVersion(&driverVersion));
+  if (driverVersion >= 11030) {
+    int cudaDev, attr = 0;
+    CUDACHECK(cudaGetDevice(&cudaDev));
+    CUDACHECK(cudaDeviceGetAttribute(&attr, cudaDevAttrGPUDirectRDMASupported, cudaDev));
+    *gdrSupport = attr;
+    return ncclSuccess;
+  }
+#endif
+  static int gdrSupportMatrix[32] = {
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+	  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  if (gdrSupportMatrix[comm->cudaDev] == -1) {
+    int netDevs;
+    NCCLCHECK(comm->ncclNet->devices(&netDevs));
+    gdrSupportMatrix[comm->cudaDev] = 0;
+    for (int dev=0; dev<netDevs; dev++) {
+      // Find a net device which is GDR-capable
+      ncclNetProperties_t props;
+      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
+      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
+
+    // Allocate memory on the GPU and try to register it on the NIC.
+    void *lComm = NULL, *sComm = NULL, *rComm = NULL;
+    ncclNetHandle_t handle;
+    char* gpuPtr = NULL;
+    void* mHandle = NULL;
+    ncclResult_t ret;
+    ncclDebugNoWarn = NCCL_NET;
+    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
+
+    bool connected;
+    connected = false;
+    while (!connected) {
+
+      // If we're aborting now, skip to cleanup
+      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE)) {
+        goto cleanup2;
+      }
+
+      if (sComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
+
+      if (rComm == NULL)
+        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
+
+      connected = (rComm != NULL) && (sComm != NULL);
+    }
+
+    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
+    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
+      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
+      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
+      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+      gdrSupportMatrix[comm->cudaDev] = 1;
+    }
+    ncclDebugNoWarn = 0;
+    NCCLCHECK(ncclCudaFree(gpuPtr));
+cleanup2:
+    if (rComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeRecv(rComm));
+    if (sComm != NULL)
+      NCCLCHECK(comm->ncclNet->closeSend(sComm));
+    NCCLCHECK(comm->ncclNet->closeListen(lComm));
+cleanup1:
+      break;
+    }
+  }
+  *gdrSupport = gdrSupportMatrix[comm->cudaDev];
+  return ncclSuccess;
+}
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
new file mode 100644
index 000000000..682f239f7
--- /dev/null
+++ b/src/plugin/net/net_v10.cc
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+
+static ncclNet_v10_t* ncclNet_v10;
+static ncclCollNet_v10_t* ncclCollNet_v10;
+
+ncclNet_t* getNcclNet_v10(void* lib) {
+  ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
+  if (ncclNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
+    return ncclNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
+  return nullptr;
+}
+
+ncclCollNet_t* getNcclCollNet_v10(void* lib) {
+  ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
+  if (ncclCollNet_v10) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
+    return ncclCollNet_v10;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
new file mode 100644
index 000000000..baff67935
--- /dev/null
+++ b/src/plugin/net/net_v6.cc
@@ -0,0 +1,178 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v6_t* ncclNet_v6;
+static ncclCollNet_v6_t* ncclCollNet_v6;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+  return ncclNet_v6->connect(dev, handle, sendComm);
+}
+
+static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** /*recvDevComm*/) {
+  return ncclNet_v6->accept(listenComm, recvComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v6->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v6->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v6_t p6;
+  ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
+  if (ans != ncclSuccess) return ans;
+  props->name = p6.name;
+  props->pciPath = p6.pciPath;
+  props->guid = p6.guid;
+  props->ptrSupport = p6.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p6.speed;
+  props->port = p6.port;
+  props->maxComms = p6.maxComms;
+  props->maxRecvs = p6.maxRecvs;
+  props->latency = p6.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+     ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v6->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v6->init(logfn));
+  ncclNet.devices = ncclNet_v6->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v6->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v6->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v6->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v6->iflush;
+  ncclNet.test = ncclNet_v6->test;
+  ncclNet.closeSend = ncclNet_v6->closeSend;
+  ncclNet.closeRecv = ncclNet_v6->closeRecv;
+  ncclNet.closeListen = ncclNet_v6->closeListen;
+  ncclNet.getDeviceMr = NULL;
+  ncclNet.irecvConsumed = NULL;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v6(void* lib) {
+  ncclNet_v6 = (ncclNet_v6_t*)dlsym(lib, "ncclNetPlugin_v6");
+  if (ncclNet_v6) {
+    ncclNet.name = ncclNet_v6->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v6->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v6->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v6->listen;
+  ncclCollNet.connect = ncclCollNet_v6->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v6->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v6->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v6->iflush;
+  ncclCollNet.test = ncclCollNet_v6->test;
+  ncclCollNet.closeColl = ncclCollNet_v6->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v6->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v6(void* lib) {
+  ncclCollNet_v6 = (ncclCollNet_v6_t*)dlsym(lib, "ncclCollNetPlugin_v6");
+  if (ncclCollNet_v6) {
+    ncclCollNet.name = ncclCollNet_v6->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
new file mode 100644
index 000000000..4bad5ec26
--- /dev/null
+++ b/src/plugin/net/net_v7.cc
@@ -0,0 +1,174 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v7_t* ncclNet_v7;
+static ncclCollNet_v7_t* ncclCollNet_v7;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType = p7.netDeviceType;
+  props->netDeviceVersion = p7.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v7->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v7->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v7_t p7;
+  ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
+  if (ans != ncclSuccess) return ans;
+  props->name = p7.name;
+  props->pciPath = p7.pciPath;
+  props->guid = p7.guid;
+  props->ptrSupport = p7.ptrSupport;
+  props->regIsGlobal = 0;
+  props->forceFlush = 0;
+  props->speed = p7.speed;
+  props->port = p7.port;
+  props->maxComms = p7.maxComms;
+  props->maxRecvs = p7.maxRecvs;
+  props->latency = p7.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
+  if (size >= 1UL<<31) return ncclInternalError;
+  return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v7->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v7->init(logfn));
+  ncclNet.devices = ncclNet_v7->devices;
+  ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties;
+  ncclNet.listen = ncclNet_v7->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v7->accept;
+  ncclNet.regMr = ncclNet_regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v7->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v7->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v7->iflush;
+  ncclNet.test = ncclNet_v7->test;
+  ncclNet.closeSend = ncclNet_v7->closeSend;
+  ncclNet.closeRecv = ncclNet_v7->closeRecv;
+  ncclNet.closeListen = ncclNet_v7->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed;
+  ncclNet.makeVDevice  = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v7(void* lib) {
+  ncclNet_v7 = (ncclNet_v7_t*)dlsym(lib, "ncclNetPlugin_v7");
+  if (ncclNet_v7) {
+    ncclNet.name = ncclNet_v7->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v7->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v7->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v7->listen;
+  ncclCollNet.connect = ncclCollNet_v7->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v7->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v7->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = nullptr;
+  ncclCollNet.ireducescatter = nullptr;
+  ncclCollNet.iflush = ncclCollNet_v7->iflush;
+  ncclCollNet.test = ncclCollNet_v7->test;
+  ncclCollNet.closeColl = ncclCollNet_v7->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v7->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v7(void* lib) {
+  ncclCollNet_v7 = (ncclCollNet_v7_t*)dlsym(lib, "ncclCollNetPlugin_v7");
+  if (ncclCollNet_v7) {
+    ncclCollNet.name = ncclCollNet_v7->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
new file mode 100644
index 000000000..b43bb895e
--- /dev/null
+++ b/src/plugin/net/net_v8.cc
@@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v8_t* ncclNet_v8;
+static ncclCollNet_v8_t* ncclCollNet_v8;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType = p8.netDeviceType;
+  props->netDeviceVersion = p8.netDeviceVersion;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  int sizeInt;
+  if (size > MAX_NET_SIZE) return ncclInternalError;
+  sizeInt = (int)size;
+  ncclResult_t ans = ncclNet_v8->isend(sendComm, data, sizeInt, tag, mhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  int sizesInt[NCCL_PROXY_MAX_SUBS];
+  //reset to nullptr if optional receive completion is set
+  if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
+  for (int i=0; i<n; i++) {
+    if (sizes[i] > MAX_NET_SIZE) return ncclInternalError;
+    sizesInt[i] = (int) sizes[i];
+  }
+  ncclResult_t ans = ncclNet_v8->irecv(recvComm, n, data, sizesInt, tags, mhandles, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v8_t p8;
+  ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
+  if (ans != ncclSuccess) return ans;
+  props->name = p8.name;
+  props->pciPath = p8.pciPath;
+  props->guid = p8.guid;
+  props->ptrSupport = p8.ptrSupport;
+  props->regIsGlobal = p8.regIsGlobal;
+  props->forceFlush = 0;
+  props->speed = p8.speed;
+  props->port = p8.port;
+  props->maxComms = p8.maxComms;
+  props->maxRecvs = p8.maxRecvs;
+  props->latency = p8.latency;
+  props->netDeviceType    = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  props->maxP2pBytes = MAX_NET_SIZE;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
+  int countInt;
+  if (count > MAX_NET_SIZE) return ncclInternalError;
+  countInt = (int)count;
+  ncclResult_t ans = ncclCollNet_v8->iallreduce(collComm, sendData, recvData, countInt, dataType, redOp,
+                 sendMhandle, recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_iallgather (void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                           size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                           void* sendMhandle, void** request) {
+  ncclNetSGE_v8_t recvPartsInt;
+  if (nRecvParts > 1) return ncclInternalError;
+  if (recvParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  recvPartsInt.mhandle = recvParts->mhandle;
+  recvPartsInt.address = recvParts->address;
+  recvPartsInt.size = (int)recvParts->size;
+  ncclResult_t ans = ncclCollNet_v8->iallgather(collComm, sendData, nRecvParts, &recvPartsInt,
+                  bytesPerRank, windowOffset, windowBytes,
+                  sendMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                               size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                               ncclDataType_t dataType, ncclRedOp_t redOp,
+                               void* recvMhandle, void** request) {
+  ncclNetSGE_v8_t sendPartsInt;
+  if (nSendParts > 1) return ncclInternalError;
+  if (sendParts->size > MAX_COLLNET_SIZE) return ncclInternalError;
+  sendPartsInt.mhandle = sendParts->mhandle;
+  sendPartsInt.address = sendParts->address;
+  sendPartsInt.size = (int)sendParts->size;
+  ncclResult_t ans = ncclCollNet_v8->ireducescatter(collComm, nSendParts, &sendPartsInt,
+                  recvData, bytesPerRank, windowOffset, windowBytes,
+                  dataType, redOp,
+                  recvMhandle, request);
+  return ans;
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v8->init(logfn));
+  ncclNet.devices = ncclNet_v8->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v8->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept =  ncclNet_v8->accept;
+  ncclNet.regMr = ncclNet_v8->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v8->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v8->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v8->iflush;
+  ncclNet.test = ncclNet_v8->test;
+  ncclNet.closeSend = ncclNet_v8->closeSend;
+  ncclNet.closeRecv = ncclNet_v8->closeRecv;
+  ncclNet.closeListen = ncclNet_v8->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed;
+  ncclNet.makeVDevice   = NULL;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v8(void* lib) {
+  ncclNet_v8 = (ncclNet_v8_t*)dlsym(lib, "ncclNetPlugin_v8");
+  if (ncclNet_v8) {
+    ncclNet.name = ncclNet_v8->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v8->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v8->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v8->listen;
+  ncclCollNet.connect = ncclCollNet_v8->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v8->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v8->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v8->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v8->iflush;
+  ncclCollNet.test = ncclCollNet_v8->test;
+  ncclCollNet.closeColl = ncclCollNet_v8->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v8->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v8(void* lib) {
+  ncclCollNet_v8 = (ncclCollNet_v8_t*)dlsym(lib, "ncclCollNetPlugin_v8");
+  if (ncclCollNet_v8) {
+    ncclCollNet.name = ncclCollNet_v8->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
new file mode 100644
index 000000000..34e039332
--- /dev/null
+++ b/src/plugin/net/net_v9.cc
@@ -0,0 +1,121 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include "checks.h"
+
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
+static ncclNet_v9_t* ncclNet_v9;
+static ncclCollNet_v9_t* ncclCollNet_v9;
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+  return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request);
+}
+
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+  return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request);
+}
+
+static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props);
+}
+
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+}
+
+static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request) {
+  return ncclCollNet_v9->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v9_t*)recvParts, bytesPerRank,
+                             windowOffset, windowBytes, sendMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request) {
+  return ncclCollNet_v9->ireducescatter(collComm, nSendParts, (ncclNetSGE_v9_t*)sendParts, recvData, bytesPerRank,
+                                 windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
+}
+
+static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  NCCLCHECK(ncclNet_v9->init(logfn));
+  ncclNet.devices = ncclNet_v9->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_v9->listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept = ncclNet_v9->accept;
+  ncclNet.regMr = ncclNet_v9->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v9->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v9->deregMr;
+  ncclNet.isend = ncclNet_isend;
+  ncclNet.irecv = ncclNet_irecv;
+  ncclNet.iflush = ncclNet_v9->iflush;
+  ncclNet.test = ncclNet_v9->test;
+  ncclNet.closeSend = ncclNet_v9->closeSend;
+  ncclNet.closeRecv = ncclNet_v9->closeRecv;
+  ncclNet.closeListen = ncclNet_v9->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed;
+  ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  return ncclSuccess;
+}
+
+ncclNet_t* getNcclNet_v9(void* lib) {
+  ncclNet_v9 = (ncclNet_v9_t*)dlsym(lib, "ncclNetPlugin_v9");
+  if (ncclNet_v9) {
+    ncclNet.name = ncclNet_v9->name;
+    ncclNet.init = ncclNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name);
+    return &ncclNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
+  return nullptr;
+}
+
+static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclCollNet_v9->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v9->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_v9->listen;
+  ncclCollNet.connect = ncclCollNet_v9->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v9->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v9->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v9->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_v9->iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v9->iflush;
+  ncclCollNet.test = ncclCollNet_v9->test;
+  ncclCollNet.closeColl = ncclCollNet_v9->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v9->closeListen;
+  return ncclSuccess;
+}
+
+ncclCollNet_t* getNcclCollNet_v9(void* lib) {
+  ncclCollNet_v9 = (ncclCollNet_v9_t*)dlsym(lib, "ncclCollNetPlugin_v9");
+  if (ncclCollNet_v9) {
+    ncclCollNet.name = ncclCollNet_v9->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name);
+    return &ncclCollNet;
+  }
+  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
+  return nullptr;
+}
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
new file mode 100644
index 000000000..a43df28d3
--- /dev/null
+++ b/src/plugin/plugin_open.cc
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <dlfcn.h>
+
+#include "debug.h"
+
+#define MAX_STR_LEN 255
+
+enum ncclPluginType {
+  ncclPluginTypeNet,
+  ncclPluginTypeTuner,
+  ncclPluginTypeProfiler,
+};
+
+#define NUM_LIBS 3
+static void *libHandles[NUM_LIBS];
+static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
+static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
+static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
+static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
+
+static void* tryOpenLib(char* name, int* err, char* errStr) {
+  *err = 0;
+  if (nullptr == name || strlen(name) == 0) {
+    return nullptr;
+  }
+
+  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
+    name = nullptr;
+  }
+
+  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
+  if (nullptr == handle) {
+    strncpy(errStr, dlerror(), MAX_STR_LEN);
+    errStr[MAX_STR_LEN] = '\0';
+    // "handle" and "name" won't be NULL at the same time.
+    // coverity[var_deref_model]
+    if (strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
+      *err = ENOENT;
+    }
+  }
+  return handle;
+}
+
+static void appendNameToList(char* nameList, int *nameListLen, char* name) {
+  snprintf(nameList, *nameListLen, " %s", name);
+  nameList += strlen(name) + 1;
+  *nameListLen -= strlen(name) + 1;
+}
+
+static void* openPluginLib(enum ncclPluginType type, const char* libName) {
+  int openErr, len = PATH_MAX;
+  char libName_[MAX_STR_LEN] = { 0 };
+  char openErrStr[MAX_STR_LEN + 1] = { 0 };
+  char eNoEntNameList[PATH_MAX] = { 0 };
+
+  if (libName && strlen(libName)) {
+    snprintf(libName_, MAX_STR_LEN, "%s", libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+
+    snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  } else {
+    snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
+    } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+  }
+
+  if (strlen(eNoEntNameList)) {
+    INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]);
+  } else if (strlen(pluginFallback[type])) {
+    INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]);
+  }
+  return nullptr;
+}
+
+void* ncclOpenNetPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeNet, name);
+}
+
+void* ncclOpenTunerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeTuner, name);
+}
+
+void* ncclOpenProfilerPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeProfiler, name);
+}
+
+void* ncclGetNetPluginLib(void) {
+  return libHandles[ncclPluginTypeNet];
+}
+
+ncclResult_t ncclClosePluginLib(void* handle) {
+  for (int l=0; l<NUM_LIBS; l++) {
+    if (libHandles[l] == handle) {
+      libHandles[l] = nullptr;
+      dlclose(handle);
+      return ncclSuccess;
+    }
+  }
+  return ncclInternalError;
+}
diff --git a/src/misc/profiler.cc b/src/plugin/profiler.cc
similarity index 57%
rename from src/misc/profiler.cc
rename to src/plugin/profiler.cc
index c9fb2a869..023a704f4 100644
--- a/src/misc/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -11,182 +11,20 @@
 #include "utils.h"
 #include "proxy.h"
 #include "profiler.h"
+#include "transport.h"
+#include "plugin.h"
+
+extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
 
 static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
-static ncclProfiler_v2_t ncclProfiler_v1_as_v2;
-static ncclProfiler_v1_t* ncclProfiler_v1;
-
-static uint8_t ncclStringToFunc(const char* func) {
-  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
-  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
-  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
-  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
-  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
-  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
-  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
-  return ncclFuncSend;
-}
-
-static uint8_t ncclStringToAlgo(const char* algo) {
-  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
-  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
-  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
-  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
-  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
-  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
-  return NCCL_ALGO_PAT;
-}
-
-static uint8_t ncclStringToProto(const char* proto) {
-  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
-  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
-  return NCCL_PROTO_SIMPLE;
-}
-
-static uint8_t ncclStringToDatatype(const char* dt) {
-  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
-  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
-  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
-  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
-  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
-  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
-  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
-#endif
-  return ncclFloat64;
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr) {
-  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
-  eDescr_v1.type = eDescr->type;
-  eDescr_v1.parentObj = eDescr->parentObj;
-  eDescr_v1.rank = eDescr->rank;
-  switch(eDescr->type) {
-    case ncclProfileGroup: break;
-    case ncclProfileColl: {
-      eDescr_v1.coll.name = eDescr->coll.name;
-      eDescr_v1.coll.commHash = eDescr->coll.commHash;
-      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
-      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
-      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
-      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
-      eDescr_v1.coll.count = eDescr->coll.count;
-      eDescr_v1.coll.root = eDescr->coll.root;
-      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
-      eDescr_v1.coll.op = 0; // removed in v2
-      eDescr_v1.coll.trafficBytes = eDescr->coll.trafficBytes;
-      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
-      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
-      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
-      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
-    } break;
-    case ncclProfileP2p: {
-      eDescr_v1.p2p.name = eDescr->p2p.name;
-      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
-      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
-      eDescr_v1.p2p.buff = eDescr->p2p.buff;
-      eDescr_v1.p2p.count = eDescr->p2p.count;
-      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
-      eDescr_v1.p2p.peer = eDescr->p2p.peer;
-    } break;
-    case ncclProfileProxyOp: {
-      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
-      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
-      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
-      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
-      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
-      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
-    } break;
-    case ncclProfileProxyStep: {
-      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
-    } break;
-    case ncclProfileProxyCtrl: break;
-    default:;
-  }
-  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
-}
-
-static ncclResult_t ncclProfiler_v1_as_v2_init(void** context, int* eActivationMask) {
-  ncclProfiler_v1->init(context, eActivationMask);
-  ncclProfiler_v1_as_v2.startEvent = ncclProfiler_v1_as_v2_startEvent;
-  ncclProfiler_v1_as_v2.stopEvent = ncclProfiler_v1->stopEvent;
-  ncclProfiler_v1_as_v2.recordEventState = ncclProfiler_v1->recordEventState;
-  ncclProfiler_v1_as_v2.finalize = ncclProfiler_v1->finalize;
-  return ncclSuccess;
-}
 
 #define MAX_STR_LEN 256
 
-static void* tryOpenLib(char* name, int *err, char* errStr) {
-  if (nullptr == name || strlen(name) == 0) {
-    return nullptr;
-  }
-
-  if (strncasecmp(name, "STATIC_PLUGIN", strlen(name)) == 0) {
-    name = nullptr;
-  }
-
-  void *handle = dlopen(name, RTLD_NOW | RTLD_LOCAL);
-  if (nullptr == handle) {
-    strncpy(errStr, dlerror(), MAX_STR_LEN);
-    errStr[MAX_STR_LEN] = 0;
-    if (name && strstr(errStr, name) && strstr(errStr, "No such file or directory")) {
-      *err = ENOENT;
-    }
-  }
-
-  return handle;
-}
-
-static char* tryOpenLibCheck(int openErr, char* openErrStr, char* nameList, int *nameListLen, char* name) {
-  if (openErr == ENOENT) {
-    snprintf(nameList, *nameListLen, " %s", name);
-    nameList += strlen(name) + 1;
-    *nameListLen -= strlen(name) + 1;
-    return nameList;
-  }
-  INFO(NCCL_ENV, "PROFILER/Plugin: %s", openErrStr);
-  return nameList;
-}
-
-static void* openProfilerPluginLib(char* couldNotFindNames, int len) {
-  int openErr;
-  void *pluginLib;
-  char profilerPluginLibName[PATH_MAX];
-  char openErrStr[MAX_STR_LEN + 1] = { 0 };
-
-  const char *envProfilerPluginName = getenv("NCCL_PROFILER_PLUGIN");
-  if (envProfilerPluginName && strlen(envProfilerPluginName)) {
-    snprintf(profilerPluginLibName, PATH_MAX, "%s", envProfilerPluginName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: Plugin name set by env to %s", profilerPluginLibName);
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  } else {
-    snprintf(profilerPluginLibName, PATH_MAX, "libnccl-profiler.so");
-    pluginLib = tryOpenLib(profilerPluginLibName, &openErr, openErrStr);
-    if (pluginLib) {
-      return pluginLib;
-    }
-    couldNotFindNames = tryOpenLibCheck(openErr, openErrStr, couldNotFindNames, &len, profilerPluginLibName);
-  }
-
-  return nullptr;
-}
-
 enum {
   profilerPluginLoadFailed = -1,
   profilerPluginLoadReady = 0,
@@ -195,43 +33,31 @@ enum {
 static int profilerPluginStatus = profilerPluginLoadReady;
 static pid_t pid;
 
-#define MAX_PLUGIN_LOAD 2
-
 static ncclResult_t ncclProfilerPluginLoad(void) {
   if (profilerPluginLoadFailed == profilerPluginStatus) {
     return ncclSuccess;
   }
 
-  char couldNotFindNames[MAX_PLUGIN_LOAD * PATH_MAX] = { 0 };
   pthread_mutex_lock(&profilerLock);
   if (profilerPluginLoadSuccess == profilerPluginStatus) {
     ++profilerPluginRefCount;
     goto exit;
   }
 
-  profilerPluginLib = openProfilerPluginLib(couldNotFindNames, MAX_PLUGIN_LOAD * PATH_MAX);
+  profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN"));
   if (profilerPluginLib == nullptr) {
-    if (strlen(couldNotFindNames)) {
-      INFO(NCCL_ENV, "PROFILER/Plugin: Could not find:%s.", couldNotFindNames);
-    }
     goto fail;
   }
 
-  ncclProfiler = (ncclProfiler_v2_t*)dlsym(profilerPluginLib, "ncclProfiler_v2");
+  ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
   if (ncclProfiler == nullptr) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2.");
-    ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(profilerPluginLib, "ncclProfiler_v1");
-    if (ncclProfiler_v1 == nullptr) {
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
-      goto fail;
-    } else {
-      ncclProfiler = &ncclProfiler_v1_as_v2;
-      ncclProfiler_v1_as_v2.name = ncclProfiler_v1->name;
-      ncclProfiler_v1_as_v2.init = ncclProfiler_v1_as_v2_init;
-      INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v1.");
-    }
-  } else {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded ncclProfiler_v2.");
+    ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    ncclProfiler = getNcclProfiler_v1(profilerPluginLib);
+  }
+  if (ncclProfiler == NULL) {
+    goto fail;
   }
 
   ++profilerPluginRefCount;
@@ -247,7 +73,7 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
   pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 fail:
-  if (profilerPluginLib) dlclose(profilerPluginLib);
+  if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
   profilerPluginStatus = profilerPluginLoadFailed;
   goto exit;
 }
@@ -256,7 +82,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
   pthread_mutex_lock(&profilerLock);
   if (0 == (--profilerPluginRefCount)) {
     INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
-    dlclose(profilerPluginLib);
+    NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
     profilerPluginLib = nullptr;
     ncclProfiler = nullptr;
     profilerPluginStatus = profilerPluginLoadReady;
@@ -269,6 +95,11 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
 #include "timer.h"
 
 #if ENABLE_TIMER
+// These counters are used to measure profiler overheads for different part of the code
+// These counters are only useful/meaningful in controlled test environments where there
+// is only one thread updating each set of counters, i.e., every communicator has its
+// own proxy thread and the network uses only one thread to make progress (this is true
+// for net_ib plugin but might not be true for net_socket plugin).
 static int64_t elapsedCount;
 static int64_t initCount, finalizeCount;
 static int64_t groupStartCount, groupStopCount;
@@ -324,15 +155,14 @@ static double proxyOpRecordTs[2], proxyStepRecordTs[2], proxyCtrlRecordTs[2];
 #endif
 
 
-static int eActivationMask;       // Set by profiler
-static int eActivationMaskGroup;  // Cached for current group
+int ncclProfilerEventMask;       // Set by profiler
 
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
   TIME_START_EVENT(elapsed);
   TIME_START_EVENT(init);
   ncclProfilerPluginLoad();
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int err = ncclProfiler->init(&comm->profilerContext, &eActivationMask);
+    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
     if (err) {
       WARN("Profiler init failed with error (%d). Continue without profiler.", err);
       ncclProfiler = NULL;
@@ -356,9 +186,29 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
 
 ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(groupStart);
-  eActivationMaskGroup = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (eActivationMaskGroup & (ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep)) {
+    // Check if any collective in the plan has a set event activation mask
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    int eActivationMask_ = 0;
+    while (ct) {
+      if (ct->eActivationMask) {
+        eActivationMask_ = ct->eActivationMask;
+        goto startGroup;
+      }
+      ct = ct->next;
+    }
+    // Check if any pt2pt in the plan has a set event activation mask
+    while (pt) {
+      if (pt->eActivationMask) {
+        eActivationMask_ = pt->eActivationMask;
+        goto startGroup;
+      }
+      pt = pt->next;
+    }
+
+  startGroup:
+    if (eActivationMask_ & (ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileGroup;
       ncclProfiler->startEvent(plan->comm->profilerContext, &plan->groupEventHandle, &eDescr);
@@ -379,52 +229,63 @@ ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan) {
 
 ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStart);
-  if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
-      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-      while (ct) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileColl;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.coll.name = plan->comm->commName;
-        eDescr.coll.commHash = plan->comm->commHash;
-        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func]++;
-        eDescr.coll.func = ncclFuncToString(ct->func);
-        eDescr.coll.sendBuff = ct->sendbuff;
-        eDescr.coll.recvBuff = ct->recvbuff;
-        eDescr.coll.count = ct->count;
-        eDescr.coll.root = ct->root;
-        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
-        eDescr.coll.trafficBytes = ct->trafficBytes;
-        eDescr.coll.nMaxChannels = ct->nMaxChannels;
-        eDescr.coll.nWarps = ct->nWarps;
-        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
-        eDescr.coll.proto = ncclProtoToString(ct->protocol);
-        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        ct->eActivationMask = eActivationMaskGroup;
-        ct = ct->next;
+  struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+  while (ct) {
+    if (__builtin_expect(ncclProfiler != NULL, 0)) {
+      if (plan->groupEventHandle) {
+        int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileColl;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.coll.name = plan->comm->commName;
+          eDescr.coll.commHash = plan->comm->commHash;
+          eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
+          eDescr.coll.func = ncclFuncToString(ct->func);
+          eDescr.coll.sendBuff = ct->sendbuff;
+          eDescr.coll.recvBuff = ct->recvbuff;
+          eDescr.coll.count = ct->count;
+          eDescr.coll.root = ct->root;
+          eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
+          eDescr.coll.nMaxChannels = ct->nMaxChannels;
+          eDescr.coll.nWarps = ct->nWarps;
+          eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+          eDescr.coll.proto = ncclProtoToString(ct->protocol);
+          ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
+        }
       }
+    }
+    // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well.
+    // The test for "persistent" is a workaround for graph-captured collectives.  In their case this function may not be
+    // consistently invoked on all the ranks, which would lead to mismatched counter values and thus false-positive
+    // reports from RAS.  Instead, we choose not to include graph-captured collectives in our counts.  An exception is
+    // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which
+    // gives the consistency.
+    if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
+                              (ct->eActivationMask & ncclProfileKernelCh)))
+      plan->comm->seqNumber[ct->func]++;
+    ct = ct->next;
+  }
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    if (plan->groupEventHandle) {
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfilerEventDescr_t eDescr = { 0 };
-        eDescr.type = ncclProfileP2p;
-        eDescr.parentObj = plan->groupEventHandle;
-        eDescr.rank = plan->comm->rank;
-        eDescr.p2p.name = plan->comm->commName;
-        eDescr.p2p.commHash = plan->comm->commHash;
-        eDescr.p2p.func = ncclFuncToString(pt->func);
-        eDescr.p2p.buff = pt->buff;
-        eDescr.p2p.count = pt->count;
-        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
-        eDescr.p2p.peer = pt->root;
-        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
-
-        // update collective task with group event activation mask
-        pt->eActivationMask = eActivationMaskGroup;
+        int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh);
+        if (enable) {
+          ncclProfilerEventDescr_t eDescr = { 0 };
+          eDescr.type = ncclProfileP2p;
+          eDescr.parentObj = plan->groupEventHandle;
+          eDescr.rank = plan->comm->rank;
+          eDescr.p2p.name = plan->comm->commName;
+          eDescr.p2p.commHash = plan->comm->commHash;
+          eDescr.p2p.func = ncclFuncToString(pt->func);
+          eDescr.p2p.buff = pt->buff;
+          eDescr.p2p.count = pt->count;
+          eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
+          eDescr.p2p.peer = pt->root;
+          ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
+        }
         pt = pt->next;
       }
     }
@@ -436,16 +297,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStop);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int enable = eActivationMaskGroup & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileColl);
-    if (plan->groupEventHandle && enable) {
+    if (plan->groupEventHandle) {
       struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
       while (ct) {
-        ncclProfiler->stopEvent(ct->eventHandle);
+        if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle);
         ct = ct->next;
       }
       struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
       while (pt) {
-        ncclProfiler->stopEvent(pt->eventHandle);
+        if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle);
         pt = pt->next;
       }
     }
@@ -463,7 +323,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -485,7 +345,7 @@ ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileProxyOp)) {
+    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyOp;
       eDescr.parentObj = sub->taskEventHandle;
@@ -518,7 +378,7 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -536,7 +396,7 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & ncclProfileProxyStep)) {
+    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
       int step_ = DIVUP(stepId, args->sliceSteps);
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
@@ -568,7 +428,7 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
   TIME_START_EVENT(proxyCtrlStart);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     // for proxy control events we allow profiling mode to change on a per event basis
-    int eActivationMaskProxy = __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED);
+    int eActivationMaskProxy = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
     if (eActivationMaskProxy & ncclProfileProxyCtrl) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyCtrl;
@@ -591,6 +451,30 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->eActivationMask & ncclProfileKernelCh) {
+      ncclProfilerEventDescr_t eDescr = { };
+      eDescr.type = ncclProfileKernelCh;
+      eDescr.parentObj = sub->taskEventHandle;
+      eDescr.kernelCh.channelId = sub->channelId;
+      ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = &args->subs[s];
+    if (sub->kernelEventHandle) {
+      ncclProfiler->stopEvent(sub->kernelEventHandle);
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
@@ -619,7 +503,7 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
 
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void* eHandle, int appended, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyCtrlRecord);
-  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&eActivationMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && eHandle && __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED) & ncclProfileProxyCtrl) {
     ncclProfilerEventStateArgs_t args = { };
     args.proxyCtrl.appendedProxyOps = appended;
     ncclProfiler->recordEventState(eHandle, eState, &args);
@@ -632,3 +516,47 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
   op->pid = pid;
   return ncclSuccess;
 }
+
+static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER;
+
+static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) {
+  ncclResult_t ret = ncclSuccess;
+  pthread_mutex_lock(&proxyProfilerConnectLock);
+  if (comm->profiler.initialized) goto exit;
+  for (int c = 0; c < MAXCHANNELS; c++) {
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.sendProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+    NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.recvProxyConn[c]), ret, exit);
+    NCCLCHECKGOTO(ncclProxyCallBlocking(comm, &comm->profiler.recvProxyConn[c], ncclProxyMsgConnect, NULL, 0, NULL, 0), ret, exit);
+  }
+  comm->profiler.initialized = true;
+exit:
+  pthread_mutex_unlock(&proxyProfilerConnectLock);
+  return ret;
+}
+
+bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) {
+  bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh));
+  if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op);
+  return enabled;
+}
+
+ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
+    if (type == 0) { // start
+      if (sub->eActivationMask & ncclProfileNetPlugin) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileNetPlugin;
+        eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
+        eDescr.rank = sub->rank;
+        eDescr.netPlugin.id = pluginId;
+        eDescr.netPlugin.data = extData;
+        ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
+      }
+    } else { // stop
+      ncclProfiler->stopEvent(*eHandle);
+    }
+  }
+  return ncclSuccess;
+}
diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc
new file mode 100644
index 000000000..139742942
--- /dev/null
+++ b/src/plugin/profiler/profiler_v1.cc
@@ -0,0 +1,133 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v1_t* ncclProfiler_v1;
+
+static uint8_t ncclStringToFunc(const char* func) {
+  if (0 == strcmp(func, "AllGather")) return ncclFuncAllGather;
+  if (0 == strcmp(func, "AllReduce")) return ncclFuncAllReduce;
+  if (0 == strcmp(func, "Broadcast")) return ncclFuncBroadcast;
+  if (0 == strcmp(func, "Recv")) return ncclFuncRecv;
+  if (0 == strcmp(func, "Reduce")) return ncclFuncReduce;
+  if (0 == strcmp(func, "ReduceScatter")) return ncclFuncReduceScatter;
+  if (0 == strcmp(func, "SendRecv")) return ncclFuncSendRecv;
+  return ncclFuncSend;
+}
+
+static uint8_t ncclStringToAlgo(const char* algo) {
+  if (0 == strcmp(algo, "TREE")) return NCCL_ALGO_TREE;
+  if (0 == strcmp(algo, "RING")) return NCCL_ALGO_RING;
+  if (0 == strcmp(algo, "COLLNET_DIRECT")) return NCCL_ALGO_COLLNET_DIRECT;
+  if (0 == strcmp(algo, "COLLNET_CHAIN")) return NCCL_ALGO_COLLNET_CHAIN;
+  if (0 == strcmp(algo, "NVLS")) return NCCL_ALGO_NVLS;
+  if (0 == strcmp(algo, "NVLS_TREE")) return NCCL_ALGO_NVLS_TREE;
+  return NCCL_ALGO_PAT;
+}
+
+static uint8_t ncclStringToProto(const char* proto) {
+  if (0 == strcmp(proto, "LL")) return NCCL_PROTO_LL;
+  if (0 == strcmp(proto, "LL128")) return NCCL_PROTO_LL128;
+  return NCCL_PROTO_SIMPLE;
+}
+
+static uint8_t ncclStringToDatatype(const char* dt) {
+  if (0 == strcmp(dt, "ncclInt8")) return ncclInt8;
+  if (0 == strcmp(dt, "ncclInt32")) return ncclInt32;
+  if (0 == strcmp(dt, "ncclUint32")) return ncclUint32;
+  if (0 == strcmp(dt, "ncclInt64")) return ncclInt64;
+  if (0 == strcmp(dt, "ncclUint64")) return ncclUint64;
+  if (0 == strcmp(dt, "ncclFloat16")) return ncclFloat16;
+  if (0 == strcmp(dt, "ncclFloat32")) return ncclFloat32;
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+  if (0 == strcmp(dt, "ncclBfloat16")) return ncclBfloat16;
+#endif
+  return ncclFloat64;
+}
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
+  eDescr_v1.type = eDescr->type;
+  eDescr_v1.parentObj = eDescr->parentObj;
+  eDescr_v1.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v1.coll.name = eDescr->coll.name;
+      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
+      eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v1.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v1.coll.count = eDescr->coll.count;
+      eDescr_v1.coll.root = eDescr->coll.root;
+      eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
+      eDescr_v1.coll.op = 0; // removed in v2
+      eDescr_v1.coll.trafficBytes = 0; // removed in v3
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
+      eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v1.p2p.name = eDescr->p2p.name;
+      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
+      eDescr_v1.p2p.buff = eDescr->p2p.buff;
+      eDescr_v1.p2p.count = eDescr->p2p.count;
+      eDescr_v1.p2p.datatype = ncclStringToDatatype(eDescr->p2p.datatype);
+      eDescr_v1.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v1.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v1.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v1.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v1.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v1.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v1.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    case ncclProfileKernelCh:
+    case ncclProfileNetPlugin: {
+      *eHandle = NULL;
+      return ncclSuccess;
+    }
+    default:;
+  }
+  return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v1->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v1(void* lib) {
+  ncclProfiler_v1 = (ncclProfiler_v1_t*)dlsym(lib, "ncclProfiler_v1");
+  if (ncclProfiler_v1) {
+    ncclProfiler.name = ncclProfiler_v1->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
new file mode 100644
index 000000000..3d00008a6
--- /dev/null
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -0,0 +1,45 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_t ncclProfiler;
+static ncclProfiler_v2_t* ncclProfiler_v2;
+
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) {
+    *eHandle = NULL;
+    return ncclSuccess;
+  }
+  return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+  NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v2->finalize;
+  return ncclSuccess;
+}
+
+ncclProfiler_t* getNcclProfiler_v2(void* lib) {
+  ncclProfiler_v2 = (ncclProfiler_v2_t*)dlsym(lib, "ncclProfiler_v2");
+  if (ncclProfiler_v2) {
+    ncclProfiler.name = ncclProfiler_v2->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name);
+    return &ncclProfiler;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2");
+  return NULL;
+}
diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc
new file mode 100644
index 000000000..322bea57a
--- /dev/null
+++ b/src/plugin/profiler/profiler_v3.cc
@@ -0,0 +1,20 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+
+static ncclProfiler_v3_t* ncclProfiler_v3;
+
+ncclProfiler_t* getNcclProfiler_v3(void* lib) {
+  ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
+  if (ncclProfiler_v3) {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
+    return ncclProfiler_v3;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
+  return NULL;
+}
diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc
new file mode 100644
index 000000000..443bf78c4
--- /dev/null
+++ b/src/plugin/tuner.cc
@@ -0,0 +1,99 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "checks.h"
+#include "debug.h"
+#include "tuner.h"
+#include "plugin.h"
+
+extern ncclTuner_t* getNcclTuner_v2(void* lib);
+extern ncclTuner_t* getNcclTuner_v3(void* lib);
+extern ncclTuner_t* getNcclTuner_v4(void* lib);
+
+pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static int tunerPluginRefCount;
+static void* tunerPluginLib = nullptr;
+static ncclTuner_t* tunerSymbol = nullptr;
+
+enum {
+  tunerPluginLoadFailed  = -1,
+  tunerPluginLoadReady   =  0,
+  tunerPluginLoadSuccess =  1,
+};
+
+#define MAX_PLUGIN_LOAD 4
+
+static int status = tunerPluginLoadReady;
+
+ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
+  // Initialize to nullptr by default if plugin tuner cannot be loaded.
+  comm->tuner = nullptr;
+  if (tunerPluginLoadFailed == status) {
+    return ncclSuccess;
+  }
+
+  pthread_mutex_lock(&tunerPluginLock);
+  if (tunerPluginLoadFailed == status) {
+    goto exit;
+  }
+
+  if (tunerPluginLoadSuccess == status) {
+    comm->tuner = tunerSymbol;
+    ++tunerPluginRefCount;
+    goto exit;
+  }
+
+  tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
+  if (nullptr == tunerPluginLib) {
+    tunerPluginLib = ncclGetNetPluginLib();
+    if (nullptr == tunerPluginLib) {
+      goto fail;
+    }
+  }
+
+  tunerSymbol = getNcclTuner_v4(tunerPluginLib);
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v3(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v2(tunerPluginLib);
+  }
+  if (tunerSymbol == NULL) {
+    goto fail;
+  }
+
+  comm->tuner = tunerSymbol;
+  ++tunerPluginRefCount;
+  status = tunerPluginLoadSuccess;
+  comm->tunerPluginLoaded = 1;
+
+exit:
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+fail:
+  tunerPluginLib = nullptr;
+  status = tunerPluginLoadFailed;
+  goto exit;
+}
+
+ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
+  pthread_mutex_lock(&tunerPluginLock);
+  if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
+    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
+    NCCLCHECK(ncclClosePluginLib(tunerPluginLib));
+    tunerPluginLib = nullptr;
+    tunerSymbol = nullptr;
+    comm->tuner = nullptr;
+    status = tunerPluginLoadReady;
+    comm->tunerPluginLoaded = 0;
+  }
+  pthread_mutex_unlock(&tunerPluginLock);
+  return ncclSuccess;
+}
diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc
new file mode 100644
index 000000000..005638f01
--- /dev/null
+++ b/src/plugin/tuner/tuner_v2.cc
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v2_t* ncclTuner_v2;
+static ncclTuner_t ncclTuner;
+
+static int hasNvlsSupport(float** collCostTable) {
+  // Requirements for support of different algorithms:
+  //
+  // - NVLS intra-node: nvlsSupport
+  // - NVLS intra+inter-node: collNetSupport
+  // - NVLSTree intra-node: always disabled
+  // - NVLSTree inter-node: nvlsSupport
+  // - Collnet* inter-node: collNetSupport
+  //
+  // nvlsSupport = 1 if either NVLS or NVLS_TREE entries in the cost table are not -1
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE || table[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) ? 1 : 0;
+}
+
+static int hasCollNetSupport(float** collCostTable) {
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  return (table[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] == NCCL_ALGO_PROTO_IGNORE) ? 0 : 1;
+}
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo __attribute__((unused)), int numProto __attribute__((unused)), int regBuff __attribute__((unused)), int* nChannels) {
+  int algorithm = NCCL_ALGO_UNDEF;
+  int protocol = NCCL_PROTO_UNDEF;
+  int nvlsSupport = hasNvlsSupport(collCostTable);
+  int collNetSupport = hasCollNetSupport(collCostTable);
+  NCCLCHECK(ncclTuner_v2->getCollInfo(context, collType, nBytes, collNetSupport, nvlsSupport, numPipeOps, &algorithm, &protocol, nChannels));
+  // set time to 0 below to make sure this algorithm/protocol is selected later on
+  if (algorithm >= 0 && algorithm < NCCL_NUM_ALGORITHMS && protocol >= 0 && protocol < NCCL_NUM_PROTOCOLS) {
+    float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+    if (table[algorithm][protocol] != NCCL_ALGO_PROTO_IGNORE) table[algorithm][protocol] = 0.0;
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v2->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v2(void* lib) {
+  ncclTuner_v2 = (ncclTuner_v2_t*)dlsym(lib, "ncclTunerPlugin_v2");
+  if (ncclTuner_v2) {
+    ncclTuner.name = ncclTuner_v2->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc
new file mode 100644
index 000000000..3898243bc
--- /dev/null
+++ b/src/plugin/tuner/tuner_v3.cc
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "checks.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v3_t* ncclTuner_v3;
+static ncclTuner_t ncclTuner;
+
+static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, size_t nBytes, int numPipeOps, float** collCostTable, int numAlgo, int numProto, int regBuff __attribute__((unused)), int* nChannels) {
+  NCCLCHECK(ncclTuner_v3->getCollInfo(context, collType, nBytes, numPipeOps, collCostTable, numAlgo, numProto,  nChannels));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+  NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_getCollInfo;
+  ncclTuner.destroy = ncclTuner_v3->destroy;
+  return ncclSuccess;
+}
+
+ncclTuner_t* getNcclTuner_v3(void* lib) {
+  ncclTuner_v3 = (ncclTuner_v3_t*)dlsym(lib, "ncclTunerPlugin_v3");
+  if (ncclTuner_v3) {
+    ncclTuner.name = ncclTuner_v3->name;
+    ncclTuner.init = ncclTuner_init;
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name);
+    return &ncclTuner;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
+  return NULL;
+}
diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc
new file mode 100644
index 000000000..4bfd116bb
--- /dev/null
+++ b/src/plugin/tuner/tuner_v4.cc
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v4_t* ncclTuner_v4;
+
+ncclTuner_t* getNcclTuner_v4(void* lib) {
+  ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4");
+  if (ncclTuner_v4) {
+    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name);
+    return ncclTuner_v4;
+  }
+  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
+  return NULL;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index 5a83ef3eb..7e8021e47 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -383,6 +383,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->pid = op->pid;
   sub->profilerContext = op->profilerContext;
   sub->ringAlgo = op->ringAlgo;
+  sub->workCounter = op->workCounter;
   args->nsubs = subIndex+1;
   if (subIndex) {
     if ((args->sliceSteps != op->sliceSteps) ||
@@ -532,6 +533,19 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   return ncclSuccess;
 }
 
+static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
+  struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId];
+  if (justInquire) *justInquire = true;
+  else {
+    op->sendbuff = (uint8_t *)comm->profiler.workStarted;
+    op->recvbuff = (uint8_t *)comm->profiler.workCompleted;
+    NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op));
+    // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter
+    op->workCounter += comm->profiler.workCounter[op->channelId];
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t SaveProxy(struct ncclComm* comm, struct ncclChannel* channel, int type, int peer, struct ncclProxyOp* op, int connIndex, bool* justInquire) {
   if (peer < 0) return ncclSuccess;
 
@@ -612,20 +626,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, sendStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, sendStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank + (1<<i)) % nranks;
@@ -647,20 +660,19 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       // Run full algorithm to count the number of steps for each peer.
       ncclResult_t result = ncclSuccess;
       const ssize_t size = op->nbytes/comm->nRanks;
-      int last = 0;
-      int *nstepsSend = NULL, *nstepsRecv = NULL;
       const int rank = comm->rank, nranks = comm->nRanks;
-      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 0, size, size, op->chunkSize, rank, nranks);
+      int *nstepsSend = NULL, *nstepsRecv = NULL;
+      PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
 
-      while (last == 0) {
-        int recvDim, sendDim, recvOffset, sendOffset, recvStepOffset, postRecv, postSend, nelem;
-        size_t inpIx, outIx;
-        algo.getNextOp(recvDim, sendDim, inpIx, outIx, recvOffset, sendOffset, recvStepOffset, nelem, postRecv, postSend, last);
-        if (recvDim != -1 && postRecv) nstepsRecv[recvDim]++;
-        if (sendDim != -1 && postSend) nstepsSend[sendDim]++;
-      }
+      struct ncclPatStep ps;
+      do {
+        algo.getNextOp(&ps);
+        if (ps.flags & PatSkipped) continue;
+        if (ps.recvDim != -1 && ps.postRecv) nstepsRecv[ps.recvDim]++;
+        if (ps.sendDim != -1 && ps.postSend) nstepsSend[ps.sendDim]++;
+      } while (ps.last != 2);
       for (int i=0; i<log2Up(nranks); i++) {
         if (nstepsSend[i]) {
           int sendPeer = (rank - (1<<i) + nranks) % nranks;
@@ -683,6 +695,11 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       if (op->root == comm->rank) return ncclSuccess;
       NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
     } break;
+  case ncclPatternProfiler: {
+      if (ncclProfilerNeedsProxy(comm, op)) {
+        NCCLCHECK(SaveProxyProfiler(comm, op, justInquire));
+      }
+    } break;
   }
   return ncclSuccess;
 }
@@ -725,10 +742,10 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
     TIME_START(0); TIME_START(1);
-    NCCLCHECK(op->progress(proxyState, op));
+    ncclResult_t ret = op->progress(proxyState, op);
     if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
-    if (op->state == ncclProxyOpNone) {
+    if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
       TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
       TIME_STOP(2);
@@ -910,7 +927,7 @@ void* ncclProxyProgress(void *proxyState_) {
     if (ret != ncclSuccess) {
       __atomic_store_n(&proxyState->asyncResult, ret, __ATOMIC_RELEASE);
       INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
-      continue;
+      break;
     }
     void* eHandle;
     ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
@@ -932,7 +949,7 @@ void* ncclProxyProgress(void *proxyState_) {
       }
     }
     lastIdle = idle;
-  } while (state->stop == 0 || (state->stop == 1 && state->active));
+  } while ((state->stop == 0 || (state->stop == 1 && state->active)) && __atomic_load_n(proxyState->abortFlag, __ATOMIC_ACQUIRE) == 0);
   return NULL;
 }
 
@@ -1140,6 +1157,7 @@ ncclResult_t ncclProxyCallBlockingUDS(struct ncclComm* comm, struct ncclProxyCon
   }
 
   ncclIpcHdr hdr;
+  memset(&hdr, '\0', sizeof(hdr));
   hdr.type = type;
   hdr.rank = rank;
   hdr.reqSize = reqSize;
@@ -1323,9 +1341,12 @@ static ncclResult_t proxyProgressInit(struct ncclProxyState* proxyState) {
     pthread_mutexattr_init(&mutexAttr);
     pthread_mutexattr_setpshared(&mutexAttr, PTHREAD_PROCESS_SHARED);
     pthread_mutex_init(&pool->mutex, &mutexAttr);
+    pthread_mutexattr_destroy(&mutexAttr);
     pthread_condattr_t condAttr;
+    pthread_condattr_init(&condAttr);
     pthread_condattr_setpshared(&condAttr, PTHREAD_PROCESS_SHARED);
     pthread_cond_init(&pool->cond, &condAttr);
+    pthread_condattr_destroy(&condAttr);
     state->opsPool = pool;
 
     memcpy(state->opsPoolShmSuffix, shmPath+sizeof("/dev/shm/nccl-")-1, sizeof("XXXXXX")-1);
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 3e4e9a504..3eafe1b79 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -4,8 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
-#include <cassert>
 #include <cstdarg>
 #include <cstddef>
 
@@ -26,26 +24,26 @@
 #define STR2(v) #v
 #define STR(v) STR2(v)
 
-// The RAS client listening socket of this RAS thread (normally port 28028).
-int rasClientListeningSocket = -1;
-
-// Auxiliary structure used when processing the results.  Helps with statistics gathering and sorting.
+// Generic auxiliary structure used when processing the results.  Helps with statistics gathering and sorting,
+// e.g., for the calculation of the distribution of the number of peers per node, of the number of GPUs per peer,
+// of the communicator sizes, or of the counts of collective operations.
 struct rasValCount {
   uint64_t value; // The observed value.
   int count; // The number of occurences of this value in the results.
   int firstIdx; // The index of the first occurence of this value in the results.
 };
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined.
+// Communicator status, used in rasAuxComm below.  The values are bitmasks so that they can be combined.
 typedef enum {
-  RAS_ACS_UNKNOWN = 1, // Set if a peer did not provide info about a given communicator.
+  RAS_ACS_NOCOMM = 1, // Set if the peer claims not to be a member of a given communicator.
   RAS_ACS_INIT = 2,
   RAS_ACS_RUNNING = 4,
   RAS_ACS_FINALIZE = 8,
   RAS_ACS_ABORT = 16
 } rasACStatus;
 
-// Used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the exception of RAS_ACE_OK).
+// Communicator errors, used in rasAuxComm below.  The values are bitmasks so that they can be combined (with the
+// exception of RAS_ACE_OK).
 typedef enum {
   RAS_ACE_OK = 0,
   RAS_ACE_MISMATCH = 1,
@@ -53,22 +51,45 @@ typedef enum {
   RAS_ACE_INCOMPLETE = 4
 } rasACError;
 
-// Auxiliary structure used when processing the results.  Helps with sorting and includes additional statistics
-// on the number of peers and nodes for a communicator.
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query.  For each communicator, caches
+// statistics extracted from the results, such as the number of peers and nodes or the communicator status.  Includes
+// a pointer to the communicator data in the results, making it easy to sort the communicators by a different key
+// without altering the results buffer, or just to iterate over the communicators, given that the communicator data
+// in the resuls is of variable length.
 struct rasAuxComm {
-  struct rasCollComms::comm* comm;
+  struct rasCollComms::comm* comm; // Points to the results buffer.
   int nPeers;
   int nNodes;
   int ranksPerNodeMin;
   int ranksPerNodeMax;
   unsigned int status; // Bitmask of rasACStatus values.
   unsigned int errors; // Bitmask of rasACError values.
-  uint64_t firstCollOpCount; // collOpCount of the first rank, to compare against.
+  uint64_t firstCollOpCounts[NCCL_NUM_FUNCTIONS]; // collOpCounts of the first rank, to compare against.
+  int nIncompleteRanks; // Number of ranks that we didn't get any response from.
 };
 
+// Auxiliary structure used when processing the rasPeerInfo data stored in the global rasPeers array.  Makes it possible
+// to extract a subset of peers (e.g., the dead ones), to sort by a different key without altering the original array,
+// and also has room for extracted temporary data such as the number of peers per node or the number of GPUs per peer.
+struct rasAuxPeerInfo {
+  struct rasPeerInfo* peer; // Points to an element in rasPeers.
+  int value;
+};
+
+// Auxiliary structure used when processing the results of the RAS_COLL_COMMS query, specifically when iterating over
+// each communicator's ranks.  Makes it possible to sort by a different key without altering the original array, and
+// also has room for extracted temporary data such as the rank's status or a count of collective operations.
+struct rasAuxCommRank {
+  struct rasCollComms::comm::rank* rank; // Points to the results buffer.
+  uint64_t value;
+};
+
+// The RAS client listening socket of this RAS thread (normally port 28028).
+int rasClientListeningSocket = -1;
+
 // Connected RAS clients.
-struct rasClient* rasClients;
-int nRasClients;
+struct rasClient* rasClientsHead;
+struct rasClient* rasClientsTail;
 
 // Minimum byte count to increment the output buffer size by if it's too small.
 #define RAS_OUT_INCREMENT 4096
@@ -85,6 +106,7 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
 
+
 static ncclResult_t getNewClientEntry(struct rasClient** pClient);
 static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
 static void rasClientTerminate(struct rasClient* client);
@@ -101,15 +123,13 @@ static void rasOutExtract(char* buffer);
 static int rasOutLength();
 static void rasOutReset();
 
-static int rasPeersNGpuCompare(const void* e1, const void* e2);
-static int rasPeersNProcsCompare(const void* e1, const void* e2);
-static int rasPeersHostPidCompare(const void* e1, const void* e2);
+static int rasAuxPeersValueCompare(const void* e1, const void* e2);
 static int ncclSocketsHostCompare(const void* p1, const void* p2);
 static int rasValCountsCompareRev(const void* p1, const void* p2);
 static int rasAuxCommsCompareRev(const void* p1, const void* p2);
-static int rasCommRanksPeerCompare(const void* p1, const void* p2);
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2);
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2);
 
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size);
 static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size);
 static const char* ncclErrorToString(ncclResult_t err);
 static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
@@ -181,21 +201,20 @@ ncclResult_t rasClientAcceptNewSocket() {
 // Returns the index of the first available entry in the rasClients array, enlarging the array if necessary.
 static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
   struct rasClient* client;
-  int i;
-  for (i = 0; i < nRasClients; i++)
-    if (rasClients[i].status == RAS_CLIENT_CLOSED)
-      break;
-  if (i == nRasClients) {
-    NCCLCHECK(ncclRealloc(&rasClients, nRasClients, nRasClients+RAS_INCREMENT));
-    nRasClients += RAS_INCREMENT;
-  }
 
-  client = rasClients+i;
-  memset(client, '\0', sizeof(*client));
+  NCCLCHECK(ncclCalloc(&client, 1));
+
   client->sock = client->pfd = -1;
   ncclIntruQueueConstruct(&client->sendQ);
   client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
-  client->collIdx = -1;
+
+  if (rasClientsHead) {
+    rasClientsTail->next = client;
+    client->prev = rasClientsTail;
+    rasClientsTail = client;
+  } else {
+    rasClientsHead = rasClientsTail = client;
+  }
 
   *pClient = client;
   return ncclSuccess;
@@ -219,22 +238,32 @@ static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgL
   struct rasMsgMeta* meta = (struct rasMsgMeta*)((char*)msg - offsetof(struct rasMsgMeta, msg));
   meta->offset = 0;
   meta->length = msgLen;
-  ncclIntruQueueEnqueue(&client->sendQ, meta);
-  assert(client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED);
-  rasPfds[client->pfd].events |= POLLOUT;
+  if (client->status != RAS_CLIENT_CLOSED && client->status < RAS_CLIENT_FINISHED) {
+    ncclIntruQueueEnqueue(&client->sendQ, meta);
+    rasPfds[client->pfd].events |= POLLOUT;
+  } else {
+    INFO(NCCL_RAS, "RAS invalid client status %d -- internal error?", client->status);
+  }
 }
 
 // Terminates a connection with a RAS client.
 static void rasClientTerminate(struct rasClient* client) {
   (void)close(client->sock);
-  client->sock = -1;
-  client->status = RAS_CLIENT_CLOSED;
   rasPfds[client->pfd].fd = -1;
   rasPfds[client->pfd].events = rasPfds[client->pfd].revents = 0;
-  client->pfd = -1;
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&client->sendQ)) {
     free(meta);
   }
+
+  if (client == rasClientsHead)
+    rasClientsHead = rasClientsHead->next;
+  if (client == rasClientsTail)
+    rasClientsTail = rasClientsTail->prev;
+  if (client->prev)
+    client->prev->next = client->next;
+  if (client->next)
+    client->next->prev = client->prev;
+  free(client);
 }
 
 
@@ -245,16 +274,12 @@ static void rasClientTerminate(struct rasClient* client) {
 // Invoked when an asynchronous operation that a client was waiting on completes.  Finds the right client and
 // reinvokes rasClientRun.
 ncclResult_t rasClientResume(struct rasCollective* coll) {
-  int collIdx = coll-rasCollectives;
-  int i;
-  struct rasClient* client = nullptr;
-  for (i = 0; i < nRasClients; i++) {
-    client = rasClients+i;
-    if (client->status != RAS_CLIENT_CLOSED && client->collIdx == collIdx) {
+  struct rasClient* client;
+
+  for (client = rasClientsHead; client; client = client->next)
+    if (client->coll == coll)
       break;
-    }
-  }
-  if (i == nRasClients) {
+  if (client == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching client!");
     rasCollFree(coll);
     goto exit;
@@ -266,8 +291,7 @@ ncclResult_t rasClientResume(struct rasCollective* coll) {
 }
 
 // Handles a ready client FD from the main event loop.
-void rasClientEventLoop(int clientIdx, int pollIdx) {
-  struct rasClient* client = rasClients+clientIdx;
+void rasClientEventLoop(struct rasClient* client, int pollIdx) {
   bool closed = false;
 
   if (client->status == RAS_CLIENT_CONNECTED) {
@@ -431,7 +455,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_CONNS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunConns(client), ret, exit);
 #endif
       client->status = RAS_CLIENT_COMMS;
@@ -440,7 +463,6 @@ static ncclResult_t rasClientRun(struct rasClient* client) {
         break;
       }
     case RAS_CLIENT_COMMS:
-      assert(client->collIdx != -1);
       NCCLCHECKGOTO(rasClientRunComms(client), ret, exit);
       client->status = RAS_CLIENT_FINISHED;
       break;
@@ -459,7 +481,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasPeerInfo* peersReSorted = nullptr;
+  struct rasAuxPeerInfo* auxRasPeers = nullptr;
   int totalGpus, totalNodes, firstNGpusNode, firstNGpusGlobal, firstNPeersGlobal;
   bool consistentNGpusNode, consistentNGpusGlobal, consistentNPeersGlobal;
   int firstIdx, nPeers;
@@ -467,6 +489,8 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   int nValCounts;
   static int cudaDriver = -1, cudaRuntime = -1;
 
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
+
   rasOutReset();
   rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
                " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
@@ -481,7 +505,6 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
 
-  rasOutReset();
   totalGpus = totalNodes = 0;
   firstNGpusNode = 0; // #GPUs on the first peer of a node.
   firstNGpusGlobal = 0; // #GPUs on peerIdx 0.
@@ -489,7 +512,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   consistentNGpusGlobal = true; // Whether #GPUs/peer is consistent between the peers *on all nodes*.
   consistentNPeersGlobal = true; // Whether #peers/node is consistent between all nodes.
   nPeers = 0; // #peers on a node.
-  firstNPeersGlobal = 0;
+  firstNPeersGlobal = 0; // #peers on the first node.
   for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
     int nGpus = __builtin_popcountll(rasPeers[peerIdx].cudaDevs);
     totalGpus += nGpus;
@@ -522,6 +545,11 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
     }
   } // for (peerIdx)
 
+  TRACE(NCCL_RAS, "RAS: totalNodes %d, nRasPeers %d, totalGpus %d", totalNodes, nRasPeers, totalGpus);
+  TRACE(NCCL_RAS, "RAS: consistentNPeersGlobal %d, consistentNGpusGlobal %d, consistentNGpusNode %d",
+        consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode);
+  TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal);
+
   rasOutAppend("Job summary\n"
                "===========\n\n");
 
@@ -532,22 +560,24 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
                  totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
   } else {
     // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
-    // but of a group of peers, so calculating it is more involved.  We make a copy of rasPeers and creatively
-    // misuse it: cudaDevs of each element will be repurposed to store the number of processes on the node.
-    NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
-    memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+    // but of a group of peers, so calculating it is more involved.  We store the value in a temporary auxRasPeers
+    // array.
+    NCCLCHECKGOTO(ncclCalloc(&auxRasPeers, nRasPeers), ret, fail);
 
     firstIdx = 0;
     nPeers = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+      auxRasPeers[peerIdx].peer = rasPeers+peerIdx;
       if (peerIdx == 0) {
         nPeers = 1;
         firstIdx = 0;
       } else { // peerIdx > 0
-        if (!ncclSocketsSameNode(&peersReSorted[peerIdx].addr, &peersReSorted[peerIdx-1].addr)) {
+        if (!ncclSocketsSameNode(&auxRasPeers[peerIdx].peer->addr, &auxRasPeers[peerIdx-1].peer->addr)) {
+          TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+                ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
           for (int i = firstIdx; i < peerIdx; i++) {
             // Go back and update the number of processes of all the elements of that node.
-            peersReSorted[i].cudaDevs = nPeers;
+            auxRasPeers[i].value = nPeers;
           }
           nPeers = 1;
           firstIdx = peerIdx;
@@ -557,21 +587,23 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       } // peerIdx > 0
       if (peerIdx == nRasPeers-1) {
         // Last iteration of the loop.
+        TRACE(NCCL_RAS, "RAS: node %s: nPeers %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)), nPeers);
         for (int i = firstIdx; i < nRasPeers; i++) {
-          peersReSorted[i].cudaDevs = nPeers;
+          auxRasPeers[i].value = nPeers;
         }
       }
     } // for (peerIdx)
 
-    // Re-sort it now using the number of processes on the node (cudaDevs) as the primary key, host IP as the
+    // Re-sort it now using the number of processes on the node (value) as the primary key, host IP as the
     // secondary, and process id as the tertiary.
-    qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNProcsCompare);
+    qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
     // Calculate the distribution of different numbers of peers per node.
     nValCounts = 0;
     for (int peerIdx = 0; peerIdx < nRasPeers;) {
-      if (peerIdx == 0 || peersReSorted[peerIdx].cudaDevs != peersReSorted[peerIdx-1].cudaDevs) {
-        valCounts[nValCounts].value = peersReSorted[peerIdx].cudaDevs;
+      if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+        valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
         valCounts[nValCounts].count = 1;
         valCounts[nValCounts].firstIdx = peerIdx;
         nValCounts++;
@@ -579,14 +611,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         valCounts[nValCounts-1].count++;
       }
       // Advance peerIdx to the next node.
-      peerIdx += peersReSorted[peerIdx].cudaDevs;
-    }
+      peerIdx += auxRasPeers[peerIdx].value;
+    } // for (peerIdx)
     // valCounts is currently sorted by value (the number of peers per node).  Sort it by the count (most frequent
     // number of peers first).
     qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
 
     // Print it out, the most frequent peer counts first.
     if (consistentNGpusNode && consistentNGpusGlobal) {
+      // consistentNPeersGlobal must be false
       rasOutAppend("  Nodes  Processes         GPUs\n"
                    "          per node  per process\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -594,7 +627,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("%7d  %9ld  %11d\n",
                      vc->count, vc->value, firstNGpusGlobal);
       }
-    } else {
+    } else { // !consistentNGpusNode || !consistentNGpusGlobal
       rasOutAppend("  Nodes  Processes\n"
                    "          per node\n");
       for (int i = 0; i < nValCounts; i++) {
@@ -606,24 +639,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       // We calculate and print the GPUs/process separately.  This is required for !consistentNGpusNode and
       // it also makes our life easier above for !consistentNGpusGlobal (which could require a larger valCounts).
 
-      // Sort peers by the GPU count, to simplify data extraction.
-      memcpy(peersReSorted, rasPeers, nRasPeers * sizeof(*peersReSorted));
+      // Sort peers by the GPU count, to simplify data extraction.  Not sure how fast __builtin_popcountll is so we
+      // may just as well cache it...
+      for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
+        auxRasPeers[peerIdx].value = __builtin_popcountll(auxRasPeers[peerIdx].peer->cudaDevs);
+        TRACE(NCCL_RAS, "RAS: node %s pid %d: nGpus %d",
+              ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
+              auxRasPeers[peerIdx].peer->pid, auxRasPeers[peerIdx].value);
+      }
       // GPU count is the primary key, host IP is the secondary, and process id is the tertiary.
-      qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), rasPeersNGpuCompare);
+      qsort(auxRasPeers, nRasPeers, sizeof(*auxRasPeers), rasAuxPeersValueCompare);
 
       // Calculate the distribution of different numbers of GPUs per peer.
       nValCounts = 0;
       for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++) {
-        if (peerIdx == 0 || __builtin_popcountll(peersReSorted[peerIdx].cudaDevs) !=
-                            __builtin_popcountll(peersReSorted[peerIdx-1].cudaDevs)) {
-          valCounts[nValCounts].value = __builtin_popcountll(peersReSorted[peerIdx].cudaDevs);
+        if (peerIdx == 0 || auxRasPeers[peerIdx].value != auxRasPeers[peerIdx-1].value) {
+          valCounts[nValCounts].value = auxRasPeers[peerIdx].value;
           valCounts[nValCounts].count = 1;
           valCounts[nValCounts].firstIdx = peerIdx;
           nValCounts++;
         } else {
           valCounts[nValCounts-1].count++;
         }
-      }
+      } // for (peerIdx)
       // valCounts is currently sorted by value (number of GPUs per peer).  Sort it by the count (most frequent
       // GPU counts first).
       qsort(valCounts, nValCounts, sizeof(*valCounts), rasValCountsCompareRev);
@@ -637,7 +675,7 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         rasOutAppend("         %9d  %11ld\n",
                      vc->count, vc->value);
       }
-    }
+    } // !consistentNGpusNode || !consistentNGpusGlobal
     rasOutAppend("\n"
                  "  Nodes  Processes         GPUs\n"
                  "(total)    (total)      (total)\n"
@@ -652,16 +690,16 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         // provided that they meet our definition of an outlier.
         if (rasCountIsOutlier(vc->count, client->verbose, totalNodes)) {
           rasOutAppend("\nThe outlier node%s:\n", (vc->count > 1 ? "s" : ""));
-          // peersReSorted is sorted by the node IP address (not port!) as the secondary key and the pid as
+          // auxRasPeers is sorted by the node IP address (not port!) as the secondary key and the pid as
           // the tertiary, which comes in handy when printing...
           for (int peerIdx = vc->firstIdx; peerIdx < vc->count*vc->value + vc->firstIdx; peerIdx += vc->value) {
             lineBuf[0] = '\0';
             for (int j = 0; j < vc->value; j++) {
               snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (j > 0 ? "," : ""), peersReSorted[j].pid);
+                       (j > 0 ? "," : ""), auxRasPeers[j].peer->pid);
             }
             rasOutAppend("  Node %s running process%s %s\n",
-                         ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
+                         ncclSocketToHost(&auxRasPeers[peerIdx].peer->addr, rasLine, sizeof(rasLine)),
                          (vc->value > 1 ? "es" : ""), lineBuf);
           } // for (peerIdx)
         } // if (rasCountIsOutlier(vc->count))
@@ -678,13 +716,12 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_CONNS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_CONNS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress; // We need to wait for async. responses.
   }
@@ -696,18 +733,18 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
+  TRACE(NCCL_RAS, "RAS: rasClientRunInit: scheduling RAS_COLL_COMMS and finishing");
 exit:
-  free(peersReSorted);
+  free(auxRasPeers);
   return ret;
 fail:
   goto exit;
@@ -721,13 +758,16 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollConns* connsData = (struct rasCollConns*)coll->data;
   int expected;
   struct rasPeerInfo* peersBuf = nullptr;
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
@@ -822,13 +862,12 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
   {
-    struct rasCollRequest collReq;
+    struct rasCollRequest collReq = {};
     bool allDone = false;
     rasCollReqInit(&collReq);
     collReq.timeout = client->timeout;
     collReq.type = RAS_COLL_COMMS;
-    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, rasCollDataLength(RAS_COLL_COMMS), &allDone, &client->collIdx),
-                  ret, fail);
+    NCCLCHECKGOTO(rasNetSendCollReq(&collReq, &allDone, &client->coll), ret, fail);
     if (!allDone)
       ret = ncclInProgress;
   }
@@ -847,10 +886,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   ncclResult_t ret = ncclSuccess;
   char* msg = nullptr;
   int msgLen;
-  struct rasCollective* coll = rasCollectives+client->collIdx;
+  struct rasCollective* coll = client->coll;
   struct rasCollComms* commsData = (struct rasCollComms*)coll->data;
   struct rasCollComms::comm* comm;
-  struct rasCollComms::comm::rank* ranksReSorted = nullptr;
+  struct rasAuxCommRank* auxCommRanks = nullptr;
   struct rasValCount* valCounts = nullptr;
   int nValCounts;
   struct rasValCount* collOpCounts = nullptr;
@@ -860,7 +899,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   int vcIdx;
   int nPeersMissing;
   uint64_t* peerNvmlDevs = nullptr;
-  const char*const statusStr[] = { "UNKNOWN", "INIT", "RUNNING", "FINALIZE", "ABORT" };
+  const char*const statusStr[] = { "NOCOMM", "INIT", "RUNNING", "FINALIZE", "ABORT" };
   const char*const errorStr[] = {
     // Listing them all like this, while a bit of a hassle, is less effort than formatting in a temporary buffer.
     "OK",
@@ -873,14 +912,22 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     "INCOMPLETE,ERROR,MISMATCH"
   };
 
-  assert(coll->nFwdSent == coll->nFwdRecv);
-  client->collIdx = -1;
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: starting");
+  TRACE(NCCL_RAS, "RAS: coll nLegTimeouts %d, nPeers %d, nData %d; commsData nComms %d",
+        coll->nLegTimeouts, coll->nPeers, coll->nData, commsData->nComms);
+
+  if (coll == nullptr || coll->nFwdSent != coll->nFwdRecv) {
+    INFO(NCCL_RAS, "RAS invalid collective operation status; client status %d -- internal error?", client->status);
+    return ncclInternalError;
+  }
+  client->coll = nullptr;
 
   rasOutReset();
   rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
 
   // Calculate the number of missing peers early as we rely on it for other things.
   nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
+  TRACE(NCCL_RAS, "RAS: nRasPeers %d, nRasDeadPeers %d, nPeersMissing %d", nRasPeers, nRasDeadPeers, nPeersMissing);
 
   // Sort the communicators by size.  As the structure is inconvenient to move around due to the elements being
   // of variable length, we create an auxiliary array that includes pointers to individual elements and simply sort
@@ -896,12 +943,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     auxComms[commIdx].comm = comm;
     comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
   }
-  NCCLCHECKGOTO(ncclCalloc(&ranksReSorted, maxCommSize), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&auxCommRanks, maxCommSize), ret, fail);
+  TRACE(NCCL_RAS, "RAS: maxCommSize %d", maxCommSize);
 
   // For convenience, create a translation table from rasCollective's peerIdx to rasPeers peerIdx.
   NCCLCHECKGOTO(ncclCalloc(&peerIdxConv, coll->nPeers), ret, fail);
-  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++)
+  for (int peerIdx = 0; peerIdx < coll->nPeers; peerIdx++) {
     peerIdxConv[peerIdx] = rasPeerFind(coll->peers+peerIdx);
+    TRACE(NCCL_RAS, "RAS: coll peers[%d] -> rasPeers[%d]", peerIdx, peerIdxConv[peerIdx]);
+  }
   // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
   qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
 
@@ -910,42 +960,75 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     struct rasAuxComm* auxComm = auxComms+commIdx;
     int nRanks = 0;
     comm = auxComm->comm;
+    TRACE(NCCL_RAS, "RAS: coll comms[%d]: commId (0x%lx, 0x%lx, 0x%lx), commNRanks %d, nRanks %d, nMissingRanks %d",
+          commIdx, comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash,
+          comm->commNRanks, comm->nRanks, comm->nMissingRanks);
 
-    if (comm->commNRanks > comm->nRanks) {
+    if (comm->nMissingRanks > 0) {
       // There are two possibilities here.  Either we are missing the data on some ranks because the processes are
       // unreachable, or the processes _are_ reachable but didn't report to be part of this communicator (which
-      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).  Because we
-      // currently don't collect data about missing ranks, we can't reliably distinguish these two cases.
-      // For now we rely on an approximation: if we _know_ that some peers failed to respond, we mark this
-      // as an INCOMPLETE error; otherwise as a MISMATCH warning.
-      if (nPeersMissing > 0 || nRasDeadPeers > 0)
-        auxComm->errors |= RAS_ACE_INCOMPLETE;
-      else {
+      // could definitely happen if some processes have already called ncclCommDestroy or ncclCommAbort).
+      if (nPeersMissing == 0 && nRasDeadPeers == 0) {
+        // We received data from _all_ processes.  That's an easy case.
         auxComm->errors |= RAS_ACE_MISMATCH;
-        auxComm->status |= RAS_ACS_UNKNOWN;
-      }
+        auxComm->status |= RAS_ACS_NOCOMM;
+      } else {
+        // We failed to receive data from some processes but we don't know if that's why we don't have the info about
+        // some ranks of this communicator.  We need to check all the missing ranks one-by-one as different ranks may
+        // have different reason.
+        struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+
+        for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+          struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+          void* found;
+          if ((found = bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                               ncclSocketsCompare)) != nullptr) {
+            // We did receive the data from that process, but not about this communicator.
+            auxComm->errors |= RAS_ACE_MISMATCH;
+            auxComm->status |= RAS_ACS_NOCOMM;
+          } else {
+            // We failed to receive data from that process.
+            auxComm->errors |= RAS_ACE_INCOMPLETE;
+            auxComm->nIncompleteRanks++;
+          }
+          TRACE(NCCL_RAS, "RAS: comm missingRank[%d] commRank %d, addr %td (-> %d), cudaDev %d, nvmlDev %d",
+                rankIdx, missingRank->commRank, (found ? ((union ncclSocketAddress*)found) - coll->peers: -1),
+                rasPeerFind(&missingRank->addr), missingRank->cudaDev, missingRank->nvmlDev);
+        } // for (rankIdx)
+      } // nPeersMissing > 0 || nRasDeadPeers > 0
+    } // if (comm->nMissingRanks > 0)
+
+    // Initialize auxCommRanks from comm->rank, converting peerIdx to rasPeers, then sort by it -- that way we will
+    // have the ranks sorted by node and process, which makes counting easy.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+      auxCommRanks[rankIdx].rank = rank;
+      auxCommRanks[rankIdx].value = peerIdxConv[rank->peerIdx];
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] commRank %d, peerIdx %d (-> %d), cudaDev %d, nvmlDev %d",
+            rankIdx, rank->commRank, rank->peerIdx, peerIdxConv[rank->peerIdx], rank->cudaDev, rank->nvmlDev);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] collOpCounts (%ld, %ld, %ld, %ld, %ld)",
+            rankIdx, rank->collOpCounts[0], rank->collOpCounts[1], rank->collOpCounts[2], rank->collOpCounts[3],
+            rank->collOpCounts[4]);
+      TRACE(NCCL_RAS, "RAS: comm rank[%d] status initState %d, asyncError %d, finalizeCalled %d, destroyFlag %d, "
+            "abortFlag %d", rankIdx, rank->status.initState, rank->status.asyncError, rank->status.finalizeCalled,
+            rank->status.destroyFlag, rank->status.abortFlag); /**/
     }
-
-    memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-    // Convert ranksReSorted' peerIdx to rasPeers and sort by it -- that way we will have the ranks sorted
-    // by process _and_ node, which makes counting easy.
-    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
-      ranksReSorted[rankIdx].peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-    qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksPeerCompare);
+    // This also sorts by the commRank, which we don't care about here, but it won't hurt.
+    qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
 
     // Count the peers and nodes, get the status/error indicators.
     for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-      struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+      struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
       if (rankIdx == 0) {
         auxComm->nPeers = auxComm->nNodes = 1;
         auxComm->ranksPerNodeMin = NCCL_MAX_LOCAL_RANKS;
         auxComm->ranksPerNodeMax = 0;
-        auxComm->firstCollOpCount = rank->collOpCount;
+        memcpy(auxComm->firstCollOpCounts, auxRank->rank->collOpCounts, sizeof(auxComm->firstCollOpCounts));
         nRanks = 1;
       } else { // rankIdx > 0
-        if (rank->peerIdx != rank[-1].peerIdx) {
+        if (auxRank->value != auxRank[-1].value) {
           auxComm->nPeers++;
-          if (!ncclSocketsSameNode(&rasPeers[rank->peerIdx].addr, &rasPeers[rank[-1].peerIdx].addr)) {
+          if (!ncclSocketsSameNode(&rasPeers[auxRank->value].addr, &rasPeers[auxRank[-1].value].addr)) {
             auxComm->nNodes++;
             if (auxComm->ranksPerNodeMin > nRanks)
               auxComm->ranksPerNodeMin = nRanks;
@@ -953,7 +1036,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               auxComm->ranksPerNodeMax = nRanks;
             nRanks = 0;
           }
-        } // if (rank->peerIdx != rank[-1].peerIdx)
+        } // if (auxRank->value != auxRank[-1].value)
         nRanks++;
       } // rankIdx > 0
       if (rankIdx == comm->nRanks-1) {
@@ -964,25 +1047,27 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           auxComm->ranksPerNodeMax = nRanks;
       }
 
-      if (rank->status.abortFlag)
+      if (auxRank->rank->status.abortFlag)
         auxComm->status |= RAS_ACS_ABORT;
-      else if (rank->status.finalizeCalled || rank->status.destroyFlag) {
+      else if (auxRank->rank->status.finalizeCalled || auxRank->rank->status.destroyFlag) {
         // destroyFlag is set by ncclCommDestroy and ncclCommAbort.  finalizeCalled appears to be set by
         // ncclCommFinalize only.  According to the docs, ncclCommDestroy *can* be called without calling
         // ncclCommFinalize first.  The code structure here ensures that we attribute destroyFlag properly
         // as a finalize state indicator (and ignore it in case of ncclCommAbort).
         auxComm->status |= RAS_ACS_FINALIZE;
       }
-      else if (rank->status.initState == ncclSuccess)
+      else if (auxRank->rank->status.initState == ncclSuccess)
         auxComm->status |= RAS_ACS_RUNNING;
-      else // rank->initState != ncclSuccess
+      else // auxRank->rank->initState != ncclSuccess
         auxComm->status |= RAS_ACS_INIT;
 
-      if (rank->collOpCount != auxComm->firstCollOpCount)
-        auxComm->errors |= RAS_ACE_MISMATCH;
-      if (rank->status.initState != ncclSuccess && rank->status.initState != ncclInProgress)
+      for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS && !(auxComm->errors & RAS_ACE_MISMATCH); collIdx++) {
+        if (auxRank->rank->collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx])
+          auxComm->errors |= RAS_ACE_MISMATCH;
+      }
+      if (auxRank->rank->status.initState != ncclSuccess && auxRank->rank->status.initState != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
-      if (rank->status.asyncError != ncclSuccess && rank->status.asyncError != ncclInProgress)
+      if (auxRank->rank->status.asyncError != ncclSuccess && auxRank->rank->status.asyncError != ncclInProgress)
         auxComm->errors |= RAS_ACE_ERROR;
     } // for (rankIdx)
 
@@ -990,9 +1075,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       // We've got a status mismatch between ranks.
       auxComm->errors |= RAS_ACE_MISMATCH;
     }
+    TRACE(NCCL_RAS, "RAS: auxComm nPeers %d, nNodes %d, nIncompleteRanks %d",
+          auxComm->nPeers, auxComm->nNodes, auxComm->nIncompleteRanks);
+    TRACE(NCCL_RAS, "RAS: auxComm ranksPerNodeMin %d, ranksPerNodeMax %d, status 0x%x, errors 0x%x",
+          auxComm->ranksPerNodeMin, auxComm->ranksPerNodeMax, auxComm->status, auxComm->errors);
   } // for (commIdx)
   // Sort it by size/nNodes/status/errors/missing ranks.
-  qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
+  if (auxComms)
+    qsort(auxComms, commsData->nComms, sizeof(*auxComms), &rasAuxCommsCompareRev);
 
   // Calculate the distribution of different communicator sizes.
   NCCLCHECKGOTO(ncclCalloc(&valCounts, commsData->nComms), ret, fail);
@@ -1014,10 +1104,14 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     }
   }
 
-  rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
-               "    #  in group  per comm  per node  per comm  in group\n");
-  if (commsData->nComms == 0)
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: done with initial data processing");
+
+  if (commsData->nComms > 0) {
+    rasOutAppend("Group     Comms     Nodes     Ranks     Ranks     Ranks    Status  Errors\n"
+                 "    #  in group  per comm  per node  per comm  in group\n");
+  } else {
     rasOutAppend("No communicator data collected!\n");
+  }
 
   // Allocate an auxiliary structure used for counting the number of ranks (unique GPUs) in a group.
   NCCLCHECKGOTO(ncclCalloc(&peerNvmlDevs, coll->nPeers), ret, fail);
@@ -1058,6 +1152,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  // status (which is a bitmask) into an array index.
                  statusStr[(sizeof(unsigned int)*8-1)-__builtin_clz(auxComm->status)], errorStr[auxComm->errors]);
   }
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("\nErrors\n"
                "======\n\n");
@@ -1068,12 +1167,12 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
     if (rasCountIsOutlier(nPeersMissing, client->verbose)) {
       // Extract a list of missing peers.  We don't want to print it right away because it would be sorted
       // by address (including port, which isn't meaningful to end users).
-      struct rasPeerInfo* peersBuf = nullptr;
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
       int nPeersBuf;
 
       // Both rasPeers and coll->peers are sorted by address (the latter we sorted above) which makes comparing
       // them much easier.
-      NCCLCHECKGOTO(ncclCalloc(&peersBuf, nPeersMissing), ret, fail);
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nPeersMissing), ret, fail);
       nPeersBuf = 0;
       for (int rasPeerIdx = 0, collPeerIdx = 0; rasPeerIdx < nRasPeers || collPeerIdx < coll->nPeers;) {
         int cmp;
@@ -1088,30 +1187,42 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
         } else if (cmp < 0) {
           // Process missing from coll->peers.  Don't report dead ones though, as they are not included
           // in nPeersMissing and are reported separately below.
-          if (!rasPeerIsDead(&rasPeers[rasPeerIdx].addr)) {
-            assert(nPeersBuf < nPeersMissing);
-            memcpy(peersBuf+(nPeersBuf++), rasPeers+rasPeerIdx, sizeof(*peersBuf));
+          bool dead;
+          if (!(dead = rasPeerIsDead(&rasPeers[rasPeerIdx].addr))) {
+            if (nPeersBuf < nPeersMissing) {
+              auxPeersBuf[nPeersBuf++].peer = rasPeers+rasPeerIdx;
+            } else {
+              INFO(NCCL_RAS, "RAS overflow of auxPeersBuf: nPeersBuf %d, rasPeerIdx %d (%s), collPeerIdx %d -- "
+                   "internal error?",
+                   nPeersBuf, rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), collPeerIdx);
+            }
           }
+          TRACE(NCCL_RAS, "RAS rasPeerIdx %d (%s) is missing from coll->peers; dead %d",
+                rasPeerIdx, ncclSocketToString(&rasPeers[rasPeerIdx].addr, rasLine), dead);
           rasPeerIdx++;
         } else { // cmp > 0
           // Process not found in rasPeers -- shouldn't happen, unless during a race?
+          INFO(NCCL_RAS, "RAS failed to find coll->peer[%d] (%s) in rasPeers -- internal error?",
+               collPeerIdx, ncclSocketToString(coll->peers+collPeerIdx, rasLine));
           collPeerIdx++;
         } // cmp > 0
       } // for (rasPeerIdx, collPeerIdx)
 
-      // Sort the output by host and pid.
-      qsort(peersBuf, nPeersBuf, sizeof(*peersBuf), rasPeersHostPidCompare);
+      // Sort the output by host and pid.  rasAuxPeersValueCompare uses value as the primary key, which is 0 for
+      // all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
       for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersBuf[peerIdx].pid,
-                     ncclSocketToHost(&peersBuf[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersBuf[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersBuf[peerIdx].cudaDevs, peersBuf[peerIdx].nvmlDevs, lineBuf,
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
       if (nPeersBuf != nPeersMissing)
         rasOutAppend("  [could not find information on %d process%s]\n",
                      nPeersMissing-nPeersBuf, (nPeersMissing-nPeersBuf > 1 ? "es" : ""));
-      free(peersBuf);
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nPeersMissing))
     rasOutAppend("\n");
   }
@@ -1121,31 +1232,35 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  "  %d job process%s considered dead (unreachable via the RAS network)\n", nRasDeadPeers,
                  (nRasDeadPeers > 1 ? "es are" : " is"));
     if (rasCountIsOutlier(nRasDeadPeers, client->verbose)) {
-      struct rasPeerInfo* peersReSorted = nullptr;
-      int nPeersReSorted = 0;
-      NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasDeadPeers), ret, fail);
+      // rasDeadPeers contains only addresses, whereas we want a complete rasPeerInfo, and sorted differently.
+      struct rasAuxPeerInfo* auxPeersBuf = nullptr;
+      int nPeersBuf = 0;
+      NCCLCHECKGOTO(ncclCalloc(&auxPeersBuf, nRasDeadPeers), ret, fail);
       for (int i = 0; i < nRasDeadPeers; i++) {
         int peerIdx = rasPeerFind(rasDeadPeers+i);
         if (peerIdx != -1)
-          memcpy(peersReSorted+(nPeersReSorted++), rasPeers+peerIdx, sizeof(*peersReSorted));
+          auxPeersBuf[nPeersBuf++].peer = rasPeers+peerIdx;
       }
-      // Sort the output by host and pid, not host and port.
-      qsort(peersReSorted, nPeersReSorted, sizeof(*peersReSorted), rasPeersHostPidCompare);
-      for (int peerIdx = 0; peerIdx < nPeersReSorted; peerIdx++) {
-        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", peersReSorted[peerIdx].pid,
-                     ncclSocketToHost(&peersReSorted[peerIdx].addr, rasLine, sizeof(rasLine)),
-                     (__builtin_popcountll(peersReSorted[peerIdx].cudaDevs) > 1 ? "s" : ""),
-                     rasGpuDevsToString(peersReSorted[peerIdx].cudaDevs, peersReSorted[peerIdx].nvmlDevs, lineBuf,
+      // Sort the output by host and pid, not host and port.  rasAuxPeersValueCompare uses value as the primary key,
+      // which is 0 for all auxPeersBuf elements here, so it will do.
+      qsort(auxPeersBuf, nPeersBuf, sizeof(*auxPeersBuf), rasAuxPeersValueCompare);
+      for (int peerIdx = 0; peerIdx < nPeersBuf; peerIdx++) {
+        struct rasAuxPeerInfo* auxPeer = auxPeersBuf+peerIdx;
+        rasOutAppend("  Process %d on node %s managing GPU%s %s\n", auxPeer->peer->pid,
+                     ncclSocketToHost(&auxPeer->peer->addr, rasLine, sizeof(rasLine)),
+                     (__builtin_popcountll(auxPeer->peer->cudaDevs) > 1 ? "s" : ""),
+                     rasGpuDevsToString(auxPeer->peer->cudaDevs, auxPeer->peer->nvmlDevs, lineBuf,
                                         sizeof(lineBuf)));
       }
-      if (nPeersReSorted != nRasDeadPeers)
+      if (nPeersBuf != nRasDeadPeers)
         rasOutAppend("  [could not find information on %d process%s]\n",
-                     nRasDeadPeers-nPeersReSorted, (nRasDeadPeers-nPeersReSorted > 1 ? "es" : ""));
-      free(peersReSorted);
+                     nRasDeadPeers-nPeersBuf, (nRasDeadPeers-nPeersBuf > 1 ? "es" : ""));
+      free(auxPeersBuf);
     } // if (rasCountIsOutlier(nRasDeadPeers)
     rasOutAppend("\n");
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc;
     vc = valCounts+vcIdx;
@@ -1154,23 +1269,28 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_INCOMPLETE) {
-        int nRanksMissing = comm->commNRanks - comm->nRanks;
         rasOutAppend("#%d-%d (%016lx) INCOMPLETE\n"
                      "  Missing communicator data from %d rank%s\n", vcIdx, commIdx - vc->firstIdx,
-                     comm->commHash, nRanksMissing, (nRanksMissing > 1 ? "s" : ""));
-        if (rasCountIsOutlier(nRanksMissing, client->verbose)) {
-          lineBuf[0] = '\0';
-          // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-          // exception of the missing ranks...
-          for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-            if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-              rankIdx++;
-            } else {
-              snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                       (rankIdx == commRank ? "" : ","), commRank);
-            }
-          } // for (commRank)
-          rasOutAppend("  The missing rank%s: %s\n", (nRanksMissing > 1 ? "s" : ""), lineBuf);
+                     comm->commId.commHash, auxComm->nIncompleteRanks, (auxComm->nIncompleteRanks > 1 ? "s" : ""));
+        if (rasCountIsOutlier(auxComm->nIncompleteRanks, client->verbose)) {
+          struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+          for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+            struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+            // Filter out ranks that provided a response but not for this communicator.
+            if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers), ncclSocketsCompare) ==
+                nullptr) {
+              int peerIdx = rasPeerFind(&missingRank->addr);
+              if (peerIdx != -1) {
+                rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                             missingRank->commRank,
+                             rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev, lineBuf, sizeof(lineBuf)),
+                             rasPeers[peerIdx].pid,
+                             ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+              } else {
+                rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+              }
+            } // if rank did not respond
+          } // for (rankIdx)
         } // if (rasCountIsOutlier(nRanksMissing))
         rasOutAppend("\n");
       } // if (auxComm->errors & RAS_ACE_INCOMPLETE)
@@ -1178,7 +1298,7 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       if (auxComm->errors & RAS_ACE_ERROR) {
         int ncclErrors[ncclNumResults];
         int nErrors;
-        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) ERROR\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         memset(ncclErrors, '\0', sizeof(ncclErrors));
         for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++)
@@ -1203,6 +1323,11 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
       } // if (auxComm->errors & RAS_ACE_ERROR)
     } // for (commIdx)
   } // for (vcIdx)
+  msgLen = rasOutLength();
+  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+  rasOutExtract(msg);
+  rasClientEnqueueMsg(client, msg, msgLen);
+  msg = nullptr;
 
   rasOutAppend("Warnings\n"
                "========\n\n");
@@ -1213,15 +1338,15 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
                  coll->nLegTimeouts, (coll->nLegTimeouts > 1 ? "s" : ""));
   }
 
+  // Continue printing the largest communicators first, as in the summary table.
   for (int vcIdx = 0; vcIdx < nValCounts; vcIdx++) {
     struct rasValCount* vc = valCounts+vcIdx;
     for (int commIdx = vc->firstIdx; commIdx < vc->count + vc->firstIdx; commIdx++) {
-      bool inconsistent;
       struct rasAuxComm* auxComm = auxComms+commIdx;
       comm = auxComm->comm;
 
       if (auxComm->errors & RAS_ACE_MISMATCH) {
-        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commHash);
+        rasOutAppend("#%d-%d (%016lx) MISMATCH\n", vcIdx, commIdx - vc->firstIdx, comm->commId.commHash);
 
         if (collOpCounts == nullptr) {
           // Allocating comm->commNRanks elements ensures that we won't need to reallocate, because the valCounts
@@ -1234,28 +1359,31 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
           rasOutAppend("  Communicator ranks have different status\n");
 
           // We need to sort the ranks by status.  However, status is normally calculated from other fields.
-          // We will copy the ranks and reuse collOpCount to store it.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
+          // We will store it in the auxCommRanks' value.
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            struct rasCollComms::comm::rank* rank = ranksReSorted+rankIdx;
+            struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+            struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+            auxRank->rank = rank;
 
             if (rank->status.abortFlag)
-              rank->collOpCount = RAS_ACS_ABORT;
+              auxRank->value = RAS_ACS_ABORT;
             else if (rank->status.finalizeCalled || rank->status.destroyFlag)
-              rank->collOpCount = RAS_ACS_FINALIZE;
+              auxRank->value = RAS_ACS_FINALIZE;
             else if (rank->status.initState == ncclSuccess)
-              rank->collOpCount = RAS_ACS_RUNNING;
+              auxRank->value = RAS_ACS_RUNNING;
             else
-              rank->collOpCount = RAS_ACS_INIT;
+              auxRank->value = RAS_ACS_INIT;
           }
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
+          qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
           // Calculate the frequency of different status values.
           int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
+            if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
               // __builtin_clz returns the number of leading 0-bits.  This makes it possible to translate the
-              // status (which is a bitmask) into an array index.
-              collOpCounts[nCollOpCounts].value = (sizeof(unsigned int)*8-1) - __builtin_clz(ranksReSorted[rankIdx].collOpCount);
+              // status (which is a bitmask) into an array index.  The argument is an unsigned int (there is no
+              // 64-bit version seemingly, but we don't actually need one here).
+              collOpCounts[nCollOpCounts].value =
+                (sizeof(unsigned int)*8-1) - __builtin_clz((unsigned int)auxCommRanks[rankIdx].value);
               collOpCounts[nCollOpCounts].count = 1;
               collOpCounts[nCollOpCounts].firstIdx = rankIdx;
               nCollOpCounts++;
@@ -1263,11 +1391,10 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               collOpCounts[nCollOpCounts-1].count++;
             }
           }
-          if (comm->nRanks < comm->commNRanks) {
-            // Add a "fake" element corresponding to the missing entries.  The statusStr array contains the "UNKNOWN"
-            // string at index 0.
-            collOpCounts[nCollOpCounts].value = 0;
-            collOpCounts[nCollOpCounts].count = comm->commNRanks - comm->nRanks;
+          if (comm->nMissingRanks - auxComm->nIncompleteRanks > 0) {
+            // Add a "fake" element corresponding to the NOCOMM entries, since they are not in the ranks array.
+            collOpCounts[nCollOpCounts].value = 0; // The index of "NOCOMM" in statusStr.
+            collOpCounts[nCollOpCounts].count = comm->nMissingRanks - auxComm->nIncompleteRanks;
             collOpCounts[nCollOpCounts].firstIdx = -1; // "Fake" entry identifier.
             nCollOpCounts++;
           }
@@ -1280,114 +1407,159 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
               rasOutAppend("  %d ranks have status %s\n", vcc->count, statusStr[vcc->value]);
             if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
               if (vcc->firstIdx != -1) {
-                // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
                 for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                  int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
                   if (peerIdx != -1) {
                     if (vcc->count > 1)
                       rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank,
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                     else
                       rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value],
-                                   rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value],
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
                                    rasPeers[peerIdx].pid,
                                    ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
                   } else { // peerIdx == -1
                     if (vcc->count > 1)
-                      rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
                     else
                       rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
-                                   ranksReSorted[rankIdx].commRank, statusStr[vcc->value]);
+                                   auxCommRanks[rankIdx].rank->commRank, statusStr[vcc->value]);
                   } // peerIdx == -1
                 } // for (rankIdx)
               } else {
-                // UNKNOWN ranks.  Format a string with their rank numbers (we don't know anything more).
-                lineBuf[0] = '\0';
-                // rankIdx indexes the comm->ranks array; in principle it should be the same as commRank, with the
-                // exception of the missing ranks...
-                for (int commRank = 0, rankIdx = 0; commRank < comm->commNRanks; commRank++) {
-                  if (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == commRank) {
-                    rankIdx++;
-                  } else {
-                    snprintf(lineBuf+strlen(lineBuf), sizeof(lineBuf)-strlen(lineBuf), "%s%d",
-                             (rankIdx == commRank ? "" : ","), commRank);
-                  }
-                } // for (commRank)
-                if (vcc->count > 1) {
-                  rasOutAppend("  The unknown ranks: %s\n", lineBuf);
-                } else {
-                  rasOutAppend("  Rank %s has status %s\n", lineBuf, statusStr[vcc->value]);
-                }
-              }
+                // NOCOMM ranks are in a different array.
+                struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks +
+                                                                                                 comm->nRanks);
+                for (int rankIdx = 0; rankIdx < comm->nMissingRanks; rankIdx++) {
+                  struct rasCollCommsMissingRank* missingRank = missingRanks + rankIdx;
+                  // Filter out ranks that did not respond at all.
+                  if (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                              ncclSocketsCompare)) {
+                    int peerIdx = rasPeerFind(&missingRank->addr);
+                    if (peerIdx != -1) {
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                                           lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- GPU %s managed by process %d on node %s\n",
+                                     missingRank->commRank, statusStr[vcc->value],
+                                     rasGpuToString(missingRank->cudaDev, missingRank->nvmlDev,
+                                                    lineBuf, sizeof(lineBuf)), rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&missingRank->addr, rasLine, sizeof(rasLine)));
+                      }
+                    } else { // peerIdx == -1
+                      if (vcc->count > 1) {
+                        rasOutAppend("  Rank %d -- [process information not found]\n", missingRank->commRank);
+                      } else {
+                        rasOutAppend("  Rank %d has status %s -- [process information not found]\n",
+                                     missingRank->commRank, statusStr[vcc->value]);
+                      }
+                    } // peerIdx == -1
+                  } // if rank responded
+                } // for (rankIdx)
+              } // vcc->firstIdx == -1
             } // if (rasCountIsOutlier(vcc->count))
           } // for (coc)
         } // if (__builtin_popcount(auxComm->status) > 1)
 
-        inconsistent = false;
-        for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-          if (comm->ranks[rankIdx].collOpCount != auxComm->firstCollOpCount) {
-            inconsistent = true;
-            break;
-          }
-        }
-        if (inconsistent) {
-          rasOutAppend("  Communicator ranks have different collective operation counts\n");
+        for (int collIdx = 0; collIdx < NCCL_NUM_FUNCTIONS; collIdx++) {
+          bool inconsistent = false;
 
-          // Sort the ranks by collOpCount and rank for easy counting.
-          memcpy(ranksReSorted, comm->ranks, comm->nRanks * sizeof(*ranksReSorted));
-          qsort(ranksReSorted, comm->nRanks, sizeof(*ranksReSorted), rasCommRanksCollOpCompare);
-          // Calculate the frequency of different collOpCount values.
-          int nCollOpCounts = 0;
           for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
-            if (rankIdx == 0 || ranksReSorted[rankIdx].collOpCount != ranksReSorted[rankIdx-1].collOpCount) {
-              collOpCounts[nCollOpCounts].value = ranksReSorted[rankIdx].collOpCount;
-              collOpCounts[nCollOpCounts].count = 1;
-              collOpCounts[nCollOpCounts].firstIdx = rankIdx;
-              nCollOpCounts++;
-            } else {
-              collOpCounts[nCollOpCounts-1].count++;
+            if (comm->ranks[rankIdx].collOpCounts[collIdx] != auxComm->firstCollOpCounts[collIdx]) {
+              inconsistent = true;
+              break;
             }
           }
-          // Sort by that frequency (most frequent first).
-          qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
 
-          for (int coc = 0; coc < nCollOpCounts; coc++) {
-            struct rasValCount* vcc = collOpCounts+coc;
-            if (vcc->count > 1)
-              rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
-            if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
-              // ranksReSorted is sorted by rank as the secondary key, which comes in handy when printing...
-              for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
-                int peerIdx = peerIdxConv[ranksReSorted[rankIdx].peerIdx];
-                if (peerIdx != -1) {
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                  else
-                    rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d on node %s\n",
-                                 ranksReSorted[rankIdx].commRank, vcc->value,
-                                 rasCommRankGpuToString(ranksReSorted+rankIdx, lineBuf, sizeof(lineBuf)),
-                                 rasPeers[peerIdx].pid,
-                                 ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
-                } else { // peerIdx == -1
-                  if (vcc->count > 1)
-                    rasOutAppend("  Rank %d -- [process information not found]\n", ranksReSorted[rankIdx].commRank);
-                  else
-                     rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
-                                  ranksReSorted[rankIdx].commRank, vcc->value);
-                } // peerIdx == -1
-              } // for (rankIdx)
-            } // if (rasCountIsOutlier(vcc->count))
-          } // for (coc)
-        } // if (inconsistent)
-        rasOutAppend("\n");
+          if (inconsistent) {
+            rasOutAppend("  Communicator ranks have different %s operation counts\n", ncclFuncStr[collIdx]);
+
+            // Sort the ranks by collOpCounts[collIdx] and commRank for easy counting.
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              struct rasCollComms::comm::rank* rank = comm->ranks+rankIdx;
+              struct rasAuxCommRank* auxRank = auxCommRanks+rankIdx;
+              auxRank->rank = rank;
+              auxRank->value = rank->collOpCounts[collIdx];
+            }
+            qsort(auxCommRanks, comm->nRanks, sizeof(*auxCommRanks), rasAuxCommRanksValueCompare);
+            // Calculate the frequency of different collOpCounts[collIdx] values.
+            int nCollOpCounts = 0;
+            for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+              if (rankIdx == 0 || auxCommRanks[rankIdx].value != auxCommRanks[rankIdx-1].value) {
+                collOpCounts[nCollOpCounts].value = auxCommRanks[rankIdx].value;
+                collOpCounts[nCollOpCounts].count = 1;
+                collOpCounts[nCollOpCounts].firstIdx = rankIdx;
+                nCollOpCounts++;
+              } else {
+                collOpCounts[nCollOpCounts-1].count++;
+              }
+            }
+            // Sort by that frequency (most frequent first).
+            qsort(collOpCounts, nCollOpCounts, sizeof(*collOpCounts), rasValCountsCompareRev);
+
+            for (int coc = 0; coc < nCollOpCounts; coc++) {
+              struct rasValCount* vcc = collOpCounts+coc;
+              if (vcc->count > 1) {
+                if (vcc->value > 0)
+                  rasOutAppend("  %d ranks have launched up to operation %ld\n", vcc->count, vcc->value);
+                else
+                  rasOutAppend("  %d ranks have not launched any operations\n", vcc->count);
+              }
+              if (rasCountIsOutlier(vcc->count, client->verbose, comm->commNRanks)) {
+                // auxCommRanks is sorted by commRank as the secondary key, which comes in handy when printing...
+                for (int rankIdx = vcc->firstIdx; rankIdx < vcc->count+vcc->firstIdx; rankIdx++) {
+                  int peerIdx = peerIdxConv[auxCommRanks[rankIdx].rank->peerIdx];
+                  if (peerIdx != -1) {
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- GPU %s managed by process %d on node %s\n",
+                                   auxCommRanks[rankIdx].rank->commRank,
+                                   rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                   rasPeers[peerIdx].pid,
+                                   ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                    } else {
+                      if (vcc->value > 0) {
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank, vcc->value,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      } else {
+                        rasOutAppend("  Rank %d has not launched any operations -- GPU %s managed by process %d "
+                                     "on node %s\n", auxCommRanks[rankIdx].rank->commRank,
+                                     rasCommRankGpuToString(auxCommRanks[rankIdx].rank, lineBuf, sizeof(lineBuf)),
+                                     rasPeers[peerIdx].pid,
+                                     ncclSocketToHost(&rasPeers[peerIdx].addr, rasLine, sizeof(rasLine)));
+                      }
+                    }
+                  } else { // peerIdx == -1
+                    if (vcc->count > 1) {
+                      rasOutAppend("  Rank %d -- [process information not found]\n",
+                                   auxCommRanks[rankIdx].rank->commRank);
+                    } else {
+                      if (vcc->value > 0)
+                        rasOutAppend("  Rank %d has launched up to operation %ld -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank, vcc->value);
+                      else
+                        rasOutAppend("  Rank %d has not launched any operations -- [process information not found]\n",
+                                     auxCommRanks[rankIdx].rank->commRank);
+                    }
+                  } // peerIdx == -1
+                } // for (rankIdx)
+              } // if (rasCountIsOutlier(vcc->count))
+            } // for (coc)
+            rasOutAppend("\n");
+          } // if (inconsistent)
+        } // for (collIdx)
       } // if (auxComm->errors & RAS_ACE_MISMATCH)
     } // for (commIdx)
   } // for (vcIdx)
@@ -1398,20 +1570,26 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   rasOutExtract(msg);
   rasClientEnqueueMsg(client, msg, msgLen);
   msg = nullptr;
+
+  TRACE(NCCL_RAS, "RAS: rasClientRunComms: finishing");
 exit:
   free(peerNvmlDevs);
   free(collOpCounts);
   free(valCounts);
   free(peerIdxConv);
-  free(ranksReSorted);
+  free(auxCommRanks);
   free(auxComms);
   return ret;
 fail:
   goto exit;
 }
 
+// Generates detailed info about encountered errors, be it initialization ones or asynchronous ones.
 static void rasClientBreakDownErrors(struct rasClient* client, struct rasCollComms::comm* comm,
                                      const int* peerIdxConv, int ncclErrors[ncclNumResults], bool isAsync) {
+  // Because the number of possible error kinds is finite and small, we don't bother in this case with allocating
+  // temporary data structures, counting the errors, sorting arrays, etc.  Instead, in each iteration we pick the most
+  // numerous error kind, we iterate through the ranks in search for this error, and immediately add it to the output.
   for (;;) {
     int maxCount = 0;
     ncclResult_t maxCountIdx = ncclSuccess;
@@ -1489,17 +1667,20 @@ static void rasOutAppend(const char* format, ...) {
   }
 
   nRasOutBuffer += needed;
-  assert(nRasOutBuffer <= rasOutBufferSize);
+  if (nRasOutBuffer >= rasOutBufferSize)
+    nRasOutBuffer = rasOutBufferSize - 1; // Should never happen, but just to be extra sure...
 exit:
   ;
 }
 
 // Copies the output data from an internal buffer to a user-supplied one, including the terminating '\0'.
 // The user buffer must already be allocated and be at least rasOutLength() bytes long (which includes
-// the terminating '\0').
+// the terminating '\0').  Resets the output buffer when done.
 static void rasOutExtract(char* buffer) {
-  if (rasOutBuffer)
+  if (rasOutBuffer) {
     memcpy(buffer, rasOutBuffer, rasOutLength());
+    rasOutReset();
+  }
 }
 
 // Returns the current length of the used portion of the output buffer, *not* including the terminating '\0'.
@@ -1524,60 +1705,25 @@ static void rasOutReset() {
 // Various sorting callbacks used when grouping/formatting data. //
 ///////////////////////////////////////////////////////////////////
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of bits set in cudaDevs.  Uses the host IP as the
-// secondary key and the process id as the tertiary key.
-static int rasPeersNGpuCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-  int c1 = __builtin_popcountll(p1->cudaDevs);
-  int c2 = __builtin_popcountll(p2->cudaDevs);
-
-  if (c1 == c2) {
-    // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-    if (cmp == 0) {
-      // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-    }
-    return cmp;
-  } else {
-    return (c1 < c2 ? -1 : 1);
-  }
-}
-
-// Sorting callback for rasPeerInfo elements.  Sorts by the number of peers per node, which we store in cudaDevs.
-// Uses the host IP as the secondary key and the process id as the tertiary key.
-static int rasPeersNProcsCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
+// Sorting callback for rasAuxPeerInfo elements.  Sorts by value, with the peers host IP as the secondary key and
+// the process id as the tertiary key.
+static int rasAuxPeersValueCompare(const void* e1, const void* e2) {
+  const struct rasAuxPeerInfo* p1 = (const struct rasAuxPeerInfo*)e1;
+  const struct rasAuxPeerInfo* p2 = (const struct rasAuxPeerInfo*)e2;
 
-  if (p1->cudaDevs == p2->cudaDevs) {
+  if (p1->value == p2->value) {
     // Host IP address is the secondary key.
-    int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
+    int cmp = ncclSocketsHostCompare(&p1->peer->addr, &p2->peer->addr);
     if (cmp == 0) {
       // Process ID is the tertiary key.
-      cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
+      cmp = (p1->peer->pid < p2->peer->pid ? -1 : (p1->peer->pid > p2->peer->pid ? 1 : 0));
     }
     return cmp;
   } else {
-    return (p1->cudaDevs < p2->cudaDevs ? -1 : 1);
+    return (p1->value < p2->value ? -1 : 1);
   }
 }
 
-// Sorting callback for rasPeerInfo elements.  Sorts by the host IP and the process id as the secondary key (rather
-// than the port).
-static int rasPeersHostPidCompare(const void* e1, const void* e2) {
-  const struct rasPeerInfo* p1 = (const struct rasPeerInfo*)e1;
-  const struct rasPeerInfo* p2 = (const struct rasPeerInfo*)e2;
-
-  int cmp = ncclSocketsHostCompare(&p1->addr, &p2->addr);
-  if (cmp == 0) {
-    // Process ID is the secondary key.
-    cmp = (p1->pid < p2->pid ? -1 : (p1->pid > p2->pid ? 1 : 0));
-  }
-  return cmp;
-}
-
 // Sorting callback for ncclSocketAddress.  Unlike the ncclSocketsCompare, it ignores the port.
 static int ncclSocketsHostCompare(const void* p1, const void* p2) {
   const union ncclSocketAddress* a1 = (const union ncclSocketAddress*)p1;
@@ -1599,7 +1745,8 @@ static int ncclSocketsHostCompare(const void* p1, const void* p2) {
     cmp = memcmp(&a1->sin6.sin6_addr, &a2->sin6.sin6_addr, sizeof(a1->sin6.sin6_addr));
   } else {
     // The only remaining valid case are empty addresses.
-    assert(family == 0);
+    if (family != 0)
+      INFO(NCCL_RAS, "RAS invalid address family %d -- internal error?", family);
     cmp = 0; // Two empty addresses are equal...
   }
 
@@ -1657,24 +1804,16 @@ static int rasAuxCommsCompareRev(const void* p1, const void* p2) {
   }
 }
 
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the peerIdx.
-static int rasCommRanksPeerCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
-
-  return (r1->peerIdx < r2->peerIdx ? -1 : (r1->peerIdx > r2->peerIdx ? 1 : 0));
-}
+// Sorting callback for rasAuxCommRank elements.  Sorts by value, with rank's commRank as the secondary key.
+static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) {
+  const struct rasAuxCommRank* r1 = (const struct rasAuxCommRank*)p1;
+  const struct rasAuxCommRank* r2 = (const struct rasAuxCommRank*)p2;
 
-// Sorting callback for rasCollComms::comm::rank elements.  Sorts by the collOpCount, with rank as the secondary key.
-static int rasCommRanksCollOpCompare(const void* p1, const void* p2) {
-  const struct rasCollComms::comm::rank* r1 = (const struct rasCollComms::comm::rank*)p1;
-  const struct rasCollComms::comm::rank* r2 = (const struct rasCollComms::comm::rank*)p2;
-
-  if (r1->collOpCount == r2->collOpCount) {
-    // Use the rank as the secondary key.
-    return (r1->commRank < r2->commRank ? -1 : (r1->commRank > r2->commRank ? 1 : 0));
+  if (r1->value == r2->value) {
+    // Use the commRank as the secondary key.
+    return (r1->rank->commRank < r2->rank->commRank ? -1 : (r1->rank->commRank > r2->rank->commRank ? 1 : 0));
   } else {
-    return (r1->collOpCount < r2->collOpCount ? -1 : 1);
+    return (r1->value < r2->value ? -1 : 1);
   }
 }
 
@@ -1705,16 +1844,22 @@ const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf,
   return buf;
 }
 
-// Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
+// Formats a GPU string based on the CUDA/NVML ids provided.  If the CUDA id is different from the NVML id, both are
 // printed.
-static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
-  snprintf(buf, size, "%d", rank->cudaDev);
-  if (rank->cudaDev != rank->nvmlDev) {
-    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", rank->nvmlDev);
+static const char* rasGpuToString(int cudaDev, int nvmlDev, char* buf, size_t size) {
+  snprintf(buf, size, "%d", cudaDev);
+  if (cudaDev != nvmlDev) {
+    snprintf(buf+strlen(buf), size-strlen(buf), " (NVML %d)", nvmlDev);
   }
   return buf;
 }
 
+// Formats a GPU string based on the rasCollComms's rank.  If the CUDA id is different from the NVML id, both are
+// printed.
+static const char* rasCommRankGpuToString(const struct rasCollComms::comm::rank* rank, char* buf, size_t size) {
+  return rasGpuToString(rank->cudaDev, rank->nvmlDev, buf, size);
+}
+
 // Converts a NCCL error result to a string.
 static const char* ncclErrorToString(ncclResult_t err) {
   switch (err) {
@@ -1753,3 +1898,21 @@ static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
            (totalCount == -1 || count <= totalCount * RAS_CLIENT_OUTLIER_FRACTION);
   }
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasClientSupportTerminate() {
+  (void)close(rasClientListeningSocket);
+  rasClientListeningSocket = -1;
+
+  free(rasOutBuffer);
+  rasOutBuffer = nullptr;
+  nRasOutBuffer = rasOutBufferSize = 0;
+
+  for (struct rasClient* client = rasClientsHead; client;) {
+    struct rasClient* clientNext = client->next;
+    rasClientTerminate(client);
+    client = clientNext;
+  }
+
+  // rasClientsHead and rasClientsTail are taken care of by rasClientTerminate().
+}
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
index 201144f1a..72833604f 100644
--- a/src/ras/collectives.cc
+++ b/src/ras/collectives.cc
@@ -4,7 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out duriyng development only!
+#define NDEBUG // Comment out during development only!
 #include <cassert>
 #include <mutex>
 
@@ -12,6 +12,7 @@
 #include "checks.h"
 #include "comm.h"
 #include "nccl.h"
+#include "transport.h"
 #include "utils.h"
 #include "ras_internal.h"
 
@@ -32,14 +33,14 @@ static int nRasCollHistory, rasCollHistNextIdx;
 // Monotonically increased to ensure that each collective originating locally has a unique Id.
 static uint64_t rasCollLastId;
 
-// Array keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
+// Keeping track of ongoing collective operations (apart from broadcasts, which have no response so require
 // no such tracking).
-struct rasCollective* rasCollectives;
-static int nRasCollectives;
+struct rasCollective* rasCollectivesHead;
+struct rasCollective* rasCollectivesTail;
 
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl);
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx);
+                                       const struct rasCollRequest* req, size_t reqLen, struct rasConnection* fromConn);
 static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct rasCollRequest* req, size_t reqLen);
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll);
 static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
@@ -47,12 +48,17 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
                                         const union ncclSocketAddress* peers, int nPeers,
                                         const char* data, int nData, int nLegTimeouts);
 
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData);
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg* msg);
 
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData);
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData);
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg);
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm);
 static int ncclCommsCompare(const void* p1, const void* p2);
+static int peersHashesCompare(const void* p1, const void* p2);
+static int peersHashesSearch(const void* k, const void* e);
+static int rasCommIdCompare(const void* p1, const void* p2);
+static int rasCollCommsMissingRankSearch(const void* k, const void* e);
 
 
 ///////////////////////////////////////////////////////////////////////////////////////
@@ -62,22 +68,26 @@ static int ncclCommsCompare(const void* p1, const void* p2);
 // Returns the index of the first available entry in the rasCollectives array, enlarging the array if necessary.
 static ncclResult_t getNewCollEntry(struct rasCollective** pColl) {
   struct rasCollective* coll;
-  int i;
-  for (i = 0; i < nRasCollectives; i++)
-    if (rasCollectives[i].type == RAS_MSG_NONE)
-      break;
-  if (i == nRasCollectives) {
-    NCCLCHECK(ncclRealloc(&rasCollectives, nRasCollectives, nRasCollectives+RAS_INCREMENT));
-    nRasCollectives += RAS_INCREMENT;
-  }
+  int nRasConns;
+
+  NCCLCHECK(ncclCalloc(&coll, 1));
 
-  coll = rasCollectives+i;
-  memset(coll, '\0', sizeof(*coll));
   coll->startTime = clockNano();
-  coll->fromConnIdx = -1;
+  coll->fromConn = nullptr;
   // We are unlikely to use the whole array, but at least we won't need to realloc.
+  nRasConns = 0;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    nRasConns++;
   NCCLCHECK(ncclCalloc(&coll->fwdConns, nRasConns));
 
+  if (rasCollectivesHead) {
+    rasCollectivesTail->next = coll;
+    coll->prev = rasCollectivesTail;
+    rasCollectivesTail = coll;
+  } else {
+    rasCollectivesHead = rasCollectivesTail = coll;
+  }
+
   *pColl = coll;
   return ncclSuccess;
 }
@@ -95,21 +105,23 @@ void rasCollReqInit(struct rasCollRequest* req) {
 // in preparation for collective response messages.
 // pAllDone indicates on return if the collective operation is already finished, which is unusual, but possible
 // in scenarios such as a total of two peers.
-// pCollIdx provides on return an index of the allocated rasCollective structure to track this collective (unless
+// pColl provides on return a pointer to the allocated rasCollective structure to track this collective (unless
 // it's a broadcast, which require no such tracking).
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone, int* pCollIdx,
-                               int fromConnIdx) {
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone,
+                               struct rasCollective** pColl, struct rasConnection* fromConn) {
   struct rasCollective* coll = nullptr;
+  struct rasCollRequest* reqMod = (struct rasCollRequest*)req;
+  size_t reqLen = 0;
   if (req->type >= RAS_COLL_CONNS) {
     // Keep track of this collective operation so that we can handle the responses appropriately.
     NCCLCHECK(getNewCollEntry(&coll));
-    if (pCollIdx)
-      *pCollIdx = coll-rasCollectives;
+    if (pColl)
+      *pColl = coll;
     memcpy(&coll->rootAddr, &req->rootAddr, sizeof(coll->rootAddr));
     coll->rootId = req->rootId;
     coll->type = req->type;
     coll->timeout = req->timeout;
-    coll->fromConnIdx = fromConnIdx;
+    coll->fromConn = fromConn;
     if (ncclCalloc(&coll->peers, 1) == ncclSuccess) {
       memcpy(coll->peers, &rasNetListeningSocket.addr, sizeof(*coll->peers));
       coll->nPeers = 1;
@@ -117,9 +129,9 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
 
     // Collective-specific initialization of accumulated data (using local data for now).
     if (req->type == RAS_COLL_CONNS)
-      (void)rasCollConnsInit(&coll->data, &coll->nData);
+      (void)rasCollConnsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
     else if (req->type == RAS_COLL_COMMS)
-      (void)rasCollCommsInit(&coll->data, &coll->nData);
+      (void)rasCollCommsInit(&reqMod, &reqLen, &coll->data, &coll->nData);
   } else { // req->type < RAS_COLL_CONNS
     // Add the info to the collective message history.
     nRasCollHistory = std::min(nRasCollHistory+1, COLL_HISTORY_SIZE);
@@ -131,42 +143,42 @@ ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen,
     // Collective-specific message handling.
     if (req->type == RAS_BC_DEADPEER) {
       bool done = false;
-      rasMsgHandleBCDeadPeer(req, &done);
+      rasMsgHandleBCDeadPeer(&reqMod, &reqLen, &done);
       if (done)
         goto exit;
     }
   } // req->type < RAS_COLL_CONNS
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
-  (void)rasLinkSendCollReq(&rasNextLink, coll, req, reqLen, fromConnIdx);
-  (void)rasLinkSendCollReq(&rasPrevLink, coll, req, reqLen, fromConnIdx);
+  (void)rasLinkSendCollReq(&rasNextLink, coll, reqMod, reqLen, fromConn);
+  (void)rasLinkSendCollReq(&rasPrevLink, coll, reqMod, reqLen, fromConn);
 
   if (coll && pAllDone)
     *pAllDone = (coll->nFwdSent == coll->nFwdRecv);
 exit:
+  if (reqMod != req)
+    free(reqMod);
   return ncclSuccess;
 }
 
 // Sends the collective message through all connections associated with this link (with the exception of the one
 // the message came from, if any).
 static ncclResult_t rasLinkSendCollReq(struct rasLink* link, struct rasCollective* coll,
-                                       const struct rasCollRequest* req, size_t reqLen, int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->linkFlag) {
-        // We send collective messages through fully established and operational connections only.
-        if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
-          if (rasConnSendCollReq(conn, req, reqLen) == ncclSuccess && coll != nullptr)
-            coll->fwdConns[coll->nFwdSent++] = linkConn->connIdx;
-        } // if (conn->sockIdx != -1 && RAS_SOCK_READY)
-        conn->linkFlag = true;
-      } // if (!conn->linkFlag)
-    } // if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx)
-  } // for (i)
+                                       const struct rasCollRequest* req, size_t reqLen,
+                                       struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn && linkConn->conn != fromConn && !linkConn->conn->linkFlag) {
+      // We send collective messages through fully established and operational connections only.
+      if (linkConn->conn->sock && linkConn->conn->sock->status == RAS_SOCK_READY &&
+          !linkConn->conn->experiencingDelays) {
+        if (rasConnSendCollReq(linkConn->conn, req, reqLen) == ncclSuccess && coll != nullptr)
+          coll->fwdConns[coll->nFwdSent++] = linkConn->conn;
+      } // linkConn->conn is fully established and operational.
+      linkConn->conn->linkFlag = true;
+    } // if (linkConn->conn && linkConn->conn != fromConn && !linkConn->con->linkFlag)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
@@ -190,8 +202,8 @@ static ncclResult_t rasConnSendCollReq(struct rasConnection* conn, const struct
 // in which case it can immediately send the response.
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
   bool allDone = false;
-  int collIdx = -1;
-  assert(sock->connIdx != -1);
+  struct rasCollective* coll = nullptr;
+  assert(sock->conn);
 
   // First check if we've already handled this request (through another connection).
   for (int i = 0; i < nRasCollHistory; i++) {
@@ -202,7 +214,7 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
       if (msg->collReq.type >= RAS_COLL_CONNS) {
         // Send an empty response so that the sender can account for it.  The non-empty response has already been
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
       }
       goto exit;
@@ -211,31 +223,29 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
 
   if (msg->collReq.type >= RAS_COLL_CONNS) {
     // Check if we're currently handling this collective request.
-    for (int i = 0; i < nRasCollectives; i++) {
-      struct rasCollective* coll = rasCollectives+i;
-      if (coll->type != RAS_MSG_NONE &&
-          memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
+    for (coll = rasCollectivesHead; coll; coll = coll->next) {
+      if (memcmp(&msg->collReq.rootAddr, &coll->rootAddr, sizeof(msg->collReq.rootAddr)) == 0 &&
           msg->collReq.rootId == coll->rootId) {
         assert(msg->collReq.type == coll->type);
 
         // Send an empty response so that the sender can account for it.  The non-empty response will be
         // sent through the connection that we received the request through first.
-        NCCLCHECK(rasConnSendCollResp(rasConns+sock->connIdx, &msg->collReq.rootAddr, msg->collReq.rootId,
+        NCCLCHECK(rasConnSendCollResp(sock->conn, &msg->collReq.rootAddr, msg->collReq.rootId,
                                       /*peers*/nullptr, /*nPeers*/0, /*data*/nullptr, /*nData*/0, /*nLegTimeouts*/0));
         goto exit;
       } // if match
-    } // for (i)
+    } // for (coll)
   } // if (msg->collReq.type >= RAS_COLL_CONNS)
 
   // Re-broadcast the message to my peers (minus the one it came from) and handle it locally.
-  NCCLCHECK(rasNetSendCollReq(&msg->collReq, rasCollDataLength(msg->collReq.type), &allDone, &collIdx, sock->connIdx));
+  NCCLCHECK(rasNetSendCollReq(&msg->collReq, &allDone, &coll, sock->conn));
 
   if (msg->collReq.type >= RAS_COLL_CONNS && allDone) {
-    assert(collIdx != -1);
+    assert(coll);
     // We are a leaf process -- send the response right away.  This can probably trigger only for the case of a total
     // of two peers, and hence just one RAS connection, or during communication issues, because normally every peer
     // has more than one connection so there should always be _some_ other peer to forward the request to.
-    NCCLCHECK(rasCollReadyResp(rasCollectives+collIdx));
+    NCCLCHECK(rasCollReadyResp(coll));
   }
 exit:
   return ncclSuccess;
@@ -245,9 +255,9 @@ ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock) {
 // Invoked when we are finished waiting for the collective responses from other peers (i.e., either there weren't
 // any peers (unlikely), the peers sent their responses (likely), or we timed out.
 static ncclResult_t rasCollReadyResp(struct rasCollective* coll) {
-  if (coll->fromConnIdx != -1) {
+  if (coll->fromConn) {
     // For remotely-initiated collectives, send the response back.
-    NCCLCHECK(rasConnSendCollResp(rasConns+coll->fromConnIdx, &coll->rootAddr, coll->rootId,
+    NCCLCHECK(rasConnSendCollResp(coll->fromConn, &coll->rootAddr, coll->rootId,
                                   coll->peers, coll->nPeers, coll->data, coll->nData, coll->nLegTimeouts));
 
     // Add the identifying info to the collective message history.
@@ -302,18 +312,15 @@ static ncclResult_t rasConnSendCollResp(struct rasConnection* conn,
 // the data from the response into the accumulated data.  If all the responses have been accounted for, sends the
 // accumulated response back.
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
-  int collIdx;
-  struct rasCollective* coll = nullptr;
+  struct rasCollective* coll;
   char line[SOCKET_NAME_MAXLEN+1];
 
-  for (collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    coll = rasCollectives+collIdx;
-    if (coll->type != RAS_MSG_NONE &&
-        memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
+  for (coll = rasCollectivesHead; coll; coll = coll->next) {
+    if (memcmp(&msg->collResp.rootAddr, &coll->rootAddr, sizeof(msg->collResp.rootAddr)) == 0 &&
         msg->collResp.rootId == coll->rootId)
       break;
   }
-  if (collIdx == nRasCollectives) {
+  if (coll == nullptr) {
     INFO(NCCL_RAS, "RAS failed to find a matching ongoing collective for response %s:%ld from %s!",
          ncclSocketToString(&msg->collResp.rootAddr, line), msg->collResp.rootId,
          ncclSocketToString(&sock->sock.addr, rasLine));
@@ -321,11 +328,11 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
   }
 
   coll->nLegTimeouts += msg->collResp.nLegTimeouts;
-  assert(sock->connIdx != -1);
-  // Account for the received response in our collective operation tracking.
+  assert(sock->conn);
+  // Account for the received response in our collective operations tracking.
   for (int i = 0; i < coll->nFwdSent; i++) {
-    if (coll->fwdConns[i] == sock->connIdx) {
-      coll->fwdConns[i] = -1;
+    if (coll->fwdConns[i] == sock->conn) {
+      coll->fwdConns[i] = nullptr;
       break;
     }
   }
@@ -353,46 +360,53 @@ ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock) {
 
 // Removes a connection from all ongoing collectives.  Called when a connection is experiencing a delay or is being
 // terminated.
-void rasCollsPurgeConn(int connIdx) {
-  for (int i = 0; i < nRasCollectives; i++) {
-    struct rasCollective* coll = rasCollectives+i;
-    if (coll->type != RAS_MSG_NONE) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      if (coll->fromConnIdx == connIdx) {
-        INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
-             ncclSocketToString(&coll->rootAddr, line), coll->rootId,
-             ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-        rasCollFree(coll);
-      } else {
-        for (int j = 0; j < coll->nFwdSent; j++) {
-          if (coll->fwdConns[j] == connIdx) {
-            coll->fwdConns[j] = -1;
-            coll->nFwdRecv++;
-            coll->nLegTimeouts++;
-            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-                 ncclSocketToString(&rasConns[connIdx].addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-            if (coll->nFwdSent == coll->nFwdRecv)
-              (void)rasCollReadyResp(coll);
-            break;
-          }
-        } // for (j)
-      } // coll->fromConnIdx != connIdx
-    } // !RAS_MSG_NONE
-  } // for (i)
+void rasCollsPurgeConn(struct rasConnection* conn) {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    char line[SOCKET_NAME_MAXLEN+1];
+    if (coll->fromConn == conn) {
+      INFO(NCCL_RAS, "RAS purging collective %s:%ld because it comes from %s",
+           ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+           ncclSocketToString(&conn->addr, rasLine));
+      rasCollFree(coll);
+    } else {
+      for (int i = 0; i < coll->nFwdSent; i++) {
+        if (coll->fwdConns[i] == conn) {
+          coll->fwdConns[i] = nullptr;
+          coll->nFwdRecv++;
+          coll->nLegTimeouts++;
+          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line), coll->rootId,
+               coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+          if (coll->nFwdSent == coll->nFwdRecv)
+            (void)rasCollReadyResp(coll);
+          break;
+        }
+      } // for (i)
+    } // coll->fromConn != conn
+    coll = collNext;
+  } // for (coll)
 }
 
 // Frees a rasCollective entry and any memory associated with it.
 void rasCollFree(struct rasCollective* coll) {
+  if (coll == nullptr)
+    return;
+
   free(coll->fwdConns);
-  coll->fwdConns = nullptr;
   free(coll->peers);
-  coll->peers = nullptr;
   free(coll->data);
-  coll->data = nullptr;
-  coll->fromConnIdx = -1;
-  coll->type = RAS_MSG_NONE;
+
+  if (coll == rasCollectivesHead)
+    rasCollectivesHead = rasCollectivesHead->next;
+  if (coll == rasCollectivesTail)
+    rasCollectivesTail = rasCollectivesTail->prev;
+  if (coll->prev)
+    coll->prev->next = coll->next;
+  if (coll->next)
+    coll->next->prev = coll->prev;
+  free(coll);
 }
 
 // Invoked from the main RAS thread loop to handle timeouts of the collectives.
@@ -407,64 +421,64 @@ void rasCollFree(struct rasCollective* coll) {
 // and send back whatever we have.  Unfortunately, the peer that the RAS client is connected to will in all likelihood
 // time out first, so at that point any delayed responses that eventually arrive are likely to be too late...
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int collIdx = 0; collIdx < nRasCollectives; collIdx++) {
-    struct rasCollective* coll = rasCollectives+collIdx;
-    if (coll->type == RAS_MSG_NONE || coll->timeout == 0)
-      continue;
-
-    if (now - coll->startTime > coll->timeout) {
-      // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
-      if (!coll->timeoutWarned) {
-        INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
-             ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
-             (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-        coll->timeoutWarned = true;
-      }
-      for (int i = 0; i < coll->nFwdSent; i++) {
-        if (coll->fwdConns[i] != -1) {
-          struct rasConnection* conn = rasConns+coll->fwdConns[i];
-          char line[SOCKET_NAME_MAXLEN+1];
-          if (!conn->experiencingDelays && conn->sockIdx != -1) {
-            struct rasSocket* sock = rasSockets+conn->sockIdx;
-            // Ensure that the connection is fully established and operational, and that the socket hasn't been
-            // re-created during the handling of the collective (which would suggest that the request may have been
-            // lost).
-            if (sock->status == RAS_SOCK_READY && sock->createTime < coll->startTime)
-              continue;
-          }
-          // In all other cases we declare a timeout so that we can (hopefully) recover.
-          INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
-               "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
-               ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
-               coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
-          coll->fwdConns[i] = -1;
-          coll->nFwdRecv++;
-          coll->nLegTimeouts++;
-        } // if (coll->fwdConns[i] != -1)
-      } // for (i)
-      if (coll->nFwdSent == coll->nFwdRecv) {
-        (void)rasCollReadyResp(coll);
-      } else {
-        // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
-        // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
-        // (incomplete) responses will arrive shortly, so we should wait a little longer.
-        if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
-          // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
-          // the originator of the collective, if it's not us, may have timed out already anyway).
-          INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    if (coll->timeout > 0) {
+      if (now - coll->startTime > coll->timeout) {
+        // We've exceeded the leg timeout.  For all outstanding responses, check their connections.
+        if (!coll->timeoutWarned) {
+          INFO(NCCL_RAS, "RAS collective %s:%ld timeout warning (%lds) -- %d responses missing",
                ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
                (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
-          coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
-          coll->nFwdRecv = coll->nFwdSent;
+          coll->timeoutWarned = true;
+        }
+        for (int i = 0; i < coll->nFwdSent; i++) {
+          if (coll->fwdConns[i]) {
+            struct rasConnection* conn = coll->fwdConns[i];
+            char line[SOCKET_NAME_MAXLEN+1];
+            if (!conn->experiencingDelays && conn->sock) {
+              // Ensure that the connection is fully established and operational, and that the socket hasn't been
+              // re-created during the handling of the collective (which would suggest that the request may have been
+              // lost).
+              if (conn->sock->status == RAS_SOCK_READY && conn->sock->createTime < coll->startTime)
+                continue;
+            }
+            // In all other cases we declare a timeout so that we can (hopefully) recover.
+            INFO(NCCL_RAS, "RAS not waiting for response from %s to collective %s:%ld "
+                 "(nFwdSent %d, nFwdRecv %d, nLegTimeouts %d)",
+                 ncclSocketToString(&conn->addr, rasLine), ncclSocketToString(&coll->rootAddr, line),
+                 coll->rootId, coll->nFwdSent, coll->nFwdRecv, coll->nLegTimeouts);
+            coll->fwdConns[i] = nullptr;
+            coll->nFwdRecv++;
+            coll->nLegTimeouts++;
+          } // if (coll->fwdConns[i])
+        } // for (i)
+        if (coll->nFwdSent == coll->nFwdRecv) {
           (void)rasCollReadyResp(coll);
         } else {
-          *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
-        }
-      } // conn->nFwdRecv < conn->nFwdSent
-    } else {
-      *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
-    }
-  } // for (collIdx)
+          // At least some of the delays are *not* due to this process' connections experiencing delays, i.e., they
+          // must be due to delays at other processes.  Presumably those processes will give up waiting soon and the
+          // (incomplete) responses will arrive shortly, so we should wait a little longer.
+          if (now - coll->startTime > coll->timeout + RAS_COLLECTIVE_EXTRA_TIMEOUT) {
+            // We've exceeded even the longer timeout, which is unexpected.  Try to return whatever we have (though
+            // the originator of the collective, if it's not us, may have timed out already anyway).
+            INFO(NCCL_RAS, "RAS collective %s:%ld timeout error (%lds) -- giving up on %d missing responses",
+                 ncclSocketToString(&coll->rootAddr, rasLine), coll->rootId,
+                 (now - coll->startTime) / CLOCK_UNITS_PER_SEC, coll->nFwdSent - coll->nFwdRecv);
+            coll->nLegTimeouts += coll->nFwdSent - coll->nFwdRecv;
+            coll->nFwdRecv = coll->nFwdSent;
+            (void)rasCollReadyResp(coll);
+          } else {
+            *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout+RAS_COLLECTIVE_EXTRA_TIMEOUT);
+          }
+        } // conn->nFwdRecv < conn->nFwdSent
+      } else {
+        *nextWakeup = std::min(*nextWakeup, coll->startTime+coll->timeout);
+      }
+    } // if (coll->timeout > 0)
+
+    coll = collNext;
+  } // for (coll)
 }
 
 
@@ -476,15 +490,16 @@ void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For this particular collective, we keep some reduced statistical data (min/max/avg travel time) as well
 // as connection-specific info in case we observed a negative min travel time (which, ideally, shouldn't happen,
 // but the system clocks may not be perfectly in sync).
-static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
+static ncclResult_t rasCollConnsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
   struct rasCollConns connsData = {.travelTimeMin = INT64_MAX, .travelTimeMax = INT64_MIN};
   struct rasCollConns* pConnsData;
 
+  *pReqLen = rasCollDataLength(RAS_COLL_CONNS);
+
   // Update the statistical data first and in the process also calculate how much connection-specific space we
   // will need.
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && conn->travelTimeCount > 0) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (conn->travelTimeCount > 0) {
       if (connsData.travelTimeMin > conn->travelTimeMin)
         connsData.travelTimeMin = conn->travelTimeMin;
       if (connsData.travelTimeMax < conn->travelTimeMax)
@@ -502,9 +517,9 @@ static ncclResult_t rasCollConnsInit(char** pData, int* pNData) {
   pConnsData = (struct rasCollConns*)*pData;
   memcpy(pConnsData, &connsData, sizeof(*pConnsData));
   if (connsData.nNegativeMins > 0) {
-    for (int i = 0, negMinsIdx = 0; i < nRasConns; i++) {
-      struct rasConnection* conn = rasConns+i;
-      if (conn->inUse && conn->travelTimeMin < 0) {
+    int negMinsIdx = 0;
+    for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+      if (conn->travelTimeMin < 0) {
         struct rasCollConns::negativeMin* negativeMin = pConnsData->negativeMins+negMinsIdx;
         memcpy(&negativeMin->source, &rasNetListeningSocket.addr, sizeof(negativeMin->source));
         memcpy(&negativeMin->dest, &conn->addr, sizeof(negativeMin->dest));
@@ -560,10 +575,26 @@ static ncclResult_t rasCollConnsMerge(struct rasCollective* coll, struct rasMsg*
 // Initializes the accumulated data with just the local data for now.
 // For this particular collective, we keep for every communicator information about every rank, to help identify
 // the missing ones and the discrepancies between the ones that did respond.
-static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
+// For any new (previously unseen) communicator we also save the basic identification data about every rank that is
+// "missing" (i.e., not part of this process).  During merging, this should be replaced by the actual data from
+// those ranks, if they are responsive.  We want to provide this information to the user (so that we can say more
+// than "rank xyz missing").
+// Every "new" communicator is also recorded in the (updated) request, so that when that request is forwarded to our
+// peers, those peers don't needlessly send us the same data.
+static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqLen, char** pData, int* pNData) {
+  ncclResult_t ret = ncclSuccess;
   struct rasCollComms* commsData;
-  int nComms = 0, nRanks = 0;
+  int nComms = 0, nRanks = 0, nMissingRanks = 0;
+  bool skipMissing = false;
   std::lock_guard<std::mutex> lock(ncclCommsMutex);
+  struct rasCollComms::comm* comm;
+  struct rasCollRequest* req = nullptr;
+  struct rasPeerInfo** peersReSorted = nullptr;
+  int firstNewSkipMissingIdx = -1;
+
+  *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+    (*pReq)->comms.nSkipMissingRanksComms * sizeof(*(*pReq)->comms.skipMissingRanksComms);
+  *pData = nullptr;
 
   // Start by counting the communicators so that we know how much space to allocate.
   // We also need to sort the comms array, to make the subsequent merging easier, both between the ranks (in case
@@ -572,77 +603,152 @@ static ncclResult_t rasCollCommsInit(char** pData, int* pNData) {
     qsort(ncclComms, nNcclComms, sizeof(*ncclComms), &ncclCommsCompare);
     ncclCommsSorted = true;
   }
-  for (int i = 0; i < nNcclComms; i++) {
-    if (ncclComms[i] == nullptr) // nullptr's are always at the end after sorting.
+  for (int commIdx = 0; commIdx < nNcclComms; commIdx++) {
+    if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting.
       break;
-    if (i == 0) {
-      nComms = 1;
-    } else if (ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
+    // A process may manage multiple GPUs and thus have multiple communicators with the same commHash.
+    // Comparing just the commHash is OK though within communicators that are part of the same process.
+    if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) {
+      skipMissing = rasCollCommsSkipMissing(*pReq, ncclComms[commIdx]);
+      if (!skipMissing) {
+        // Add this communicator to the request so that the processes we forward the request to know not to fill in
+        // the missing rank info.
+        struct rasCommId* skipComm;
+        if (req == nullptr) {
+          // We pessimistically allocate space for all the remaining communicators so that we don't need to reallocate.
+          int newSize = *pReqLen + (nNcclComms-commIdx) * sizeof(*req->comms.skipMissingRanksComms);
+          NCCLCHECKGOTO(ncclCalloc((char**)&req, newSize), ret, fail);
+          memcpy(req, *pReq, *pReqLen);
+          *pReq = req;
+          firstNewSkipMissingIdx = req->comms.nSkipMissingRanksComms;
+        }
+        skipComm = req->comms.skipMissingRanksComms + req->comms.nSkipMissingRanksComms++;
+        skipComm->commHash = ncclComms[commIdx]->commHash;
+        skipComm->hostHash = ncclComms[commIdx]->peerInfo->hostHash;
+        skipComm->pidHash = ncclComms[commIdx]->peerInfo->pidHash;
+
+        nMissingRanks += ncclComms[commIdx]->nRanks;
+      } // if (!skipMissing)
       nComms++;
-    }
+    } // if encountered a new communicator
     nRanks++;
-  }
+    if (!skipMissing)
+      nMissingRanks--;
+  } // for (commIdx)
 
-  // rasNetCollCommsData has nested variable-length arrays, which makes the size calculation and subsequent
+  // rasCollComms has nested variable-length arrays, which makes the size calculation and subsequent
   // pointer manipulations somewhat unwieldy...
-  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks);
-  NCCLCHECK(ncclCalloc(pData, *pNData));
+  // This is extra complicated because of the "hidden" array of struct rasCollCommsMissingRank following the
+  // ranks array for each communicator.
+  *pNData = sizeof(*commsData) + nComms * sizeof(*commsData->comms) + nRanks * sizeof(*commsData->comms[0].ranks) +
+    nMissingRanks * sizeof(struct rasCollCommsMissingRank);
+  NCCLCHECKGOTO(ncclCalloc(pData, *pNData), ret, fail);
   commsData = (struct rasCollComms*)*pData;
   commsData->nComms = nComms;
 
   // comm points at the space in the accumulated data where the info about the current communicator is to be stored.
-  struct rasCollComms::comm* comm = commsData->comms;
-  for (int i = 0; i < nNcclComms; i++) {
-    struct rasCollComms::comm::rank* rank;
-    ncclResult_t asyncError;
-    if (ncclComms[i] == nullptr)
-      break;
-    if (i == 0 || ncclComms[i]->commHash != ncclComms[i-1]->commHash) {
-      if (i > 0)
-        comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks));
-      comm->commHash = ncclComms[i]->commHash;
-      comm->commNRanks = ncclComms[i]->nRanks;
-      comm->nRanks = 0;
-    } else if (ncclComms[i]->nRanks != ncclComms[i-1]->nRanks) {
-      INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-           "possible commHash collision (0x%lx)", ncclComms[i-1]->nRanks, ncclComms[i]->nRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    } else if (ncclComms[i]->rank == ncclComms[i-1]->rank) {
-      INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-           ncclComms[i]->rank, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    if (comm->nRanks == comm->commNRanks) {
-      INFO(NCCL_RAS,
-           "RAS encountered more ranks than the communicator size (%d) -- possible commHash collision (0x%lx)",
-           comm->commNRanks, comm->commHash);
-      continue; // Short of failing, the best we can do is skip...
-    }
-    rank = comm->ranks+comm->nRanks;
-    rank->commRank = ncclComms[i]->rank;
-    // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
-    // always 0.  It will increase after we send this response back to the peer we got the request from.
-    rank->peerIdx = 0;
-    rank->collOpCount = ncclComms[i]->collOpCount;
-    rank->status.initState = ncclComms[i]->initState;
-    if (ncclCommGetAsyncError(ncclComms[i], &asyncError) == ncclSuccess)
-      rank->status.asyncError = asyncError;
-    rank->status.finalizeCalled = (ncclComms[i]->finalizeCalled != 0);
-    rank->status.destroyFlag = (ncclComms[i]->destroyFlag != 0);
-    rank->status.abortFlag = (__atomic_load_n(ncclComms[i]->abortFlag, __ATOMIC_ACQUIRE) != 0);
-    rank->cudaDev = ncclComms[i]->cudaDev;
-    rank->nvmlDev = ncclComms[i]->nvmlDev;
-    comm->nRanks++;
-  }
-  assert(nComms == 0 || ((char*)(comm->ranks+comm->nRanks)) - (char*)commsData <= *pNData);
+  comm = commsData->comms;
+  // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms.
+  for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) {
+    struct ncclComm* ncclComm = ncclComms[commIdx];
+
+    comm->commId.commHash = ncclComm->commHash;
+    comm->commId.hostHash = ncclComm->peerInfo->hostHash;
+    comm->commId.pidHash = ncclComm->peerInfo->pidHash;
+    comm->commNRanks = ncclComm->nRanks;
+    comm->nRanks = comm->nMissingRanks = 0;
+
+    // Fill in the comm->ranks array.
+    for (; commIdx < nNcclComms && ncclComms[commIdx] && ncclComms[commIdx]->commHash == comm->commId.commHash;
+         commIdx++) {
+      ncclComm = ncclComms[commIdx];
+      struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks;
+      ncclResult_t asyncError;
+      rank->commRank = ncclComm->rank;
+      // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
+      // always 0.  It will increase after we send this response back to the peer we got the request from.
+      rank->peerIdx = 0;
+      memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts));
+      rank->status.initState = ncclComm->initState;
+      if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess)
+        rank->status.asyncError = asyncError;
+      rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0);
+      rank->status.destroyFlag = (ncclComm->destroyFlag != 0);
+      rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0);
+      rank->cudaDev = ncclComm->cudaDev;
+      rank->nvmlDev = ncclComm->nvmlDev;
+      comm->nRanks++;
+    } // for (commIdx)
+
+    if (firstNewSkipMissingIdx != -1 &&
+        memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) {
+      // Fill in the missingRanks array that follows the comm->ranks.
+      struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
+
+      if (peersReSorted == nullptr) {
+        // Create a lookup table to rasPeers that is sorted by hostHash and pidHash, to reduce the complexity of the
+        // lookups in the missingRankIdx loop below.
+        NCCLCHECKGOTO(ncclCalloc(&peersReSorted, nRasPeers), ret, fail);
+        for (int peerIdx = 0; peerIdx < nRasPeers; peerIdx++)
+          peersReSorted[peerIdx] = rasPeers+peerIdx;
+        qsort(peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesCompare);
+      }
 
-  return ncclSuccess;
+      comm->nMissingRanks = comm->commNRanks - comm->nRanks;
+      for (int missingRankIdx = 0, rankIdx = 0; missingRankIdx < comm->nMissingRanks; missingRankIdx++) {
+        struct rasCollCommsMissingRank* missingRank;
+        struct ncclPeerInfo* info;
+        struct rasPeerInfo** peer;
+        uint64_t key[2];
+        // Look for the next "hole" in the ranks array.
+        while (rankIdx < comm->nRanks && comm->ranks[rankIdx].commRank == rankIdx+missingRankIdx)
+          rankIdx++;
+
+        missingRank = missingRanks + missingRankIdx;
+        missingRank->commRank = rankIdx + missingRankIdx;
+        info = ncclComm->peerInfo + missingRank->commRank;
+        key[0] = info->hostHash - ncclComm->commHash;
+        key[1] = info->pidHash - ncclComm->commHash;
+        peer = (struct rasPeerInfo**)bsearch(key, peersReSorted, nRasPeers, sizeof(*peersReSorted), peersHashesSearch);
+        if (peer)
+          memcpy(&missingRank->addr, &(*peer)->addr, sizeof(missingRank->addr));
+        missingRank->cudaDev = info->cudaDev;
+        missingRank->nvmlDev = info->nvmlDev;
+      } // for (missingRankIdx)
+
+      if (++firstNewSkipMissingIdx == req->comms.nSkipMissingRanksComms)
+        firstNewSkipMissingIdx = -1;
+    } // if need to fill in the missingRanks
+
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) +
+                                        comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+  } // for (collCommIdx)
+  assert(((char*)comm) - (char*)commsData <= *pNData);
+
+  if (req) {
+    // Finish updating the request.
+    *pReqLen = rasCollDataLength(RAS_COLL_COMMS) +
+      req->comms.nSkipMissingRanksComms * sizeof(*req->comms.skipMissingRanksComms);
+    qsort(req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+          sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare);
+  }
+ret:
+  free(peersReSorted);
+  return ret;
+fail:
+  if (req) {
+    free(req);
+    *pReq = nullptr;
+  }
+  free(*pData);
+  *pData = nullptr;
+  goto ret;
 }
 
 // Merges incoming collective RAS_COLL_COMMS response message into the local accumulated data.
 static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg* msg) {
-  struct rasCollComms* collData;
-  struct rasCollComms* msgData;
+  struct rasCollComms* collData; // Data previously stored (locally) by our process.
+  struct rasCollComms* msgData; // Data just received from another process.
   int dataOffset = rasMsgLength(RAS_MSG_COLLRESP) + msg->collResp.nPeers*sizeof(*msg->collResp.peers);
   ALIGN_SIZE(dataOffset, alignof(int64_t));
 
@@ -650,7 +756,7 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   collData = (struct rasCollComms*)coll->data;
 
   if (msgData->nComms > 0) {
-    struct rasCollComms* newData = nullptr;
+    struct rasCollComms* newData = nullptr; // Destination buffer for the merged data.
 
     // Allocate the new buffer pessimistically (sized as the sum of the two old ones).
     NCCLCHECK(ncclCalloc((char**)&newData, coll->nData + msg->collResp.nData));
@@ -661,25 +767,28 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
     for (int collIdx = 0, msgIdx = 0; collIdx < collData->nComms || msgIdx < msgData->nComms; newData->nComms++) {
       int cmp;
       if (collIdx < collData->nComms && msgIdx < msgData->nComms)
-        cmp = (collComm->commHash < msgComm->commHash ? -1 : (collComm->commHash > msgComm->commHash ? 1 : 0));
+        cmp = rasCommIdCompare(&collComm->commId, &msgComm->commId);
       else
         cmp = (collIdx < collData->nComms ? -1 : 1);
 
       if (cmp == 0 && collComm->commNRanks != msgComm->commNRanks) {
         INFO(NCCL_RAS, "RAS encountered inconsistent communicator data: size %d != %d -- "
-             "possible commHash collision (0x%lx)", collComm->commNRanks, msgComm->commNRanks, collComm->commHash);
+             "possible hash collision (0x%lx, 0x%lx, 0x%lx)", collComm->commNRanks, msgComm->commNRanks,
+             collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
         cmp = (collComm->commNRanks < msgComm->commNRanks ? -1 : 1);
-        // We try to preserve both separately, although the input data might already be messed up anyway...
+        // We try to preserve them both separately...
       }
 
       if (cmp == 0) {
         // Merge the comms.
-        newComm->commHash = collComm->commHash;
+        memcpy(&newComm->commId, &collComm->commId, sizeof(newComm->commId));
         newComm->commNRanks = collComm->commNRanks;
         if (collComm->nRanks + msgComm->nRanks > collComm->commNRanks) {
           INFO(NCCL_RAS,
-               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible commHash collision (0x%lx)",
-               collComm->nRanks + msgComm->nRanks, newComm->commNRanks, newComm->commHash);
+               "RAS encountered more ranks (%d) than the communicator size (%d) -- possible hash collision "
+               "(0x%lx, 0x%lx, 0x%lx)", collComm->nRanks + msgComm->nRanks, newComm->commNRanks,
+               collComm->commId.commHash, collComm->commId.hostHash, collComm->commId.pidHash);
+          newComm->nRanks = newComm->commNRanks;
           // We'll skip the extras in the loop below.
         } else {
           newComm->nRanks = collComm->nRanks + msgComm->nRanks;
@@ -691,16 +800,18 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           int cmpRank;
           if (newRankIdx == newComm->commNRanks)
             break; // Short of failing, the best we can do is skip...
-          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks)
+          if (collRankIdx < collComm->nRanks && msgRankIdx < msgComm->nRanks) {
             cmpRank = (collComm->ranks[collRankIdx].commRank < msgComm->ranks[msgRankIdx].commRank ? -1 :
                        (collComm->ranks[collRankIdx].commRank > msgComm->ranks[msgRankIdx].commRank ? 1 : 0));
-          else
+          } else {
             cmpRank = (collRankIdx < collComm->nRanks ? -1 : 1);
+          }
 
           // There shouldn't be any overlaps in ranks between different sources.
           if (cmpRank == 0) {
-            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible commHash collision (0x%lx)",
-                 collComm->ranks[collRankIdx].commRank, newComm->commHash);
+            INFO(NCCL_RAS, "RAS encountered duplicate data for rank %d -- possible hash collision "
+                 "(0x%lx, 0x%lx, 0x%lx)", collComm->ranks[collRankIdx].commRank,
+                 newComm->commId.commHash, newComm->commId.hostHash, newComm->commId.pidHash);
             msgRankIdx++; // Short of failing, the best we can do is skip...
           }
           memcpy(newComm->ranks+newRankIdx, (cmpRank <= 0 ? collComm->ranks+collRankIdx++ :
@@ -708,23 +819,63 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
           if (cmpRank > 0) {
             // peerIdx values from msgComm need to shift after merge.
             newComm->ranks[newRankIdx].peerIdx += coll->nPeers;
-          }
+
+            if (collComm->nMissingRanks > 0) {
+              // Remove the corresponding entry from missingRanks.
+              struct rasCollCommsMissingRank* missingRank;
+              missingRank = (struct rasCollCommsMissingRank*)bsearch(&newComm->ranks[newRankIdx].commRank,
+                                                                     collComm->ranks+collComm->nRanks,
+                                                                     collComm->nMissingRanks,
+                                                                     sizeof(struct rasCollCommsMissingRank),
+                                                                     rasCollCommsMissingRankSearch);
+              if (missingRank) {
+                // Mark the entry as no longer needed.
+                memset(&missingRank->addr, '\0', sizeof(missingRank->addr));
+              } else {
+                INFO(NCCL_RAS, "RAS failed to find missingRank data -- internal error?");
+              }
+            } // if (collComm->nMissingRanks > 0)
+          } // if (cmpRank > 0)
         } // for (newRankIdx)
-        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks));
-        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks));
+        if (collComm->nMissingRanks > 0) {
+          // Copy the missingRanks to newComm, skipping over any no longer needed entries.
+          union ncclSocketAddress emptyAddr;
+          struct rasCollCommsMissingRank* collMissingRanks;
+          struct rasCollCommsMissingRank* newMissingRanks;
+          int newRankIdx;
+
+          memset(&emptyAddr, '\0', sizeof(emptyAddr));
+          collMissingRanks = (struct rasCollCommsMissingRank*)(collComm->ranks+collComm->nRanks);
+          newMissingRanks = (struct rasCollCommsMissingRank*)(newComm->ranks+newComm->nRanks);
+          newRankIdx = 0;
+          for (int collRankIdx = 0; collRankIdx < collComm->nMissingRanks; collRankIdx++) {
+            if (memcmp(&collMissingRanks[collRankIdx].addr, &emptyAddr, sizeof(emptyAddr))) {
+              memcpy(newMissingRanks + newRankIdx++, collMissingRanks + collRankIdx, sizeof(*newMissingRanks));
+            }
+          }
+          newComm->nMissingRanks = newRankIdx;
+          assert(newComm->nRanks + newComm->nMissingRanks == newComm->commNRanks);
+        }
+        newComm = (struct rasCollComms::comm*)(((char*)(newComm+1)) + newComm->nRanks * sizeof(*newComm->ranks) +
+                                               newComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+        collComm = (struct rasCollComms::comm*)(((char*)(collComm+1)) + collComm->nRanks * sizeof(*collComm->ranks) +
+                                                collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         collIdx++;
-        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks));
+        msgComm = (struct rasCollComms::comm*)(((char*)(msgComm+1)) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+                                               msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
         msgIdx++;
       } else if (cmp < 0) {
         // Copy from collComm.
-        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks);
+        int commSize = sizeof(*collComm) + collComm->nRanks * sizeof(*collComm->ranks) +
+          collComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, collComm, commSize);
         newComm = (struct rasCollComms::comm*)(((char*)(newComm)) + commSize);
         collComm = (struct rasCollComms::comm*)(((char*)(collComm)) + commSize);
         collIdx++;
       } else { // cmp > 0
         // Copy from msgComm.
-        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks);
+        int commSize = sizeof(*msgComm) + msgComm->nRanks * sizeof(*msgComm->ranks) +
+          msgComm->nMissingRanks * sizeof(struct rasCollCommsMissingRank);
         memcpy(newComm, msgComm, commSize);
         for (int i = 0; i < newComm->nRanks; i++) {
           // peerIdx values from msgComm need to shift after merge.
@@ -745,18 +896,87 @@ static ncclResult_t rasCollCommsMerge(struct rasCollective* coll, struct rasMsg*
   return ncclSuccess;
 }
 
+// Checks if a given communicator is in the skipMissingRanksComms array of the request.
+static bool rasCollCommsSkipMissing(const struct rasCollRequest* req, struct ncclComm* comm) {
+  struct rasCommId id;
+  id.commHash = comm->commHash;
+  id.hostHash = comm->peerInfo->hostHash;
+  id.pidHash = comm->peerInfo->pidHash;
+  return (bsearch(&id, req->comms.skipMissingRanksComms, req->comms.nSkipMissingRanksComms,
+                  sizeof(*req->comms.skipMissingRanksComms), rasCommIdCompare) != nullptr);
+}
+
 // Sorting callback for the ncclComms array.
 static int ncclCommsCompare(const void* p1, const void* p2) {
-  const ncclComm** pc1 = (const ncclComm**)p1;
-  const ncclComm** pc2 = (const ncclComm**)p2;
+  const ncclComm* comm1 = *(const ncclComm**)p1;
+  const ncclComm* comm2 = *(const ncclComm**)p2;
 
   // Put nullptr's at the end.
-  if (*pc1 == nullptr || *pc2 == nullptr)
-    return (*pc1 != nullptr ? -1 : (*pc2 != nullptr ? 1 : 0));
+  if (comm1 == nullptr || comm2 == nullptr)
+    return (comm1 != nullptr ? -1 : (comm2 != nullptr ? 1 : 0));
 
-  if ((*pc1)->commHash == (*pc2)->commHash) {
-    return ((*pc1)->rank < (*pc2)->rank ? -1 : ((*pc1)->rank > (*pc2)->rank ? 1 : 0));
+  if (comm1->commHash == comm2->commHash) {
+    return (comm1->rank < comm2->rank ? -1 : (comm1->rank > comm2->rank ? 1 : 0));
   } else {
-    return ((*pc1)->commHash < (*pc2)->commHash ? -1 : 1);
+    return (comm1->commHash < comm2->commHash ? -1 : 1);
   }
 }
+
+// Sorting callback for a lookup table to rasPeers.  Sorts by the hostHash (primary) and pidHash (secondary).
+static int peersHashesCompare(const void* p1, const void* p2) {
+  const struct rasPeerInfo* pi1 = *(const struct rasPeerInfo**)p1;
+  const struct rasPeerInfo* pi2 = *(const struct rasPeerInfo**)p2;
+
+  if (pi1->hostHash == pi2->hostHash) {
+    return (pi1->pidHash < pi2->pidHash ? -1 : (pi1->pidHash > pi2->pidHash ? 1 : 0));
+  } else {
+    return (pi1->hostHash < pi2->hostHash ? -1 : 1);
+  }
+}
+
+// Search callback for a lookup table to rasPeers.  Searches by the hostHash and pidHash.  The key is an array
+// containing the hostHash at index 0 and the pidHash at index 1.
+static int peersHashesSearch(const void* k, const void* e) {
+  const uint64_t* key = (const uint64_t*)k;
+  const struct rasPeerInfo* elem = *(const struct rasPeerInfo**)e;
+
+  if (key[0] == elem->hostHash) {
+    return (key[1] < elem->pidHash ? -1 : (key[1] > elem->pidHash ? 1 : 0));
+  } else {
+    return (key[0] < elem->hostHash ? -1 : 1);
+  }
+}
+
+// Sorting/searching callback for struct rasCommId.  Sorts by commHash, then hostHash, then pidHash.
+static int rasCommIdCompare(const void* p1, const void* p2) {
+  const struct rasCommId* i1 = (const struct rasCommId*)p1;
+  const struct rasCommId* i2 = (const struct rasCommId*)p2;
+  if (i1->commHash == i2->commHash) {
+    if (i1->hostHash == i2->hostHash) {
+      return (i1->pidHash < i2->pidHash ? -1 : (i1->pidHash > i2->pidHash ? 1 : 0));
+    } else {
+      return (i1->hostHash < i2->hostHash ? -1 : 1);
+    }
+  } else {
+    return (i1->commHash < i2->commHash ? -1 : 1);
+  }
+}
+
+// Search callback for rasCollComms::comm rasCollCommsMissingRank array.  The key is the commRank.
+static int rasCollCommsMissingRankSearch(const void* k, const void* e) {
+  int key = *(const int*)k;
+  const struct rasCollCommsMissingRank* elem = (const struct rasCollCommsMissingRank*)e;
+
+  return (key < elem->commRank ? -1 : (key > elem->commRank ? 1 : 0));
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasCollectivesTerminate() {
+  for (struct rasCollective* coll = rasCollectivesHead; coll;) {
+    struct rasCollective* collNext = coll->next;
+    rasCollFree(coll);
+    coll = collNext;
+  }
+
+  // rasCollectivesHead and rasCollectivesTail are taken care of by rasCollFree().
+}
diff --git a/src/ras/peers.cc b/src/ras/peers.cc
index f2692d3e1..8573209f1 100644
--- a/src/ras/peers.cc
+++ b/src/ras/peers.cc
@@ -40,10 +40,11 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
 static ncclResult_t rasPeersUpdate(struct rasPeerInfo* rankPeers, int* nRankPeers, int newNRasPeers = -1);
 
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks = nullptr, int nranks = 0, int fromConnIdx = -1);
+                                      struct rasRankInit* ranks = nullptr, int nranks = 0,
+                                      struct rasConnection* fromConn = nullptr);
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx);
+                                           struct rasConnection* fromConn);
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks);
 ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock);
@@ -146,6 +147,8 @@ static ncclResult_t rasRanksConvertToPeers(struct rasRankInit* ranks, int nranks
     rankPeer->pid = rank->pid;
     rankPeer->cudaDevs = (1UL << rank->cudaDev);
     rankPeer->nvmlDevs = (1UL << rank->nvmlDev);
+    rankPeer->hostHash = rank->hostHash;
+    rankPeer->pidHash = rank->pidHash;
     rankPeerIdx++;
 
     // Also check if there is already an entry with that address in the global rasPeers so that the caller can know how
@@ -357,12 +360,12 @@ int rasPeerFind(const union ncclSocketAddress* addr) {
 // ranks -- if provided -- lists all the peers who are already aware of this update (because they are the members
 // of the new communicator being established), and who thus don't need to be notified.  updatedDeadPeers can
 // be used, however, to request at least the propagation of rasDeadPeers to such peers.
-// fromConnIdx -- if provided -- identified the connection used to receive this update; there's no need to
+// fromConn -- if provided -- identifies the connection used to receive this update; there's no need to
 // propagate the update back through it.
 // Reconfigures the RAS network to accommodate the newly added peers, by modifying the links and establishing new
 // connections as needed.
 static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nNewPeers, bool updateDeadPeers,
-                                      struct rasRankInit* ranks, int nranks, int fromConnIdx) {
+                                      struct rasRankInit* ranks, int nranks, struct rasConnection* fromConn) {
   ncclResult_t ret = ncclSuccess;
 
   // Do we actually have anything to do?
@@ -371,8 +374,8 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN
 
   // Start by propagating the update through the RAS network links.  We consider any errors during this process
   // to be non-fatal (we can re-sync later around a keep-alive exchange).
-  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
-  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConnIdx);
+  (void)rasLinkPropagateUpdate(&rasNextLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
+  (void)rasLinkPropagateUpdate(&rasPrevLink, newPeers, nNewPeers, updateDeadPeers, ranks, nranks, fromConn);
 
   // Calculate new link peers and open new connections if needed.
   NCCLCHECKGOTO(rasLinkReinitConns(&rasNextLink), ret, fail);
@@ -388,15 +391,13 @@ static ncclResult_t rasNetUpdatePeers(const struct rasPeerInfo* newPeers, int nN
 // for the explanation of the function arguments.
 static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct rasPeerInfo* newPeers, int nNewPeers,
                                            bool updateDeadPeers, struct rasRankInit* ranks, int nranks,
-                                           int fromConnIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
+                                           struct rasConnection* fromConn) {
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
     // Note that we don't send the update via the connection that we received this notification from in the first
     // place (while it wouldn't loop indefinitely, it would add a needless extra exchange).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != fromConnIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
+    if (linkConn->conn && linkConn->conn != fromConn) {
       // Failed propagations are not considered fatal (we will retry after a keep-alive).
-      (void)rasConnPropagateUpdate(conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
+      (void)rasConnPropagateUpdate(linkConn->conn, newPeers, nNewPeers, updateDeadPeers, ranks, nranks);
     }
   }
 
@@ -407,7 +408,7 @@ static ncclResult_t rasLinkPropagateUpdate(struct rasLink* link, const struct ra
 // arguments.
 static ncclResult_t rasConnPropagateUpdate(struct rasConnection* conn, const struct rasPeerInfo* newPeers,
                                            int nNewPeers, bool updateDeadPeers, struct rasRankInit* ranks, int nranks) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     // If we have the rank info, check if the peer on the other side of this connection has participated in the new
     // communicator.
     int connRank = -1;
@@ -462,7 +463,8 @@ ncclResult_t rasConnSendPeersUpdate(struct rasConnection* conn, const struct ras
   msg->peersUpdate.deadPeersHash = rasDeadPeersHash;
   msg->peersUpdate.nDeadPeers = nDeadPeers;
   memcpy(msg->peersUpdate.peers, peers, nPeers * sizeof(msg->peersUpdate.peers[0]));
-  memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
+  if (nDeadPeers > 0)
+    memcpy(((char*)msg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
 
   if (nPeers > 0)
     conn->lastSentPeersHash = rasPeersHash;
@@ -485,8 +487,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
   ncclResult_t ret = ncclSuccess;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen = 0;
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   int nPeers, nDeadPeers;
   int deadPeersOffset = 0;
   bool updatePeers, updateDeadPeers;
@@ -496,8 +497,8 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
        msg->peersUpdate.nPeers, msg->peersUpdate.nDeadPeers);
   INFO(NCCL_RAS, "RAS my old rasPeersHash 0x%lx, rasDeadPeersHash 0x%lx, nRasPeers %d, nRasDeadPeers %d",
        rasPeersHash, rasDeadPeersHash, nRasPeers, nRasDeadPeers);
-  conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
-  conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
+  sock->conn->lastRecvPeersHash = msg->peersUpdate.peersHash;
+  sock->conn->lastRecvDeadPeersHash = msg->peersUpdate.deadPeersHash;
 
   // Prepare ours to send back.  We don't enqueue it right away because we want to make sure first that we need
   // to send it.  We'll find out by comparing the hash values after the merge.
@@ -545,15 +546,15 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       rasDeadPeersDump();
 
     // If post-merge the hashes are still different, send our (dead) peers back.
-    updatePeers = (conn->lastSentPeersHash != rasPeersHash && conn->lastRecvPeersHash != rasPeersHash);
-    updateDeadPeers = (conn->lastSentDeadPeersHash != rasDeadPeersHash &&
-                       conn->lastRecvDeadPeersHash != rasDeadPeersHash);
+    updatePeers = (sock->conn->lastSentPeersHash != rasPeersHash && sock->conn->lastRecvPeersHash != rasPeersHash);
+    updateDeadPeers = (sock->conn->lastSentDeadPeersHash != rasDeadPeersHash &&
+                       sock->conn->lastRecvDeadPeersHash != rasDeadPeersHash);
     if (updatePeers || updateDeadPeers) {
       newMsg->peersUpdate.peersHash = rasPeersHash;
       newMsg->peersUpdate.deadPeersHash = rasDeadPeersHash;
       if (updatePeers) {
         assert(nPeers > 0);
-        conn->lastSentPeersHash = rasPeersHash;
+        sock->conn->lastSentPeersHash = rasPeersHash;
       } else {
         // If hashes match, make sure that we don't send the rasPeers back.
         newMsg->peersUpdate.nPeers = 0;
@@ -564,14 +565,14 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
 
       if (updateDeadPeers) {
         assert(nRasDeadPeers > 0);
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
 
         ALIGN_SIZE(newMsgLen, alignof(union ncclSocketAddress));
         deadPeersOffset = newMsgLen;
         newMsgLen += nRasDeadPeers*sizeof(*rasDeadPeers);
 
         memcpy(((char*)newMsg)+deadPeersOffset, rasDeadPeers, nDeadPeers * sizeof(*rasDeadPeers));
-        conn->lastSentDeadPeersHash = rasDeadPeersHash;
+        sock->conn->lastSentDeadPeersHash = rasDeadPeersHash;
         newMsg->peersUpdate.nDeadPeers = nRasDeadPeers;
       } else {
         newMsg->peersUpdate.nDeadPeers = 0;
@@ -580,13 +581,13 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
       INFO(NCCL_RAS, "RAS sending back a peersUpdate (nPeers %d, nDeadPeers %d)",
            newMsg->peersUpdate.nPeers, newMsg->peersUpdate.nDeadPeers);
 
-      rasConnEnqueueMsg(conn, newMsg, newMsgLen);
+      rasConnEnqueueMsg(sock->conn, newMsg, newMsgLen);
       newMsg = nullptr;
     } // if (updatePeers || updateDeadPeers)
 
     // Propagate the changes through our RAS network links.
     NCCLCHECKGOTO(rasNetUpdatePeers(msg->peersUpdate.peers, msg->peersUpdate.nPeers, updateDeadPeers, nullptr, 0,
-                                    sock->connIdx), ret, fail);
+                                    sock->conn), ret, fail);
   }
 
 exit:
@@ -603,7 +604,7 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
 
 // Reinitializes the connection(s) of a particular link, following a peers update.
 // Adding new peers can affect the calculation of the link's primary connection and also the fallbacks.
-// The newly added peers could also shift all the existing peerIdx values, invalidating the values in RasLinkConn
+// The newly added peers could also shift all the existing peerIdx values, invalidating the values in rasLinkConn
 // structures, so it's better to drop it all and recalculate from scratch.
 // We recalculate the primary peer; if an active connection to it already exists, then we're done.  If there
 // is no connection, we create one.  If a connection exists but is experiencing delays then we add a fallback and
@@ -611,77 +612,51 @@ ncclResult_t rasMsgHandlePeersUpdate(struct rasMsg* msg, struct rasSocket* sock)
 // External conns are dropped from the links as well (they will be re-created via keepAlive messages as needed).
 static ncclResult_t rasLinkReinitConns(struct rasLink* link) {
   struct rasLinkConn* linkConn;
-  struct rasConnection* conn = nullptr;
   int newPeerIdx = myPeerIdx;
 
-  if (link->connsSize == 0) {
-    link->connsSize = RAS_INCREMENT;
-    NCCLCHECK(ncclCalloc(&link->conns, link->connsSize));
-  }
-  link->nConns = 0;
-
-  // Establish a connection for this link.  We iterate as long as the connections we find are experiencing delays.
-  while (newPeerIdx != -1) {
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
+  if (link->conns) {
+    // Free the old contents but keep the first entry for convenience (though wipe it).
+    for (struct rasLinkConn* linkConn = link->conns->next; linkConn;) {
+      struct rasLinkConn* linkConnNext = linkConn->next;
+      free(linkConn);
+      linkConn = linkConnNext;
     }
+    memset(link->conns, '\0', sizeof(*link->conns));
+    link->lastUpdatePeersTime = 0;
+  } else { // link->conns == nullptr
+    NCCLCHECK(ncclCalloc(&link->conns, 1));
+  }
 
-    newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/link->nConns > 1);
-    if (newPeerIdx == -1) {
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
-      if (link->nConns > 0)
-        break;
-    }
-    linkConn = link->conns+link->nConns;
-    linkConn->peerIdx = newPeerIdx;
-    linkConn->connIdx = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : -1);
-    linkConn->external = false;
-
-    // If the calculated connection does not exist, then we are at the end of the chain and this is the last iteration.
-    // Depending on the circumstances, we may first need to create that connection.
-    if (linkConn->connIdx == - 1) {
-      if (link->nConns == 0) {
-        if (linkConn->peerIdx != -1) {
-          INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
-               link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
-               ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-          // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
-          // to avoid races and the creation of duplicate connections.
-          if (myPeerIdx < linkConn->peerIdx) {
-            NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-          }
-          else { // If we didn't initiate the connection, start the timeout.
-            link->lastUpdatePeersTime = clockNano();
-          }
-        } // if (linkConn->peerIdx != -1)
-      } else { // link->nConns > 0
-        INFO(NCCL_RAS, "RAS link %d: opening new fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &linkConn->connIdx));
-      } // link->nConns > 0
-    } else { // linkConn->connIdx != -1
-      if (link->nConns == 0) {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
-             link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-      } else {
-        INFO(NCCL_RAS, "RAS link %d: calculated existing fallback connection %d with %s",
-             link->direction, link->nConns, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  // Fill in the entry for the primary connection.
+  linkConn = link->conns;
+  linkConn->peerIdx = newPeerIdx = rasLinkCalculatePeer(link, myPeerIdx, /*isFallback*/false);
+  linkConn->conn = (newPeerIdx != -1 ? rasConnFind(&rasPeers[newPeerIdx].addr) : nullptr);
+  linkConn->external = false;
+
+  if (linkConn->conn == nullptr) {
+    if (linkConn->peerIdx != -1) {
+      // We try to initiate primary connections from the side with a lower address (and thus an earlier peer index)
+      // to avoid races and the creation of duplicate connections.
+      INFO(NCCL_RAS, "RAS link %d: %s primary connection with %s",
+           link->direction, (myPeerIdx < linkConn->peerIdx ? "opening new" : "calculated deferred"),
+           ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+      if (myPeerIdx < linkConn->peerIdx) {
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
       }
-    }
-    link->nConns++;
-    if (linkConn->connIdx == -1)
-      break;
-    conn = rasConns+linkConn->connIdx;
-
-    // We check if the connection already went through the fallback calculation; if so, we'll need to create a new
-    // fallback in the next iteration, to ensure that RAS will keep retrying.
-    if (!conn->experiencingDelays)
-      break;
+      else { // If we didn't initiate the connection, start the timeout.
+        link->lastUpdatePeersTime = clockNano();
+      }
+    } // if (linkConn->peerIdx != -1)
+  } else { // linkConn->conn
+    INFO(NCCL_RAS, "RAS link %d: calculated existing primary connection with %s",
+         link->direction, ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
+  } // linkConn->conn
 
+  if (linkConn->conn && linkConn->conn->experiencingDelays) {
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (clockNano()-conn->startRetryTime)/1e9,
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         linkConn->conn->experiencingDelays, (clockNano()-linkConn->conn->startRetryTime)/1e9,
+         (linkConn->conn->sock ? linkConn->conn->sock->status : - 1));
+    NCCLCHECK(rasLinkAddFallback(link, linkConn->conn));
   }
 
   return ncclSuccess;
@@ -701,39 +676,37 @@ int rasLinkCalculatePeer(const struct rasLink* link, int peerIdx, bool isFallbac
     if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr)) {
       // peerIdx is a fallback and it is not running on the same node as us.
       int tryPeerIdx = newPeerIdx;
-      int tryConnIdx = -1;
+      struct rasConnection* tryConn = nullptr;
 
       // Try to skip the remaining peers on the same node as peerIdx.  We may end up skipping over some peers that
       // are alive, which is fine -- they will still have connectivity with the rest of the RAS network, just a
       // little suboptimal one.
       while (ncclSocketsSameNode(&rasPeers[tryPeerIdx].addr, &rasPeers[peerIdx].addr)) {
         if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr)) {
-          tryConnIdx = rasConnFind(&rasPeers[tryPeerIdx].addr);
-          if (tryConnIdx != -1) {
-            struct rasConnection* tryConn = rasConns+tryConnIdx;
+          tryConn = rasConnFind(&rasPeers[tryPeerIdx].addr);
+          if (tryConn) {
             // Check if the connection is fully established and operational, i.e., if the underlying socket
             // is ready and there's been recent communication on it.
-            if (tryConn->sockIdx != -1 && rasSockets[tryConn->sockIdx].status == RAS_SOCK_READY &&
-                !tryConn->experiencingDelays) {
+            if (tryConn->sock && tryConn->sock->status == RAS_SOCK_READY && !tryConn->experiencingDelays) {
               // We convinced ourselves that the node is not down.  We don't adjust newPeerIdx in
               // this case.  This is the only case when tryConnIdx != -1 after this loop.
               break;
             }
-          } // if (tryConnIdx != -1)
+          } // if (tryConn)
         } // if (!rasPeerIsDead(&rasPeers[tryPeerIdx].addr))
 
-        tryConnIdx = -1;
-        tryPeerIdx = (tryPeerIdx + nRasPeers + link->direction) % nRasPeers;
+        tryConn = nullptr;
+        tryPeerIdx = (tryPeerIdx + link->direction + nRasPeers) % nRasPeers;
         if (tryPeerIdx == myPeerIdx)
           break;
       }
 
-      if (tryConnIdx == -1)
+      if (tryConn == nullptr)
         newPeerIdx = tryPeerIdx;
       if (tryPeerIdx == myPeerIdx)
         break;
     } // if (isFallback && !ncclSocketsSameNode(&rasPeers[peerIdx].addr, &rasNetListeningSocket.addr))
-    
+
     if (rasPeerIsDead(&rasPeers[newPeerIdx].addr)) {
       newPeerIdx = (newPeerIdx + nRasPeers + link->direction) % nRasPeers;
     }
@@ -932,7 +905,8 @@ bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSock
 static void rasPeersDump() {
   for (int p = 0; p < nRasPeers; p++) {
     const struct rasPeerInfo* peer = rasPeers+p;
-    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)), (p == myPeerIdx ? " [this process]" : ""));
+    INFO(NCCL_RAS, "RAS peer %d: %s%s", p, rasPeerDump(peer, rasLine, sizeof(rasLine)),
+         (p == myPeerIdx ? " [this process]" : ""));
   }
   if (nRasPeers > 0)
     INFO(NCCL_RAS, "RAS peersHash 0x%lx", rasPeersHash);
@@ -958,3 +932,17 @@ static char* rasPeerDump(const struct rasPeerInfo* peer, char* result, size_t nr
            rasGpuDevsToString(peer->cudaDevs, peer->nvmlDevs, line2, sizeof(line2)));
   return result;
 }
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasPeersTerminate() {
+  free(rasPeers);
+  rasPeers = nullptr;
+  nRasPeers = 0;
+  rasPeersHash = 0;
+  myPeerIdx = -1;
+
+  free(rasDeadPeers);
+  rasDeadPeers = nullptr;
+  nRasDeadPeers = rasDeadPeersSize = 0;
+  rasDeadPeersHash = 0;
+}
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
index 4905d7a69..8ef551c64 100644
--- a/src/ras/ras.cc
+++ b/src/ras/ras.cc
@@ -4,8 +4,10 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#define NDEBUG // Comment out during development only!
-#include <cassert>
+// Workaround for libstdc++ trying to force public visibility of std:: symbols.  We don't want to do that in libnccl.so.
+#include <bits/c++config.h>
+#undef _GLIBCXX_VISIBILITY
+#define _GLIBCXX_VISIBILITY(V)
 #include <cstddef>
 #include <mutex>
 #include <poll.h>
@@ -65,8 +67,8 @@ int nNcclComms = 0;
 bool ncclCommsSorted = false; // Whether the array is currently sorted. We sort by the comms' commHash and rank.
 
 static ncclResult_t rasLocalNotify(const struct rasNotification* msg);
-static ncclResult_t rasLocalHandle();
-static void rasLocalHandleTerminate();
+static ncclResult_t rasLocalHandle(bool* terminate);
+static void rasThreadCleanup();
 
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock);
 static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct rasSocket* sock);
@@ -74,6 +76,8 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock);
 
 static void* rasThreadMain(void*);
 
+static void rasTerminate() __attribute__((destructor));
+
 NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
 
 //////////////////////////////////////////////////
@@ -105,7 +109,6 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank)
 
       PTHREADCHECKGOTO(pthread_create(&rasThread, nullptr, &rasThreadMain, nullptr), "pthread_create", ret, fail);
       ncclSetThreadName(rasThread, "NCCL RAS");
-      (void)pthread_detach(rasThread);
 
       rasInitialized = true;
     }
@@ -157,18 +160,27 @@ ncclResult_t ncclRasCommFini(const struct ncclComm* comm) {
       }
     }
   }
-  if (ncclAtomicRefCountDecrement(&rasInitRefCount) == 0) {
-    struct rasNotification msg;
-    msg.type = RAS_TERMINATE;
-    NCCLCHECK(rasLocalNotify(&msg));
-  }
+  ncclAtomicRefCountDecrement(&rasInitRefCount);
   return ncclSuccess;
 }
 
+// Global destructor.  Notifies the RAS thread to release all the resources
+// and terminate.  Waits for the thread to terminate.
+static void rasTerminate() {
+  struct rasNotification msg;
+  if (!rasInitialized)
+    return;
+  memset(&msg, '\0', sizeof(msg));
+  msg.type = RAS_TERMINATE;
+  if (rasLocalNotify(&msg) == ncclSuccess)
+    (void)pthread_join(rasThread, nullptr);
+}
+
 // Invoked by regular NCCL threads on every (non-split) comm initialization.  Provides info on all the ranks within
 // the communicator.
 ncclResult_t ncclRasAddRanks(struct rasRankInit* ranks, int nranks) {
   struct rasNotification msg;
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_ADD_RANKS;
   msg.addRanks.ranks = ranks;
   msg.addRanks.nranks = nranks;
@@ -199,7 +211,7 @@ static ncclResult_t rasLocalNotify(const struct rasNotification* msg) {
 /////////////////////////////////////////////////////////////////////////////////
 
 // Handles asynchronous local notifications arriving from regular NCCL threads.
-static ncclResult_t rasLocalHandle() {
+static ncclResult_t rasLocalHandle(bool* terminate) {
   struct rasNotification msg;
 
   size_t done = 0;
@@ -212,9 +224,11 @@ static ncclResult_t rasLocalHandle() {
   }
 
   if (msg.type == RAS_ADD_RANKS) {
-    NCCLCHECK(rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks));
+    (void)rasLocalHandleAddRanks(msg.addRanks.ranks, msg.addRanks.nranks);
+    // Not great if the above fails, but it shouldn't be critical; better to keep going.
   } else if (msg.type == RAS_TERMINATE) {
-    rasLocalHandleTerminate();
+    INFO(NCCL_RAS, "RAS handling local termination request");
+    *terminate = true;
   } else {
     WARN("RAS received unknown notification type %d", msg.type);
     return ncclInternalError;
@@ -223,10 +237,35 @@ static ncclResult_t rasLocalHandle() {
   return ncclSuccess;
 }
 
-// Handles local RAS_TERMINATE notification.
-static void rasLocalHandleTerminate() {
-  INFO(NCCL_RAS, "RAS handling local termination request");
-  // For now we don't do anything.
+// Cleans up local RAS state, normally in response to a RAS_TERMINATE notification.
+static void rasThreadCleanup() {
+  rasClientSupportTerminate();
+  rasNetTerminate();
+  rasCollectivesTerminate();
+  rasPeersTerminate();
+
+  {
+    std::lock_guard<std::mutex> lock(rasInitMutex);
+    (void)close(rasNotificationPipe[1]);
+    (void)close(rasNotificationPipe[0]);
+    // rasClientListeningSocket is taken care of by rasClientSupportTerminate().
+    rasNotificationPipe[0] = rasNotificationPipe[1] = -1;
+    (void)ncclSocketClose(&rasNetListeningSocket);
+    rasInitRefCount = 0;
+    rasInitialized = false;
+  }
+
+  {
+    std::lock_guard<std::mutex> lock(ncclCommsMutex);
+    free(ncclComms);
+    ncclComms = nullptr;
+    nNcclComms = 0;
+    ncclCommsSorted = false;
+  }
+
+  free(rasPfds);
+  rasPfds = nullptr;
+  nRasPfds = 0;
 }
 
 
@@ -270,10 +309,10 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
   else
     ncclIntruQueueEnqueue(&conn->sendQ, meta);
 
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-    if (sock->status == RAS_SOCK_READY || (sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
-      rasPfds[sock->pfd].events |= POLLOUT;
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY ||
+        (conn->sock->status == RAS_SOCK_HANDSHAKE && msg->type == RAS_MSG_CONNINIT)) {
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
       ready = true;
     }
   }
@@ -283,31 +322,31 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
          "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
          msg->type, ncclSocketToString(&conn->addr, rasLine),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         (conn->sock ? conn->sock->status : -1));
   }
 }
 
 // Attempts to send the queued RAS messages to another RAS thread.
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent) {
-  struct ncclSocket* sock = &rasSockets[conn->sockIdx].sock;
   struct rasMsgMeta* meta;
   *closed = 0;
   while ((meta = ncclIntruQueueHead(&conn->sendQ)) != nullptr) {
-    if (rasSockets[conn->sockIdx].status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
+    if (conn->sock->status == RAS_SOCK_HANDSHAKE && meta->msg.type != RAS_MSG_CONNINIT) {
       // We don't send anything beyond the handshake at this point.
       meta = nullptr;
       break;
     }
     if (meta->offset < sizeof(meta->length)) {
       // Send the length of the message.
-      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &meta->length, sizeof(meta->length), &meta->offset, closed));
+      NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, &meta->length, sizeof(meta->length),
+                                   &meta->offset, closed));
       if (*closed)
         return ncclSuccess;
       if (meta->offset < sizeof(meta->length))
         break;
     }
     // Send the body of the message.
-    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, ((char*)&meta->msg)-sizeof(meta->length),
+    NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, &conn->sock->sock, ((char*)&meta->msg)-sizeof(meta->length),
                                  meta->length+sizeof(meta->length), &meta->offset, closed));
     if (*closed)
       return ncclSuccess;
@@ -377,7 +416,7 @@ ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock) {
 static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSocket* sock) {
   ncclResult_t ret = ncclSuccess;
   struct rasConnection* conn = nullptr;
-  int connIdx, peerIdx;
+  int peerIdx;
   struct rasMsg* newMsg = nullptr;
   int newMsgLen;
   char line[SOCKET_NAME_MAXLEN+1];
@@ -406,19 +445,16 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   }
 
   // Check for any existing connection with that RAS thread (could happen due to a network issue, or possibly a race).
-  connIdx = rasConnFind(&msg->connInit.listeningAddr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-
+  conn = rasConnFind(&msg->connInit.listeningAddr);
+  if (conn) {
     INFO(NCCL_RAS,
          "RAS found a matching existing connection (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
          conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0));
 
-    if (conn->sockIdx != -1) {
-      struct rasSocket* connSock = rasSockets+conn->sockIdx;
+    if (conn->sock) {
       INFO(NCCL_RAS, "RAS found an alternative existing socket (status %d, createTime %.2fs)",
-           connSock->status, (clockNano()-connSock->createTime)/1e9);
+           conn->sock->status, (clockNano()-conn->sock->createTime)/1e9);
       // In general we prefer to keep the newer connection, but "newer" can be a relative term: we may have
       // a race where both sides attempt to establish a connection at roughly the same time, so the other side's
       // incoming connection ends up looking newer than the locally-initiated one -- for *both* of them.
@@ -433,21 +469,19 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
         goto exit;
       } else {
         INFO(NCCL_RAS, "RAS keeping the new socket and terminating the existing one");
-        rasSocketTerminate(connSock);
+        rasSocketTerminate(conn->sock);
       }
     }
-  }
-  if (!conn) {
+  } else { // conn == nullptr
     NCCLCHECK(getNewConnEntry(&conn));
     memcpy(&conn->addr, &msg->connInit.listeningAddr, sizeof(conn->addr));
-    connIdx = conn - rasConns;
   }
 
   sock->status = RAS_SOCK_READY;
   // rasConnResume will reset any experiencingDelays, startRetryTime, etc.
 
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = connIdx;
+  conn->sock = sock;
+  sock->conn = conn;
   memcpy(&sock->sock.addr, &msg->connInit.listeningAddr, sizeof(sock->sock.addr));
 
   // Make sure that the connection is part of the right links forming the RAS network.  At this point we only
@@ -456,8 +490,8 @@ static ncclResult_t rasMsgHandleConnInit(const struct rasMsg* msg, struct rasSoc
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the connInit arrives before
   // the peers update.
   if (peerIdx != -1) {
-    (void)rasLinkUpdateConn(&rasNextLink, connIdx, peerIdx);
-    (void)rasLinkUpdateConn(&rasPrevLink, connIdx, peerIdx);
+    (void)rasLinkConnUpdate(&rasNextLink, conn, peerIdx);
+    (void)rasLinkConnUpdate(&rasPrevLink, conn, peerIdx);
   }
 
   // Send a confirmation to the server that requested the connection (so that the resilience code can mark
@@ -504,12 +538,13 @@ static ncclResult_t rasMsgHandleConnInitAck(const struct rasMsg* msg, struct ras
 }
 
 // Handles the deadPeer broadcast.
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone) {
-  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&req->deadPeer.addr, rasLine));
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone) {
+  INFO(NCCL_RAS, "RAS handling deadPeer (addr %s)", ncclSocketToString(&(*pReq)->deadPeer.addr, rasLine));
 
-  if (!rasPeerIsDead(&req->deadPeer.addr)) {
-    rasConnDisconnect(&req->deadPeer.addr);
-    (void)rasPeerDeclareDead(&req->deadPeer.addr);
+  *pReqLen = rasCollDataLength(RAS_BC_DEADPEER);
+  if (!rasPeerIsDead(&(*pReq)->deadPeer.addr)) {
+    rasConnDisconnect(&(*pReq)->deadPeer.addr);
+    (void)rasPeerDeclareDead(&(*pReq)->deadPeer.addr);
     *pDone = false;
   } else {
     INFO(NCCL_RAS, "RAS already knew it was dead");
@@ -530,6 +565,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock) {
 
   INFO(NCCL_RAS, "RAS sending NACK to %s", ncclSocketToString(&sock->sock.addr, rasLine));
 
+  memset(&msg, '\0', sizeof(msg));
   msg.type = RAS_MSG_CONNINITACK;
   msg.connInitAck.nack = 1;
   offset = 0;
@@ -557,16 +593,16 @@ static void* rasThreadMain(void*) {
   INFO(NCCL_RAS, "RAS thread started");
 
   // Initialize the global pollfd with the file descriptors we already have (the pipe and the listening socket).
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasNotificationPipe[0];
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
-  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
+  NCCLCHECKGOTO(ncclSocketGetFd(&rasNetListeningSocket, &rasNetListeningSocketFd), ret, exit);
   rasPfds[pfd].fd = rasNetListeningSocketFd;
   rasPfds[pfd].events = POLLIN;
 
-  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, fail);
+  NCCLCHECKGOTO(rasGetNewPollEntry(&pfd), ret, exit);
   rasPfds[pfd].fd = rasClientListeningSocket;
   rasPfds[pfd].events = POLLIN;
 
@@ -595,32 +631,37 @@ static void* rasThreadMain(void*) {
       if (rasPfds[pollIdx].revents) {
         nEvents--;
         if (rasPfds[pollIdx].fd == rasNotificationPipe[0]) {
-          (void)rasLocalHandle();
+          bool terminate = false;
+          NCCLCHECKGOTO(rasLocalHandle(&terminate), ret, exit);
+          if (terminate)
+            goto exit;
         } else if (rasPfds[pollIdx].fd == rasNetListeningSocketFd) {
           (void)rasNetAcceptNewSocket();
         } else if (rasPfds[pollIdx].fd == rasClientListeningSocket) {
           (void)rasClientAcceptNewSocket();
         } else {
           // Check if it's one of the RAS sockets.
-          int sockIdx;
-          for (sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-            struct rasSocket* sock = rasSockets+sockIdx;
-            if (sock->status != RAS_SOCK_CLOSED && rasPfds[pollIdx].fd == sock->sock.fd) {
-              rasSockEventLoop(sockIdx, pollIdx);
+          struct rasSocket* sock;
+          for (sock = rasSocketsHead; sock;) {
+            struct rasSocket* sockNext = sock->next;
+            if (rasPfds[pollIdx].fd == sock->sock.fd) {
+              rasSockEventLoop(sock, pollIdx);
               break;
             }
-          } // for (sockIdx)
+            sock = sockNext;
+          } // for (sock)
 
-          if (sockIdx == nRasSockets) {
+          if (sock == nullptr) {
             // Try a client socket instead.
-            for (int clientIdx = 0; clientIdx < nRasClients; clientIdx++) {
-              struct rasClient* client = rasClients+clientIdx;
-              if (client->status != RAS_CLIENT_CLOSED && rasPfds[pollIdx].fd == client->sock) {
-                rasClientEventLoop(clientIdx, pollIdx);
+            for (struct rasClient* client = rasClientsHead; client;) {
+              struct rasClient* clientNext = client->next;
+              if (rasPfds[pollIdx].fd == client->sock) {
+                rasClientEventLoop(client, pollIdx);
                 break;
               }
-            } // for (clientIdx)
-          } // if (sockIdx == nRasSockets)
+              client = clientNext;
+            } // for (client)
+          } // if (sock == nullptr)
         } // dynamic fds
       } // if (revents)
     } // for (pollIdx)
@@ -636,14 +677,9 @@ static void* rasThreadMain(void*) {
     rasCollsHandleTimeouts(now, &nextWakeup);
   } // for (;;)
 
-fail:
-  WARN("fatal error - RAS thread terminating");
-  std::lock_guard<std::mutex> lock(rasInitMutex);
-  (void)close(rasNotificationPipe[1]);
-  (void)close(rasNotificationPipe[0]);
-  (void)close(rasClientListeningSocket);
-  (void)ncclSocketClose(&rasNetListeningSocket);
-  rasInitialized = false;
+exit:
+  rasThreadCleanup();
+  INFO(NCCL_RAS, "RAS thread terminating");
   return nullptr;
 }
 
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 715fff4a4..17326c342 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -42,6 +42,14 @@ typedef enum {
   RAS_COLL_COMMS = 1002, // Collect data about all communicators.
 } rasCollectiveType;
 
+// Unique communicator identifier.  commHash by itself is definitely not guaranteed to be unique.
+// Combined with the two other hashes, the chance is much better...
+// All three fields are used for sorting.
+struct rasCommId {
+  uint64_t commHash;
+  uint64_t hostHash, pidHash; // These are the hashes of the *first* rank (comm->peerInfo[0]).
+};
+
 // Payload of a collective request message (RAS_MSG_COLLREQ).
 struct rasCollRequest {
   union ncclSocketAddress rootAddr;
@@ -56,6 +64,10 @@ struct rasCollRequest {
     struct {
     } conns;
     struct {
+      int nSkipMissingRanksComms; // Number of elements in the array below.
+      // Communicators for which we do *not* need the missingRanks data in the responses
+      // (see struct rasCollCommsMissingRank later).
+      struct rasCommId skipMissingRanksComms[0]; // Variable length, sorted.
     } comms;
   };
 };
@@ -69,8 +81,8 @@ struct rasCollResponse {
   int nPeers;
   int nData; // Size of data in bytes.
   union ncclSocketAddress peers[0]; // Variable length.
-  // The peersAddrs array is followed by:
-  //alignas(int64_t) char data[0]; // Variable length, collective-dependent.
+  // The peers array is followed by:
+  // alignas(int64_t) char data[0]; // Variable length, collective-dependent.
 };
 
 // Describes a peer NCCL process.  Every RAS thread keeps an (identical) array of them, one entry for each
@@ -80,6 +92,8 @@ struct rasPeerInfo {
   pid_t pid;
   uint64_t cudaDevs; // Bitmask.  This is for local devices so 64 bits is enough.
   uint64_t nvmlDevs; // Same, but not affected by CUDA_VISIBLE_DEVICES.
+  uint64_t hostHash, pidHash; // Taken from ncclComm, but with the commHash subtracted to make it
+                              // communicator-independent.
 };
 
 // Describes a RAS message.  Every message is preceded by a (32-bit) message length.  All data in the host
@@ -112,7 +126,7 @@ struct rasMsg {
       int nPeers;
       int nDeadPeers;
       struct rasPeerInfo peers[0]; // Variable length.
-      // The peers array is followed by the following:
+      // The peers array is followed by:
       //union ncclSocketAddress deadPeers[0]; // Variable length.
     } peersUpdate;
     struct {
@@ -218,6 +232,9 @@ struct rasMsgMeta {
 // Describes an ongoing collective RAS operation (apart from broadcasts, which don't need a response).
 // For every collective operation, each participating RAS thread will create its own.
 struct rasCollective {
+  struct rasCollective* next;
+  struct rasCollective* prev;
+
   union ncclSocketAddress rootAddr;
   uint64_t rootId;
 
@@ -227,15 +244,16 @@ struct rasCollective {
   bool timeoutWarned;
 
   int64_t startTime; // For timeout calculations.
-  int fromConnIdx; // The connection we received the request from.
+  struct rasConnection* fromConn; // The connection we received the request from.
 
-  int* fwdConns; // Indices of the connections we forwarded the request to; replaced by -1 as the responses arrive.
+  struct rasConnection** fwdConns; // Connections we forwarded the request to; replaced by nullptr's as the
+                                   // responses arrive.
   int nFwdSent; // Count of the above (local process only).
   int nFwdRecv; // Count of the responses received or timeouts (local process only).
 
   int nLegTimeouts; // Collective (from this process and the responses we received).
 
-  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).
+  union ncclSocketAddress* peers; // Collective (from this process and the responses we received).  Unsorted.
   int nPeers;
 
   char* data; // Collective (from this process and the responses we received).
@@ -261,13 +279,14 @@ struct rasCollConns {
 struct rasCollComms {
   int nComms;
   struct comm {
-    uint64_t commHash;
-    int commNRanks;
-    int nRanks; // number of elements in the array below, *not* in the communicator.
+    struct rasCommId commId;
+    int commNRanks; // >= nRanks + nMissingRanks
+    int nRanks; // Number of elements in the ranks array below, *not* in the communicator.
+    int nMissingRanks; // Number of elements in the missingRanks array below.
     struct rank {
       int commRank;
       int peerIdx; // Index within rasCollective->peers, *not* rasPeers.
-      uint64_t collOpCount;
+      uint64_t collOpCounts[NCCL_NUM_FUNCTIONS];
       struct {
         ncclResult_t initState:4;
         ncclResult_t asyncError:4;
@@ -278,34 +297,47 @@ struct rasCollComms {
       char cudaDev;
       char nvmlDev;
     } ranks[0]; // Variable length. Sorted by commRank.  Optimized for 1 GPU/process.
-  } comms[0]; // Variable length. Sorted by commHash.
+    // The ranks array is followed by:
+    // struct rasCollCommsMissingRank missingRanks[0]; // Variable length.  Sorted by commRank.
+  } comms[0]; // Variable length.  Sorted by commId.
+};
+
+// Provides info about missing ranks.  An array of these structures can be part of struct rasCollComms above.
+// Because the arrays are of variable length, we can't describe them in C.  To ensure that adding
+// rasCollCommsMissingRank structures doesn't mess up the alignment, we explicitly request one.
+struct alignas(struct rasCollComms) rasCollCommsMissingRank {
+  int commRank;
+  union ncclSocketAddress addr;
+  // We don't need pid here as we can look it up in rasPeers via addr.
+  char cudaDev;
+  char nvmlDev;
 };
 
 // Holds data needed to keep track of a connection belonging to a RAS network link (either the primary one
 // or one of the fallbacks).
 struct rasLinkConn {
+  struct rasLinkConn* next;
   int peerIdx; // Index in the rasPeers array of the peer this entry describes.  Could be -1 (an entry initiated
                // by an as of yet unknown peer -- should be a temporary situation that resolves via peer updates).
-  int connIdx; // Index in the rasConns array of the connection to the above peer.  Could be -1 (a placeholder
-               // for a connection to be started by the remote peer).
+  struct rasConnection* conn; // The connection to the above peer.  Could be nullptr (a placeholder for a connection
+                              // to be started by the remote peer).
   bool external; // true if the entry exists only due to an external request (requested by a remote peer, most
                  // likely as part of fault recovery).  Such connections are kept as fallbacks even if there's a
                  // valid primary connection, in order to ensure that keep-alive messages are sent.
 };
 
 // Describes a link that forms the backbone of the RAS network.  Links focus on direction (previous/next in
-// case of 1-D topology) rather than a particular destination.  The are implemented using rasConnections, but
+// case of 1-D topology) rather than a particular destination.  They are implemented using rasConnections, but
 // they are persistent through the life of the RAS threads, whereas rasConnections can be terminated if the RAS
 // network is reconfigured or a peer dies.
 struct rasLink {
   int direction; // 1 for nextLink, -1 for prevLink.
 
-  // Index 0 is the primary connection; any additional ones are fallbacks (that get created if we are having
-  // problems with the primary connection).  The elements are de-facto ordered (highest-preference ones have
-  // the lowest indices).
+  // First element is the primary connection; any additional ones are fallbacks (that get created if we are having
+  // problems with the primary connection).  The highest-preference elements come first; the list is de-facto sorted
+  // by peerIdx, though peerIdx values can wrap around (given the ring/torus topology) and they can also be -1
+  // (the latter are stored at the end).
   struct rasLinkConn* conns;
-  int nConns;
-  int connsSize; // Array size; could be larger than nConns.
 
   // Keep track of a timeout in case we did not create a connection during the last peers update (because we expect
   // the peer on the other side to do so) but that peer failed to initiate.
@@ -315,15 +347,15 @@ struct rasLink {
 // Describes a connection to another peer on the RAS network.  It is meant to be more persistent than a volatile
 // socket (described by the rasSocket structure), which can be affected by transient network issues.
 struct rasConnection {
-  bool inUse;
+  struct rasConnection* next;
+  struct rasConnection* prev;
 
   union ncclSocketAddress addr;
 
-  // Index of the current rasSocket in the rasSockets array.  Note that multiple rasSocket entries may point back
+  // Pointer to the current rasSocket.  Note that multiple rasSocket entries may point back
   // to a single entry here, for sockets that are in the process of being terminated and re-established.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no such socket.
-  int sockIdx;
+  // nullptr if there is no such socket.
+  struct rasSocket* sock;
 
   // We keep the rasPeersHash of remote connections to minimize the number of needless exchanges.
   // There is a subtle difference in the meaning of lastSentPeersHash and lastRecvPeersHash.
@@ -371,16 +403,18 @@ typedef enum {
 
 // Describes a socket implementing communication between two peers.
 struct rasSocket {
+  struct rasSocket* next;
+  struct rasSocket* prev;
+
   struct ncclSocket sock;
 
   rasSocketStatus status;
 
   int pfd; // Index in the rasPfds array.
 
- // Index of the corresponding entry in the rasConns array.
-  // We use indices, not pointers, because the arrays holding these structures can be re-alloced at run time.
-  // -1 if there is no connection (normal condition on the accept side before the connInit message).
-  int connIdx;
+  // Pointer to the corresponding entry in the rasConns array.
+  // nullptr if there is no connection (a normal condition on the accept side before the connInit message).
+  struct rasConnection* conn;
 
   int64_t createTime;
   int64_t lastSendTime;
@@ -404,7 +438,10 @@ typedef enum {
 
 // Describes a RAS client.
 struct rasClient {
-  int sock;
+  struct rasClient* next;
+  struct rasClient* prev;
+
+  int sock; // File descriptor
 
   rasClientStatus status;
 
@@ -420,7 +457,7 @@ struct rasClient {
   int64_t timeout;
 
   // State stored during asynchronous operations such as collectives.
-  int collIdx; // Index to the onging rasCollective.
+  struct rasCollective* coll;
 };
 
 
@@ -440,31 +477,33 @@ void rasConnEnqueueMsg(struct rasConnection* conn, struct rasMsg* msg, size_t ms
 ncclResult_t rasConnSendMsg(struct rasConnection* conn, int* closed, bool* allSent);
 ncclResult_t rasMsgRecv(struct rasSocket* sock, struct rasMsg** msg, int* closed);
 ncclResult_t rasMsgHandle(struct rasMsg* msg, struct rasSocket* sock);
-void rasMsgHandleBCDeadPeer(const struct rasCollRequest* req, bool* pDone);
+void rasMsgHandleBCDeadPeer(struct rasCollRequest** pReq, size_t* pReqLen, bool* pDone);
 ncclResult_t rasGetNewPollEntry(int* index);
 
 
 // rasnet.cc
 extern struct rasLink rasNextLink, rasPrevLink;
-extern struct rasConnection* rasConns;
-extern int nRasConns;
-extern struct rasSocket *rasSockets;
-extern int nRasSockets;
+extern struct rasConnection* rasConnsHead;
+extern struct rasConnection* rasConnsTail;
+extern struct rasSocket *rasSocketsHead;
+extern struct rasSocket *rasSocketsTail;
 
 ncclResult_t getNewConnEntry(struct rasConnection** pConn);
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx);
-int rasConnFind(const union ncclSocketAddress* addr);
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn);
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr);
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasConnDisconnect(const union ncclSocketAddress* addr);
 ncclResult_t rasNetAcceptNewSocket();
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup);
 void rasSocketTerminate(struct rasSocket* sock, bool finalize = false, uint64_t startRetryOffset = 0,
                         bool retry = true);
-void rasSockEventLoop(int sockIdx, int pollIdx);
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx);
 void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup);
 ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* sock);
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external = false,
-                               bool insert = false, bool pretend = false, int* pLinkIdx = nullptr);
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn);
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+void rasNetTerminate();
+
 
 // peers.cc
 extern struct rasPeerInfo* rasPeers;
@@ -483,29 +522,35 @@ ncclResult_t rasPeerDeclareDead(const union ncclSocketAddress* addr);
 bool rasPeerIsDead(const union ncclSocketAddress* addr);
 int ncclSocketsCompare(const void* p1, const void* p2);
 bool ncclSocketsSameNode(const union ncclSocketAddress* a1, const union ncclSocketAddress* a2);
+void rasPeersTerminate();
 
 
 // collectives.cc
-extern struct rasCollective* rasCollectives;
+extern struct rasCollective* rasCollectivesHead;
+extern struct rasCollective* rasCollectivesTail;
 
 void rasCollReqInit(struct rasCollRequest* req);
-ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, size_t reqLen, bool* pAllDone = nullptr,
-                               int* pCollIdx = nullptr, int fromConnIdx = -1);
+ncclResult_t rasNetSendCollReq(const struct rasCollRequest* req, bool* pAllDone = nullptr,
+                               struct rasCollective** pColl = nullptr, struct rasConnection* fromConn = nullptr);
 ncclResult_t rasMsgHandleCollReq(struct rasMsg* msg, struct rasSocket* sock);
 ncclResult_t rasMsgHandleCollResp(struct rasMsg* msg, struct rasSocket* sock);
-void rasCollsPurgeConn(int connIdx);
+void rasCollsPurgeConn(struct rasConnection* conn);
 void rasCollFree(struct rasCollective* coll);
 void rasCollsHandleTimeouts(int64_t now, int64_t* nextWakeup);
+void rasCollectivesTerminate();
+
 
 // client_support.cc
 extern int rasClientListeningSocket;
-extern struct rasClient* rasClients;
-extern int nRasClients;
+extern struct rasClient* rasClientsHead;
+extern struct rasClient* rasClientsTail;
+
 ncclResult_t rasClientInitSocket();
 ncclResult_t rasClientAcceptNewSocket();
 ncclResult_t rasClientResume(struct rasCollective* coll);
-void rasClientEventLoop(int clientIdx, int pollIdx);
+void rasClientEventLoop(struct rasClient* client, int pollIdx);
 const char* rasGpuDevsToString(uint64_t cudaDevs, uint64_t nvmlDevs, char* buf, size_t size);
+void rasClientSupportTerminate();
 
 #endif // !NCCL_RAS_CLIENT
 
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
index 441ad192c..43aa042a7 100644
--- a/src/ras/rasnet.cc
+++ b/src/ras/rasnet.cc
@@ -13,90 +13,106 @@
 struct rasLink rasNextLink = {1}, rasPrevLink = {-1};
 
 // Connections on the RAS network.
-struct rasConnection* rasConns;
-int nRasConns;
+struct rasConnection* rasConnsHead;
+struct rasConnection* rasConnsTail;
 
 // Sockets implementing the RAS network.
-struct rasSocket *rasSockets;
-int nRasSockets;
+struct rasSocket *rasSocketsHead;
+struct rasSocket *rasSocketsTail;
 
 // Magic file descriptor number when we want poll() to ignore an entry.  Anything negative would do, but
 // I didn't want to use -1 because it has a special meaning for us.
 #define POLL_FD_IGNORE -2
 
+static void freeConnEntry(struct rasConnection* conn);
 static void rasConnOpen(struct rasConnection* conn);
 static ncclResult_t rasConnPrepare(struct rasConnection* conn);
 static void rasConnTerminate(struct rasConnection* conn);
 
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock);
+static void freeSockEntry(struct rasSocket* sock);
 
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup);
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup);
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup);
 static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack = false);
 
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx);
 static void rasConnResume(struct rasConnection* conn);
 static void rasLinkSanitizeFallbacks(struct rasLink* link);
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx = -1);
-static int rasLinkFindConn(const struct rasLink* link, int connIdx);
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend = false,
+                                   int* pLinkIdx = nullptr, struct rasLinkConn** pLinkConn = nullptr,
+                                   bool insert = true);
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx);
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external = false);
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx = nullptr);
 
 
 ///////////////////////////////////////////////
 // Functions related to the RAS connections. //
 ///////////////////////////////////////////////
 
-// Allocates an entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasConnections list.
 ncclResult_t getNewConnEntry(struct rasConnection** pConn) {
   struct rasConnection* conn;
-  int i;
-  for (i = 0; i < nRasConns; i++)
-    if (!rasConns[i].inUse)
-      break;
-  if (i == nRasConns) {
-    NCCLCHECK(ncclRealloc(&rasConns, nRasConns, nRasConns+RAS_INCREMENT));
-    nRasConns += RAS_INCREMENT;
-  }
 
-  conn = rasConns+i;
-  memset(conn, '\0', sizeof(*conn));
-  conn->inUse = true;
-  conn->sockIdx = -1;
+  NCCLCHECK(ncclCalloc(&conn, 1));
+
   ncclIntruQueueConstruct(&conn->sendQ);
   conn->travelTimeMin = INT64_MAX;
   conn->travelTimeMax = INT64_MIN;
 
+  if (rasConnsHead) {
+    rasConnsTail->next = conn;
+    conn->prev = rasConnsTail;
+    rasConnsTail = conn;
+  } else {
+    rasConnsHead = rasConnsTail = conn;
+  }
+
   *pConn = conn;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasConns list.
+static void freeConnEntry(struct rasConnection* conn) {
+  if (conn == nullptr)
+    return;
+
+  if (conn == rasConnsHead)
+    rasConnsHead = rasConnsHead->next;
+  if (conn == rasConnsTail)
+    rasConnsTail = rasConnsTail->prev;
+  if (conn->prev)
+    conn->prev->next = conn->next;
+  if (conn->next)
+    conn->next->prev = conn->prev;
+  free(conn);
+}
+
 // Creates a new RAS network connection to a remote peer address.
-ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
+ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, struct rasConnection** pConn) {
   ncclResult_t ret = ncclSuccess;
-  struct rasConnection* conn = nullptr;
+  struct rasConnection* conn;
 
   // First check if a connection entry for this peer already exists.
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    conn = rasConns+connIdx;
-  }
+  conn = rasConnFind(addr);
 
-  if (conn && conn->sockIdx != -1) {
+  if (conn && conn->sock) {
     // An entry exists and has a socket associated with it -- nothing left for us to do.
-    if (pConnIdx)
-      *pConnIdx = connIdx;
+    if (pConn)
+      *pConn = conn;
     goto exit;
   }
 
-  if (!conn) {
+  if (conn == nullptr) {
     NCCLCHECKGOTO(getNewConnEntry(&conn), ret, exit);
     memcpy(&conn->addr, addr, sizeof(conn->addr));
     // We are establishing a new connection -- start the timeout.
     conn->startRetryTime = clockNano();
-    connIdx = conn - rasConns;
   }
 
-  if (pConnIdx)
-    *pConnIdx = connIdx;
+  if (pConn)
+    *pConn = conn;
 
   rasConnOpen(conn);
 
@@ -107,7 +123,7 @@ ncclResult_t rasConnCreate(const union ncclSocketAddress* addr, int* pConnIdx) {
 // Opens a connection to a remote peer.
 static void rasConnOpen(struct rasConnection* conn) {
   ncclResult_t ret; // Not used.
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   bool closeSocketOnFail = false;
   int ready;
 
@@ -120,10 +136,8 @@ static void rasConnOpen(struct rasConnection* conn) {
 
   NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
 
-  // We delay the initialization of sockIdx, connIdx and status until this point so that in case of failures
-  // we don't need to clean them up.
-  conn->sockIdx = sock-rasSockets;
-  sock->connIdx = conn-rasConns;
+  conn->sock = sock;
+  sock->conn = conn;
   rasPfds[sock->pfd].fd = sock->sock.fd;
 
   // We ignore the possibly ready status of the socket at this point and consider it CONNECTING because
@@ -141,6 +155,7 @@ static void rasConnOpen(struct rasConnection* conn) {
 fail:
   if (closeSocketOnFail)
     (void)ncclSocketClose(&sock->sock);
+  freeSockEntry(sock);
   goto exit;
 }
 
@@ -166,16 +181,13 @@ static ncclResult_t rasConnPrepare(struct rasConnection* conn) {
 }
 
 // Searches through rasConns for a connection with a provided address.
-int rasConnFind(const union ncclSocketAddress* addr) {
-  // rasConns is not sorted (given the number of indices, it would be a massive hassle to keep it that way)
-  // so binary search won't do...
-  for (int i = 0; i < nRasConns; i++) {
-    struct rasConnection* conn = rasConns+i;
-    if (conn->inUse && memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
-      return i;
+struct rasConnection* rasConnFind(const union ncclSocketAddress* addr) {
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next) {
+    if (memcmp(&conn->addr, addr, sizeof(conn->addr)) == 0)
+      return conn;
   }
 
-  return -1;
+  return nullptr;
 }
 
 // Handles any connection-related timeouts.  Many timeouts affect the underlying sockets and thus have been handled
@@ -184,58 +196,56 @@ int rasConnFind(const union ncclSocketAddress* addr) {
 // This is also where we declare peers as dead, etc.
 // Invoked from the main RAS event loop.
 void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-
-    if (!conn->inUse)
-      continue;
-
-    if (conn->sockIdx != -1) {
-      struct rasSocket* sock = rasSockets+conn->sockIdx;
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (conn->sock) {
       bool sockTerminated = false;
 
       // Retry the socket connections that have been refused.
-      if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting) {
-        if (now - sock->lastSendTime > RAS_CONNECT_RETRY) {
+      if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting) {
+        if (now - conn->sock->lastSendTime > RAS_CONNECT_RETRY) {
           int ready;
-          if (ncclSocketReady(&sock->sock, &ready) != ncclSuccess) {
+          if (ncclSocketReady(&conn->sock->sock, &ready) != ncclSuccess) {
             INFO(NCCL_RAS, "Unexpected error from ncclSocketReady; terminating the socket connection with %s",
-                 ncclSocketToString(&sock->sock.addr, rasLine));
-            rasSocketTerminate(sock, /*finalize*/true);
+                 ncclSocketToString(&conn->addr, rasLine));
+            rasSocketTerminate(conn->sock, /*finalize*/true);
             // We will retry below in the same loop.
             sockTerminated = true;
           } else {
             // We update lastSendTime even if !ready because we need it up-to-date for timeout calculations.
-            sock->lastSendTime = clockNano();
-            if (!ready && sock->sock.state == ncclSocketStateConnecting)
-              *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+            conn->sock->lastSendTime = clockNano();
+            if (!ready && conn->sock->sock.state == ncclSocketStateConnecting)
+              *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
             else
-              rasPfds[sock->pfd].fd = sock->sock.fd; // Enable the handling via the main loop.
+              rasPfds[conn->sock->pfd].fd = conn->sock->sock.fd; // Enable the handling via the main loop.
           } // if (ncclSocketReady)
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_CONNECT_RETRY);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_CONNECT_RETRY);
         }
-      } // if (sock->status == RAS_SOCK_CONNECTING && sock->sock.state == ncclSocketStateConnecting)
+      } // if (conn->sock->status == RAS_SOCK_CONNECTING && conn->sock->sock.state == ncclSocketStateConnecting)
 
       // For connections that have data to send but that we've been unable to send a message on for a while,
       // consider their sockets lost and terminate them.
-      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY) {
-        if (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
+      if (!sockTerminated && !ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY) {
+        if (now - std::max(conn->sock->lastSendTime,
+                           ncclIntruQueueHead(&conn->sendQ)->enqueueTime) > RAS_STUCK_TIMEOUT) {
           INFO(NCCL_RAS, "RAS send stuck timeout error (%lds) on socket connection with %s",
-               (now - std::max(sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
-               CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-          rasSocketTerminate(sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
+               (now - std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)) /
+               CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+          rasSocketTerminate(conn->sock, /*finalize*/false, RAS_STUCK_TIMEOUT);
           // We will retry below in the same loop.
         } else {
-          *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime,
-                                                       ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+RAS_STUCK_TIMEOUT);
+          *nextWakeup = std::min(*nextWakeup,
+                                 std::max(conn->sock->lastSendTime, ncclIntruQueueHead(&conn->sendQ)->enqueueTime)+
+                                 RAS_STUCK_TIMEOUT);
         }
-      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && sock->status == RAS_SOCK_READY)
-    } // if (conn->sockIdx != -1)
+      } // if (!ncclIntruQueueEmpty(&conn->sendQ) && conn->sock->status == RAS_SOCK_READY)
+    } // if (conn->sock)
 
     // For connections that are being (re-)established, irrespective of whether there's a valid socket associated
-    // with them (conn->startIdx != -1), we need to check if any connection-level timeout has expired.
+    // with them, we need to check if any connection-level timeout has expired.
     if (conn->startRetryTime) {
+      bool connTerminated = false;
       // If we've been trying to open a connection for too long (60s), give up and mark the peer as dead
       // so that we don't try again.
       if (now - conn->startRetryTime > RAS_PEER_DEAD_TIMEOUT) {
@@ -248,82 +258,83 @@ void rasConnsHandleTimeouts(int64_t now, int64_t* nextWakeup) {
         rasCollReqInit(&bCast);
         bCast.type = RAS_BC_DEADPEER;
         memcpy(&bCast.deadPeer.addr, &conn->addr, sizeof(bCast.deadPeer.addr));
-        (void)rasNetSendCollReq(&bCast, rasCollDataLength(RAS_BC_DEADPEER));
+        (void)rasNetSendCollReq(&bCast);
 
-        continue;
+        connTerminated = true;
       } else {
         *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_PEER_DEAD_TIMEOUT);
       }
 
       // RAS_STUCK_TIMEOUT has already been handled in the socket function (we'll pick it up later via
-      // the conn->sockIdx == -1 test).
-
-      // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
-      // to establish fallback connections.
-      if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
-        if (!conn->experiencingDelays) {
-          INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
-               (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
-
-          // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
-          // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
-          // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
-          conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
-
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
-        } // if (!conn->experiencingDelays)
-      } else {
-        *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
-      }
+      // the conn->sock == nullptr test).
+
+      if (!connTerminated) {
+        // We print warnings after the same time as with keep-alive (5s), and we pessimistically immediately try
+        // to establish fallback connections.
+        if (now - conn->startRetryTime > RAS_CONNECT_WARN) {
+          if (!conn->experiencingDelays) {
+            INFO(NCCL_RAS, "RAS connect timeout warning (%lds) on socket connection with %s",
+                 (now-conn->startRetryTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+
+            // See if the connection was meant to be a part of a RAS link and if so, try to initiate fallback
+            // connection(s).  At this point, it's mostly just a precaution; we will continue trying to establish
+            // the primary connection until RAS_PEER_DEAD_TIMEOUT expires.
+            conn->experiencingDelays = true;
+            (void)rasLinkAddFallback(&rasNextLink, conn);
+            (void)rasLinkAddFallback(&rasPrevLink, conn);
+
+            // Stop collectives from waiting for a response over it.
+            rasCollsPurgeConn(conn);
+          } // if (!conn->experiencingDelays)
+        } else {
+          *nextWakeup = std::min(*nextWakeup, conn->startRetryTime+RAS_CONNECT_WARN);
+        }
 
-      // If a socket was terminated (or never opened, due to some error), try to open it now.
-      // We retry once a second.
-      if (conn->sockIdx == -1) {
-        if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
-          INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
-               ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
-               (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
-          rasConnOpen(conn);
+        // If a socket was terminated (or never opened, due to some error), try to open it now.
+        // We retry once a second.
+        if (conn->sock == nullptr) {
+          if (now - conn->lastRetryTime > RAS_CONNECT_RETRY) {
+            INFO(NCCL_RAS, "RAS trying to reconnect with %s (experiencingDelays %d, startRetryTime %.2fs)",
+                 ncclSocketToString(&conn->addr, rasLine), conn->experiencingDelays,
+                 (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0));
+            rasConnOpen(conn);
+          }
+          if (conn->sock == nullptr)
+            *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
         }
-        if (conn->sockIdx == -1)
-          *nextWakeup = std::min(*nextWakeup, conn->lastRetryTime+RAS_CONNECT_RETRY);
-      }
+      } // if (!connTerminated)
     } // if (conn->startRetryTime)
-  } // for (connIdx)
+
+    conn = connNext;
+  } // for (conn)
 }
 
 // Checks if we have a connection to a given peer and if so, terminates it.  The connection is removed from the
 // RAS links, though fallbacks are initiated if necessary.  Typically called just before declaring a peer dead.
 void rasConnDisconnect(const union ncclSocketAddress* addr) {
-  int connIdx = rasConnFind(addr);
-  if (connIdx != -1) {
-    (void)rasLinkAddFallback(&rasNextLink, connIdx);
-    (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-    rasLinkDropConn(&rasNextLink, connIdx);
-    rasLinkDropConn(&rasPrevLink, connIdx);
-
-    rasConnTerminate(rasConns+connIdx);
+  struct rasConnection* conn = rasConnFind(addr);
+  if (conn) {
+    (void)rasLinkAddFallback(&rasNextLink, conn);
+    (void)rasLinkAddFallback(&rasPrevLink, conn);
+    rasLinkConnDrop(&rasNextLink, conn);
+    rasLinkConnDrop(&rasPrevLink, conn);
+
+    rasConnTerminate(conn);
   }
 }
 
 // Terminates a connection and frees the rasConns entry.
 static void rasConnTerminate(struct rasConnection* conn) {
-  int connIdx = conn - rasConns;
-
   // Make sure there are no lingering rasSockets pointing to it.
-  for (int i = 0; i < nRasSockets; i++) {
-    struct rasSocket* sock = rasSockets+i;
-    if (sock->status != RAS_SOCK_CLOSED && sock->connIdx == connIdx)
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    if (sock->conn == conn)
       rasSocketTerminate(sock, /*finalize*/true);
+    sock = sockNext;
   }
 
   // Also check any ongoing collectives.
-  rasCollsPurgeConn(connIdx);
+  rasCollsPurgeConn(conn);
 
   while (struct rasMsgMeta* meta = ncclIntruQueueTryDequeue(&conn->sendQ)) {
     free(meta);
@@ -331,8 +342,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 
   INFO(NCCL_RAS, "RAS terminating a connection with %s", ncclSocketToString(&conn->addr, rasLine));
 
-  conn->inUse = false;
-  conn->sockIdx = -1; // Should be that way already, but just to be extra sure...
+  freeConnEntry(conn);
 }
 
 
@@ -344,7 +354,7 @@ static void rasConnTerminate(struct rasConnection* conn) {
 // corresponding rasConnection can't be established without knowing the peer's address.
 ncclResult_t rasNetAcceptNewSocket() {
   ncclResult_t ret = ncclSuccess;
-  struct rasSocket* sock;
+  struct rasSocket* sock = nullptr;
   int ready;
   bool socketInitialized = false;
   NCCLCHECKGOTO(getNewSockEntry(&sock), ret, fail);
@@ -370,91 +380,98 @@ ncclResult_t rasNetAcceptNewSocket() {
 fail:
   if (socketInitialized)
     NCCLCHECK(ncclSocketClose(&sock->sock));
+  freeSockEntry(sock);
   goto exit;
 }
 
-// Returns the index of the first available entry in the rasConns array, enlarging the array if necessary.
+// Allocates a new entry in the rasSockets list.
 static ncclResult_t getNewSockEntry(struct rasSocket** pSock) {
   struct rasSocket* sock;
-  int i;
-  for (i = 0; i < nRasSockets; i++)
-    if (rasSockets[i].status == RAS_SOCK_CLOSED)
-      break;
-  if (i == nRasSockets) {
-    NCCLCHECK(ncclRealloc(&rasSockets, nRasSockets, nRasSockets+RAS_INCREMENT));
-    nRasSockets += RAS_INCREMENT;
-  }
 
-  sock = rasSockets+i;
-  memset(sock, '\0', sizeof(*sock));
+  NCCLCHECK(ncclCalloc(&sock, 1));
+
   sock->pfd = -1;
-  sock->connIdx = -1;
   sock->createTime = sock->lastSendTime = sock->lastRecvTime = clockNano();
 
+  if (rasSocketsHead) {
+    rasSocketsTail->next = sock;
+    sock->prev = rasSocketsTail;
+    rasSocketsTail = sock;
+  } else {
+    rasSocketsHead = rasSocketsTail = sock;
+  }
+
   *pSock = sock;
   return ncclSuccess;
 }
 
+// Frees an entry from the rasSockets list.
+static void freeSockEntry(struct rasSocket* sock) {
+  if (sock == nullptr)
+    return;
+
+  if (sock == rasSocketsHead)
+    rasSocketsHead = rasSocketsHead->next;
+  if (sock == rasSocketsTail)
+    rasSocketsTail = rasSocketsTail->prev;
+  if (sock->prev)
+    sock->prev->next = sock->next;
+  if (sock->next)
+    sock->next->prev = sock->prev;
+  free(sock);
+}
+
 // Invoked from the main RAS event loop to handle RAS socket timeouts.
 void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
-  for (int sockIdx = 0; sockIdx < nRasSockets; sockIdx++) {
-    struct rasSocket* sock = rasSockets+sockIdx;
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
 
-    if (sock->status == RAS_SOCK_CLOSED)
-      continue;
-
-    // For socket connections that are still being established, give up on the ones that take too long to initialize.
     if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE) {
+      // For socket connections that are still being established, give up on the ones that take too long to initialize.
       if (now - sock->createTime > RAS_STUCK_TIMEOUT) {
-        if (sock->connIdx == -1) {
+        if (sock->conn == nullptr) {
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on incoming socket connection from %s",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
         } else {
-          struct rasConnection* conn = rasConns+sock->connIdx;
           INFO(NCCL_RAS, "RAS init timeout error (%lds) on socket connection with %s "
                "(experiencingDelays %d, startRetryTime %.2fs, socket status %d)",
                (now-sock->createTime)/CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine),
-               conn->experiencingDelays, (conn->startRetryTime ? (now-conn->startRetryTime)/1e9 : 0.0),
-               sock->status);
+               sock->conn->experiencingDelays,
+               (sock->conn->startRetryTime ? (now-sock->conn->startRetryTime)/1e9 : 0.0), sock->status);
         }
         rasSocketTerminate(sock, /*finalize*/true);
         // We may retry later.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, sock->createTime+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_CONNECTING || sock->status == RAS_SOCK_HANDSHAKE)
-
-    // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
-    if (sock->status == RAS_SOCK_TERMINATING) {
+    } else if (sock->status == RAS_SOCK_TERMINATING) {
+      // For sockets that are being terminated, force finalization of the ones that haven't made progress in too long.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_STUCK_TIMEOUT) {
         INFO(NCCL_RAS, "RAS termination stuck timeout error (%lds) on socket connection with %s",
              (now-std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/true);
         // This socket is presumably already being re-established, if needed.
-        continue;
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_STUCK_TIMEOUT);
       }
-    } // if (sock->status == RAS_SOCK_TERMINATING)
-
-    // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
-    // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
-    // suspend, rasSocketTerminate will do additional checking.
-    if (sock->status == RAS_SOCK_READY) {
+    } else if (sock->status == RAS_SOCK_READY) {
+      // Terminate sockets that haven't been used in a good while.  In principle this shouldn't trigger for anything
+      // important due to shorter timeouts on RAS network connections, but in case of weird situations like process
+      // suspend, rasSocketTerminate will do additional checking.
       if (now - std::max(sock->lastSendTime, sock->lastRecvTime) > RAS_IDLE_TIMEOUT) {
         INFO(NCCL_RAS, "RAS idle timeout (%lds) on socket connection with %s",
              (now - std::max(sock->lastSendTime, sock->lastRecvTime)) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock, /*finalize*/false, /*startRetryOffset*/0, /*retry*/false);
-        continue;
         // The RAS network timeout handler will terminate the conn it was associated with, if any.
       } else {
         *nextWakeup = std::min(*nextWakeup, std::max(sock->lastSendTime, sock->lastRecvTime)+RAS_IDLE_TIMEOUT);
       }
     } // if (sock->status == RAS_SOCK_READY)
-  } // for (sockIdx)
+
+    sock = sockNext;
+  } // for (sock)
 }
 
 // Handles the termination of a RAS socket.
@@ -464,19 +481,19 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // For not fully established sockets, we can terminate immediately as there's no useful data to extract.
 void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
   assert(sock->status != RAS_SOCK_CLOSED);
-  if (sock->connIdx != -1) {
-    struct rasConnection* conn = rasConns+sock->connIdx;
-    // If the sockIdx of the connection points back to us, it means that we are the current socket of this
+  if (sock->conn) {
+    struct rasConnection* conn = sock->conn;
+    // If the sock of the connection points back to us, it means that we are the current socket of this
     // connection, so we have additional work to do before we can terminate it.
-    if (conn->sockIdx == sock-rasSockets) {
+    if (conn->sock == sock) {
       // Reset it to indicate there's no valid socket associated with that connection anymore.
-      conn->sockIdx = -1;
+      conn->sock = nullptr;
 
       // Don't attempt to retry on sockets that have been unused for so long that the remote peer probably
       // deliberately closed them.  Make an exception for sockets that are part of the RAS network links.
       if ((retry &&
            clockNano() - std::max(sock->lastSendTime, sock->lastRecvTime) < RAS_IDLE_TIMEOUT - RAS_IDLE_GRACE_PERIOD) ||
-          rasLinkFindConn(&rasNextLink, sock->connIdx) != -1 || rasLinkFindConn(&rasPrevLink, sock->connIdx) != -1) {
+          rasLinkConnFind(&rasNextLink, sock->conn) || rasLinkConnFind(&rasPrevLink, sock->conn)) {
         // For connections that were fine until now, the connection-level timeout starts at termination, and possibly
         // even earlier, depending on what event trigerred the termination -- if it was another timeout expiring, then
         // we need to include that timeout as well.
@@ -507,11 +524,11 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
       } // if (retry)
 
       // Stop collectives from waiting for a response over this connection.
-      rasCollsPurgeConn(sock->connIdx);
-    } // if (conn->sockIdx == sock-rasSockets)
-  } // if (sock->connIdx != -1)
+      rasCollsPurgeConn(sock->conn);
+    } // if (conn->sock == sock)
+  } // if (sock->conn)
 
-  if (sock->status != RAS_SOCK_CONNECTING && sock->connIdx != -1 && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
+  if (sock->status != RAS_SOCK_CONNECTING && sock->conn && !finalize && (rasPfds[sock->pfd].events & POLLIN)) {
     if (sock->status != RAS_SOCK_TERMINATING) {
       // The receiving side is still open -- close just the sending side.
       (void)ncclSocketShutdown(&sock->sock, SHUT_WR);
@@ -525,20 +542,15 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
   } else {
     // Either the caller requested finalization or we cannot receive on it.
     (void)ncclSocketClose(&sock->sock);
-    sock->status = RAS_SOCK_CLOSED;
     rasPfds[sock->pfd].fd = -1;
     rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
-    sock->pfd = sock->connIdx = -1;
-    sock->recvOffset = sock->recvLength = 0;
     free(sock->recvMsg);
-    sock->recvMsg = nullptr;
+    freeSockEntry(sock);
   }
 }
 
 // Handles a ready socket FD from the main event loop.
-void rasSockEventLoop(int sockIdx, int pollIdx) {
-  struct rasSocket* sock = rasSockets+sockIdx;
-
+void rasSockEventLoop(struct rasSocket* sock, int pollIdx) {
   if (sock->status == RAS_SOCK_CONNECTING) {
     int ready;
     // Socket is not yet fully established. Continue the OS or NCCL-level handshake.
@@ -554,15 +566,15 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
         (connectSide ? sock->lastSendTime : sock->lastRecvTime) = clockNano();
         sock->status = RAS_SOCK_HANDSHAKE;
         if (connectSide) {
-          assert(sock->connIdx != -1);
-          if (rasConns[sock->connIdx].sockIdx == sockIdx) {
-            if (rasConnPrepare(rasConns+sock->connIdx) != ncclSuccess) {
+          assert(sock->conn);
+          if (sock->conn->sock == sock) {
+            if (rasConnPrepare(sock->conn) != ncclSuccess) {
               INFO(NCCL_RAS, "RAS unexpected error from rasConnPrepare; terminating the socket connection with %s",
                    ncclSocketToString(&sock->sock.addr, rasLine));
               rasSocketTerminate(sock);
               // We may retry further down.
             }
-          } else {
+          } else { // sock->conn->sock != sock
             // The connection this socket is associated with no longer considers it to be the current one.
             // This could possibly happen due to a race condition.  Simply terminate it.
             INFO(NCCL_RAS, "RAS connected with %s via a socket that's no longer current!",
@@ -581,10 +593,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
     if (sock->status != RAS_SOCK_TERMINATING && (rasPfds[pollIdx].revents & POLLOUT)) {
       int closed = 0;
       bool allSent = false;
-      assert(sock->connIdx != -1);
-      struct rasConnection* conn = rasConns+sock->connIdx;
-      assert(conn->sockIdx == sockIdx);
-      if (rasConnSendMsg(conn, &closed, &allSent) != ncclSuccess) {
+      assert(sock->conn);
+      assert(sock->conn->sock == sock);
+      if (rasConnSendMsg(sock->conn, &closed, &allSent) != ncclSuccess) {
         INFO(NCCL_RAS, "RAS unexpected error from rasConnSendMsg; terminating the socket connection with %s",
              ncclSocketToString(&sock->sock.addr, rasLine));
         rasSocketTerminate(sock);
@@ -612,9 +623,9 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
           // We may retry further down.
         } else if (closed) {
           const char* socketType;
-          if (sock->connIdx == -1)
+          if (sock->conn == nullptr)
             socketType = "incoming";
-          else if (rasConns[sock->connIdx].sockIdx != sockIdx)
+          else if (sock->conn->sock != sock)
             socketType = "old";
           else if (sock->status == RAS_SOCK_HANDSHAKE)
             socketType = "new";
@@ -624,25 +635,21 @@ void rasSockEventLoop(int sockIdx, int pollIdx) {
                socketType, ncclSocketToString(&sock->sock.addr, rasLine));
           rasSocketTerminate(sock, /*finalize*/true);
           // We may retry further down.
-        } else {
+        } else { // !closed
           sock->lastRecvTime = clockNano();
           if (msg) {
             (void)rasMsgHandle(msg, sock);
             free(msg);
-            // Message handlers can terminate a socket in certain cases; we need to check for
-            // that here so that we don't try to receive from a closed socket.
-            // No handlers are currently believed to create new sockets but better to be safe than sorry
-            // and re-init the sock variable.
-            sock = rasSockets+sockIdx;
-            if (sock->status == RAS_SOCK_CLOSED)
+            // Message handlers can terminate a socket in various cases.  We re-check rasPfds.events to ensure that
+            // this hasn't happened here (rasSocketTerminate will reset it when finalizing a socket).
+            if (!(rasPfds[pollIdx].revents & POLLIN))
               break;
           }
-          if (sock->connIdx != -1) {
-            struct rasConnection* conn = rasConns+sock->connIdx;
-            if (conn->sockIdx == sockIdx && (conn->startRetryTime || conn->experiencingDelays))
-              rasConnResume(conn);
+          if (sock->conn) {
+            if (sock->conn->sock == sock && (sock->conn->startRetryTime || sock->conn->experiencingDelays))
+              rasConnResume(sock->conn);
           }
-        }
+        } // !closed
       } while (msg);
     } // if (POLLIN)
   } // RAS_SOCK_HANDSHAKE || RAS_SOCK_READY || RAS_SOCK_TERMINATING
@@ -658,109 +665,95 @@ void rasNetHandleTimeouts(int64_t now, int64_t* nextWakeup) {
   // A connection can belong to multiple links but, when it comes to various timeouts, we want to handle each
   // connection just once.  We solve that with a simple flag within a connection.  This also allows us to distinguish
   // connections that are part of a link from those that are not.
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++)
-    rasConns[connIdx].linkFlag = false;
+  for (struct rasConnection* conn = rasConnsHead; conn; conn = conn->next)
+    conn->linkFlag = false;
 
   (void)rasLinkHandleNetTimeouts(&rasNextLink, now, nextWakeup);
   (void)rasLinkHandleNetTimeouts(&rasPrevLink, now, nextWakeup);
 
-  for (int connIdx = 0; connIdx < nRasConns; connIdx++) {
-    struct rasConnection* conn = rasConns+connIdx;
-    if (conn->inUse && !conn->linkFlag) {
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    if (!conn->linkFlag) {
       // The connection is not part of any link.  Check if it should be terminated.
-      if (conn->sockIdx == -1 && ncclIntruQueueEmpty(&conn->sendQ)) {
+      if (conn->sock == nullptr && ncclIntruQueueEmpty(&conn->sendQ))
         rasConnTerminate(conn);
-        continue;
-      }
     }
+    conn = connNext;
   }
 }
 
 // Checks for and handles timeouts at the link level; primarily the keep-alives for link connections.
 static ncclResult_t rasLinkHandleNetTimeouts(struct rasLink* link, int64_t now, int64_t* nextWakeup) {
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-    if (linkConn->connIdx != -1) {
-      if (!rasConns[linkConn->connIdx].linkFlag) {
-        rasConnHandleNetTimeouts(linkConn->connIdx, now, nextWakeup);
-        // rasConns may have been reallocated by the above call, which is why we don't have a conn variable here.
-        // For the same reason we re-init linkConn.
-        linkConn = link->conns+i;
-        rasConns[linkConn->connIdx].linkFlag = true;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next) {
+    if (linkConn->conn) {
+      if (!linkConn->conn->linkFlag) {
+        rasConnHandleNetTimeouts(linkConn->conn, now, nextWakeup);
+        linkConn->conn->linkFlag = true;
       }
-    } else if (i == 0 && link->lastUpdatePeersTime != 0) {
+    } else if (linkConn == link->conns && link->lastUpdatePeersTime != 0) {
       // This triggers when rasLinkReinitConns didn't create the primary connection because we have a higher address
       // than the peer.  If that peer fails to initiate within RAS_CONNECT_WARN, we need to take action.
       if (now - link->lastUpdatePeersTime > RAS_CONNECT_WARN) {
         INFO(NCCL_RAS, "RAS peer connect timeout warning (%lds) on socket connection from %s",
              (now-link->lastUpdatePeersTime) / CLOCK_UNITS_PER_SEC,
              ncclSocketToString(&rasPeers[linkConn->peerIdx].addr, rasLine));
-        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->connIdx));
-        if (linkConn->connIdx != -1) {
-          rasConns[linkConn->connIdx].linkFlag = true;
+        NCCLCHECK(rasConnCreate(&rasPeers[linkConn->peerIdx].addr, &linkConn->conn));
+        if (linkConn->conn) {
+          linkConn->conn->linkFlag = true;
         }
-        // We used to connect to the first fallback but I think trying to connect to the calculated primary first
-        // in this case is more intuitive.
-        //(void)rasLinkTryFallback(link, -1);
         link->lastUpdatePeersTime = 0;
       } else {
         *nextWakeup = std::min(*nextWakeup, link->lastUpdatePeersTime+RAS_CONNECT_WARN);
       }
-    } // if (i == 0 && link->lastUpdatePeerTime != 0)
-  } // for (i)
+    } // if (linkConn == link->conns && link->lastUpdatePeerTime != 0)
+  } // for (linkConn)
 
   return ncclSuccess;
 }
 
 // Handles the sending of keep-alive messages and related timeouts for connections that are part of the RAS links.
-static void rasConnHandleNetTimeouts(int connIdx, int64_t now, int64_t* nextWakeup) {
-  struct rasConnection* conn = rasConns+connIdx;
-  if (conn->sockIdx != -1) {
-    struct rasSocket* sock = rasSockets+conn->sockIdx;
-
-    if (sock->status == RAS_SOCK_READY) {
+static void rasConnHandleNetTimeouts(struct rasConnection* conn, int64_t now, int64_t* nextWakeup) {
+  if (conn->sock) {
+    if (conn->sock->status == RAS_SOCK_READY) {
       // Send a regular keep-alive message if we haven't sent anything in a while and we don't have anything queued.
       if (ncclIntruQueueEmpty(&conn->sendQ)) {
-        if (now - sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
+        if (now - conn->sock->lastSendTime > RAS_KEEPALIVE_INTERVAL) {
           rasConnSendKeepAlive(conn);
         } else {
-          *nextWakeup = std::min(*nextWakeup, sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
+          *nextWakeup = std::min(*nextWakeup, conn->sock->lastSendTime+RAS_KEEPALIVE_INTERVAL);
         }
       }
 
       // For short timeouts print a warning but also pessimistically immediately try to establish fallback connections.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_WARN) {
         if (!conn->experiencingDelays) {
           INFO(NCCL_RAS, "RAS keep-alive timeout warning (%lds) on socket connection with %s",
-               (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
+               (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
 
           // At this point, it's mostly just a precaution; we will continue with the primary connection until
           // RAS_PEER_DEAD_TIMEOUT expires.
           conn->experiencingDelays = true;
-          (void)rasLinkAddFallback(&rasNextLink, connIdx);
-          (void)rasLinkAddFallback(&rasPrevLink, connIdx);
-          // rasConns and rasSockets may have been reallocated by the above calls.
-          conn = rasConns+connIdx;
-          sock = rasSockets+conn->sockIdx;
-
-          // Stop collectives from waiting for a response over it.
-          rasCollsPurgeConn(connIdx);
+          (void)rasLinkAddFallback(&rasNextLink, conn);
+          (void)rasLinkAddFallback(&rasPrevLink, conn);
+
+          // Stop ongoing collectives from waiting for a response over this connection.
+          rasCollsPurgeConn(conn);
         }
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_WARN);
       }
 
       // For long timeouts we need to act.
-      if (now - sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
+      if (now - conn->sock->lastRecvTime > RAS_KEEPALIVE_TIMEOUT_ERROR) {
         INFO(NCCL_RAS, "RAS keep-alive timeout error (%lds) on socket connection with %s",
-             (now-sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&sock->sock.addr, rasLine));
-        rasSocketTerminate(sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
+             (now-conn->sock->lastRecvTime) / CLOCK_UNITS_PER_SEC, ncclSocketToString(&conn->addr, rasLine));
+        rasSocketTerminate(conn->sock, /*finalize*/true, RAS_KEEPALIVE_TIMEOUT_ERROR);
         *nextWakeup = now; // Retry will be in the next iteration of the main loop so ensure we don't wait.
       } else {
-        *nextWakeup = std::min(*nextWakeup, sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
+        *nextWakeup = std::min(*nextWakeup, conn->sock->lastRecvTime+RAS_KEEPALIVE_TIMEOUT_ERROR);
       }
-    } // if (sock->status == RAS_SOCK_READY)
-  } // if (conn->sockIdx != -1)
+    } // if (conn->sock->status == RAS_SOCK_READY)
+  } // if (conn->sock)
 }
 
 // Sends a keep-alive message to a peer on the RAS network.
@@ -768,17 +761,17 @@ static void rasConnSendKeepAlive(struct rasConnection* conn, bool nack) {
   struct rasMsg* msg = nullptr;
   int msgLen = rasMsgLength(RAS_MSG_KEEPALIVE);
   if (rasMsgAlloc(&msg, msgLen) == ncclSuccess) {
-    int linkIdx;
+    struct rasLinkConn* linkConn;
     msg->type = RAS_MSG_KEEPALIVE;
     msg->keepAlive.peersHash = rasPeersHash;
     msg->keepAlive.deadPeersHash = rasDeadPeersHash;
     msg->keepAlive.nack = (nack ? 1 : 0);
 
-    linkIdx = rasLinkFindConn(&rasNextLink, conn-rasConns);
-    if (linkIdx != -1 && !rasNextLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasNextLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 2; // Our rasNextLink should be the peer's rasPrevLink.
-    linkIdx = rasLinkFindConn(&rasPrevLink, conn-rasConns);
-    if (linkIdx != -1 && !rasPrevLink.conns[linkIdx].external)
+    linkConn = rasLinkConnFind(&rasPrevLink, conn);
+    if (linkConn && !linkConn->external)
       msg->keepAlive.linkMask |= 1; // Our rasPrevLink should be the peer's rasNextLink.
 
     (void)clock_gettime(CLOCK_REALTIME, &msg->keepAlive.realTime);
@@ -793,46 +786,51 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
   int64_t travelTime;
   int peerIdx;
 
-  assert(sock->connIdx != -1);
-  struct rasConnection* conn = rasConns+sock->connIdx;
+  assert(sock->conn);
   SYSCHECK(clock_gettime(CLOCK_REALTIME, &currentTime), "clock_gettime");
   travelTime = (currentTime.tv_sec-msg->keepAlive.realTime.tv_sec)*1000*1000*1000 +
     (currentTime.tv_nsec-msg->keepAlive.realTime.tv_nsec);
 
-  if (msg->keepAlive.peersHash != conn->lastRecvPeersHash) {
-    conn->lastRecvPeersHash = msg->keepAlive.peersHash;
+  if (msg->keepAlive.peersHash != sock->conn->lastRecvPeersHash) {
+    sock->conn->lastRecvPeersHash = msg->keepAlive.peersHash;
   }
-  if (msg->keepAlive.deadPeersHash != conn->lastRecvDeadPeersHash) {
-    conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
+  if (msg->keepAlive.deadPeersHash != sock->conn->lastRecvDeadPeersHash) {
+    sock->conn->lastRecvDeadPeersHash = msg->keepAlive.deadPeersHash;
   }
 
   // Make sure that the connection is part of the appropriate links forming the RAS network.  In particular, this
   // will add any externally-requested connections to the appropriate links (or remove existing ones, if no longer
   // needed).
-  peerIdx = rasPeerFind(&conn->addr);
+  peerIdx = rasPeerFind(&sock->conn->addr);
   // Note: it's possible for peerIdx to be -1 at this point if, due to races, the keepAlive arrives before
   // the peers update.
-  (void)rasLinkUpdateConn(&rasNextLink, (msg->keepAlive.linkMask & 1) ? sock->connIdx : -1, peerIdx, /*external*/true);
-  (void)rasLinkUpdateConn(&rasPrevLink, (msg->keepAlive.linkMask & 2) ? sock->connIdx : -1, peerIdx, /*external*/true);
+  if (msg->keepAlive.linkMask & 1)
+    (void)rasLinkConnAddExternal(&rasNextLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasNextLink, sock->conn, /*external*/true);
+  if (msg->keepAlive.linkMask & 2)
+    (void)rasLinkConnAddExternal(&rasPrevLink, sock->conn, peerIdx);
+  else
+    rasLinkConnDrop(&rasPrevLink, sock->conn, /*external*/true);
 
   // If the keep-alive message is from a peer that doesn't actually need this connection (i.e., for that peer the
   // connection is just an external fallback), we should check if *we* still need it.  It might be that we don't,
-  // and because we stopped sending the keep-alives, our peer doesn't know about it.  rasLinkUpdateConn calls above
-  // will have wiped any external fallbacks, so anything that remains must be needed.
+  // and because we stopped sending the keep-alives, our peer doesn't know about it.  The rasLinkConnDrop calls
+  // above will have wiped any external fallbacks, so anything that remains must be needed.
   if (!msg->keepAlive.nack && msg->keepAlive.linkMask == 0) {
-    if (rasLinkFindConn(&rasNextLink, sock->connIdx) == -1 && rasLinkFindConn(&rasPrevLink, sock->connIdx) == -1) {
+    if (rasLinkConnFind(&rasNextLink, sock->conn) == nullptr && rasLinkConnFind(&rasPrevLink, sock->conn) == nullptr) {
       // We don't need this connection either.  Notify the peer about it.  To avoid an infinite loop, we set the
       // special nack flag in the message to distinguish it from regular keep-alives.
-      rasConnSendKeepAlive(conn, /*nack*/true);
+      rasConnSendKeepAlive(sock->conn, /*nack*/true);
     }
   }
 
-  if (conn->travelTimeMin > travelTime)
-    conn->travelTimeMin = travelTime;
-  if (conn->travelTimeMax < travelTime)
-    conn->travelTimeMax = travelTime;
-  conn->travelTimeSum += travelTime;
-  conn->travelTimeCount++;
+  if (sock->conn->travelTimeMin > travelTime)
+    sock->conn->travelTimeMin = travelTime;
+  if (sock->conn->travelTimeMax < travelTime)
+    sock->conn->travelTimeMax = travelTime;
+  sock->conn->travelTimeSum += travelTime;
+  sock->conn->travelTimeCount++;
 
   if (msg->keepAlive.peersHash != rasPeersHash || msg->keepAlive.deadPeersHash != rasDeadPeersHash) {
     // This could happen due to a short-lived race condition between the peers propagation
@@ -842,7 +840,7 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
     INFO(NCCL_RAS, "RAS keepAlive hash mismatch from %s (peersHash 0x%lx, deadPeersHash 0x%lx)",
          ncclSocketToString(&sock->sock.addr, rasLine), msg->keepAlive.peersHash, msg->keepAlive.deadPeersHash);
     INFO(NCCL_RAS, "RAS my peersHash 0x%lx, deadPeersHash 0x%lx", rasPeersHash, rasDeadPeersHash);
-    NCCLCHECK(rasConnSendPeersUpdate(conn, rasPeers, nRasPeers));
+    NCCLCHECK(rasConnSendPeersUpdate(sock->conn, rasPeers, nRasPeers));
   }
   return ncclSuccess;
 }
@@ -857,100 +855,104 @@ ncclResult_t rasMsgHandleKeepAlive(const struct rasMsg* msg, struct rasSocket* s
 // External connections are generally ignored by this whole process: in particular, we don't add fallbacks for
 // timing out external connections.  However, we will use an active external connection if it would be a better
 // option than whatever we can come up with.
-static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
-  int peerIdx = -1;
-  int linkIdx = -1;
+ncclResult_t rasLinkAddFallback(struct rasLink* link, const struct rasConnection* conn) {
+  struct rasLinkConn* foundLinkConn = nullptr;
+  struct rasLinkConn* firstExtLinkConn = nullptr;
   int firstExtLinkIdx = -1;
-  int newPeerIdx;
+  int newPeerIdx, i;
 
   // First check if the connection is part of this link.  In the process also check if any of the link's connections
   // might be active -- if so, there's no need to initiate any more fallbacks and we can bail out.
-  for (int i = 0; i < link->nConns; i++) {
-    struct rasLinkConn* linkConn = link->conns+i;
-
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
     if (linkConn->peerIdx == -1) {
-      // Such elements are always at the very end of the array and we can't use them so we can just as well break.
+      // Such elements are always at the end and we can't use them so we can just as well break.
       break;
     }
 
     // Check for any other connection that might be a viable fallback (basically, anything that is not experiencing
     // delays).
-    if (linkConn->connIdx != -1 && linkConn->connIdx != connIdx) {
-      struct rasConnection* conn = rasConns+linkConn->connIdx;
-      if (!conn->experiencingDelays) {
-        if (!linkConn->external)
+    if (linkConn->conn && linkConn->conn != conn) {
+      if (!linkConn->conn->experiencingDelays) {
+        if (!linkConn->external) {
           goto exit; // We don't need to do anything if there's a non-external connection.
-        else if (linkConn->peerIdx != -1) {
+        } else if (linkConn->peerIdx != -1) {
           // Record the location of the first potentially viable external connection in the chain; we may prefer it
           // over anything we can come up with.
-          if (firstExtLinkIdx == -1)
+          if (firstExtLinkConn == nullptr) {
+            firstExtLinkConn = linkConn;
             firstExtLinkIdx = i;
-          if (linkIdx != -1)
+          }
+          if (foundLinkConn)
             break; // Break out of the loop if we already have all the data we might need.
         } // linkConn->external && linkConn->peerIdx != -1
-      } // if (!conn->experiencingDelays)
-    } // if (linkConn->connIdx != -1)
+      } // if (!linkConn->conn->experiencingDelays)
+    } // if (linkConn->conn && linkConn->conn != conn)
 
-    if (linkConn->connIdx == connIdx) {
+    if (linkConn->conn == conn) {
       if (linkConn->external)
         goto exit; // We don't add fallbacks for external connections...
-      peerIdx = linkConn->peerIdx;
-      linkIdx = i;
+      foundLinkConn = linkConn;
       // We are not breaking out of the loop here because we want to check for active connections on *all* potentially
       // viable elements (in particular, there could be some external ones beyond this one).
     }
   }
 
-  if (linkIdx == -1)
+  if (foundLinkConn == nullptr)
     goto exit;
 
   // We found an existing element so the connection is part of the link.  No existing non-external connections of this
   // link are active, so a fallback is needed.
-  assert(peerIdx != -1);
-  newPeerIdx = rasLinkCalculatePeer(link, peerIdx, /*isFallback*/linkIdx > 0);
+  assert(foundLinkConn->peerIdx != -1);
+  newPeerIdx = rasLinkCalculatePeer(link, foundLinkConn->peerIdx, /*isFallback*/(foundLinkConn != link->conns));
   // In principle we want to add (at most) one fallback.  However, if the found fallback connection already exists
   // and is also experiencing delays, we need to keep iterating.
   while (newPeerIdx != -1) {
-    int newConnIdx = rasConnFind(&rasPeers[newPeerIdx].addr);
+    struct rasConnection* newConn = rasConnFind(&rasPeers[newPeerIdx].addr);
+    int linkIdx;
+    struct rasLinkConn* newLinkConn;
     // If we previously found a potential external fallback connection, check if it's better than what we just found.
-    if (firstExtLinkIdx != -1) {
+    if (firstExtLinkConn) {
       linkIdx = -1;
       // Calculate the index that the newly found fallback would have (pretend mode).
-      NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/true,
-                                  &linkIdx));
+      NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/true, &linkIdx));
       assert(linkIdx != -1);
       if (firstExtLinkIdx < linkIdx) {
         // The external connection *is* better -- use it as a fallback instead and be done.
-        link->conns[firstExtLinkIdx].external = false;
+        firstExtLinkConn->external = false;
         goto exit;
       }
     }
-    NCCLCHECK(rasLinkUpdateConn(link, newConnIdx, newPeerIdx, /*external*/false, /*insert*/true, /*pretend*/false,
-                                &linkIdx));
-    if (firstExtLinkIdx != -1 && linkIdx <= firstExtLinkIdx)
-      firstExtLinkIdx++; // Adjust if we inserted a new conn at a lower index.
+    NCCLCHECK(rasLinkConnAdd(link, newConn, newPeerIdx, /*pretend*/false, &linkIdx, &newLinkConn));
+    if (firstExtLinkConn && linkIdx <= firstExtLinkIdx)
+      firstExtLinkIdx++; // Adjust if we inserted a new entry ahead of this one.
 
     INFO(NCCL_RAS, "RAS link %d: %s fallback connection %d with %s",
-         link->direction, (newConnIdx == -1 ? "opening new" : "calculated existing"),
+         link->direction, (newConn == nullptr ? "opening new" : "calculated existing"),
          linkIdx, ncclSocketToString(&rasPeers[newPeerIdx].addr, rasLine));
     // Note that we don't follow here our convention of "lower address is the one establishing connections" --
     // that convention is for optimizing regular operations, but we don't want to take chances during fault
     // recovery. It may temporarily result in duplicate connections, but we have a mechanism to deal with those.
-    if (newConnIdx == -1)
-      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &link->conns[linkIdx].connIdx));
+    if (newConn == nullptr) {
+      NCCLCHECK(rasConnCreate(&rasPeers[newPeerIdx].addr, &newConn));
+      newLinkConn->conn = newConn;
+    }
 
-    struct rasConnection* conn = rasConns+link->conns[linkIdx].connIdx;
     // If the fallback connection is also experiencing delays, we need to keep trying.
-    if (!conn->experiencingDelays)
+    if (!newConn->experiencingDelays)
       break;
     INFO(NCCL_RAS, "RAS connection experiencingDelays %d, startRetryTime %.2fs, socket status %d",
-         conn->experiencingDelays, (conn->startRetryTime ? (clockNano()-conn->startRetryTime)/1e9 : 0.0),
-         (conn->sockIdx == -1 ? -1 : rasSockets[conn->sockIdx].status));
+         newConn->experiencingDelays, (newConn->startRetryTime ? (clockNano()-newConn->startRetryTime)/1e9 : 0.0),
+         (newConn->sock ? newConn->sock->status : -1));
 
     newPeerIdx = rasLinkCalculatePeer(link, newPeerIdx, /*isFallback*/true);
   }
-  if (newPeerIdx == -1)
-      INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (nConns %d)", link->direction, link->nConns);
+  if (newPeerIdx == -1) {
+    int nConns = 0;
+    for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next)
+      nConns++;
+    INFO(NCCL_RAS, "RAS link %d: no more fallbacks to add (total %d)", link->direction, nConns);
+  }
 exit:
   return ncclSuccess;
 }
@@ -958,7 +960,7 @@ static ncclResult_t rasLinkAddFallback(struct rasLink* link, int connIdx) {
 // Invoked when we receive a message over a connection that was just activated or was experiencing delays.
 // Cleans up the fallbacks, timers, etc, as appropriate.
 static void rasConnResume(struct rasConnection* conn) {
-  if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY) {
+  if (conn->sock && conn->sock->status == RAS_SOCK_READY) {
     INFO(NCCL_RAS, "RAS %s connection with %s (sendQ %sempty, experiencingDelays %d, startRetryTime %.2fs)",
          (conn->experiencingDelays && conn->startRetryTime == 0 ? "recovered" : "established"),
          ncclSocketToString(&conn->addr, rasLine), (ncclIntruQueueEmpty(&conn->sendQ) ? "" : "not "),
@@ -972,218 +974,362 @@ static void rasConnResume(struct rasConnection* conn) {
     rasLinkSanitizeFallbacks(&rasPrevLink);
 
     if (!ncclIntruQueueEmpty(&conn->sendQ))
-      rasPfds[rasSockets[conn->sockIdx].pfd].events |= POLLOUT;
+      rasPfds[conn->sock->pfd].events |= POLLOUT;
   }
 }
 
 // Checks if the primary connection is fully established and if so, purges the fallbacks (as they are no longer needed).
 static void rasLinkSanitizeFallbacks(struct rasLink* link) {
-  if (link->nConns > 0 && link->conns[0].connIdx != -1) {
-    struct rasConnection* conn = rasConns+link->conns[0].connIdx;
-    if (conn->sockIdx != -1 && rasSockets[conn->sockIdx].status == RAS_SOCK_READY && !conn->experiencingDelays) {
+  if (link->conns && link->conns->conn) {
+    struct rasConnection* conn = link->conns->conn;
+    if (conn->sock && conn->sock->status == RAS_SOCK_READY && !conn->experiencingDelays) {
       // We have a good primary.  Simply drop all the fallbacks (the external ones will get recreated via the
       // keepAlive messages).
-      for (int i = 1; i < link->nConns; i++) {
+      int i = 1;
+      for (struct rasLinkConn* linkConn = link->conns->next; linkConn; i++) {
+        struct rasLinkConn* linkConnNext = linkConn->next;
         INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-             link->direction, (link->conns[i].external ? "external " : ""), i,
-             ncclSocketToString(&rasConns[link->conns[i].connIdx].addr, rasLine));
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&linkConn->conn->addr, rasLine));
+        free(linkConn);
+        linkConn = linkConnNext;
       }
-      link->nConns = 1;
+      link->conns->next = nullptr;
       link->lastUpdatePeersTime = 0;
     }
   }
 }
 
-// Attempt to drop a connection from a link.
-static void rasLinkDropConn(struct rasLink* link, int connIdx, int linkIdx) {
-  if (linkIdx == -1)
-    linkIdx = rasLinkFindConn(link, connIdx);
-  if (linkIdx != -1) {
-    if (linkIdx == 0) {
-      INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
-           link->direction, ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    } else {
-      INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
-           link->direction, (link->conns[linkIdx].external ? "external " : ""), linkIdx,
-           ncclSocketToString(&rasConns[connIdx].addr, rasLine));
-    }
-    memmove(link->conns+linkIdx, link->conns+linkIdx+1, (link->nConns-(linkIdx+1))*sizeof(*link->conns));
-    if (link->nConns > 1)
-      link->nConns--;
-    else {
-      link->conns[0].peerIdx = link->conns[0].connIdx = -1;
+// Adds an entry to a RAS network link (or updates one, if it already exists).
+// conn can be nullptr if the connection doesn't exist (yet).
+// peerIdx *cannot* be -1 when this function is invoked.
+// If pretend is true, the function will not modify the list and will just set *pLinkIdx and *pLinkConn as appropriate.
+// pLinkIdx and pLinkConn are (optional) pointers to the results; the index/address of the added/updated entry are
+// stored there.
+// insert (true by default) determines whether this is an "add" function (as implied by the name) or an "update" --
+// if set to false, it will refuse to add a new entry (but will update an existing one as needed).
+// Note: there is some code duplication between this function and rasLinkConnAddExternal so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAdd(struct rasLink* link, struct rasConnection* conn, int peerIdx, bool pretend,
+                                   int* pLinkIdx, struct rasLinkConn** pLinkConn, bool insert) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i, oldLinkIdx = -1;
+
+  assert(peerIdx != -1);
+  if (conn) {
+    // Start by checking if we already have an element with this conn.
+    oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+    if (oldLinkConn) {
+      if (pLinkConn)
+        *pLinkConn = oldLinkConn;
+      if (oldLinkConn->peerIdx != -1) {
+        assert(oldLinkConn->peerIdx == peerIdx);
+
+        if (!pretend)
+          oldLinkConn->external = false; // Ensure that external is cleared.
+        if (pLinkIdx)
+          *pLinkIdx = oldLinkIdx;
+        goto exit; // Nothing more to do if both conn and peerIdx are up to date.
+      } // if (oldLinkConn->peerIdx != -1)
+
+      // Otherwise oldLinkConn->peerIdx == -1.  The oldLinkConn is in a wrong place in the list -- we need to find
+      // the right spot.  This can happen only for external connections.
+    } // if (oldLinkConn)
+  } // if (conn)
+
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (!pretend) {
+        if (linkConn->conn == nullptr)
+          linkConn->conn = conn;
+        linkConn->external = false; // Ensure that external is cleared.
+        if (linkConn == link->conns) {
+          // We received a connection from the remote peer that matches the primary connection we've been
+          // waiting for.
+          rasLinkSanitizeFallbacks(link);
+        }
+      } // if (!pretend)
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      if (pLinkConn)
+        *pLinkConn = linkConn;
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
+
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
     }
+  } // for (linkConn)
 
-    if (linkIdx == 0) {
-      // First ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
-      // the remote peer loses interest in it).
-      link->conns[0].external = false;
-      if (link->conns[0].connIdx != -1) {
-        INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
-             link->direction, ncclSocketToString(&rasConns[link->conns[0].connIdx].addr, rasLine));
-      }
-      rasLinkSanitizeFallbacks(link);
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
+  if (pLinkIdx)
+    *pLinkIdx = i;
+  if (pretend)
+    goto exit;
+
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
+          break;
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else if (insert) {
+    struct rasLinkConn* linkConn;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+    } else {
+      assert(link->conns == nullptr); // We never add an element that would replace an existing primary.
+      link->conns = linkConn;
+      // linkConn->next is already nullptr.
     }
-  }
-}
+    linkConn->peerIdx = peerIdx;
+    linkConn->conn = conn;
+    linkConn->external = false;
+    if (pLinkConn)
+      *pLinkConn = linkConn;
+  } // oldLinkConn == nullptr && insert
 
-// Checks if a given connection is a member of this link and if so, returns its entry index.
-// Returns -1 if connection not found.
-static int rasLinkFindConn(const struct rasLink* link, int connIdx) {
-  for (int i = 0; i < link->nConns; i++) {
-    if (link->conns[i].connIdx == connIdx)
-      return i;
-  }
-  return -1;
+exit:
+  return ncclSuccess;
 }
 
-// Note: the behavior of this function has become super-complex and so it should be considered for refactoring.
-// Searches for and updates an entry in a RAS network link.  The conns array is de-facto sorted by peerIdx: it is
-// ordered by preference, though peerIdx values can wrap around (given the ring/torus topology) and they can also
-// be -1 (the latter are stored at the end).
-// external provides an updated value for the entry's external field.  A false value, if requested, is always set;
-// a true value, however, is only set if a new entry is added (external == true implies insert), i.e., if an entry
-// already exists and the function is invoked with external == true, the new value will be ignored.
-// If insert is set, it will, if necessary, insert a new entry if one is not already there.
-// If pretend is set, it will not modify the array and will just set *pLinkIdx as appropriate.
-// pLinkIdx is a pointer to an (optional) result where the index of the added/updated entry is stored.
-// -1 can be passed as peerIdx if unknown (possible in case of race conditions, and only if external).
-// -1 can be passed as connIdx if unknown or, if insert is *not* set, to indicate that the entry is to be removed
-// (the entry's external must match the argument external for it to be removed).
-ncclResult_t rasLinkUpdateConn(struct rasLink* link, int connIdx, int peerIdx, bool external, bool insert,
-                               bool pretend, int* pLinkIdx) {
+// Adds an external entry in a RAS network link (or updates one, if already exists).
+// conn *cannot* be nullptr when this function is invoked.
+// peerIdx can be -1 if unknown (possible in case of a race condition between keepAlive and peers update).
+// Note: there is some code duplication between this function and rasLinkConnAdd so changes to one of them
+// may need to be sync'ed to the other one as well.  They used to be a single function that could do it all but the
+// logic was extremely difficult to follow then.
+static ncclResult_t rasLinkConnAddExternal(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  struct rasLinkConn* oldLinkConn = nullptr;
+  struct rasLinkConn* linkConnPrev = nullptr;
   int i, oldLinkIdx = -1;
 
-  if (external && connIdx != -1)
-    insert = true;
-
-  if (connIdx != -1) {
-    // Start by checking if we already have an element with this connIdx.
-    oldLinkIdx = rasLinkFindConn(link, connIdx);
-    if (oldLinkIdx != -1) {
-      struct rasLinkConn* linkConn = link->conns+oldLinkIdx;
-      if (linkConn->peerIdx != -1)
-        assert(linkConn->peerIdx == peerIdx);
-
-      if (linkConn->peerIdx == peerIdx) {
-        if (!external && !pretend)
-          linkConn->external = false; // Ensure that external is cleared if so requested.
-        if (pLinkIdx)
-          *pLinkIdx = oldLinkIdx;
-        goto exit; // Nothing more to do if both connIdx and peerIdx are up to date.
+  assert(conn);
+  oldLinkConn = rasLinkConnFind(link, conn, &oldLinkIdx);
+  if (oldLinkConn) {
+    if (oldLinkConn->peerIdx != -1)
+      assert(oldLinkConn->peerIdx == peerIdx);
+
+    if (oldLinkConn->peerIdx == peerIdx)
+      goto exit; // Nothing more to do if both conn and peerIdx are up to date.  Note that we neither check nor
+                 // update the value of external here.
+
+    // Otherwise (oldLinkConn->peerIdx == -1 && peerIdx != -1) oldLinkConn, due to its -1 peerIdx, is in
+    // a wrong place in the array -- we need to find the right spot.  oldLinkConn->peerIdx == -1 can only happen for
+    // external connections.
+  } // if (oldLinkConn)
+
+  // Search for the right spot in the conns list.
+  i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (peerIdx == -1) {
+      // We simply want to find the end of the list so that we can insert a new entry with -1 peerIdx there.
+      continue;
+    }
+    if (linkConn->peerIdx == peerIdx) {
+      // The exact linkConn element already exists.
+      if (linkConn->conn)
+        assert(linkConn->conn == conn);
+      if (linkConn->conn == nullptr)
+        linkConn->conn = conn;
+      if (linkConn == link->conns) {
+        // We received a connection from the remote peer that matches the primary connection we've been
+        // waiting for.  This shouldn't trigger for external connections (rasLinkConnUpdate should be invoked first,
+        // which will update the entry's conn, so rasLinkConnFind invoked at the top of this function should succeed),
+        // but better safe than sorry...
+        rasLinkSanitizeFallbacks(link);
       }
+      goto exit;
+    } // if (linkConn->peerIdx == peerIdx)
 
-      // Otherwise (linkConn->peerIdx == -1 && peerIdx != -1) we have a conn that, due to -1 peerIdx, is in a wrong
-      // place in the array -- we need to find the right spot.  linkConn->peerIdx == -1 can only happen for external
-      // connections.
-      assert(external);
+    // Ensure that the previous element is valid.
+    if (linkConnPrev == nullptr)
+      continue;
+    // linkConns with peerIdx == -1 are stored at the end, so if we reach one of them, we are done.
+    if (linkConn->peerIdx == -1)
+      break;
+    // Detect a roll-over and handle it specially.
+    if (link->direction * (linkConnPrev->peerIdx - linkConn->peerIdx) > 0) {
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 ||
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
+    } else { // Regular, monotonic case with the peerIdx value between two existing elements.
+      if (link->direction * (peerIdx - linkConnPrev->peerIdx) > 0 &&
+          link->direction * (peerIdx - linkConn->peerIdx) < 0)
+        break;
     }
-  }
-
-  if (peerIdx != -1) {
-    // Search for the right spot in the conns array.
-    for (i = 0; i < link->nConns; i++) {
-      struct rasLinkConn* linkConn = link->conns+i;
-      if (peerIdx != -1 && linkConn->peerIdx == peerIdx) {
-        // The exact conn element already exists.
-        if (connIdx == -1 && !insert) {
-          // Drop the connection from the link.
-          if (linkConn->external == external) {
-            if (!pretend)
-              rasLinkDropConn(link, linkConn->connIdx, i);
-            else if (pLinkIdx)
-              *pLinkIdx = i;
-          }
-        } else { // connIdx != -1 || insert
-          if (!pretend) {
-            if (linkConn->connIdx != -1)
-              assert(linkConn->connIdx == connIdx);
-            else
-              linkConn->connIdx = connIdx;
-            if (!external)
-              linkConn->external = false; // Ensure that external is cleared if so requested.
-            if (i == 0) {
-              // We received a connection from the remote peer that matches the primary connection we've been
-              // waiting for.
-              rasLinkSanitizeFallbacks(link);
-            }
-          } // if (!pretend)
-          if (pLinkIdx)
-            *pLinkIdx = i;
-        } // connIdx != -1 || insert
+  } // for (linkConn)
 
-        goto exit;
-      } // if (peerIdx != -1 && linkConn->peerIdx == peerIdx)
-      if (!insert)
-        continue;
-      // Ensure that the i-1 index is also valid.
-      if (i == 0)
-        continue;
-      // Conns with peerIdx == -1 are stored at the end, so anything else needs to go before them.
-      if (peerIdx != -1 && linkConn->peerIdx == -1)
-        break;
-      // Detect a roll-over and handle it specially.
-      if (link->direction * (link->conns[i-1].peerIdx - linkConn->peerIdx) > 0) {
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 ||
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
-          break;
-      } else { // Regular, monotonic case with the peerIdx value between two existing elements.
-        if (link->direction * (peerIdx - link->conns[i-1].peerIdx) > 0 &&
-            link->direction * (peerIdx - linkConn->peerIdx) < 0)
+  // The new element should be inserted after linkConnPrev (which is at index i-1).
+  if (oldLinkConn) {
+    if (i != oldLinkIdx) {
+      // We already have the entry, but we need to move it to a new spot (which must be earlier in the list).
+      assert(i < oldLinkIdx);
+      INFO(NCCL_RAS, "RAS link %d: moving %sfallback connection with %s from %d to %d", link->direction,
+           (oldLinkConn->external ? "external " : ""), ncclSocketToString(&conn->addr, rasLine), oldLinkIdx, i);
+      // Remove oldLinkConn from its old spot.
+      for (struct rasLinkConn* linkConn = linkConnPrev; linkConn->next; linkConn = linkConn->next) {
+        if (linkConn->next == oldLinkConn) {
+          linkConn->next = oldLinkConn->next;
           break;
-      }
-    } // for (i)
-  } else {
-    // If peerIdx == -1, insert the new element at the very end.  This can only happen for external connections.
-    assert(external && oldLinkIdx == -1);
-    i = link->nConns;
-  }
-  if (!insert)
-    goto exit;
-
-  // i holds the index at which to insert a new element.
-  if (pretend) {
-    if (pLinkIdx)
-      *pLinkIdx = i;
-    goto exit;
-  }
-
-  if (oldLinkIdx == -1) {
+        }
+      } // for (linkConn)
+      // Insert it at its new spot.
+      oldLinkConn->next = linkConnPrev->next;
+      linkConnPrev->next = oldLinkConn;
+    } // if (i != oldLinkIdx)
+    oldLinkConn->peerIdx = peerIdx;
+    oldLinkConn->external = false;
+  } else { // oldLinkConn == nullptr
     struct rasLinkConn* linkConn;
-    if (link->nConns == link->connsSize) {
-      NCCLCHECK(ncclRealloc(&link->conns, link->connsSize, link->connsSize+RAS_INCREMENT));
-      link->connsSize += RAS_INCREMENT;
+    NCCLCHECK(ncclCalloc(&linkConn, 1));
+    if (linkConnPrev) {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = linkConnPrev->next;
+      linkConnPrev->next = linkConn;
+      linkConn->external = true;
+    } else {
+      INFO(NCCL_RAS, "RAS link %d: adding external fallback with %s as a new primary connection", link->direction,
+           ncclSocketToString(&conn->addr, rasLine));
+      linkConn->next = link->conns;
+      link->conns = linkConn;
+      linkConn->external = false; // Primary connections are never external.
     }
-    linkConn = link->conns+i;
-    // Shift existing conns with indices >= i to make room for the new one.
-    memmove(linkConn+1, linkConn, (link->nConns-i)*sizeof(*link->conns));
     linkConn->peerIdx = peerIdx;
-    linkConn->connIdx = connIdx;
-    linkConn->external = external;
-    if (external) {
-      INFO(NCCL_RAS, "RAS link %d: adding external fallback connection %d with %s", link->direction, i,
-           ncclSocketToString((connIdx != -1 ? &rasConns[connIdx].addr : &rasPeers[peerIdx].addr), rasLine));
+    linkConn->conn = conn;
+  } // oldLinkConn == nullptr
+
+exit:
+  return ncclSuccess;
+}
+
+// Updates an existing entry in a RAS network link, if any.
+// Basically an easy-to-use variant of rasLinkConnAdd.
+// For this function, conn cannot be a nullptr and peerIdx cannot be -1.
+ncclResult_t rasLinkConnUpdate(struct rasLink* link, struct rasConnection* conn, int peerIdx) {
+  assert(conn && peerIdx != -1);
+
+  NCCLCHECK(rasLinkConnAdd(link, conn, peerIdx, /*pretend*/false, /*pLinkIdx*/nullptr, /*pLinkConn*/nullptr,
+                           /*insert*/false));
+  return ncclSuccess;
+}
+
+// Attempts to drop a connection from a link.
+// If the optional external argument is true, it will drop a connection only if its external flag is set
+// (otherwise the flag is ignored and a connection is always dropped if found).
+static void rasLinkConnDrop(struct rasLink* link, const struct rasConnection* conn, bool external) {
+  struct rasLinkConn* linkConnPrev = nullptr;
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConnPrev = linkConn, linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn && (!external || linkConn->external)) {
+      if (linkConnPrev) {
+        INFO(NCCL_RAS, "RAS link %d: dropping %sfallback connection %d with %s",
+             link->direction, (linkConn->external ? "external " : ""), i,
+             ncclSocketToString(&conn->addr, rasLine));
+        linkConnPrev->next = linkConn->next;
+        free(linkConn);
+      } else { // linkConnPrev == nullptr
+        INFO(NCCL_RAS, "RAS link %d: dropping primary connection with %s",
+             link->direction, ncclSocketToString(&conn->addr, rasLine));
+        if (linkConn->next) {
+          link->conns = linkConn->next;
+          // Ensure that the conn becoming the primary is not marked as external (we don't want to lose it if
+          // the remote peer loses interest in it).
+          link->conns->external = false;
+          if (link->conns->conn)
+            INFO(NCCL_RAS, "RAS link %d: former fallback connection 1 with %s is the new primary",
+                 link->direction, ncclSocketToString(&link->conns->conn->addr, rasLine));
+          rasLinkSanitizeFallbacks(link);
+          free(linkConn);
+        } else { // linkConn->next == nullptr
+          // We prefer the primary entry to always be present, even if empty.
+          linkConn->peerIdx = -1;
+          linkConn->conn = nullptr;
+        } // linkConn->next == nullptr
+      } // linkConnPrev == nullptr
+      break;
+    } // if (linkConn->conn == conn)
+  } // for (linkConn)
+}
+
+// Checks if a given connection is a member of this link and if so, returns its link entry.
+// Optionally returns the position of the connection in the conns list.
+// Returns nullptr if connection not found.
+static struct rasLinkConn* rasLinkConnFind(const struct rasLink* link, const struct rasConnection* conn,
+                                           int* pLinkIdx) {
+  int i = 0;
+  for (struct rasLinkConn* linkConn = link->conns; linkConn; linkConn = linkConn->next, i++) {
+    if (linkConn->conn == conn) {
+      if (pLinkIdx)
+        *pLinkIdx = i;
+      return linkConn;
     }
-    link->nConns++;
   }
-  else { // oldLinkIdx > -1
-    // We already have the conn, we just need to move it to a new spot.
-    struct rasLinkConn* linkConn = link->conns+i;
-    assert(i <= oldLinkIdx); // We can only get here if linkConn->peerIdx == -1 && peerIdx != -1.
-    if (i != oldLinkIdx) {
-      struct rasLinkConn tmp;
-      struct rasLinkConn* linkConnNext = link->conns+i+1; // Just to silence the compiler.
-      // Move the existing conn from index oldLinkIdx to a (lower) index i, shifting the existing conns
-      // with indices in the range [i, oldLinkIdx).
-      memcpy(&tmp, link->conns+oldLinkIdx, sizeof(tmp));
-      memmove(linkConnNext, linkConn, (oldLinkIdx-i)*sizeof(*linkConn));
-      memcpy(linkConn, &tmp, sizeof(*linkConn));
-    }
-    if (!external)
-      linkConn->external = false; // Ensure that external is cleared if so requested.
-  } // oldLinkIdx > -1
   if (pLinkIdx)
-    *pLinkIdx = i;
-exit:
-  return ncclSuccess;
+    *pLinkIdx = -1;
+  return nullptr;
+}
+
+// Invoked during RAS termination to release all the allocated resources.
+void rasNetTerminate() {
+  for (struct rasLinkConn* linkConn = rasNextLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  for (struct rasLinkConn* linkConn = rasPrevLink.conns; linkConn;) {
+    struct rasLinkConn* linkConnNext = linkConn->next;
+    free(linkConn);
+    linkConn = linkConnNext;
+  }
+  rasNextLink.conns = rasPrevLink.conns = nullptr;
+  rasNextLink.lastUpdatePeersTime = rasPrevLink.lastUpdatePeersTime = 0;
+
+  for (struct rasConnection* conn = rasConnsHead; conn;) {
+    struct rasConnection* connNext = conn->next;
+    rasConnTerminate(conn);
+    conn = connNext;
+  }
+  // rasConnsHead and rasConnsTail are taken care of by rasConnTerminate().
+
+  for (struct rasSocket* sock = rasSocketsHead; sock;) {
+    struct rasSocket* sockNext = sock->next;
+    rasSocketTerminate(sock);
+    sock = sockNext;
+  }
+  // rasSocketsHead and rasSocketsTail are taken care of by rasSocketTerminate().
 }
diff --git a/src/register/register.cc b/src/register/register.cc
index 9e8f6eaaf..930367a97 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -92,8 +92,8 @@ static ncclResult_t regCleanup(struct ncclComm* comm, struct ncclReg* reg) {
     }
   }
   if (reg->state & NVLS_REG_COMPLETE) {
-    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regSize) != ncclSuccess) {
-      WARN("rank %d deregister NVLS buffer %p dev %d size %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regSize);
+    if (ncclNvlsDeregBuffer(comm, &reg->mcHandle, reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize) != ncclSuccess) {
+      WARN("rank %d deregister NVLS buffer %p dev %d ucsize %ld mcsize %ld failed", comm->rank, (void*)reg->regAddr, reg->dev, reg->regUCSize, reg->regMCSize);
     }
     reg->regAddr = (CUdeviceptr)NULL;
   }
diff --git a/src/transport.cc b/src/transport.cc
index 5629ce7a2..f98b77a43 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -11,11 +11,12 @@
 #include "timer.h"
 #include "transport.h"
 
-struct ncclTransport* ncclTransports[NTRANSPORTS] = {
+struct ncclTransport* ncclTransports[NTRANSPORTS+1] = {
   &p2pTransport,
   &shmTransport,
   &netTransport,
-  &collNetTransport
+  &collNetTransport,
+  &profilerTransport // Not really used for transport, only to create proxy ops polling on profiler counters.
 };
 
 template <int type>
@@ -111,12 +112,14 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   gettimeofday(&timeStart, NULL);
   timeLast = timeStart; // struct copy
   bool timeReported = false;
+  cudaStream_t hostStream, deviceStream;
 
   NCCLCHECK(ncclCalloc(&data, maxPeers));
   NCCLCHECKGOTO(ncclCalloc(&recvData, maxPeers), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&sendData, maxPeers), ret, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
   // First time initialization
   for (int i=1; i<comm->nRanks; i++) {
     int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0);
@@ -195,7 +198,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[sendPeer]->send[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[sendPeer]->send[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -214,7 +217,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
                 if (ret == ncclSuccess) {
                   conn->connected = 1;
                   /* comm->channels[c].devPeers[recvPeer]->recv[connIndex] is a device memory access. */
-                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), ret, fail);
+                  CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[recvPeer]->recv[connIndex], &conn->conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), ret, fail);
                 } else if (ret == ncclInProgress) {
                   allChannelsConnected = false;
                 }
@@ -286,8 +289,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
   if (sendData) free(sendData);
   if (recvData) free(recvData);
 
-  NCCLCHECK(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream));
-  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream));
+  NCCLCHECK(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false));
+  NCCLCHECK(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false));
   return ret;
 fail:
   goto exit;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 67180123f..c1ccfcaa8 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -103,7 +103,7 @@ struct sendResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   uint64_t* gdcSync;
   void* gdrDesc;
@@ -124,7 +124,7 @@ struct recvResources {
   int rank;
   int nranks;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   uint64_t* gdcSync;
@@ -143,9 +143,19 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG
   return ncclSuccess;
 }
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 struct setupReq {
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   struct ncclCollNetSharedRes* collNet;
 };
@@ -168,8 +178,8 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [send] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -192,8 +202,8 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.collNet = comm->collNetSharedRes;
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), &info->collNetHandle, sizeof(collNetHandle_t)));
 
-  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
-      req.useGdr ? "/GDRDMA" : "");
+  INFO(NCCL_INIT|NCCL_NET,"CollNet %02d/%1d : %d [receive] via COLLNET/%s/%d%s%s", channelId, connIndex, myInfo->rank, collNetName(comm), req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "");
   return ncclSuccess;
 }
 
@@ -454,6 +464,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
 }
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("sendProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
   static_assert(sizeof(collNetSendConnectInfo) <= sizeof(struct ncclConnect), "Collnet Send Connect info is too big");
@@ -505,16 +516,17 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->sendMhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->sendMhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -525,10 +537,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
   }
 
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  ncclResult_t ret = ncclSuccess;
   if (reqSize != sizeof(struct collNetConnectArgs)) { WARN("recvProxyConnect: reqSize is %d != %ld", reqSize, sizeof(struct collNetConnectArgs)); return ncclInternalError; }
   struct collNetConnectArgs* args = (struct collNetConnectArgs*)reqBuff;
 
@@ -574,16 +594,17 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   int bank = resources->useGdr ? NCCL_NET_MAP_SHARED_DEVMEM : NCCL_NET_MAP_SHARED_HOSTMEM;
   struct connectMapMem* mapMem = map->mems+bank;
   NCCLCHECK(sharedBuffersInit(connection->collNet, resources->useGdr, &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size));
-  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+  NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
 
+  int dmabuf_fd = -1;
 #if CUDA_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
-    NCCLCHECK(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
-                                                  NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
-                                                  &resources->mhandles[NCCL_PROTO_SIMPLE]));
+    CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)mapMem->cpuPtr, mapMem->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
+    NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, mapMem->cpuPtr, mapMem->size,
+                                                       NCCL_PTR_CUDA, 0ULL, dmabuf_fd,
+                                                       &resources->mhandles[NCCL_PROTO_SIMPLE]),
+                  ret, fail);
     (void)close(dmabuf_fd);
   } else // FALL-THROUGH to nv_peermem GDR path
 #endif
@@ -600,7 +621,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (respSize != sizeof(struct connectMap*)) { WARN("recvProxyConnect: respSize is %d != %ld", respSize, sizeof(void*)); return ncclInternalError; }
   *((struct connectMap**)respBuff) = &resources->map;
-  return ncclSuccess;
+
+exit:
+  return ret;
+fail:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+  }
+  goto exit;
 }
 
 static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState) {
@@ -737,7 +765,7 @@ static ncclResult_t collNetIallreduce(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t recvBeg, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -779,7 +807,7 @@ static ncclResult_t collNetRegIallgather(struct ncclProxyState* proxyState, stru
 }
 
 static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t recvParts;
+  ncclNetSGE_t recvParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   recvParts.mhandle = recvMhandle;
@@ -796,7 +824,7 @@ static ncclResult_t collNetIallgather(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytesIn, ssize_t allBeg, ssize_t sendBeg, void *sendMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   ssize_t nBytes;
@@ -835,7 +863,7 @@ static ncclResult_t collNetRegIreducescatter(struct ncclProxyState* proxyState,
 }
 
 static ncclResult_t collNetIreducescatter(struct ncclProxyState* proxyState, struct sendResources *resources, struct ncclProxyArgs *args, struct ncclProxySubArgs *sub, ssize_t nBytes, ssize_t allBeg, ssize_t sendBeg, ssize_t recvBeg, void *sendMhandle, void *recvMhandle, void **request) {
-  ncclNetSGE_v9_t sendParts;
+  ncclNetSGE_t sendParts;
   ssize_t sizePerRank = args->specifics.collnetDirect.sizePerRank;
   char *region = NCCL_NET_MAP_GET_POINTER(&resources->map, gpu, buffs[NCCL_PROTO_SIMPLE]);
   sendParts.mhandle = sendMhandle;
@@ -1150,6 +1178,7 @@ struct collnetRegInfo {
 
 static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* userbuff, size_t buffSize, int type, struct ncclReg* regRecord, int* outRegBufFlag, void** outHandle) {
   ncclResult_t ret = ncclSuccess;
+  int gdrEnable = -1;
   if (regRecord) {
     if (regRecord->state & COLLNET_REG_COMPLETE) {
       // reuse previous registration
@@ -1165,6 +1194,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
 
       if (conn->flags & NCCL_DIRECT_NIC) {
         struct ncclProxyConnector* proxyconn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].proxyConn : &comm->channels[0].peers[comm->nRanks]->send[type].proxyConn;
+        gdrEnable = 1;
         NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyconn, ncclProxyMsgRegister, &info, sizeof(struct collnetRegInfo), &handle, sizeof(void*)), ret, fail);
         if (handle) {
           regRecord->state |= COLLNET_REG_COMPLETE;
@@ -1174,7 +1204,8 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
           INFO(NCCL_REG, "rank %d - COLLNET register userbuff %p (handle %p), buffSize %ld, type %s", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
         }
       } else {
-        WARN("rank %d - COLLNET failed to register userbuff %p (handle %p), buffSize %ld, type %s, GDR is not enabled", comm->rank, userbuff, handle, buffSize, type == collNetRecv ? "Recv" : "Send");
+        gdrEnable = 0;
+        goto fail;
       }
     }
   }
@@ -1183,6 +1214,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
 fail:
   *outRegBufFlag = 0;
   *outHandle = NULL;
+  INFO(NCCL_REG, "rank %d - COLLNET failed to register userbuff %p, buffSize %ld, type %s, GDR %d", comm->rank, userbuff, buffSize, type == collNetRecv ? "Recv" : "Send", gdrEnable);
   goto exit;
 }
 
@@ -1268,17 +1300,20 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
 
+  int dmabuf_fd = -1;
 #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1301,17 +1336,20 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
 
   assert(reqSize == sizeof(struct collnetRegInfo));
   assert(respSize == sizeof(void*));
+  int dmabuf_fd = -1;
   #if CUDART_VERSION >= 11070
   /* DMA-BUF support */
   if (resources->useGdr && resources->useDmaBuf) {
-    int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMrDmaBuf(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
-    (void)close(dmabuf_fd);
     needReg = false;
   }
 #endif
 peermem:
+  if (dmabuf_fd != -1) {
+    (void)close(dmabuf_fd);
+    dmabuf_fd = -1;
+  }
   if (needReg) {
     NCCLCHECKGOTO(proxyState->ncclCollNet->regMr(resources->collNetComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, &handle), ret, fail);
   }
@@ -1600,4 +1638,4 @@ struct ncclTransport collNetTransport = {
   canConnect,
   { sendSetup, sendConnect, sendFree, NULL, sendProxySetup, sendProxyConnect, sendProxyFree, sendProxyProgress, sendProxyRegBuffer, sendProxyDeregBuffer },
   { recvSetup, recvConnect, recvFree, NULL, recvProxySetup, recvProxyConnect, recvProxyFree, recvProxyProgress, recvProxyRegBuffer, recvProxyDeregBuffer }
-};
\ No newline at end of file
+};
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 8760b4258..40d334fa7 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -92,7 +92,7 @@ struct sendNetResources {
   int tpLocalRank;
   int tpRemoteRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int maxRecvs;
   uint64_t* gdcSync;
@@ -123,7 +123,7 @@ struct recvNetResources {
   int tpRemoteRank;
   int tpRemoteProxyRank;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int useDmaBuf;
   int needFlush;
   int maxRecvs;
@@ -168,7 +168,7 @@ struct setupReq {
   int tpRemoteRank;
   int shared;
   int netDev;
-  int useGdr;
+  enum ncclTopoGdrMode useGdr;
   int needFlush;
   int channelId;
   int connIndex;
@@ -180,6 +180,16 @@ static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
+// Returns the flags to be used by a call to cuMemGetHandleForAddressRange.
+static inline int getHandleForAddressRangeFlags(ncclTopoGdrMode useGdr) {
+  int flags = 0;
+#if CUDA_VERSION >= 12080
+  // Force mapping on PCIe on systems with both PCI and C2C attachments.
+  if (useGdr == ncclTopoGdrModePci) flags = CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE;
+#endif
+  return flags;
+}
+
 /* Determine if we will use this transport for this peer and return connect
  * information for this peer */
 static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) {
@@ -204,11 +214,14 @@ static ncclResult_t sendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), NULL, 0));
 
   if (proxyRank == myInfo->rank) {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   } else {
-    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-        proxyRank, req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+    INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [send] via NET/%s/%d(%d)%s%s%s", channelId, connIndex, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+        proxyRank,
+        req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+        req.shared ? "/Shared" : "");
   }
   *((int*)connectInfo) = comm->topParentRanks[proxyRank];
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
@@ -247,18 +260,19 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   req.tpRemoteRank = comm->topParentRanks[peerInfo->rank];
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, &req, sizeof(req), connectInfo, sizeof(ncclNetHandle_t)));
   memcpy((uint8_t*)connectInfo + sizeof(ncclNetHandle_t), &req.useGdr, sizeof(int));
-  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
-      req.useGdr ? "/GDRDMA" : "", req.shared ? "/Shared" : "");
+  INFO(NCCL_INIT|NCCL_NET,"Channel %02d/%d : %d[%d] -> %d[%d] [receive] via NET/%s/%d%s%s%s", channelId, connIndex, peerInfo->rank, peerInfo->nvmlDev, myInfo->rank, myInfo->nvmlDev, comm->ncclNet->name, req.netDev,
+      req.useGdr ? "/GDRDMA" : "", req.useGdr==ncclTopoGdrModePci ? "(PCI)" : "",
+      req.shared ? "/Shared" : "");
   return ncclSuccess;
 }
 
-static ncclResult_t netMapShm(struct ncclComm *comm, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
+static ncclResult_t netMapShm(struct ncclComm *comm, struct ncclProxyConnector* proxyConn, struct connectMapMem* mem) {
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, proxyConn->rank, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr, &mem->attachDesc));
   return ncclSuccess;
 }
 
 static ncclResult_t netCreateShm(struct ncclProxyState* proxyState, struct connectMapMem* mem) {
-  NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
+  NCCLCHECK(ncclShmAllocateShareableBuffer(mem->size, false, &mem->createDesc, (void**)&mem->cpuPtr, (void**)&mem->gpuPtr));
   return ncclSuccess;
 }
 
@@ -292,6 +306,7 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
 
 struct netSendConnectArgs {
   ncclNetHandle_t handle;
+  int trafficClass;
 };
 
 struct netRecvConnectArgs {
@@ -315,6 +330,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
     netSendConnectArgs args = {0};
     memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
+    args.trafficClass = comm->config.trafficClass;
     NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
@@ -343,7 +359,7 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
       }
     }
   } else if (!(map->sameProcess && map->cudaDev == comm->cudaDev)) {
-    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, map->mems + NCCL_NET_MAP_HOSTMEM));
+    if (!map->sameProcess) NCCLCHECK(netMapShm(comm, &send->proxyConn, map->mems + NCCL_NET_MAP_HOSTMEM));
     if (map->mems[NCCL_NET_MAP_DEVMEM].size) {
       map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr = NULL;
       NCCLCHECK(ncclP2pImportShareableBuffer(comm, send->proxyConn.rank,
@@ -692,9 +708,11 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version,
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
+  ncclNetCommConfig_t commConfig = {0};
   if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError;
   ncclResult_t ret = ncclSuccess;
   netSendConnectArgs* req = (netSendConnectArgs*) reqBuff;
+  commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass;
   NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle));
   if (resources->shared) {
     // Shared buffers
@@ -714,15 +732,15 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+      ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+    ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
@@ -748,7 +766,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, p!= NCCL_PROTO_LL && resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -765,7 +783,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
     }
 
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
@@ -820,7 +838,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -904,7 +922,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
 
   if (resources->shared == 0) { // Only allocate dedicated buffers for ring/tree, not for p2p
     for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
-      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr, proxyState->buffSizes[p], buffs[p]);
+      NCCL_NET_MAP_ADD_POINTER(map, 0, resources->useGdr ? 1 : 0, proxyState->buffSizes[p], buffs[p]);
       resources->buffSizes[p] = proxyState->buffSizes[p];
     }
   } else {
@@ -915,14 +933,14 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
           proxyState, resources->useGdr, resources->tpLocalRank, 1, 1, proxyState->p2pnChannels,
           &mapMem->gpuPtr, &mapMem->cpuPtr, &mapMem->size, NULL));
     resources->buffSizes[NCCL_PROTO_SIMPLE] = mapMem->size;
-    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
+    NCCL_NET_MAP_ADD_POINTER(map, 1, resources->useGdr ? 1 : 0, mapMem->size, buffs[NCCL_PROTO_SIMPLE]);
   }
 
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclSendMem), sendMem);
   NCCL_NET_MAP_ADD_POINTER(map, 0, 0, sizeof(struct ncclRecvMem), recvMem);
 
   if (proxyState->allocP2pNetLLBuffers) {
-    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*resources->useGdr*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
+    NCCL_NET_MAP_ADD_POINTER(map, 0, 0 /*devMem*/, proxyState->buffSizes[NCCL_PROTO_LL], buffs[NCCL_PROTO_LL]);
     resources->buffSizes[NCCL_PROTO_LL] = proxyState->buffSizes[NCCL_PROTO_LL];
   }
 
@@ -964,7 +982,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       int type = NCCL_NET_MAP_DEV_MEM(map, buffs[p]) ? NCCL_PTR_CUDA : NCCL_PTR_HOST;
       if (type == NCCL_PTR_CUDA && resources->useDmaBuf) {
         int dmabuf_fd;
-        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+        CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)resources->buffers[p], resources->buffSizes[p], CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)));
         NCCLCHECK(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, resources->buffers[p], resources->buffSizes[p], type, 0ULL, dmabuf_fd, &resources->mhandles[p]));
         (void)close(dmabuf_fd);
       } else // FALL-THROUGH to nv_peermem GDR path
@@ -1175,11 +1193,12 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub->requests+buffSlot));
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
               sub->transSize += size;
               sub->transmitted += args->sliceSteps;
+              sub->profilerSteps++;
               ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
               ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
@@ -1280,6 +1299,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       size_t sizes[NCCL_PROXY_MAX_SUBS];
       int tags[NCCL_PROXY_MAX_SUBS];
       void* mhandles[NCCL_PROXY_MAX_SUBS];
+      void* phandles[NCCL_PROXY_MAX_SUBS];
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
         int postedStepId = sub->posted;
@@ -1323,6 +1343,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = sub->recvMhandle;
+          phandles[subCount] = sub;
           subCount++;
         }
       }
@@ -1332,7 +1353,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
         if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
-        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, requestPtr));
+        NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
         if (*requestPtr) {
           subGroup->recvRequestsCache[step%NCCL_STEPS] = *requestPtr;
           subGroup->recvRequestsSubCount = subCount;
@@ -1341,6 +1362,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             int postedStepId = sub->posted;
             TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
+            sub->profilerSteps++;
             ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
             ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
@@ -1558,7 +1580,7 @@ static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size
   return ret;
 fail:
   *outRegBufFlag = 0;
-  WARN("rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
+  INFO(NCCL_REG, "rank %d failed to NET register userbuff %p buffSize %ld GDR flag %d", comm->rank, userbuff, buffSize, gdrFlag);
   goto exit;
 }
 
@@ -1639,7 +1661,7 @@ static ncclResult_t sendProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netSendComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
@@ -1673,7 +1695,7 @@ static ncclResult_t recvProxyRegBuffer(struct ncclProxyConnection* connection, s
   /* DMA-BUF support */
   if (resources->useDmaBuf) {
     int dmabuf_fd;
-    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0), ret, peermem);
+    CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&dmabuf_fd, (CUdeviceptr)info->buffer, info->size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, getHandleForAddressRangeFlags(resources->useGdr)), ret, peermem);
     NCCLCHECKGOTO(proxyState->ncclNet->regMrDmaBuf(resources->netRecvComm, (void*)info->buffer, info->size, NCCL_PTR_CUDA, 0ULL, dmabuf_fd, &handle), ret, peermem);
     (void)close(dmabuf_fd);
     needReg = false;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index bc54133d3..bfff6e555 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -11,6 +11,7 @@
 #include "graph.h"
 #include "utils.h"
 #include "param.h"
+#include "profiler/net_ib.h"
 
 #include <assert.h>
 #include <pthread.h>
@@ -85,6 +86,11 @@ struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
 pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
+#define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
+
+#define NCCL_IB_SL_DEFAULT 0
+#define NCCL_IB_TC_DEFAULT 0
+
 NCCL_PARAM(IbGidIndex, "IB_GID_INDEX", -1);
 NCCL_PARAM(IbRoutableFlidIbGidIndex, "IB_ROUTABLE_FLID_GID_INDEX", 1);
 NCCL_PARAM(IbRoceVersionNum, "IB_ROCE_VERSION_NUM", 2);
@@ -92,8 +98,8 @@ NCCL_PARAM(IbTimeout, "IB_TIMEOUT", 20);
 NCCL_PARAM(IbRetryCnt, "IB_RETRY_CNT", 7);
 NCCL_PARAM(IbPkey, "IB_PKEY", 0);
 NCCL_PARAM(IbUseInline, "IB_USE_INLINE", 0);
-NCCL_PARAM(IbSl, "IB_SL", 0);
-NCCL_PARAM(IbTc, "IB_TC", 0);
+NCCL_PARAM(IbSl, "IB_SL", -1);
+NCCL_PARAM(IbTc, "IB_TC", -1);
 NCCL_PARAM(IbArThreshold, "IB_AR_THRESHOLD", 8192);
 NCCL_PARAM(IbPciRelaxedOrdering, "IB_PCI_RELAXED_ORDERING", 2);
 NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
@@ -327,6 +333,9 @@ static ncclResult_t ncclIbRoceGetVersionNum(const char* deviceName, int portNum,
   close(fd);
 
   if (ret == -1) {
+    // In containerized environments, read could return EINVAL if the GID index is not mapped to the
+    // container sysfs. In this case return ncclSuccess and let the caller move to next GID index.
+    if (errno == EINVAL) return ncclSuccess;
     WARN("NET/IB: read failed in ncclIbRoceGetVersionNum: %s", strerror(errno));
     return ncclSystemError;
   }
@@ -359,7 +368,7 @@ static ncclResult_t ncclUpdateGidIndex(struct ibv_context* context, uint8_t port
       return ncclSuccess;
     }
     int usrRoceVer = roceVer;
-    int gidRoceVerNum, gidRoceVerNumCandidate;
+    int gidRoceVerNum, gidRoceVerNumCandidate = -1;
     const char* deviceName = wrap_ibv_get_device_name(context->device);
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, *gidIndex, &gidRoceVerNum));
     NCCLCHECK(ncclIbRoceGetVersionNum(deviceName, portNum, gidIndexCandidate, &gidRoceVerNumCandidate));
@@ -530,8 +539,8 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
     }
     ncclIbDev* dev = ncclIbDevs + props->devs[i];
     if (dev->link != dev0->link) {
-      WARN("NET/IB : Trying to merge multiple devices together with different link_layer properties %s -> %d, %s -> %d. Try only selecting NICs with one type of link using NCCL_IB_HCA",
-        dev0->devName, dev0->link, dev->devName, dev->link);
+      WARN("NET/IB : Attempted to merge incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+        props->devs[0], dev0->devName, dev0->portNum, NCCL_IB_LLSTR(dev0->link), props->devs[i], dev->devName, dev->portNum, NCCL_IB_LLSTR(dev->link));
       return ncclInvalidUsage;
     }
   }
@@ -548,8 +557,11 @@ ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
   return res;
 }
 
-ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
   ncclResult_t ret = ncclSuccess;
+  ncclProfilerFunction = profFunction;
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
@@ -571,7 +583,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
       struct ibv_device** devices;
 
       // Check if user defined which IB device:port to use
-      char* userIbEnv = getenv("NCCL_IB_HCA");
+      const char* userIbEnv = ncclGetEnv("NCCL_IB_HCA");
       if (userIbEnv != NULL && shownIbHcaEnv++ == 0) INFO(NCCL_NET|NCCL_ENV, "NCCL_IB_HCA set to %s", userIbEnv);
       struct netIf userIfs[MAX_IB_DEVS];
       bool searchNot = userIbEnv && userIbEnv[0] == '^';
@@ -634,7 +646,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
           if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
 
           TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-              portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE", ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
 
           PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
           ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
@@ -666,7 +678,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
     ncclIbRelaxedOrderingEnabled = ncclIbRelaxedOrderingCapable();
     for (int d = 0; d < ncclNIbDevs; d++) {
         snprintf(line+strlen(line), sizeof(line)-strlen(line), " [%d]%s:%d/%s", d, ncclIbDevs[d].devName,
-          ncclIbDevs[d].portNum, ncclIbDevs[d].link == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE");
+          ncclIbDevs[d].portNum, NCCL_IB_LLSTR(ncclIbDevs[d].link));
     }
     char addrline[SOCKET_NAME_MAXLEN+1];
     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
@@ -832,6 +844,8 @@ struct ncclIbConnectionMetadata {
   char devName[MAX_MERGED_DEV_NAME];
   uint64_t fifoAddr;
   int ndevs;
+  int tc;
+  int sl;
 };
 
 enum ncclIbCommState {
@@ -873,12 +887,23 @@ struct ncclIbGidInfo {
 #define NCCL_NET_IB_REQ_FLUSH 3
 const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
 
+#define MAX_QPS_PER_REQ 8
+struct ncclProfilerInfo {
+  void* qpEventHandles[MAX_QPS_PER_REQ];
+  int qpIndex[MAX_QPS_PER_REQ];
+  int nEventHandles;
+  ncclProfilerNetIbDescr_v1_t data;
+};
+
 struct ncclIbRequest {
   struct ncclIbNetCommBase* base;
   int type;
   struct ncclSocket* sock;
   int events[NCCL_IB_MAX_DEVS_PER_NIC];
   struct ncclIbNetCommDevBase* devBases[NCCL_IB_MAX_DEVS_PER_NIC];
+#ifdef NCCL_ENABLE_NET_PROFILING
+  struct ncclProfilerInfo pInfo[NCCL_NET_IB_MAX_RECVS];
+#endif
   int nreqs;
   union {
     struct {
@@ -1084,7 +1109,7 @@ ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base,
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc) {
+ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint32_t dest_qp_num, struct ncclIbDevInfo* info, bool fifoTc, int tc, int sl) {
   struct ibv_qp_attr qpAttr;
   memset(&qpAttr, 0, sizeof(struct ibv_qp_attr));
   qpAttr.qp_state = IBV_QPS_RTR;
@@ -1100,7 +1125,7 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
     qpAttr.ah_attr.grh.flow_label = 0;
     qpAttr.ah_attr.grh.sgid_index = sGidInfo->localGidIndex;
     qpAttr.ah_attr.grh.hop_limit = 255;
-    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : ncclParamIbTc();
+    qpAttr.ah_attr.grh.traffic_class = fifoTc && ncclParamIbFifoTc() != -1 ? ncclParamIbFifoTc() : tc;
   } else {
     //pick lid if subnet prefixs are same, FLID if they are not
     if (ncclIbExtractLocalSubnetPrefix(sGidInfo->localGid.global.subnet_prefix) ==
@@ -1122,10 +1147,10 @@ ncclResult_t ncclIbRtrQp(struct ibv_qp* qp, struct ncclIbGidInfo* sGidInfo, uint
 	qpAttr.ah_attr.grh.hop_limit = 255;
     }
   }
-  qpAttr.ah_attr.sl = ncclParamIbSl();
+  qpAttr.ah_attr.sl = sl;
   qpAttr.ah_attr.src_path_bits = 0;
   qpAttr.ah_attr.port_num = info->ib_port;
-  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port);
+  TRACE(NCCL_NET, "NET/IB : ncclIbRtrQp qpn=%u mtu=%d dst=%u ll=%u port=%u sl: %d tc: %d", qp->qp_num, info->mtu, dest_qp_num, info->link_layer, info->ib_port, qpAttr.ah_attr.sl, qpAttr.ah_attr.grh.traffic_class);
   NCCLCHECK(wrap_ibv_modify_qp(qp, &qpAttr, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER));
   return ncclSuccess;
 }
@@ -1164,12 +1189,13 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   goto exit;
 }
 
-ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)stage->comm;
   int ready;
+  uint8_t link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *sendComm = NULL;
 
   if (stage->state == ncclIbCommStateConnect)      goto ib_connect_check;
@@ -1199,7 +1225,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
   // IB Setup
   struct ncclIbMergedDev* mergedDev;
   if (dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", dev);
     return ncclInternalError;
   }
 
@@ -1305,8 +1331,17 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
             devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
       }
     }
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer;
+    if (link_layer != devInfo->link_layer) {
+      int ibDev0 = comm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           commDev->base.ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
   meta.fifoAddr = (uint64_t)comm->fifo;
+  meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
+  meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
   strncpy(meta.devName, mergedDev->devName, MAX_MERGED_DEV_NAME);
 
   stage->state = ncclIbCommStateSend;
@@ -1332,13 +1367,16 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
 
   comm->base.nRemDevs = remMeta.ndevs;
 
-  int link_layer;
-  link_layer = remMeta.devs[0].link_layer;
-  for (int i = 1; i < remMeta.ndevs; i++) {
-    if (remMeta.devs[i].link_layer != link_layer) {
-      WARN("NET/IB : Can't connect net devices with different link_layer. i=%d remMeta.ndevs=%d link_layer=%d rem_link_layer=%d",
-      i, remMeta.ndevs, link_layer, remMeta.devs[i].link_layer);
-      return ncclInternalError;
+  // ensure that the remote devices have the same link layer than the local devices used in the connection.
+  if (comm->base.vProps.ndevs > 0) {
+    int ibDev0 = comm->devs[0].base.ibDevN;
+    link_layer = ncclIbDevs[ibDev0].portAttr.link_layer;
+    for (int i = 0; i < remMeta.ndevs; i++) {
+      if (remMeta.devs[i].link_layer != link_layer) {
+        WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+             NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+        return ncclInternalError;
+      }
     }
   }
 
@@ -1373,7 +1411,7 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm, ncclNet
 
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
     remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
-    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp, &commDev->base.gidInfo, remQpInfo->qpn, remDevInfo, false, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
 
@@ -1459,6 +1497,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   struct ncclIbCommStage* stage = &lComm->stage;
   struct ncclIbRecvComm* rComm = (struct ncclIbRecvComm*)stage->comm;
   int ready;
+  int link_layer = IBV_LINK_LAYER_UNSPECIFIED;
   *recvComm = NULL;
 
   if (stage->state == ncclIbCommStateAccept)   goto ib_accept_check;
@@ -1497,7 +1536,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
   ncclNetVDeviceProps_t remoteVProps;
   memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
   if (lComm->dev >= ncclNMergedIbDevs) {
-    WARN("NET/IB : Trying to use non-existant virtual device %d", lComm->dev);
+    WARN("NET/IB : Trying to use non-existent virtual device %d", lComm->dev);
     return ncclInternalError;
   }
 
@@ -1555,6 +1594,13 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     ibDev = ncclIbDevs + ibDevN;
     NCCLCHECKGOTO(ncclIbGetGidIndex(ibDev->context, ibDev->portNum, &ibDev->portAttr, &rCommDev->base.gidInfo.localGidIndex), ret, fail);
     NCCLCHECKGOTO(wrap_ibv_query_gid(ibDev->context, ibDev->portNum, rCommDev->base.gidInfo.localGidIndex, &rCommDev->base.gidInfo.localGid), ret, fail);
+    if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = ibDev->portAttr.link_layer;
+    if (link_layer != ibDev->portAttr.link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Attempted to connect incompatible devices: [%d]%s:%d/%s and [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           ibDevN, ibDev->devName, ibDev->portNum, NCCL_IB_LLSTR(ibDev->portAttr.link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Copy remDevInfo for things like remGidInfo, remFifoAddr, etc.
@@ -1562,6 +1608,12 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     rComm->base.remDevs[i] = remMeta.devs[i];
     rComm->base.remDevs[i].remoteGid.global.interface_id  = rComm->base.remDevs[i].gid.global.interface_id;
     rComm->base.remDevs[i].remoteGid.global.subnet_prefix = rComm->base.remDevs[i].gid.global.subnet_prefix;
+    if (remMeta.devs[i].link_layer != link_layer) {
+      int ibDev0 = rComm->devs[0].base.ibDevN;
+      WARN("NET/IB : Remote %s device is incompatible with the local [%d]%s:%d/%s. Try selecting NICs of only one link type using NCCL_IB_HCA",
+           NCCL_IB_LLSTR(remMeta.devs[i].link_layer), ibDev0, ncclIbDevs[ibDev0].devName, ncclIbDevs[ibDev0].portNum, NCCL_IB_LLSTR(link_layer));
+      return ncclInternalError;
+    }
   }
 
   // Stripe QP creation across merged devs
@@ -1598,7 +1650,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       meta.qpInfo[q].ece_supported = 0;
     }
 
-    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true), ret, fail);
+    NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
   }
 
@@ -1629,7 +1681,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       devInfo.gid.global.subnet_prefix        = rCommDev->base.gidInfo.localGid.global.subnet_prefix;
       devInfo.gid.global.interface_id         = rCommDev->base.gidInfo.localGid.global.interface_id;
       devInfo.mtu         = ibDev->portAttr.active_mtu;
-      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false), ret, fail);
+      NCCLCHECKGOTO(ncclIbRtrQp(rCommDev->gpuFlush.qp.qp, &rCommDev->base.gidInfo, rCommDev->gpuFlush.qp.qp->qp_num, &devInfo, false, remMeta.tc, remMeta.sl), ret, fail);
       NCCLCHECKGOTO(ncclIbRtsQp(rCommDev->gpuFlush.qp.qp), ret, fail);
     }
 
@@ -1646,6 +1698,8 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     meta.devs[i].fifoRkey = rComm->devs[i].sizesFifoMr->rkey;
   }
   meta.fifoAddr = (uint64_t)rComm->sizesFifo;
+  meta.sl = remMeta.sl;
+  meta.tc = remMeta.tc;
 
   for (int q = 0; q < rComm->base.nqps; q++) {
     meta.qpInfo[q].qpn      = rComm->base.qps[q].qp->qp_num;
@@ -1842,7 +1896,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
 
 NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
   int nreqs = slots[0].nreqs;
@@ -1860,6 +1914,9 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     wr->wr.rdma.remote_addr = slots[r].addr;
     wr->next = wr + 1;
     wr_id += (reqs[r] - comm->base.reqs) << (r*8);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    reqs[r]->pInfo[0].nEventHandles = 0;
+#endif
   }
 
   // Write size as immediate data. In the case of multi-send, only write
@@ -1929,6 +1986,24 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
     }
 
     struct ibv_send_wr* bad_wr;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // QP profiling loop
+    for (int r=0; r<nreqs && pHandle; r++) {
+      // Store comm qpIndex for this request
+      int nEventHandles = reqs[r]->pInfo[0].nEventHandles;
+      reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex;
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      reqs[r]->pInfo[0].data.type = ncclProfileQp;
+      reqs[r]->pInfo[0].data.qp.device = devIndex;
+      reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id;
+      reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode;
+      reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num;
+      reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length;
+      NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data));
+      reqs[r]->pInfo[0].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_send(qp->qp, comm->wrs, &bad_wr));
 
     for (int r=0; r<nreqs; r++) {
@@ -1945,7 +2020,7 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2018,7 +2093,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     }
 
     TIME_START(0);
-    NCCLCHECK(ncclIbMultiSend(comm, slot));
+    NCCLCHECK(ncclIbMultiSend(comm, slot, phandle));
 
     // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
     memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
@@ -2109,7 +2184,7 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
   if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
   if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
@@ -2121,6 +2196,9 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   req->type = NCCL_NET_IB_REQ_RECV;
   req->sock = &comm->base.sock;
   req->nreqs = n;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  for (int r = 0; r < n && phandles; r++) req->pInfo[r].nEventHandles = 0;
+#endif
 
   for (int i = 0; i < comm->base.vProps.ndevs; i++) {
     req->devBases[i] = &comm->devs[i].base;
@@ -2141,6 +2219,19 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   for (int i = 0; i < nqps; i++) {
     struct ncclIbQp* qp = comm->base.qps + comm->base.qpIndex;
     ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+#ifdef NCCL_ENABLE_NET_PROFILING
+    // Start a QP event for every request in the multirecv and every qp
+    for (int r = 0; r < n && phandles; r++) {
+      // Store info for profiler
+      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      req->pInfo[r].data.type = ncclProfileQp;
+      req->pInfo[r].data.qp.device = qp->devIndex;
+      req->pInfo[r].data.qp.wr_id = wr.wr_id;
+      req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
+      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data));
+      req->pInfo[r].nEventHandles++;
+    }
+#endif
     NCCLCHECK(wrap_ibv_post_recv(qp->qp, &wr, &bad_wr));
     comm->base.qpIndex = (comm->base.qpIndex+1)%comm->base.nqps;
   }
@@ -2196,6 +2287,16 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
 
 #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
 
+#ifdef NCCL_ENABLE_NET_PROFILING
+static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
+  for (int i = 0; i < MAX_QPS_PER_REQ; i++) {
+    int qpIndex = req->pInfo[request].qpIndex[i];
+    if (req->base->qps[qpIndex].qp->qp_num == qpNumber) return i;
+  }
+  return 0;
+}
+#endif
+
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
   *done = 0;
@@ -2205,11 +2306,24 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
       if (sizes && r->type == NCCL_NET_IB_REQ_RECV) {
-        for (int i=0; i<r->nreqs; i++) sizes[i] = r->recv.sizes[i];
+        for (int i=0; i<r->nreqs; i++) {
+          sizes[i] = r->recv.sizes[i];
+#ifdef NCCL_ENABLE_NET_PROFILING
+          for (int j = 0; j < r->pInfo[i].nEventHandles; j++) {
+            NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL));
+          }
+#endif
+        }
       }
       if (sizes && r->type == NCCL_NET_IB_REQ_SEND) {
         sizes[0] = r->send.size;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        for (int j = 0; j < r->pInfo[0].nEventHandles; j++) {
+          NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL));
+        }
+#endif
       }
+      // Stop all remaining Qp events for this event
       NCCLCHECK(ncclIbFreeRequest(r));
       return ncclSuccess;
     }
@@ -2264,6 +2378,10 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
                 return ncclInternalError;
               }
               sendReq->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+              // Stop Qp event for sendReq
+              NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL));
+#endif
             }
           } else {
             if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
@@ -2276,6 +2394,12 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
               }
             }
             req->events[i]--;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            // Stop Qp event for workFifo
+            for (int j = 0; j < req->nreqs; j++) {
+              NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL));
+            }
+#endif
           }
         }
         // Once the IB fatal event is reported in the async thread, we want to propagate this error
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 235dee865..8034d95fe 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -9,6 +9,7 @@
 #include "socket.h"
 #include "net.h"
 #include "param.h"
+#include "profiler/net_socket.h"
 
 #include <pthread.h>
 #include <stdlib.h>
@@ -35,7 +36,10 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction) {
+static ncclProfilerCallback_t ncclProfilerFunction;
+
+ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  ncclProfilerFunction = profFunction;
   if (ncclNetIfs == -1) {
     pthread_mutex_lock(&ncclNetSocketLock);
     if (ncclNetIfs == -1) {
@@ -158,6 +162,11 @@ struct ncclNetSocketTask {
   ncclResult_t result;
 };
 
+struct ncclProfilerInfo {
+  void* eHandle;
+  void* pHandle;
+};
+
 struct ncclNetSocketRequest {
   int op;
   void* data;
@@ -168,6 +177,7 @@ struct ncclNetSocketRequest {
   struct ncclNetSocketComm* comm;
   struct ncclNetSocketTask* tasks[MAX_SOCKETS];
   int nSubs;
+  struct ncclProfilerInfo pInfo;
 };
 
 struct ncclNetSocketTaskQueue {
@@ -180,6 +190,7 @@ struct ncclNetSocketThreadResources {
   struct ncclNetSocketTaskQueue threadTaskQueue;
   int stop;
   struct ncclNetSocketComm* comm;
+  struct ncclProfilerInfo* pInfo;
   pthread_mutex_t threadLock;
   pthread_cond_t  threadCond;
 };
@@ -210,6 +221,9 @@ void* persistentSocketThread(void *args_) {
   struct ncclNetSocketComm* comm = resource->comm;
   struct ncclNetSocketTaskQueue* myQueue = &resource->threadTaskQueue;
   int nSocksPerThread = comm->nSocks / comm->nThreads;
+#ifdef NCCL_ENABLE_NET_PROFILING
+  void* eHandle[MAX_REQUESTS*MAX_SOCKETS] = { 0 };
+#endif
   while (1) {
     int idle = 1;
     int mark = myQueue->next; // mark newest task seen
@@ -220,13 +234,33 @@ void* persistentSocketThread(void *args_) {
         for (int j=0; j<nSocksPerThread; j++) {
           struct ncclNetSocketTask* r = myQueue->tasks+i+j;
           if (r != NULL && r->used == 1 && r->offset < r->size) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (!eHandle[i+j]) {
+              ncclProfilerNetSockDescr_v1_t data;
+              data.type = ncclProfileSocket;
+              data.sock.fd = r->sock->fd;
+              data.sock.op = r->op;
+              data.sock.length = r->size;
+              ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+            }
+#endif
             r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
+#ifdef NCCL_ENABLE_NET_PROFILING
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+#endif
               WARN("NET/Socket : socket progress error");
               return NULL;
             }
             idle = 0;
             if (r->offset < r->size) repeat = 1;
+#ifdef NCCL_ENABLE_NET_PROFILING
+            if (repeat == 0) {
+              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              eHandle[i+j] = NULL;
+            }
+#endif
           }
         }
       } while (repeat);
@@ -326,7 +360,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm)
   goto exit;
 }
 
-ncclResult_t ncclNetSocketConnect(int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
@@ -444,7 +478,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi
   return ncclInternalError;
 }
 
-ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void* data, int size, struct ncclNetSocketTask** req) {
+ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclProfilerInfo* pInfo, int op, void* data, int size, struct ncclNetSocketTask** req) {
   int tid = comm->nextSock % comm->nThreads;
   struct ncclNetSocketThreadResources* res = comm->threadResources+tid;
   struct ncclNetSocketTaskQueue* queue = &res->threadTaskQueue;
@@ -457,6 +491,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, int op, void*
     NCCLCHECK(ncclCalloc(&queue->tasks, queue->len));
     queue->next = 0;
     res->comm = comm;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    res->pInfo = pInfo;
+#endif
     pthread_mutex_init(&res->threadLock, NULL);
     pthread_cond_init(&res->threadCond, NULL);
     PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
@@ -520,7 +557,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
       int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
       while (chunkOffset < r->size) {
         int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclNetSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
         chunkOffset += chunkSize;
       }
     }
@@ -544,6 +581,16 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         }
       }
     } else { // progress request using main thread
+#ifdef NCCL_ENABLE_NET_PROFILING
+      if (!r->pInfo.eHandle) {
+        ncclProfilerNetSockDescr_v1_t data;
+        data.type = ncclProfileSocket;
+        data.sock.fd = r->ctrlSock->fd;
+        data.sock.op = r->op;
+        data.sock.length = r->size;
+        ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+      }
+#endif
       if (r->offset < r->size) {
         NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, r->data, r->size, &r->offset));
       }
@@ -551,6 +598,10 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         if (size) *size = r->size;
         *done = 1;
         r->used = 0;
+#ifdef NCCL_ENABLE_NET_PROFILING
+        ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL);
+        r->pInfo.eHandle = NULL;
+#endif
       }
     }
   }
@@ -562,16 +613,26 @@ ncclResult_t ncclNetSocketRegMr(void* comm, void* data, size_t size, int type, v
 }
 ncclResult_t ncclNetSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; }
 
-ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
+ncclResult_t ncclNetSocketIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)sendComm;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_SEND, data, (int) size, (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  req->pInfo.pHandle = phandle;
+#endif
   return ncclSuccess;
 }
 
-ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** request) {
+ncclResult_t ncclNetSocketIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclNetSocketComm* comm = (struct ncclNetSocketComm*)recvComm;
   if (n != 1) return ncclInternalError;
   NCCLCHECK(ncclNetSocketGetRequest(comm, NCCL_SOCKET_RECV, data[0], (int)sizes[0], (struct ncclNetSocketRequest**)request));
+#ifdef NCCL_ENABLE_NET_PROFILING
+  // NCCL core profiler callback
+  struct ncclNetSocketRequest* req = *(struct ncclNetSocketRequest **)request;
+  if (phandles) req->pInfo.pHandle = phandles[0];
+#endif
   return ncclSuccess;
 }
 
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 3fe25a324..d99f7cb3e 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -108,29 +108,29 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
-  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, size));
-  CUCHECK(cuMemUnmap(ptr, size));
-  CUCHECK(cuMemAddressFree(ptr, size));
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
+  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize));
+  CUCHECK(cuMemUnmap(ptr, mcsize));
+  CUCHECK(cuMemAddressFree(ptr, mcsize));
   CUCHECK(cuMemRelease(*mcHandler));
-  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d, size %ld", comm->rank, (void*)ptr, dev, size);
+  INFO(NCCL_NVLS, "rank %d - NVLS deregistered buffer %p on device %d ucsize %ld mcsize %ld", comm->rank, (void*)ptr, dev, ucsize, mcsize);
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t size, void* ucptr, CUmemGenericAllocationHandle* ucHandle, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
-  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) MC handle 0x%llx(%p)", *ucHandle, ucptr, *mcHandle, mcptr);
+ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr, CUmemGenericAllocationHandle* ucHandle, size_t mcsize, void* mcptr, CUmemGenericAllocationHandle* mcHandle) {
+  INFO(NCCL_NVLS, "NVLS Unmap mem UC handle 0x%llx(%p) ucsize %zu MC handle 0x%llx(%p) mcsize %zd", *ucHandle, ucptr, ucsize, *mcHandle, mcptr, mcsize);
 
   // Release the UC memory and mapping
   if (ucptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)ucptr, ucsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)ucptr, ucsize));
     CUCHECK(cuMemRelease(*ucHandle));
   }
 
   // Release the MC memory and mapping
   if (mcptr) {
-    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, size));
-    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, size));
+    CUCHECK(cuMemUnmap((CUdeviceptr)mcptr, mcsize));
+    CUCHECK(cuMemAddressFree((CUdeviceptr)mcptr, mcsize));
     CUCHECK(cuMemRelease(*mcHandle));
   }
 
@@ -197,25 +197,27 @@ ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
   goto exit;
 }
 
-static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularity_flags mcOption, const CUmemAccessDesc* desc, size_t* sizePtr, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr) {
+static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc* desc, size_t size, CUmemGenericAllocationHandle* ucHandle, CUmemGenericAllocationHandle* mcHandle, void** ucptr, void** mcptr, size_t* ucsizePtr, size_t* mcsizePtr) {
   char shareableHandle[NVLS_HANDLE_SIZE];
   CUmulticastObjectProp mcprop;
   CUmemAllocationProp ucprop;
   ncclResult_t ret = ncclSuccess;
-  size_t size = *sizePtr;
-  size_t originSize = size;
+  size_t mcsize;
+  size_t ucsize;
   size_t ucgran, mcgran;
   int allocMcHandle = 0;
 
+  mcsize = ucsize = size;
   *ucptr = *mcptr = NULL;
+  memset(shareableHandle, '\0', sizeof(shareableHandle));
   memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
   mcprop.numDevices = comm->localRanks;
   mcprop.handleTypes = ncclCuMemHandleType;
   mcprop.flags = 0;
   mcprop.size = size;
-  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, mcOption), ret, fail);
-  ALIGN_SIZE(size, mcgran);
-  *sizePtr = mcprop.size = size;
+  CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
 
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
@@ -235,26 +237,29 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, CUmulticastGranularit
   ucprop.location.id = comm->cudaDev;
   ucprop.requestedHandleTypes = ncclCuMemHandleType;
   CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  // Map a VA for UC memory
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, size, ucgran, 0U, 0), ret, fail);
+  ALIGN_SIZE(ucsize, ucgran);
+  // Map a VA for UC memory with MC alignment and size
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail);
 
   // Alloc local physical mem for this NVLS group
-  CUCHECKGOTO(cuMemCreate(ucHandle, size, &ucprop, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, size, 0, *ucHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, size, desc, 1), ret, fail);
-  CUDACHECKGOTO(cudaMemset(*ucptr, 0, size), ret, fail);
+  CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail);
+  CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail);
 
   // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Bind physical memory to the Multicast group
   // NB: It will block until all ranks have been added to the Group
-  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, size, 0/*flags*/), ret, fail);
+  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail);
 
   // Map mc virtual address
-  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, size, mcgran, 0U, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, size, 0, *mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, size, desc, 1), ret, fail);
-  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld size %ld (%ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, size, originSize);
+  CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*mcptr, mcsize, 0, *mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*mcptr, mcsize, desc, 1), ret, fail);
+  *ucsizePtr = ucsize;
+  *mcsizePtr = mcsize;
+  INFO(NCCL_NVLS, "NVLS rank %d (dev %d) alloc done, ucptr %p ucgran %ld mcptr %p mcgran %ld ucsize %ld mcsize %ld (inputsize %ld)", comm->rank, comm->cudaDev, *ucptr, ucgran, *mcptr, mcgran, ucsize, mcsize, size);
 
 exit:
   return ret;
@@ -273,6 +278,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   size_t nvlsTotalSize = 0;
   struct ncclNvlsSharedRes* resources = NULL;
   int nChannels = -1;
+  cudaStream_t deviceStream, hostStream;
 
   if (comm->nvlsSupport == 0 || comm->nvlsResources->inited) return ncclSuccess;
   // initialize after checking comm->nvlsSupport
@@ -288,10 +294,10 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
        comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize);
 
-  NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_RECOMMENDED, &resources->accessDesc, &nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff), res, fail);
-  resources->buffSize = nvlsTotalSize;
+  NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail);
 
-  NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
   for (int h = 0; h < nHeads; h++) {
     int nvlsPeer = comm->nRanks + 1 + h;
     for (int c = 0; c < nChannels; c++) {
@@ -306,15 +312,16 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
       peer->recv[1].conn.buffs[NCCL_PROTO_SIMPLE] = resources->ucBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
       peer->send[0].conn.buffs[NCCL_PROTO_SIMPLE] = resources->mcBuff + ((h * 2 + 1) * nChannels + c) * buffSize;
 
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+      CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
     }
   }
 
-  NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+  NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
+  NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
   // For now, the barrier is a must that guarantees all buffers are mc-mapped before accessing peer's buffer
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), res, fail);
   comm->nvlsResources->inited = true;
@@ -374,6 +381,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     size_t memSize = 64;
     size_t creditSize = nChannels * 2 * memSize * nHeads;
     int nvlsStepSize = comm->nvlsChunkSize;
+    cudaStream_t hostStream, deviceStream;
 
     NCCLCHECKGOTO(ncclCalloc(&comm->nvlsResources, 1), res, fail);
     comm->nvlsResources->inited = false;
@@ -398,11 +406,11 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     resources->accessDesc.location.id = comm->cudaDev;
     resources->dev = comm->cudaDev;
 
-    NCCLCHECKGOTO(nvlsAllocateMem(comm, CU_MULTICAST_GRANULARITY_MINIMUM, &resources->accessDesc, &creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit), res, fail);
-    resources->creditSize = creditSize;
+    NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, creditSize, &resources->ucCreditHandle, &resources->mcCreditHandle, (void**)&resources->ucCredit, (void**)&resources->mcCredit, &resources->creditUCSize, &resources->creditMCSize), res, fail);
 
     // Set up head and tail only for now
-    NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), res, fail);
     for (int h = 0; h < nHeads; h++) {
       int nvlsPeer = comm->nRanks + 1 + h;
       for (int c = 0; c < nChannels; c++) {
@@ -440,14 +448,15 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
         peer->send[0].conn.stepSize = nvlsStepSize;
         peer->send[0].conn.flags |= NCCL_NVLS_MIN_POLL;
 
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
-        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, comm->sharedRes->hostStream.cudaStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[0], &peer->send[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[0], &peer->recv[0].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->send[1], &peer->send[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
+        CUDACHECKGOTO(cudaMemcpyAsync(&comm->channels[c].devPeersHostPtr[nvlsPeer]->recv[1], &peer->recv[1].conn, sizeof(struct ncclConnInfo), cudaMemcpyHostToDevice, hostStream), res, fail);
       }
     }
-    NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), res, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), res, fail);
+    NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), res, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), res, fail);
   }
 
   // MNNVL does not support NVLS buffer registration
@@ -488,13 +497,13 @@ ncclResult_t ncclNvlsFree(struct ncclComm* comm) {
       NCCLCHECK(ncclShmClose(resources->nvlsShmemHandle));
 
     if (resources->ucCredit || resources->mcCredit) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditSize, &resources->mcCreditHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditSize, resources->ucCredit, &resources->ucCreditHandle, resources->mcCredit, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->creditUCSize, &resources->mcCreditHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->creditUCSize, resources->ucCredit, &resources->ucCreditHandle, resources->creditMCSize, resources->mcCredit, &resources->mcCreditHandle));
     }
 
     if (comm->nvlsResources->inited) {
-      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffSize, &resources->mcBuffHandle));
-      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffSize, resources->ucBuff, &resources->ucBuffHandle, resources->mcBuff, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnbind(comm, resources->buffUCSize, &resources->mcBuffHandle));
+      NCCLCHECK(nvlsGroupUnmapMem(comm, resources->buffUCSize, resources->ucBuff, &resources->ucBuffHandle, resources->buffMCSize, resources->mcBuff, &resources->mcBuffHandle));
     }
     free(resources);
     comm->nvlsResources = NULL;
@@ -513,7 +522,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   size_t minSize = SIZE_MAX;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
-  size_t ucgran, mcgran;
+  size_t ucgran, mcgran, ucsize, mcsize;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -538,13 +547,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
 
         CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
-        if (regSize % mcgran == 0) {
-          regRecord->regSize = regSize;
-        } else {
-          regRecord->regSize = regRecord->baseSize - (regRecord->addr - regRecord->baseAddr);
-        }
-
-        if (regRecord->addr % ucgran == 0 && regRecord->regSize % mcgran == 0) {
+        if (regRecord->addr % ucgran == 0) {
+          if (regSize % ucgran != 0) {
+            regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran);
+          } else {
+            regRecord->regUCSize = regSize;
+          }
           regRecord->state |= NVLS_REG_POSSIBLE;
           memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
           regData[comm->localRank].offset = userBuff - regRecord->addr;
@@ -564,13 +572,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
       goto fail;
     }
     /* get minimal reg size of nvls buffers */
-    if (minSize > regData[i].reg.regSize)
-      minSize = regData[i].reg.regSize;
+    if (minSize > regData[i].reg.regUCSize)
+      minSize = regData[i].reg.regUCSize;
   }
 
   /* start registration */
+  mcsize = ucsize = minSize;
   mcprop.size = minSize;
   CUCHECKGOTO(cuMulticastGetGranularity(&mcgran, &mcprop, CU_MULTICAST_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(mcsize, mcgran);
+  mcprop.size = mcsize;
+
   if (comm->localRank == 0) {
     NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
@@ -583,16 +595,17 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, minSize, 0), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail);
 
   // Create a VA for the NVLS
-  CUCHECKGOTO(cuMemAddressReserve(&regPtr, minSize, mcgran, 0U, 0), ret, fail);
+  CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
-  CUCHECKGOTO(cuMemMap(regPtr, minSize, 0, mcHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess(regPtr, minSize, &comm->nvlsResources->accessDesc, 1), ret, fail);
+  CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail);
+  CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
   regRecord->regAddr = regPtr;
-  regRecord->regSize = minSize;
+  regRecord->regUCSize = ucsize;
+  regRecord->regMCSize = mcsize;
   regRecord->dev = comm->nvlsResources->dev;
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
@@ -706,7 +719,7 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
   return ncclSuccess;
 fail:
   regBufUsed = 0;
-  WARN("rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
+  INFO(NCCL_REG, "rank %d failed to NVLS register sendbuff %p sendbuffSize %ld recvbuff %p recvbuffSize %ld", comm->rank, sendbuff, sendbuffSize, recvbuff, recvbuffSize);
   goto exit;
 }
 
@@ -843,7 +856,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   return ncclSuccess;
 }
 
-ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size) {
+ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index dac762157..aed84c588 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -407,6 +407,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
 	  comm->peerInfo[intermediateRank].nvmlDev, useReadStr);
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = sendSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -466,6 +467,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st
     info->rank = intermediateRank;
   }
 
+  memset(&req, '\0', sizeof(req));
   req.size = recvSize;
   req.refcount = 0;
   if (P2P_SAME_PID((comm->peerInfo + info->rank), peerInfo) && (comm->peerInfo[info->rank].cudaDev != peerInfo->cudaDev)) req.refcount++;
@@ -527,7 +529,7 @@ ncclResult_t p2pRecvConnect(struct ncclComm* comm, struct ncclConnect* connectIn
 
   if (useMemcpy) {
     // Attach to peer's SHM segment
-    NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
+    NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->shm, (void**)&resources->devShm, &resources->desc));
 
     recv->conn.tail = &resources->devShm->recvMem.tail;
     recv->conn.head = &resources->devShm->sendMem.head;
@@ -634,7 +636,7 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
 
     // Create a SHM segment for the peer to attach to
     shmSize = sizeof(struct ncclSendMem) + sizeof(struct ncclRecvMem);
-    NCCLCHECK(ncclShmAllocateShareableBuffer(proxyState->tpRank, shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
+    NCCLCHECK(ncclShmAllocateShareableBuffer(shmSize, false, &proxyInfo->desc, (void**)&proxyInfo->shm, (void**)&proxyInfo->devShm));
 
     NCCLCHECK(ncclCudaHostCalloc(&proxyInfo->ceRecvMem, 1));
     memcpy(respBuff, proxyInfo, sizeof(struct p2pShmProxyInfo));
@@ -805,7 +807,7 @@ static ncclResult_t ipcRegisterBuffer(ncclComm* comm, const void* userbuff, size
 ncclResult_t ret = ncclSuccess;
   struct ncclIpcRegInfo* newInfo = NULL;
   uintptr_t* peerRmtAddrs = NULL;
-  bool legacyIpcCap = false;
+  int legacyIpcCap = 0;
   size_t baseSize = 0;
   void* baseAddr = NULL;
   bool needUpdate = false;
@@ -916,13 +918,16 @@ ncclResult_t ret = ncclSuccess;
       if (type == NCCL_IPC_COLLECTIVE) {
         // for collective, store registered remote buffers into dev memory for future reference
         if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL || needUpdate) {
-          NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
+          cudaStream_t hostStream, deviceStream;
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
           if (regRecord->regIpcAddrs.devPeerRmtAddrs == NULL)
-            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaCallocAsync(&regRecord->regIpcAddrs.devPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
           if (needUpdate)
-            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, comm->sharedRes->hostStream.cudaStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamWaitStream(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, &comm->sharedRes->hostStream), ret, fail);
-          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream), ret, fail);
+            NCCLCHECKGOTO(ncclCudaMemcpyAsync(regRecord->regIpcAddrs.devPeerRmtAddrs, regRecord->regIpcAddrs.hostPeerRmtAddrs, comm->localRanks, hostStream), ret, fail);
+          NCCLCHECKGOTO(ncclStreamWaitStream(deviceStream, hostStream, comm->sharedRes->scratchEvent), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
+          NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), ret, fail);
         }
         peerRmtAddrs = regRecord->regIpcAddrs.devPeerRmtAddrs;
       } else {
@@ -941,7 +946,7 @@ ncclResult_t ret = ncclSuccess;
   *offsetOut = 0;
   *peerRmtAddrsOut = NULL;
   if (newInfo) free(newInfo);
-  WARN("rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %p", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc);
+  INFO(NCCL_REG, "rank %d failed to IPC register userbuff %p buffSize %ld nPeers %d isLegacyIpc %d type %s", comm->rank, userbuff, buffSize, nPeers, isLegacyIpc ? *isLegacyIpc : -1, ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR ? "POSIX_FD" : "FABRIC");
   goto exit;
 }
 
diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc
new file mode 100644
index 000000000..3e32843aa
--- /dev/null
+++ b/src/transport/profiler.cc
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include "transport.h"
+#include "proxy.h"
+#include "profiler.h"
+
+static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
+  connection->proxyAppendPtr = &connection->proxyAppend;
+  connection->shared = 1;
+  return ncclSuccess;
+}
+
+// The following ncclProxySubArgs are overloaded by the profiler progress function:
+// - base       : is set to the current value of workCounter[channelId]
+// - posted     : is set to sub->nsteps to indicate that the profiler has started the event
+// - transmitted: is set to sub->nsteps to indicate that the profiler has stopped the event
+static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
+  if (args->state == ncclProxyOpReady) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      sub->base = sub->workCounter;
+      sub->posted = sub->transmitted = 0;
+    }
+    args->state = ncclProxyOpProgress;
+  }
+  if (args->state == ncclProxyOpProgress) {
+    for (int s = 0; s < args->nsubs; s++) {
+      struct ncclProxySubArgs* sub = args->subs + s;
+      uint64_t* workStarted = (uint64_t *)sub->sendbuff;
+      uint64_t* workCompleted = (uint64_t *)sub->recvbuff;
+      if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) {
+        ncclProfilerStartKernelChEvent(args, s);
+        sub->posted = sub->nsteps;
+        continue; // allow events on every channel to start
+      }
+      if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) {
+        ncclProfilerStopKernelChEvent(args, s);
+        sub->transmitted = sub->nsteps;
+        args->done++;
+      }
+    }
+    if (args->done == args->nsubs) args->state = ncclProxyOpNone;
+  }
+  return ncclSuccess;
+}
+
+struct ncclTransport profilerTransport = {
+  "Prof",
+  NULL,
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, NULL, NULL, NULL, profilerProxyConnect, NULL, profilerProxyProgress, NULL, NULL }
+};
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index d2d6906e8..aa3e6c41b 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -18,6 +18,7 @@ struct shmBuffInfo {
 };
 
 struct shmConnectInfo {
+  int rank;
   ncclShmIpcDesc_t desc;
   struct shmBuffInfo buf;
 };
@@ -120,6 +121,7 @@ static ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 1, myInfo->rank, &send->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &send->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclSendMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclSendMem*)info->buf.dptr;
 
@@ -150,6 +152,7 @@ static ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* gr
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_SHM, 0, myInfo->rank, &recv->proxyConn));
   NCCLCHECK(ncclProxyCallBlocking(comm, &recv->proxyConn, ncclProxyMsgSetup, (void*)&req, sizeof(struct shmRequest), (void*)info, sizeof(struct shmConnectInfo)));
 
+  info->rank = comm->rank;
   resources->hostMem = (struct ncclRecvMem*)info->buf.hptr;
   resources->devHostMem = (struct ncclRecvMem*)info->buf.dptr;
 
@@ -163,7 +166,7 @@ static ncclResult_t shmSendConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmSendResources* resources = (struct shmSendResources*)send->transportResources;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_SEND_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -197,7 +200,7 @@ static ncclResult_t shmRecvConnect(struct ncclComm* comm, struct ncclConnect* co
   struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo;
   char* buff;
 
-  NCCLCHECK(ncclShmImportShareableBuffer(comm, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
+  NCCLCHECK(ncclShmImportShareableBuffer(comm, info->rank, &info->desc, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, &resources->remDesc));
 
   buff = shmLocality == SHM_RECV_SIDE ? (char*)(resources->devHostMem + 1) : (char*)(resources->devRemHostMem + 1);
   for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
@@ -464,7 +467,7 @@ static ncclResult_t shmSendProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -485,7 +488,7 @@ static ncclResult_t shmRecvProxySetup(struct ncclProxyConnection* connection, st
   struct shmProxyInfo* proxyInfo;
 
   NCCLCHECK(ncclCalloc(&proxyInfo, 1));
-  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(proxyState->tpRank, req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
+  NCCLCHECKGOTO(ncclShmAllocateShareableBuffer(req->size, req->legacy, &proxyInfo->desc, &info->buf.hptr, &info->buf.dptr), result, fail);
   memcpy(&info->desc, &proxyInfo->desc, sizeof(ncclShmIpcDesc_t));
   connection->transportResources = proxyInfo;
 exit:
@@ -517,9 +520,9 @@ static void initCeOperation() {
   }
 }
 
-ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
-  if (desc == NULL || hptr == NULL || tpProxyRank < -1) {
-    WARN("Invalid argument desc %p, hptr %p, tpProxyRank %d", desc, hptr, tpProxyRank);
+ncclResult_t ncclShmAllocateShareableBuffer(size_t size, bool legacy, ncclShmIpcDesc_t *desc, void **hptr, void **dptr) {
+  if (desc == NULL || hptr == NULL) {
+    WARN("Invalid argument desc %p, hptr %p", desc, hptr);
     return ncclInvalidArgument;
   }
 #if CUDART_VERSION >= 12020
@@ -532,7 +535,6 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
     if (type == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
       // Return the native cuMem handle for later Export/Import via UDS
       memcpy(&desc->shmci.data, &handle, sizeof(handle));
-      desc->shmci.tpProxyRank = tpProxyRank;
     } else {
       CUCHECK(cuMemExportToShareableHandle(&desc->shmci.handle, handle, type, 0));
     }
@@ -560,7 +562,7 @@ ncclResult_t ncclShmAllocateShareableBuffer(int tpProxyRank, size_t size, bool l
   return ncclSuccess;
 }
 
-ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
+ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, int proxyRank, ncclShmIpcDesc_t *desc, void **hptr, void **dptr, ncclShmIpcDesc_t *descOut) {
   if (comm == NULL || desc == NULL || hptr == NULL || descOut == NULL) {
     WARN("Invalid argument comm %p, desc %p, hptr %p, descOut %p", comm, desc, hptr, descOut);
     return ncclInvalidArgument;
@@ -584,7 +586,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
       // UDS fd support
       int fd = -1;
       // Send cuMem handle to remote for conversion to an fd
-      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, desc->shmci.tpProxyRank, &desc->shmci.data, &fd));
+      NCCLCHECK(ncclProxyClientGetFdBlocking(comm, proxyRank, &desc->shmci.data, &fd));
       CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)(uintptr_t)fd, type));
       (void) close(fd);
     } else {
@@ -625,7 +627,7 @@ ncclResult_t ncclShmImportShareableBuffer(struct ncclComm *comm, ncclShmIpcDesc_
     descOut->shmci.ptr = *hptr = (void *)hostptr;
     descOut->legacy = false;
     if (dptr) *dptr = (void *)hostptr;
-    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from tpProxyRank %d size %zi ptr %p, granularity %ld", desc->shmci.tpProxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
+    INFO(NCCL_SHM, "CUMEM imported shareable host buffer from proxyRank %d size %zi ptr %p, granularity %ld", proxyRank, desc->shmci.size, descOut->shmci.ptr, granularity);
   } else {
     char shmPath[SHM_PATH_MAX];
     snprintf(shmPath, sizeof(shmPath), "/dev/shm/nccl-%s", desc->shmli.shmSuffix);

From 145e67e70745c5f78f18334f82de29dbe59bde63 Mon Sep 17 00:00:00 2001
From: Giuseppe Congiu <gcongiu@nvidia.com>
Date: Wed, 9 Apr 2025 09:02:40 -0700
Subject: [PATCH 07/21] Update ext-profiler example

Sync ext-profiler example with 2.26.2.
---
 ext-profiler/README.md                    | 142 +++++++++++++++++++---
 ext-profiler/example/Makefile             |   2 +-
 ext-profiler/example/event.h              |  41 ++++++-
 ext-profiler/example/nccl/net_ib_v1.h     |  34 ++++++
 ext-profiler/example/nccl/net_socket_v1.h |  32 +++++
 ext-profiler/example/nccl/profiler.h      |  51 +++++++-
 ext-profiler/example/nccl/profiler_net.h  |  22 ++++
 ext-profiler/example/nccl/profiler_v1.h   |  16 ++-
 ext-profiler/example/nccl/profiler_v2.h   |  44 +------
 ext-profiler/example/nccl/profiler_v3.h   | 119 ++++++++++++++++++
 ext-profiler/example/plugin.c             | 105 +++++++++++++++-
 ext-profiler/example/plugin.h             |  13 ++
 ext-profiler/example/print_event.c        |  78 +++++++++++-
 13 files changed, 621 insertions(+), 78 deletions(-)
 create mode 100644 ext-profiler/example/nccl/net_ib_v1.h
 create mode 100644 ext-profiler/example/nccl/net_socket_v1.h
 create mode 100644 ext-profiler/example/nccl/profiler_net.h
 create mode 100644 ext-profiler/example/nccl/profiler_v3.h
 create mode 100644 ext-profiler/example/plugin.h

diff --git a/ext-profiler/README.md b/ext-profiler/README.md
index 7ef44b2fa..2a4018c07 100644
--- a/ext-profiler/README.md
+++ b/ext-profiler/README.md
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v2)
+# API (v3)
 
-Below is the main `ncclProfiler_v2` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
@@ -70,7 +70,7 @@ typedef struct {
   //  - eDescr : pointer to ncclProfilerEventDescr_t object
   // Output
   //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
 
   // stopEvent - stop/finalize an event inside and event set
   // Input
@@ -82,13 +82,13 @@ typedef struct {
   //  - eHandle   : handle to event object created through startEvent
   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
   //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
 
   // finalize - finalize the profiler plugin
   // Input
   //  - context: opaque profiler context object
   ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v2_t;
+} ncclProfiler_v3_t;
 ```
 
 ## Error codes
@@ -156,7 +156,6 @@ typedef struct {
       size_t count;         // data count
       int root;             // root rank
       const char* datatype; // string containing the name of the datatype
-      size_t trafficBytes;  // number of transfer bytes
       uint8_t nMaxChannels; // max number of channels for this collective
       uint8_t nWarps;       // number of GPU warps for this collective
       const char* algo;     // string containing name of the algorithm for this collective
@@ -185,12 +184,22 @@ typedef struct {
     struct {                // proxyStep events metadata
       int step;             // individual step in `ncclProxyOp`
     } proxyStep;
+
+    struct {
+      uint8_t channelId;    // id of the channel used by the kernel
+    } kernelCh;
+
+    struct {
+      int64_t id;           // net plugin id (used by net and profiler plugins to agree on event definitions)
+      void* data;           // pointer to network plugin defined event
+    } netPlugin;
   };
-} ncclProfilerEventDescr_v2_t;
+} ncclProfilerEventDescr_v3_t;
 ```
 
 NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, and `ncclProfileProxyCtrl`.
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
+`ncclProfileNetPlugin`.
 
 #### stopEvent
 
@@ -236,7 +245,7 @@ typedef enum {
   ncclProfilerProxyCtrlWakeup,          // state marks proxy progress thread waking up
   ncclProfilerProxyCtrlAppend,          // state marks append of new network work item begin
   ncclProfilerProxyCtrlAppendEnd,       // state marks append of new network work item end
-} ncclProfilerEventState_v2_t;
+} ncclProfilerEventState_v3_t;
 ```
 
 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
@@ -251,6 +260,89 @@ the channel. Thus, they provide a more fine-grained view w.r.t. ProxyOp events.
 network requests for the GPU kernel. This includes everything else that the proxy thread might be
 doing, including appending new `ncclProxyOp` objects to the list of work elements to process.
 
+`ncclProfileKernelCh` events are generated by the profiler proxy progress function while the kernel
+processes work items for the enqueued NCCL operations.
+
+`ncclProfileNetPlugin` events are generated by the network plugin. Network plugins are free to define
+their own set of events and communicate them to the profiler plugin using `ncclProfileNetPlugin` and
+the `ncclProfilerCallback\_t` NCCL core callback. The network and profiler plugin can agree on the
+network defined event definition using the plugin id in the event descriptor. The plugin identifier
+is a 64-bit integer that has two parts: the 16 LSB are assigned to the plugin event version, the next
+16 bits are assigned to the plugin type (NCCL\_PROFILER\_NET\_TYPE\_IB, ...). The rest of the bits are
+unused and available for future extensions.
+
+A network IB plugin can use this infrastructure to define a QP event as:
+
+```C
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+```
+
+The network event infrastructure is network agnostic. A different network socket plugin can
+use it to define a socket event as:
+
+```C
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+```
+
+The network plugin creates an event (descriptor) and passes it to the profiler callback,
+along with the network type and version (plugin id). NCCL then creates a `ncclProfileNetPlugin`
+event descriptor, attaches the network plugin defined event as external data, and calls
+the profiler `startEvent` function.
+
+```C
+ncclResult_t isend(..., void* phandle, ...) {
+  ...
+  int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+  ncclProfilerNetIbDescr_v1_t eDescr = { };
+  eDescr.type = ncclProfileQp;
+  eDescr.qp = { ... };
+  ncclProfilerCallback(&eHandle, 0 /* start net event */, phandle, pluginId, &eDescr);
+  ...
+}
+```
+
 State transitions for the events described can also come with event attribute updates. For this
 reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported below.
 
@@ -264,7 +356,7 @@ typedef union {
   struct {                // attributes to update for ncclProfileProxyCtrl
     int appendedProxyOps; // number of appended proxy ops thus far
   } proxyCtrl;
-} ncclProfilerEventStateArgs_v2_t;
+} ncclProfilerEventStateArgs_v3_t;
 ```
 
 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -279,14 +371,22 @@ Group event
    +- Collective event
    |  |
    |  +- ProxyOp event
-   |     |
-   |     +- ProxyStep event
+   |  |  |
+   |  |  +- ProxyStep event
+   |  |     |
+   |  |     +- NetPlugin event
+   |  |
+   |  +- KernelCh event
    |
    +- Point-to-point event
       |
       +- ProxyOp event
-         |
-         +- ProxyStep event
+      |  |
+      |  +- ProxyStep event
+      |     |
+      |     +- NetPlugin event
+      |
+      +- KernelCh event
 
 ProxyCtrl event
 ```
@@ -316,3 +416,17 @@ thread originating the operation. To avoid the profiler instance in the remote p
 dereference a pointer from another address space the event descriptor includes the PID of the originator.
 The profiler plugin needs to check that the originator PID matches the local PID before dereferencing the
 parent event.
+
+# Known Limitations
+
+In intra-node communication, or whenever a rank does not have any network activity for which proxy events
+are unavailable, the profiler will only report the enqueue events (e.g., ncclAllReduce). The events from
+enqueue can be time stamped by the profiler (at start and stop) to reconstruct the execution time of the
+collective. However, this time only represents the launch time of the collective and not the actual
+execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
+
+Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
+thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
+the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
+accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
+delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile
index ee8e0cf08..f5cc9f1d8 100644
--- a/ext-profiler/example/Makefile
+++ b/ext-profiler/example/Makefile
@@ -10,7 +10,7 @@ PLUGIN_SO := libnccl-profiler.so
 default: $(PLUGIN_SO)
 
 $(PLUGIN_SO): plugin.c event.c print_event.c
-	$(CC) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+	$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
 
 clean:
 	rm -f $(PLUGIN_SO)
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
index 1486a2248..0638f2df1 100644
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@@ -33,10 +33,42 @@
 
 #define MAX_PROXY_OP_STATES              ((NUM_PROXY_OP_SEND_STATES   > NUM_PROXY_OP_RECV_STATES  ) ? NUM_PROXY_OP_SEND_STATES   : NUM_PROXY_OP_RECV_STATES)
 #define MAX_PROXY_STEP_STATES            ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
-
-#define MAX_COMM_CLIQUES                 (32 * 8)
+#define MAX_EVENTS_PER_REQ               (8)
 
 struct proxyOp;
+struct proxyStep;
+
+struct netPlugin {
+  uint8_t type;
+  int pluginType;
+  int pluginVer;
+  uint8_t pluginEvent;
+  union {
+    struct {
+      int device;
+      int qpNum;
+      int opcode;
+      uint64_t wr_id;
+      size_t length;
+    } qp;
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+  double startTs;
+  double stopTs;
+  struct proxyStep* parent;
+};
+
+struct kernelCh {
+  uint8_t type;
+  uint8_t channelId;
+  struct taskEventBase* parent;
+  double startTs;
+  double stopTs;
+};
 
 struct proxyStep {
   uint8_t type;                     // type of event: network transfer
@@ -46,6 +78,8 @@ struct proxyStep {
   double startTs;
   double stopTs;
   struct proxyOp* parent;
+  struct netPlugin net[MAX_EVENTS_PER_REQ];
+  int nNetEvents;
 };
 
 struct proxyOp {
@@ -101,7 +135,6 @@ struct collective {
   void const* sendBuff;
   void* recvBuff;
   size_t count;
-  size_t trafficBytes;
   int root;
   const char* datatype;
   uint8_t nMaxChannels;
@@ -111,6 +144,7 @@ struct collective {
   struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
   struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
   int nProxyOps[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
 };
 
 struct p2p {
@@ -121,6 +155,7 @@ struct p2p {
   const char* datatype;
   int peer;
   struct proxyOp op[MAX_CHANNELS];
+  struct kernelCh kernel[MAX_CHANNELS];
 };
 
 struct group {
diff --git a/ext-profiler/example/nccl/net_ib_v1.h b/ext-profiler/example/nccl/net_ib_v1.h
new file mode 100644
index 000000000..f142de5f5
--- /dev/null
+++ b/ext-profiler/example/nccl/net_ib_v1.h
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_IB_V1_H_
+#define NET_IB_V1_H_
+
+#define NCCL_PROFILER_NET_IB_VER 1
+
+enum {
+  ncclProfileQp = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int device;      // network device id
+      uint64_t wr_id;  // work request id
+      int opcode;      // ibv opcode
+      int qpNum;       // QP number
+      size_t length;   // work request data length
+    } qp;
+  };
+} ncclProfilerNetIbDescr_v1_t;
+
+#endif
diff --git a/ext-profiler/example/nccl/net_socket_v1.h b/ext-profiler/example/nccl/net_socket_v1.h
new file mode 100644
index 000000000..0cb664f20
--- /dev/null
+++ b/ext-profiler/example/nccl/net_socket_v1.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NET_SOCKET_V1_H_
+#define NET_SOCKET_V1_H_
+
+#define NCCL_PROFILER_NET_SOCKET_VER 1
+
+enum {
+  ncclProfileSocket = (1 << 0),
+};
+
+// The data structure version is encoded in the plugin identifier bitmask and
+// passed to NCCL core through the profiler callback. NCCL copies the plugin
+// identifier in the event descriptor before calling the profiler startEvent
+// function. The profiler should inspect the plugin id to find out the source
+// plugin as well as the version of the event struct
+typedef struct {
+  uint8_t type;        // event type (plugin defined)
+  union {
+    struct {
+      int fd;
+      int op;
+      size_t length;
+    } sock;
+  };
+} ncclProfilerNetSockDescr_v1_t;
+
+#endif
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
index 6680cfece..d02202d51 100644
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@@ -4,8 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
+#ifndef PROFILER_H_
+#define PROFILER_H_
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -13,7 +13,54 @@
 #include "common.h"
 #include "err.h"
 
+enum {
+  ncclProfileGroup     = (1 << 0),  // group event type
+  ncclProfileColl      = (1 << 1),  // host collective call event type
+  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted,
+  ncclProfilerProxyOpSendRemFifoWait,
+  ncclProfilerProxyOpSendTransmitted,
+  ncclProfilerProxyOpSendDone,
+  ncclProfilerProxyOpRecvPosted,
+  ncclProfilerProxyOpRecvReceived,
+  ncclProfilerProxyOpRecvTransmitted,
+  ncclProfilerProxyOpRecvDone,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait,
+  ncclProfilerProxyStepSendWait,
+  ncclProfilerProxyStepRecvWait,
+  ncclProfilerProxyStepRecvFlushWait,
+  ncclProfilerProxyStepRecvGPUWait,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle,
+  ncclProfilerProxyCtrlActive,
+  ncclProfilerProxyCtrlSleep,
+  ncclProfilerProxyCtrlWakeup,
+  ncclProfilerProxyCtrlAppend,
+  ncclProfilerProxyCtrlAppendEnd,
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+
+#include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v3_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
 
 #endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_net.h b/ext-profiler/example/nccl/profiler_net.h
new file mode 100644
index 000000000..2d087ca54
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_net.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#include "net_ib_v1.h"
+#include "net_socket_v1.h"
+
+#endif
diff --git a/ext-profiler/example/nccl/profiler_v1.h b/ext-profiler/example/nccl/profiler_v1.h
index 7d34bed57..e7d316d48 100644
--- a/ext-profiler/example/nccl/profiler_v1.h
+++ b/ext-profiler/example/nccl/profiler_v1.h
@@ -4,8 +4,8 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#ifndef NCCL_PROFILER_V1_H_
-#define NCCL_PROFILER_V1_H_
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
 
 #include <stdint.h>
 
@@ -59,8 +59,16 @@ typedef struct {
   };
 } ncclProfilerEventDescr_v1_t;
 
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_v1_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_v1_t;
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
 
 typedef struct {
   const char* name;
diff --git a/ext-profiler/example/nccl/profiler_v2.h b/ext-profiler/example/nccl/profiler_v2.h
index aab4ccf86..4be600d52 100644
--- a/ext-profiler/example/nccl/profiler_v2.h
+++ b/ext-profiler/example/nccl/profiler_v2.h
@@ -4,20 +4,11 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#ifndef NCCL_PROFILER_V2_H_
-#define NCCL_PROFILER_V2_H_
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
 
 #include <stdint.h>
 
-enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-};
-
 typedef struct {
   uint8_t type;                 // event type descriptor: ncclProfileColl, ...
   void* parentObj;              // pointer to the profiler parent object (for coll is the group)
@@ -65,32 +56,6 @@ typedef struct {
   };
 } ncclProfilerEventDescr_v2_t;
 
-typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
-
-  /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
-
-  /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
-} ncclProfilerEventState_v2_t;
-
 typedef union {
   struct {
     size_t transSize;
@@ -138,9 +103,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v2_t;
 
-typedef ncclProfilerEventDescr_v2_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v2_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v2_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v2_t ncclProfiler_t;
-
 #endif
diff --git a/ext-profiler/example/nccl/profiler_v3.h b/ext-profiler/example/nccl/profiler_v3.h
new file mode 100644
index 000000000..c1f1b919f
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v3.h
@@ -0,0 +1,119 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
+typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v3_t ncclProfiler_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index 64d5d8be1..08408dba7 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -58,6 +58,7 @@ __hidden double gettime(void) {
 
 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
+static int* eActivationMaskPtr;
 
 __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
   pthread_mutex_lock(&lock);
@@ -65,7 +66,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
     // first thread initializes event mask, environment and detach pool
     const char* str;
     str = getenv("NCCL_PROFILE_EVENT_MASK");
-    __atomic_store_n(eActivationMask, str ? atoi(str) : defaultEActivationMask, __ATOMIC_RELAXED);
+    __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
 
     str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
     groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
@@ -100,6 +101,9 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
   }
   pthread_mutex_unlock(&lock);
 
+  // store pointer to activation mask globally
+  eActivationMaskPtr = eActivationMask;
+
   // pre-allocate memory for event object pools in dedicated profiler context
   struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
   ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
@@ -199,8 +203,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
         if (base->type == ncclProfileColl) {
           struct collective* c = (struct collective *)base;
           // reset event proxyOps & proxySteps
-          memset(c->send, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
-          memset(c->recv, 0, sizeof(struct proxyOp)*MAX_CHANNELS*MAX_OPS);
           memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
           // release collective events in the group and return them to the collective pool
           __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
@@ -252,7 +254,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->count = eDescr->coll.count;
     event->root = eDescr->coll.root;
     event->datatype = eDescr->coll.datatype;
-    event->trafficBytes = eDescr->coll.trafficBytes;
     event->nMaxChannels = eDescr->coll.nMaxChannels;
     event->nWarps = eDescr->coll.nWarps;
     event->algo = eDescr->coll.algo;
@@ -373,7 +374,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
     }
- } else if (eDescr->type == ncclProfileProxyStep) {
+  } else if (eDescr->type == ncclProfileProxyStep) {
     // the parent might be null if we run out of events
     struct proxyOp* parent = (struct proxyOp *)eDescr->parentObj;
     if (parent == NULL) return ncclSuccess;
@@ -385,8 +386,77 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->isSend = parent->isSend;
     event->parent = parent;
     event->startTs = gettime() - startTime;
+    event->nNetEvents = 0;
     *eHandle = event;
     debugEvent(event, "ProxyStepStart");
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct taskEventBase* eventBase = (struct taskEventBase *)eDescr->parentObj;
+    if (eventBase == NULL) return ncclSuccess;
+    if (eventBase->type == ncclProfileColl) {
+      struct collective* parent = (struct collective *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    } else { // ncclProfileP2p
+      struct p2p* parent = (struct p2p *)eDescr->parentObj;
+      struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
+      event->type = ncclProfileKernelCh;
+      event->channelId = eDescr->kernelCh.channelId;
+      event->parent = eventBase;
+      event->startTs = gettime() - startTime;
+      *eHandle = event;
+      __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
+      debugEvent(event, "KernelChStart");
+    }
+  } else if (eDescr->type == ncclProfileNetPlugin) {
+    struct proxyStep* parent = (struct proxyStep *)eDescr->parentObj;
+    if (parent == NULL) return ncclSuccess;
+
+    int64_t pluginId = eDescr->netPlugin.id;
+    int64_t type = pluginId & NCCL_PROFILER_NET_TYPE_MASK;
+    int64_t ver = pluginId & NCCL_PROFILER_NET_VER_MASK;
+    if (type == NCCL_PROFILER_NET_TYPE_IB) {
+      if (ver == 1) {
+        ncclProfilerNetIbDescr_v1_t* descr = (ncclProfilerNetIbDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileQp) {
+          event->pluginEvent = ncclProfileQp;
+          event->qp.device = descr->qp.device;
+          event->qp.wr_id = descr->qp.wr_id;
+          event->qp.opcode = descr->qp.opcode;
+          event->qp.qpNum = descr->qp.qpNum;
+          event->qp.length = descr->qp.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    } else if (type == NCCL_PROFILER_NET_TYPE_SOCK) {
+      if (ver == 1) {
+        ncclProfilerNetSockDescr_v1_t* descr = (ncclProfilerNetSockDescr_v1_t *)eDescr->netPlugin.data;
+        struct netPlugin* event = parent->net + __atomic_fetch_add(&parent->nNetEvents, 1, __ATOMIC_RELAXED);
+        event->type = ncclProfileNetPlugin;
+        event->pluginType = type;
+        event->pluginVer = ver;
+        if (descr->type == ncclProfileSocket) {
+          event->pluginEvent = ncclProfileSocket;
+          event->sock.fd = descr->sock.fd;
+          event->sock.op = descr->sock.op;
+          event->sock.length = descr->sock.length;
+        }
+        event->startTs = gettime() - startTime;
+        *eHandle = event;
+        debugEvent(event, "NetPluginStart");
+      }
+    }
   }
   return ncclSuccess;
 }
@@ -445,6 +515,15 @@ void updateEvent(void* handle) {
     struct proxyCtrl* event = (struct proxyCtrl *)handle;
     event->stopTs = gettime() - startTime;
     debugEvent(event, "ProxyCtrlStop");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+    debugEvent(event, "KernelChStop");
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)handle;
+    event->stopTs = gettime() - startTime;
+    debugEvent(event, "NetPluginStop");
   }
 }
 
@@ -506,7 +585,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   return ncclSuccess;
 }
 
-ncclProfiler_t ncclProfiler_v2 = {
+ncclProfiler_t ncclProfiler_v3 = {
   "Example-profiler",
   exampleProfilerInit,
   exampleProfilerStartEvent,
@@ -514,3 +593,17 @@ ncclProfiler_t ncclProfiler_v2 = {
   exampleProfilerRecordEventState,
   exampleProfilerFinalize,
 };
+
+int exampleProfilerStart(int eActivationMask) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
+
+int exampleProfilerStop(void) {
+  if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
+    __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
+  }
+  return ncclSuccess;
+}
diff --git a/ext-profiler/example/plugin.h b/ext-profiler/example/plugin.h
new file mode 100644
index 000000000..b4d07060a
--- /dev/null
+++ b/ext-profiler/example/plugin.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PLUGIN_H_
+#define PLUGIN_H_
+
+int exampleProfilerStart(int eActivationMask);
+int exampleProfilerStop(void);
+
+#endif
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c
index f26a9eeb2..43f719045 100644
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.c
@@ -72,7 +72,7 @@ __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
 }
 
 static __thread int proxyStepId;
-__hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
+__hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
   if (event->isSend) {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
             "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@@ -84,8 +84,6 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
             "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
             "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
   } else {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
             "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
@@ -93,6 +91,14 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
             "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
             "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
+  }
+}
+
+__hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
+  if (event->isSend) {
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
+  } else {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
             "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
@@ -106,6 +112,19 @@ __hidden void printProxyStepEvent(FILE* fh, struct proxyStep* event) {
   }
 }
 
+static __thread int kernelId;
+__hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
+          "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
+}
+
+__hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
+  if (event->type != ncclProfileKernelCh) return;
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "KernelCh", kernelId, getpid(), 1, event->stopTs);
+}
+
 static __thread int proxyCtrlId;
 __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
   const char* str;
@@ -127,6 +146,29 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
           str, proxyCtrlId++, getpid(), 1, event->stopTs);
 }
 
+static __thread int ibQpId, sockId;
+__hidden void printNetPluginEvent(FILE* fh, struct netPlugin* event) {
+  if (event->pluginType == NCCL_PROFILER_NET_TYPE_IB) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileQp) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"device\": %d, \"qp_num\": %d, \"opcode\": %d, \"wr_id\": %lu, \"size\": %lu}},\n",
+                "Qp", ibQpId, getpid(), 1, event->startTs, event->qp.device, event->qp.qpNum, event->qp.opcode, event->qp.wr_id, event->qp.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_IB\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Qp", ibQpId++, getpid(), 1, event->stopTs);
+      }
+    }
+  } else if (event->pluginType == NCCL_PROFILER_NET_TYPE_SOCK) {
+    if (event->pluginVer == 1) {
+      if (event->pluginEvent == ncclProfileSocket) {
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"sock\": %d, \"op\": %d, \"size\": %lu}},\n",
+                "Sock", sockId, getpid(), 1, event->startTs, event->sock.fd, event->sock.op, event->sock.length);
+        fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET_SOCK\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+                "Sock", sockId++, getpid(), 1, event->stopTs);
+      }
+    }
+  }
+}
+
 //#define DEBUG_EVENTS
 void debugEvent(void* eHandle, const char* tag) {
 #ifdef DEBUG_EVENTS
@@ -146,8 +188,10 @@ void debugEvent(void* eHandle, const char* tag) {
     fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
     fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
     fprintf(fh, "  parent            = %p\n", event->base.parent);
-    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->send[i].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
-    for (int i = 0; i < MAX_CHANNELS; i++ ) if (event->recv[i].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    for (int j = 0; j < MAX_OPS; j++) {
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    }
     fprintf(fh, "  startTs           = %f\n", event->base.startTs);
     fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
     fprintf(fh, "}\n");
@@ -178,6 +222,20 @@ void debugEvent(void* eHandle, const char* tag) {
     fprintf(fh, "  startTs           = %f\n", event->startTs);
     fprintf(fh, "  stopTs            = %f\n", event->stopTs);
     fprintf(fh, "}\n");
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)eHandle;
+    fprintf(fh, "KernelCh event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  parent            = %p\n", event->parent);
+    fprintf(fh, "  channel           = %d\n", event->channelId);
+  } else if (type == ncclProfileNetPlugin) {
+    struct netPlugin* event = (struct netPlugin *)eHandle;
+    fprintf(fh, "NetPlugin event %p tag = %s {\n", event, tag);
+    fprintf(fh, "  pluginType        = %d\n", event->pluginType);
+    fprintf(fh, "  pluginVer         = %d\n", event->pluginVer);
+    fprintf(fh, "  pluginEvent       = %d\n", event->pluginEvent);
+    fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  stopTs            = %f\n", event->stopTs);
+    fprintf(fh, "}\n");
   }
   fclose(fh);
 #endif
@@ -200,17 +258,21 @@ void printEvent(FILE* fh, void* handle) {
     struct collective* c = (struct collective *)handle;
     printCollEventHeader(fh, c);
     for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &c->kernel[i]);
       for (int j = 0; j < c->nProxyOps[i]; j++) {
         printEvent(fh, &c->send[i][j]);
         printEvent(fh, &c->recv[i][j]);
       }
+      printKernelChEventTrailer(fh, &c->kernel[i]);
     }
     printCollEventTrailer(fh, c);
   } else if (type == ncclProfileP2p) {
     struct p2p* p = (struct p2p *)handle;
     printP2pEventHeader(fh, p);
     for (int i = 0; i < MAX_CHANNELS; i++) {
+      printKernelChEventHeader(fh, &p->kernel[i]);
       printEvent(fh, &p->op[i]);
+      printKernelChEventTrailer(fh, &p->kernel[i]);
     }
     printP2pEventTrailer(fh, p);
   } else if (type == ncclProfileProxyOp) {
@@ -222,7 +284,11 @@ void printEvent(FILE* fh, void* handle) {
     printProxyOpEventTrailer(fh, p);
   } else if (type == ncclProfileProxyStep) {
     struct proxyStep* p = (struct proxyStep *)handle;
-    printProxyStepEvent(fh, p);
+    printProxyStepEventHeader(fh, p);
+    for (int q = 0; q < p->nNetEvents; q++) {
+      printNetPluginEvent(fh, &p->net[q]);
+    }
+    printProxyStepEventTrailer(fh, p);
   } else if (type == ncclProfileProxyCtrl) {
     struct proxyCtrl* p = (struct proxyCtrl *)handle;
     printProxyCtrlEvent(fh, p);

From 0524aef7a0333bc79d885e392812519087eab71f Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Tue, 22 Apr 2025 13:50:40 -0700
Subject: [PATCH 08/21] NCCL 2.26.3-1

Minimize the performance impact of the device kernel profiling support when
the profiler plugin is not loaded.

Reduce the overheads of CUDA graph capturing, which increased in NCCL
2.26.2 for large graphs.

Fix the exchange of enhanced connection establishment (ECE) options to
address potential slowdowns on networks utilizing RoCE.

Test if cuMem host allocations work and if not, disable them. Enabled by
default since NCCL 2.24 if the CUDA driver version is at least 12.6, such
allocations rely on NUMA support, which is by default not available under
Docker. We recommend invoking Docker with "--cap-add SYS_NICE" to enable
it.

Fix an initialization error when running with NCCL_NET_GDR_C2C=1 on
multiple MNNVL domains with non-uniform network configurations across
nodes.

Fix the printing of sub-seconds in the debug log when using a custom
NCCL_DEBUG_TIMESTAMP_FORMAT setting.
---
 makefiles/version.mk       |  2 +-
 src/debug.cc               |  6 +++-
 src/device/common.h        | 57 +++++++++++++++++++++++++++++++++-----
 src/enqueue.cc             | 27 ++++++++++++++----
 src/graph/paths.cc         |  6 ++--
 src/include/device.h       |  2 ++
 src/include/graph.h        |  2 +-
 src/include/profiler.h     |  1 +
 src/include/proxy.h        |  6 ++++
 src/include/strongstream.h |  3 ++
 src/misc/cudawrap.cc       | 31 +++++++++++++++++++++
 src/misc/strongstream.cc   | 32 +++++++++++++++++++++
 src/plugin/profiler.cc     |  6 +++-
 src/proxy.cc               | 20 +++++++------
 src/transport/coll_net.cc  |  2 +-
 src/transport/net.cc       |  2 +-
 src/transport/net_ib.cc    | 11 ++++----
 17 files changed, 182 insertions(+), 34 deletions(-)

diff --git a/makefiles/version.mk b/makefiles/version.mk
index df3ee5c68..93a71d49d 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 26
-NCCL_PATCH   := 2
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/debug.cc b/src/debug.cc
index 2eb8d7749..e2cc4f810 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -195,6 +195,10 @@ static void ncclDebugInit() {
     }
   }
 
+  // Replace underscore with spaces... it is hard to put spaces in command line parameters.
+  for (int i=0; ncclDebugTimestampFormat[i] != '\0'; ++i) {
+    if (ncclDebugTimestampFormat[i]=='_') ncclDebugTimestampFormat[i] = ' ';
+  }
 
   // Cache pid and hostname
   getHostName(hostname, 1024, '.');
@@ -301,7 +305,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
         snprintf(localTimestampFormat + ncclDebugTimestampSubsecondsStart,
                  ncclDebugTimestampSubsecondDigits+1,
                  "%0*ld", ncclDebugTimestampSubsecondDigits,
-                 ts.tv_nsec / (1000000UL/ncclDebugTimestampMaxSubseconds));
+                 ts.tv_nsec / (1000000000UL/ncclDebugTimestampMaxSubseconds));
         strcpy(    localTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits,
                ncclDebugTimestampFormat+ncclDebugTimestampSubsecondsStart+ncclDebugTimestampSubsecondDigits);
       }
diff --git a/src/device/common.h b/src/device/common.h
index 2dca70dc2..855db730f 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -54,6 +54,7 @@ struct ncclShmemData {
   int workSize;
   uint32_t workConsumed;
   uint64_t workCounter;
+  bool profilerEnabled;
   struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
   uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
 
@@ -291,6 +292,48 @@ struct RunWorkBatch {
   }
 };
 
+#define START 0
+#define STOP  1
+#define FINI  2
+
+__device__ __forceinline__ bool profilerEnabled(void) {
+  // Check if any of the workItems in the batch is profiled. If so, there is an equivalent
+  // profiler ProxyOp waiting for the counter update in the host thread. If this check was
+  // done only for the first workItem the profiler counter for other workItems in the batch
+  // could never be updated, leaving the host thread spinning forever for the counter update
+  // and causing a hang.
+  bool enabled = false;
+  for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) {
+    if (ncclShmem.workType == ncclDevWorkTypeP2p)
+      enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled;
+    else
+      enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled;
+  }
+  return enabled;
+}
+
+__device__ __forceinline__ void profiler(int action) {
+  if (action == START) {
+    if (threadIdx.x == 0) {
+      // increment workCounter regardless of the profiler being active or not
+      ncclShmem.channel.workCounter += ncclShmem.nWorks;
+      if(!profilerEnabled()) return;
+      ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  } else if (action == STOP) {
+    if (threadIdx.x == 0 && profilerEnabled()) {
+      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  } else { // FINI
+    if (threadIdx.x == 0) {
+      // store the workCounter back to vidmem regardless of the profiler being active or not
+      ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
+      if (!profilerEnabled()) return;
+      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    }
+  }
+}
+
 template<int SpecializedFnId, typename SpecializedRunWorkBatch>
 __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* args) {
   int tid = threadIdx.x;
@@ -312,7 +355,10 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   }
   __syncthreads(); // publish ncclShmem.{args, channelId}
   /* set abort flag to 0 */
-  if (tid == 0) ncclShmem.aborted = 0;
+  if (tid == 0) {
+    ncclShmem.aborted = 0;
+    ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
+  }
 
   // Use first 2 warps to load comm and channel, and remaining load work batch.
   switch (tid/WARP_SIZE) {
@@ -348,7 +394,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   }
 
   while (ncclShmem.aborted == 0) {
-    if (tid == 0) ncclShmem.comm.workStarted[ncclShmem.channelId] = (ncclShmem.channel.workCounter += ncclShmem.nWorks);
+    profiler(START);
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
       SpecializedRunWorkBatch().run();
     } else {
@@ -358,7 +404,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     if (ncclShmem.nextBatchIx == -1) break;
     int batchIx = ncclShmem.nextBatchIx;
     __syncthreads();
-    if (tid == 0) ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+    profiler(STOP);
     loadWorkBatchToShmem(tid, tn, args, batchIx);
     __syncthreads();
 
@@ -367,10 +413,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
       ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
     }
   }
-  if (tid == 0) {
-    ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
-    ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
-  }
+  profiler(FINI);
 }
 
 __global__ void ncclDevKernel_Generic(ncclDevKernelArgs4K NCCL_GRID_CONSTANT const args4K);
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 5e0b213fc..4e8a211fc 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -288,6 +288,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
     devWork.oneNode = (comm->nNodes == 1);
     devWork.isOneRPN = comm->isOneRPN;
     devWork.netRegUsed = devWork.regUsed = 0;
+    devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
     if (task->regBufType & NCCL_NET_REG_BUFFER)
       devWork.netRegUsed = 1;
     if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
@@ -445,6 +446,7 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
       devWork.redOpArgIsPtr = task->opDev.scalarArgIsPtr;
       devWork.oneNode = (comm->nNodes == 1);
       devWork.netRegUsed = devWork.regUsed = 0;
+      devWork.profilerEnabled = ncclProfilerPluginLoaded() && (task->eActivationMask & ncclProfileKernelCh);
       if (task->regBufType & NCCL_NET_REG_BUFFER)
         devWork.netRegUsed = 1;
       if (task->regBufType & (NCCL_IPC_REG_BUFFER | NCCL_NVLS_REG_BUFFER))
@@ -557,7 +559,7 @@ static ncclResult_t scheduleCollTasksToPlan(
         proxyOp.task.coll = task;
         proxyOp.rank = comm->rank;
         proxyOp.eActivationMask = task->eActivationMask;
-        proxyOp.workCounter = ++comm->profiler.workCounter[c];
+        proxyOp.incWorkCounter = true;
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Set pattern to profiler to add a proxy profiler for kernel events
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOp));
@@ -681,7 +683,7 @@ static ncclResult_t scheduleCollTasksToPlan(
           proxyOp->ringAlgo->incRefCount();
         }
         proxyOp->eActivationMask = task->eActivationMask;
-        proxyOp->workCounter = ++comm->profiler.workCounter[c];
+        proxyOp->incWorkCounter = true;
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
@@ -886,6 +888,7 @@ static ncclResult_t addP2pToPlan(
   work->recvRank = recvRank;
   work->recvAddr = recvAddr;
   work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
+  work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh);
 
   struct ncclProxyOp proxyOps[2] = {};
   int nProxyOps = selfSend ? 0 : 2;
@@ -910,6 +913,7 @@ static ncclResult_t addP2pToPlan(
 
   nChannelsMax = std::max(nChannels[0], nChannels[1]);
   for (int part=0; part < nChannelsMax; part++) {
+    int incWorkCounter = -1;
     int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
     plan->channelMask |= uint64_t(1)<<channelId;
     // Add batch first.
@@ -945,17 +949,21 @@ static ncclResult_t addP2pToPlan(
         }
       }
 
+      // Increment work counter for <send, recv> pair rather than individual p2p
+      if (proxyOps[dir].nsteps && incWorkCounter < 0) {
+        proxyOps[dir].incWorkCounter = true;
+        incWorkCounter = dir;
+      }
+
       if (proxyOps[dir].nsteps != 0) {
         // Calculate the opCount after adding batch since then the batch count will
         // equal one plus the batch index this p2p settled in.
         proxyOps[dir].channelId = channelId;
         proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
-        proxyOps[dir].workCounter = comm->profiler.workCounter[channelId]+1;
         NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
         NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
       }
     }
-    comm->profiler.workCounter[channelId] += (proxyOps[0].nsteps || proxyOps[1].nsteps) ? 1 : 0;
   }
 
   return ncclSuccess;
@@ -1592,7 +1600,16 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
     // deviceStream waits on userStream[0]
     NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
-    CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+
+    // We know that deviceStream is strictly behind the launchStream because launchStream
+    // synced with it before kernel launch. This allows us to to see deviceStream waiting
+    // on launchStream as a fast-forward. When building CUDA graphs fast forwards should
+    // be handled specially so as not to create graphs with a blowup in the number of edges.
+    // So we could do this:
+    //   CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+    // But instead we do:
+    NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent));
+
     // Each userStream[i] waits on userStream[0]
     for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
       CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index ace4476f6..998371247 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -367,7 +367,7 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
   if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
   if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
       (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
-    INFO(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
+    TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
          info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
     *ret = 1;
   }
@@ -473,7 +473,7 @@ ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *a
 NCCL_PARAM(NetForceFlush, "NET_FORCE_FLUSH", 0);
 
 // Determine whether we need to flush the GDR recv buffers
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush) {
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush) {
   *flush = 1;
   ncclNetProperties_t props;
   NCCLCHECK(comm->ncclNet->getProperties(netDev, &props));
@@ -488,7 +488,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int*
   // flags would go through C2C. In that case, force a flush.
   int c, n;
   NCCLCHECK(ncclGetLocalCpu(system, g, &c));
-  NCCLCHECK(ncclTopoIdToIndex(system, NET, netDev, &n));
+  NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
   if (gpu->paths[NET][n].type <= PATH_PXB && gpu->paths[CPU][c].type == PATH_C2C) {
     *flush = 1;
   }
diff --git a/src/include/device.h b/src/include/device.h
index 0763a579a..f6ca51b75 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -221,6 +221,7 @@ struct alignas(16) ncclDevWorkP2p {
   uint8_t sendProtoLL:1, recvProtoLL:1;
   uint8_t sendNetReg:1, recvNetReg:1;
   uint8_t sendIpcReg:1, recvIpcReg:1;
+  uint8_t profilerEnabled:1;
 };
 
 // Compute the subset of the data transfer corresponding to the given part index.
@@ -259,6 +260,7 @@ struct alignas(16) ncclDevWorkColl {
   uint32_t channelLo:8, channelHi:8;
   uint32_t nWarps:8;
   uint32_t redOpArgIsPtr:1, regUsed:1, netRegUsed:1, oneNode:1, direct:2, isOneRPN:1;
+  uint32_t profilerEnabled:1;
   uint32_t root;
   void* recvbuff;
   void* sendbuff;
diff --git a/src/include/graph.h b/src/include/graph.h
index b779773da..a06556e37 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -43,7 +43,7 @@ enum ncclTopoGdrMode {
   ncclTopoGdrModeNum = 3
 };
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode);
-ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int netDev, int rank, int* flush);
+ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev, int rank, int* flush);
 ncclResult_t ncclTopoIsGdrAvail(struct ncclTopoSystem* system, int rank, bool *avail);
 ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank2, int* net);
 int ncclPxnDisable(struct ncclComm* comm);
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 8d4107963..bae0501bb 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -68,6 +68,7 @@ ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, n
 // Profiler utility functions
 ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op);
 bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op);
+bool ncclProfilerPluginLoaded(void);
 
 // Profiler callback for network plugin
 ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 225acb22d..f90c80275 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -88,6 +88,12 @@ struct ncclProxyOp {
     struct ncclTaskP2p* p2p;
   } task;
 
+  // Profiler work counter increment flag. Set to 'true' if the profiler work counter for this channel needs increment.
+  // Always 'true' for collective operations. Grouped p2p operations are fused into one <send, recv> pair in the GPU kernel,
+  // meaning the GPU profiler code increments the work counter for the pair rather than the individual p2p. For this
+  // reason, the incWorkCounter flag is used to avoid incrementing the work counter twice in the host code. This is done
+  // by setting incWorkCounter to 'true' only for one of the p2ps in the pair during enqueue.
+  bool incWorkCounter;
   int eActivationMask;
   void* taskEventHandle;
   int rank;
diff --git a/src/include/strongstream.h b/src/include/strongstream.h
index c56d5aca5..393a1f0b1 100644
--- a/src/include/strongstream.h
+++ b/src/include/strongstream.h
@@ -102,6 +102,9 @@ ncclResult_t ncclStreamWaitStream(
   cudaStream_t a, cudaStream_t b, cudaEvent_t scratchEvent
 );
 
+// Like cudaStreamWaitEvent except `e` must be strictly ahead of everything in `s`.
+ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e);
+
 // Synchrnoization does not need the strong stream to be acquired.
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
 
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index e5fec1e46..64a84f556 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -4,6 +4,7 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
+#include "alloc.h"
 #include "nccl.h"
 #include "debug.h"
 #include "param.h"
@@ -67,6 +68,36 @@ int ncclCuMemHostEnable() {
       ncclCumemHostEnable = paramValue;
     else
       ncclCumemHostEnable = (cudaDriverVersion >= 12060) ? 1 : 0;
+    if (ncclCumemHostEnable) {
+      // Verify that host allocations actually work.  Docker in particular is known to disable "get_mempolicy",
+      // causing such allocations to fail (this can be fixed by invoking Docker with "--cap-add SYS_NICE").
+      int cudaDev;
+      CUdevice currentDev;
+      int cpuNumaNodeId = -1;
+      CUmemAllocationProp prop = {};
+      size_t granularity = 0;
+      size_t size;
+      CUmemGenericAllocationHandle handle;
+      CUDACHECK(cudaGetDevice(&cudaDev));
+      CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+      CUCHECK(cuDeviceGetAttribute(&cpuNumaNodeId, CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID, currentDev));
+      if (cpuNumaNodeId < 0) cpuNumaNodeId = 0;
+      prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
+      prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+      prop.requestedHandleTypes = ncclCuMemHandleType;
+      prop.location.id = cpuNumaNodeId;
+      CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+      size = 1;
+      ALIGN_SIZE(size, granularity);
+      if (CUPFN(cuMemCreate(&handle, size, &prop, 0)) != CUDA_SUCCESS) {
+        INFO(NCCL_INIT, "cuMem host allocations do not appear to be working; falling back to a /dev/shm/ based "
+             "implementation. This could be due to the container runtime disabling NUMA support. "
+             "To disable this warning, set NCCL_CUMEM_HOST_ENABLE=0");
+        ncclCumemHostEnable = 0;
+      } else {
+        CUCHECK(cuMemRelease(handle));
+      }
+    }
   }
   return ncclCumemHostEnable;
 error:
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index e6cce9807..7d957d432 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -328,6 +328,38 @@ ncclResult_t ncclStreamWaitStream(cudaStream_t a, cudaStream_t b, cudaEvent_t sc
   return ncclSuccess;
 }
 
+ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cudaEvent_t e) {
+  if (g.graphId == ULLONG_MAX) {
+    CUDACHECK(cudaStreamWaitEvent(s, e, 0));
+  } else {
+    cudaStream_t tmp;
+    CUDACHECK(cudaStreamCreateWithFlags(&tmp, cudaStreamNonBlocking));
+    CUDACHECK(cudaStreamWaitEvent(tmp, e, 0));
+
+    cudaStreamCaptureStatus status;
+    cudaGraphNode_t const* nodes;
+    size_t count = 0;
+    cudaError_t res = cudaStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count);
+
+    #if CUDART_VERSION >= 12030
+    if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
+      cudaGraphEdgeData const* edges;
+      CUDACHECK(cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, &edges, &count));
+      CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, edges, count, cudaStreamSetCaptureDependencies));
+    }
+    #else
+    if (false) {}
+    #endif
+    else {
+      CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+      CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies));
+    }
+
+    CUDACHECK(cudaStreamDestroy(tmp));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss) {
   #if CUDART_VERSION >= 11030
     CUDACHECK(cudaStreamWaitEvent(ss->liveStream, ss->serialEvent, 0));
diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc
index 023a704f4..18b9b5c4f 100644
--- a/src/plugin/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -536,11 +536,15 @@ static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxy
 }
 
 bool ncclProfilerNeedsProxy(struct ncclComm* comm, struct ncclProxyOp* op) {
-  bool enabled = (__builtin_expect(ncclProfiler != NULL, 0) && (op->eActivationMask & ncclProfileKernelCh));
+  bool enabled = ncclProfilerPluginLoaded() && (op->eActivationMask & ncclProfileKernelCh);
   if (enabled && !comm->profiler.initialized) (void)proxyProfilerConnect(comm, op);
   return enabled;
 }
 
+bool ncclProfilerPluginLoaded(void) {
+  return (__builtin_expect(ncclProfiler != NULL, 0));
+}
+
 ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
diff --git a/src/proxy.cc b/src/proxy.cc
index 7e8021e47..c27d23455 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -9,7 +9,6 @@
 #include "collectives.h"
 #include "socket.h"
 #include "shmutils.h"
-#include "profiler.h"
 #define ENABLE_TIMER 0
 #include "timer.h"
 #include "profiler.h"
@@ -533,15 +532,21 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
   return ncclSuccess;
 }
 
+static void incWorkCounter(struct ncclComm* comm, struct ncclProxyOp* op) {
+  op->workCounter = (op->incWorkCounter) ? ++comm->profiler.workCounter[op->channelId] : comm->profiler.workCounter[op->channelId];
+}
+
 static ncclResult_t SaveProxyProfiler(struct ncclComm* comm, struct ncclProxyOp* op, bool* justInquire) {
   struct ncclProxyConnector* proxyConn = (op->coll == ncclFuncRecv) ? &comm->profiler.recvProxyConn[op->channelId] : &comm->profiler.sendProxyConn[op->channelId];
-  if (justInquire) *justInquire = true;
-  else {
+  if (justInquire) {
+    *justInquire = true;
+    if (!comm->planner.persistent) incWorkCounter(comm, op);
+  } else {
     op->sendbuff = (uint8_t *)comm->profiler.workStarted;
     op->recvbuff = (uint8_t *)comm->profiler.workCompleted;
-    NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op));
     // Ensure that in graph capturing the proxy workCounter is incremented to keep up with kernel workCounter
-    op->workCounter += comm->profiler.workCounter[op->channelId];
+    if (comm->planner.persistent) incWorkCounter(comm, op);
+    NCCLCHECK(ncclLocalOpAppend(comm, proxyConn, op));
   }
   return ncclSuccess;
 }
@@ -696,9 +701,8 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       NCCLCHECK(SaveProxy(comm, channel, op->pattern == ncclPatternSend ? proxySend : proxyRecv, op->root, op, 1, justInquire));
     } break;
   case ncclPatternProfiler: {
-      if (ncclProfilerNeedsProxy(comm, op)) {
-        NCCLCHECK(SaveProxyProfiler(comm, op, justInquire));
-      }
+      if (ncclProfilerNeedsProxy(comm, op)) NCCLCHECK(SaveProxyProfiler(comm, op, justInquire));
+      else incWorkCounter(comm, op);
     } break;
   }
   return ncclSuccess;
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index c1ccfcaa8..84e1f84a0 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -192,7 +192,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->rank, netId, 0, &req.useGdr));
   recv->conn.flags |= req.useGdr ? NCCL_DIRECT_NIC : 0;
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, netId, req.netDev, myInfo->rank, &req.needFlush));
 
   recv->proxyConn.tpLocalRank = comm->topParentLocalRanks[comm->localRank];
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_COLLNET, 0, myInfo->rank, &recv->proxyConn));
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 40d334fa7..61b15ce20 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -250,7 +250,7 @@ static ncclResult_t recvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph
   if (!req.useGdr && connIndex == 0) comm->useGdr = 0;
 
   // Determine whether we need to flush the GDR buffer on recv or not
-  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, req.netDev, myInfo->rank, &req.needFlush));
+  if (req.useGdr) NCCLCHECK(ncclTopoNeedFlush(comm, netId, req.netDev, myInfo->rank, &req.needFlush));
 
   // We don't support PXN on receive yet
   NCCLCHECK(ncclProxyConnect(comm, TRANSPORT_NET, 0, myInfo->rank, &recv->proxyConn));
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index bfff6e555..c049531f8 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -1641,17 +1641,18 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       // However, this has been confirmed to be intentional.
       // coverity[copy_paste_error]
       NCCLCHECKGOTO(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
-
-      // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
-      // Store this in our own qpInfo for returning to the requestor
-      if (meta.qpInfo[q].ece_supported)
-        NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
     } else {
       meta.qpInfo[q].ece_supported = 0;
     }
 
     NCCLCHECKGOTO(ncclIbRtrQp(qp->qp, &rCommDev->base.gidInfo, remMeta.qpInfo[q].qpn, remDevInfo, true, remMeta.tc, remMeta.sl), ret, fail);
     NCCLCHECKGOTO(ncclIbRtsQp(qp->qp), ret, fail);
+
+    // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
+    // Store this in our own qpInfo for returning to the requestor
+    if (remMeta.qpInfo[q].ece_supported && meta.qpInfo[q].ece_supported) {
+      NCCLCHECKGOTO(wrap_ibv_query_ece(qp->qp, &meta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported), ret, fail);
+    }
   }
 
   rComm->flushEnabled = ((ncclIbGdrSupport() == ncclSuccess || ncclIbDmaBufSupport(lComm->dev) == ncclSuccess)

From 3000e3c797b4b236221188c07aa09c1f3a0170d4 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Tue, 22 Apr 2025 13:55:13 -0700
Subject: [PATCH 09/21] NCCL 2.26.5-1

Work around a potential hang in alltoall-like communication patterns on
MNNVL systems at a scale of over 80 ranks.
---
 makefiles/version.mk |  2 +-
 src/init.cc          | 22 ++++++++++++++++------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/makefiles/version.mk b/makefiles/version.mk
index 93a71d49d..c5ed6ab70 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 26
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/init.cc b/src/init.cc
index 46b02e65e..47d7fa3c6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -271,7 +271,7 @@ NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
 // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
 NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
 #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20)
-NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT);
+NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", -1);
 NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX);
 enum ncclLaunchMode ncclParamLaunchMode;
 
@@ -458,12 +458,22 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   if (ccEnable) {
     comm->workFifoBytes = 0;
   } else {
-    comm->workFifoBytes = ncclParamWorkFifoBytes();
-    if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
-      WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
-      comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+    int64_t workFifoBytesParam = ncclParamWorkFifoBytes();
+    if (workFifoBytesParam == -1) {
+      if (comm->MNNVL && (comm->compCap >= 100)) {
+        // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL
+        INFO(NCCL_INIT, "Disabling work fifo");
+        comm->workFifoBytes = 0;
+      } else {
+        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+      }
+    } else {
+      if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) {
+        WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam);
+        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
+      }
+      comm->workFifoBytes = std::min<uint64_t>(workFifoBytesParam, 1ul<<30);
     }
-    comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
   }
 
   if (comm->rank == 0) {

From 8171af656bb3c47c8fc60b7cd49ae0c7494de664 Mon Sep 17 00:00:00 2001
From: Giuseppe Congiu <gcongiu@nvidia.com>
Date: Mon, 19 May 2025 09:15:40 -0700
Subject: [PATCH 10/21] NCCL 2.26.6-1

Fix profiler_v2 compatibility layer
 * Removing trafficBytes in profiler_v3 breaks casting to ncclProfilerEventDescr_v2_t
   in the compatibility layer for profiler_v2 interface. This patch fixes the issue
   by making the conversion between the two descriptors explicit.
---
 makefiles/version.mk               |  2 +-
 src/plugin/profiler/profiler_v2.cc | 50 +++++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/makefiles/version.mk b/makefiles/version.mk
index c5ed6ab70..5c0b0de9a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 26
-NCCL_PATCH   := 5
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
index 3d00008a6..52907d6e3 100644
--- a/src/plugin/profiler/profiler_v2.cc
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -12,11 +12,53 @@ static ncclProfiler_t ncclProfiler;
 static ncclProfiler_v2_t* ncclProfiler_v2;
 
 static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
-  if (eDescr->type == ncclProfileKernelCh || eDescr->type == ncclProfileNetPlugin) {
-    *eHandle = NULL;
-    return ncclSuccess;
+  *eHandle = nullptr;
+  ncclProfilerEventDescr_v2_t eDescr_v2 = { };
+  eDescr_v2.type = eDescr->type;
+  eDescr_v2.parentObj = eDescr->parentObj;
+  eDescr_v2.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v2.coll.name = eDescr->coll.name;
+      eDescr_v2.coll.commHash = eDescr->coll.commHash;
+      eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v2.coll.func = eDescr->coll.func;
+      eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v2.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v2.coll.count = eDescr->coll.count;
+      eDescr_v2.coll.root = eDescr->coll.root;
+      eDescr_v2.coll.datatype = eDescr->coll.datatype;
+      eDescr_v2.coll.trafficBytes = 0; // removed in v3
+      eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v2.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v2.coll.algo = eDescr->coll.algo;
+      eDescr_v2.coll.proto = eDescr->coll.proto;
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v2.p2p.name = eDescr->p2p.name;
+      eDescr_v2.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v2.p2p.func = eDescr->p2p.func;
+      eDescr_v2.p2p.buff = eDescr->p2p.buff;
+      eDescr_v2.p2p.count = eDescr->p2p.count;
+      eDescr_v2.p2p.datatype = eDescr->p2p.datatype;
+      eDescr_v2.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v2.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v2.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v2.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v2.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v2.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v2.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v2.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    default: return ncclSuccess;
   }
-  return ncclProfiler_v2->startEvent(context, eHandle, (ncclProfilerEventDescr_v2_t *)eDescr);
+  return ncclProfiler_v2->startEvent(context, eHandle, &eDescr_v2);
 }
 
 static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {

From 72d2432094d6ae36abd6e511c3a16a2d052dbf94 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Thu, 29 May 2025 20:56:40 -0700
Subject: [PATCH 11/21] NCCL 2.27.3-1

Symmetric memory API and symmetric kernels
 * Redesign from the ground up, enabling major latency and bandwidth
   improvements.
 * Add new API calls to register user-allocated memory among communicator
   ranks into a NCCL window: ncclCommWindowRegister() and
   ncclCommWindowDeregister(). The calls currently support symmetric
   registration for P2P and NVLS, and require VMM memory buffers (i.e.,
   CUMEM must be operational).
 * Implement specialized kernels taking advantage of symmetrically
   registered memory, with performance gains expected particularly for
   small to medium message sizes.
 * The kernels support 32 bit floating point types and smaller, and sum as
   the reduction operator, with no more than one collective operation per
   group.
 * Floating point summation is always done in fp32 accumulators (with the
   exception of fp8 on NVLS, where it uses fp16 inside the switch). Thus,
   the accuracy with fp8 and fp16 data types should be much improved.
 * This initial implementation supports non-network communicators only (P2P
   and NVLS transports).
 * To explore this functionality users need to use the new memory
   registration API calls with the NCCL_WIN_COLL_SYMMETRIC flag and all
   ranks of a communicator must pass buffers at the same offset in the same
   registration when invoking a collective NCCL operation.

Add support for DGX Spark.

Add support for DirectNIC (CX8) to the internal IB plugin.

Add a new ncclCommShrink() API call
 * It is a non-collective call similar to ncclCommSplit(), which makes it
   possible to exclude some (possibly unresponsive) ranks from the parent
   communicator.

Add support for loading multiple network plugins
 * This enables the creation of generic containers that can work across a
   range of providers.
 * Allow NCCL_NET_PLUGIN to accept a comma-separated list of plugins to
   load.

NVLink SHARP (NVLS) improvements
 * Implement NVLS+IB SHARP support for AllGather and ReduceScatter with
   user buffer registration. This improves performance and reduces the
   number of CTAs needed to achieve peak bandwidth.
 * Gracefully fall back by default to other transports if NVLS
   initialization fails (the old behavior of returning an error code from a
   NCCL call can be preserved by setting NCCL_NVLS_ENABLE=1).
 * Decrease the NVLS channel count to 24 on Blackwell systems with multiple
   NVLink domains per communicator.
 * Enable fine-tuning of NCCL behavior per communicator using new
   "ncclConfig_t" members "collnetEnable", "CTAPolicy", and "nvlsCTAs".

Profiler improvements
 * Extend the init function by adding communicator name, comm id (hash),
   rank, number of ranks, number of nodes, and the NCCL log function to the
   argument list. This makes the name and the comm id available to all
   events in the communicator without explicitly passing them to each
   individual event. Add the communicator id and rank to the profiler trace
   filename. Now, the communicator name can be set via a new "ncclConfig_t"
   member "commName".
 * Improve the accuracy of the GPU kernel events by providing GPU-generated
   timestamps for the start and stop of every NCCL operation.
 * Harmonize proxy events, removing overlaps between ProxyOp and ProxyStep
   states.
 * Add support for network-defined event updates (through
   "recordEventState").
 * Report the correct number of channels used by every collective/p2p
   operation (used to be set to nMaxChannels for collectives and absent for
   p2ps).
 * Fix the logic on proxyCtrl Idle/Active events (Issue #1162).
 * Fix an issue where the network proxy profiler could lose track of an
   event identifier (Issue #1682).
 * Improve the backward compatibility with plugins older than v4.
 * Ensure that the work counters are 0-initialized.
 * Fix a potential race condition in the network profiler that could result
   in an event being linked to a wrong parent.

MNNVL improvements
 * Increase to 16 the number of NICs used to communicate between MNNVL
   domains on GB200 systems, to optimize the performance of collective
   operations.
 * Add support for more complex MNNVL topologies with up to 32 NICs per
   node.
 * If the MNNVL fabric initialization was unsuccessful, NCCL will now fail
   by default, so as to avoid inadvertently falling back to a potentially
   much slower network transport. Such failures are typically due to a
   misconfigured IMEX support on the system. To continue without MNNVL,
   restart the job with NCCL_MNNVL_ENABLE=0.
 * Fix a potential hang in alltoall-like communication patterns at a scale
   of over 80 ranks.
 * Make NCCL_P2P_DISABLE=1 imply NCCL_MNNVL_ENABLE=0 (so the latter no
   longer needs to be specified on MNNVL systems).
 * Fix an initialization failure when NCCL_TOPO_FILE is used on MNNVL
   systems.
 * Fix the graph search to exclude non-local NICs.
 * Fix the SHM transport to use fabric handles on MNNVL systems.

NIC Fusion improvements
 * Disable the creation of fused NICs for physical devices that haven't
   been merged.
 * Flatten multiple ports to a single PCI device within the internal IB
   plugin and reparent dual-port NICs under the first PCI parent. If the
   parent is not a PCI switch, PCI devices for fused NICs won't be
   duplicated.
 * Route traffic on GB200-CX8 systems through DirectNIC, not the host
   interface.

Improve support for platforms with C2C connectivity (e.g., GB200)
 * Enable GPUDirect RDMA for the NICs by default.
 * Add support for P2C (PXN over C2C) and the LL128 protocol.

Extend NCCL fault tolerance in multithreaded scenarios
 * Support the creation of multiple nonblocking communicators within a
   single group and polling in parallel for the completion using multiple
   threads (one per communicator).

Enable ncclImplicitOrderLaunch for CUDA 12.9+
 * This can potentially speed up NCCL_IMPLICIT_LAUNCH_ORDER.

Improve the netSocket transport latency and control
 * Provide finer control over the size of the socket send/receive buffers,
   the task size, and the number of sockets that a single peer can open.
 * Add support for the inlining of small messages behind the header when
   using multiple sockets per connection.

Improve the readability of the CPU affinity in the debug output
 * Print it as a range string rather than a bitmask.

Fix a potential race condition in graph execution
 * A contention could arise when mixing graph and non-graph execution.

Improve PXN connection code
 * Avoid duplicate and unused connections.

RAS fixes
 * Fix a memory corruption at job termination time in case of a previously
   failed initialization of a RAS socket connection.
 * Fix a race condition leading to a crash when generating a RAS report
   during communicator initialization (Issues #1669, #1718).
 * Fix a potential race condition when gathering data for a RAS status
   report.

Fix a potential memory corruption in ncclCommSplit()
 * Memory could get corrupted when resource sharing was in use and the size
   of the NVLink domain in the new communicator was smaller than in the old
   one.

Fix asynchronous graph upload
 * Fix a small memory leak.
 * Fix oversychronization.

Add a check for out-of-memory conditions in ncclMemAlloc()

Clean up the NCCL socket code
 * accept() will retry also if just reading the magic failed (Issue #1613).
 * connect() will retry also if poll() did not return a POLLOUT event
   (Issue #1618).
 * Add error checking in a few instances (Issue #1539).
 * Fix the loop condition in ncclFindInterfaceMatchSubnet() (Issue #1574).
 * Clean up the debug output, downgrading WARN messages to INFO in
   non-critical cases, and printing the peer's address where relevant.

Switch NCCL_DEBUG_FILE to line buffering
 * This should help avoid mixed-up partial output lines in multithreaded
   cases.

Other minor fixes
 * Improve the checks for buffer overflows in the graph code (Issue #1585).
 * Extend logging and state clearing to all four events in the internal IB
   plugin (Issue #1650).
 * Fix the error path in case IB communication is not ready (Issue #1489).
 * Add ECE logging for IB fabric.
 * Fix various minor issues in the graph module (Issue #1635).
 * Clean up the debug output in the graph code, downgrading WARN messages
   to INFO in non-critical cases.
 * Add a missing argument to a directSend() call (Issue #1628).
 * Remove duplicate code in sendProxySetup() (Issue #1420).
 * Fix the order of arguments of cudaDeviceCanAccessPeer() (Issue #1507).
 * Fix compiler warnings with GCC 14.
 * Fix a typo in a comment (Issue #1236).
---
 ext-net/example/nccl/common.h             |   6 +
 ext-net/example/nccl/net.h                |   4 +-
 ext-profiler/README.md                    | 127 +++---
 ext-profiler/example/event.h              |  49 +--
 ext-profiler/example/nccl/profiler.h      |  54 ++-
 ext-profiler/example/nccl/profiler_v3.h   |   5 -
 ext-profiler/example/nccl/profiler_v4.h   | 123 ++++++
 ext-profiler/example/plugin.c             |  82 +++-
 ext-profiler/example/print_event.c        |  77 ++--
 ext-profiler/example/print_event.h        |   3 +
 makefiles/common.mk                       |  31 +-
 makefiles/version.mk                      |   4 +-
 src/Makefile                              |   2 +-
 src/allocator.cc                          | 196 +++++++++
 src/bootstrap.cc                          |   9 +-
 src/channel.cc                            |   2 +-
 src/debug.cc                              |  58 ++-
 src/device/Makefile                       |  45 ++-
 src/device/all_gather.h                   | 260 +++++++++---
 src/device/all_reduce.h                   |   2 +-
 src/device/common.h                       |  68 ++--
 src/device/generate.py                    |   2 +-
 src/device/op128.h                        |  99 ++++-
 src/device/prims_simple.h                 |  18 +-
 src/device/reduce_kernel.h                | 445 ++++++++++++++------
 src/device/reduce_scatter.h               | 246 ++++++++---
 src/device/symmetric/all_gather.cuh       | 367 +++++++++++++++++
 src/device/symmetric/all_reduce.cuh       | 432 ++++++++++++++++++++
 src/device/symmetric/generate.py          | 294 ++++++++++++++
 src/device/symmetric/kernel.cuh           |  27 ++
 src/device/symmetric/primitives.cuh       | 420 +++++++++++++++++++
 src/device/symmetric/reduce_scatter.cuh   | 387 ++++++++++++++++++
 src/enqueue.cc                            | 392 +++++++++++-------
 src/graph/connect.cc                      |  10 +-
 src/graph/paths.cc                        |  98 +++--
 src/graph/search.cc                       |  43 +-
 src/graph/topo.cc                         | 134 +++---
 src/graph/topo.h                          |  30 +-
 src/graph/tuning.cc                       |  92 +++--
 src/graph/xml.cc                          |  37 +-
 src/graph/xml.h                           |  11 +-
 src/group.cc                              | 356 ++++++++++------
 src/include/allocator.h                   |  13 +
 src/include/bitops.h                      | 186 +++++++--
 src/include/comm.h                        |  59 ++-
 src/include/cpuset.h                      |  25 ++
 src/include/cudawrap.h                    |  70 ++--
 src/include/device.h                      |  49 ++-
 src/include/graph.h                       |   6 +-
 src/include/group.h                       |  68 ++--
 src/include/mlx5/mlx5dvcore.h             |  18 +
 src/include/mlx5/mlx5dvsymbols.h          |  23 ++
 src/include/mlx5/mlx5dvwrap.h             |  41 ++
 src/include/nccl_common.h                 |  14 +-
 src/include/net.h                         |   2 -
 src/include/nvtx.h                        |   3 +-
 src/include/nvtx_payload_schemas.h        |  10 +
 src/include/plugin/nccl_net.h             |   7 +-
 src/include/plugin/nccl_profiler.h        |  54 ++-
 src/include/plugin/profiler/profiler_v4.h | 123 ++++++
 src/include/profiler.h                    |  13 +-
 src/include/proxy.h                       |  12 +-
 src/include/register.h                    |  24 +-
 src/include/register_inline.h             |  33 ++
 src/include/socket.h                      |   6 +-
 src/include/symmetric.h                   |  90 +++++
 src/include/transport.h                   |  18 +-
 src/include/utils.h                       |   6 +
 src/init.cc                               | 470 ++++++++++++----------
 src/misc/cudawrap.cc                      | 145 +++----
 src/misc/ibvwrap.cc                       |   4 +
 src/misc/mlx5dvsymbols.cc                 |  74 ++++
 src/misc/mlx5dvwrap.cc                    |  75 ++++
 src/misc/socket.cc                        | 168 +++++---
 src/misc/strongstream.cc                  |  34 ++
 src/mnnvl.cc                              |   9 +-
 src/nccl.h.in                             |  41 +-
 src/plugin/net.cc                         | 372 +++++++++--------
 src/plugin/plugin_open.cc                 |  65 +--
 src/plugin/profiler.cc                    |  91 ++---
 src/plugin/profiler/profiler_v1.cc        |  40 +-
 src/plugin/profiler/profiler_v2.cc        |  32 +-
 src/plugin/profiler/profiler_v3.cc        |  93 ++++-
 src/plugin/profiler/profiler_v4.cc        |  21 +
 src/proxy.cc                              |  17 +-
 src/ras/collectives.cc                    |  14 +-
 src/ras/rasnet.cc                         |  30 +-
 src/register/coll_reg.cc                  |  43 +-
 src/register/register.cc                  | 140 ++++++-
 src/symmetric.cc                          | 296 ++++++++++++++
 src/transport.cc                          |  11 +-
 src/transport/coll_net.cc                 |  16 +-
 src/transport/net.cc                      |  52 +--
 src/transport/net_ib.cc                   | 300 +++++++++-----
 src/transport/net_socket.cc               |  97 +++--
 src/transport/nvls.cc                     | 260 +++++++++---
 src/transport/p2p.cc                      |  93 ++++-
 src/transport/profiler.cc                 |  13 +-
 src/transport/shm.cc                      |   2 +-
 99 files changed, 7216 insertions(+), 2022 deletions(-)
 create mode 100644 ext-profiler/example/nccl/profiler_v4.h
 create mode 100644 src/allocator.cc
 create mode 100644 src/device/symmetric/all_gather.cuh
 create mode 100644 src/device/symmetric/all_reduce.cuh
 create mode 100755 src/device/symmetric/generate.py
 create mode 100644 src/device/symmetric/kernel.cuh
 create mode 100644 src/device/symmetric/primitives.cuh
 create mode 100644 src/device/symmetric/reduce_scatter.cuh
 create mode 100644 src/include/allocator.h
 create mode 100644 src/include/mlx5/mlx5dvcore.h
 create mode 100644 src/include/mlx5/mlx5dvsymbols.h
 create mode 100644 src/include/mlx5/mlx5dvwrap.h
 create mode 100644 src/include/plugin/profiler/profiler_v4.h
 create mode 100644 src/include/register_inline.h
 create mode 100644 src/include/symmetric.h
 create mode 100644 src/misc/mlx5dvsymbols.cc
 create mode 100644 src/misc/mlx5dvwrap.cc
 create mode 100644 src/plugin/profiler/profiler_v4.cc
 create mode 100644 src/symmetric.cc

diff --git a/ext-net/example/nccl/common.h b/ext-net/example/nccl/common.h
index 912925225..5aec2f7bb 100644
--- a/ext-net/example/nccl/common.h
+++ b/ext-net/example/nccl/common.h
@@ -7,9 +7,15 @@
 #ifndef COMMON_H_
 #define COMMON_H_
 
+#include <stdint.h>
+
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
 typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
+enum { ncclProfilerNetEventStart = 0, ncclProfilerNetEventStop, ncclProfilerNetEventUpdate, ncclProfilerNetEventUpdateAndStop };
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
+
 #endif
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 85ea79ef7..4cc66915b 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -8,9 +8,9 @@
 #include <stdint.h>
 #include <stdlib.h>
 
-#include "common.h"
 #include "err.h"
 #include "net_device.h"
+#include "common.h"
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L) //1TB
@@ -23,8 +23,6 @@
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
-typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* phandle, int64_t pluginId, void* extData);
-
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
diff --git a/ext-profiler/README.md b/ext-profiler/README.md
index 2a4018c07..27bd4e25c 100644
--- a/ext-profiler/README.md
+++ b/ext-profiler/README.md
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v3)
+# API (v4)
 
-Below is the main `ncclProfiler_v3` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
@@ -60,9 +60,15 @@ typedef struct {
   // init - initialize the profiler plugin
   // Input
   //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
   // Output
   //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask);
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
 
   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
   // Input
@@ -70,7 +76,7 @@ typedef struct {
   //  - eDescr : pointer to ncclProfilerEventDescr_t object
   // Output
   //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
 
   // stopEvent - stop/finalize an event inside and event set
   // Input
@@ -82,13 +88,13 @@ typedef struct {
   //  - eHandle   : handle to event object created through startEvent
   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
   //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
 
   // finalize - finalize the profiler plugin
   // Input
   //  - context: opaque profiler context object
   ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v3_t;
+} ncclProfiler_v4_t;
 ```
 
 ## Error codes
@@ -147,8 +153,6 @@ typedef struct {
   int rank;                 // rank that generated the event
   union {
     struct {                // collective events metadata
-      const char* name;     // string containing name of the communicator
-      uint64_t commHash;    // unique hash/id for the communicator
       uint64_t seqNumber;   // sequence number of this collective operation in the communicator
       const char* func;     // string containing name of the collective
       void const* sendBuff; // address of send buffer
@@ -156,20 +160,19 @@ typedef struct {
       size_t count;         // data count
       int root;             // root rank
       const char* datatype; // string containing the name of the datatype
-      uint8_t nMaxChannels; // max number of channels for this collective
+      uint8_t nChannels;    // number of channels for this collective
       uint8_t nWarps;       // number of GPU warps for this collective
       const char* algo;     // string containing name of the algorithm for this collective
       const char* proto;    // string containing name of the protocol for this collective
     } coll;
 
     struct {                // point-to-point events metadata
-      const char* name;
-      uint64_t commHash;
       const char* func;
       void* buff;
       const char* datatype;
       size_t count;
       int peer;             // peer rank for this point-to-point
+      uint8_t nChannels;    // number of channels for this p2p
     } p2p;
 
     struct {                // proxyOp events metadata
@@ -178,7 +181,7 @@ typedef struct {
       int peer;             // peer rank
       int nSteps;           // number of network transfers/steps required by the `ncclProxyOp`
       int chunkSize;        // chunk size for this `ncclProxyOp`
-      int isSend;           // set to 1 for sends and 0 for recvs
+      int isSend;           // type of network operation
     } proxyOp;
 
     struct {                // proxyStep events metadata
@@ -187,6 +190,7 @@ typedef struct {
 
     struct {
       uint8_t channelId;    // id of the channel used by the kernel
+      uint64_t ptimer;      // kernel supplied timestamp
     } kernelCh;
 
     struct {
@@ -194,7 +198,7 @@ typedef struct {
       void* data;           // pointer to network plugin defined event
     } netPlugin;
   };
-} ncclProfilerEventDescr_v3_t;
+} ncclProfilerEventDescr_v4_t;
 ```
 
 NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
@@ -212,45 +216,57 @@ handle after `eventStop` is undefined behavior.
 Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
 `ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
 
-`ncclProfileProxyOp`, `ncclProfileProxyStep` and `ncclProfileProxyCtrl` can be updated through
-calls to `recordEventState`.
+`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
 
-The state of proxy generated events can be updated, along with event attributes, using
-`recordEventState`. These events can go through several states during their lifecycle.
-The list of supported states for the proxy-defined events is reported below.
+The state of these events can be updated, along with event attributes, using `recordEventState`.
+These events can go through several states during their lifecycle.
+
+The list of supported states for the updatable events is reported below.
 
 ```
 typedef enum {
   // ncclProfileProxyOp event states
-  ncclProfilerProxyOpSendPosted,        // state marks the posting of send buffer to GPU for given network transfer/step
-  ncclProfilerProxyOpSendRemFifoWait,   // state marks the waiting of CTS credits from peer rank
-  ncclProfilerProxyOpSendTransmitted,   // state marks the sending of network transfer/step to peer rank
-  ncclProfilerProxyOpSendDone,          // state marks the ending  of network transfer/step
-  ncclProfilerProxyOpRecvPosted,        // state marks the posting of recv to network for given network transfer/step
-  ncclProfilerProxyOpRecvReceived,      // state marks the recving of network transfer/step from peer rank
-  ncclProfilerProxyOpRecvTransmitted,   // state marks the ending  of the network transfer/step
-  ncclProfilerProxyOpRecvDone,          // state marks the consuming of data from GPU
+  ncclProfilerProxyOpSendPosted        = 0, // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1, // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2, // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3, // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4, // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5, // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6, // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7, // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,// state marks transition of proxy op to progress
 
   // ncclProfileProxyStep event states
-  ncclProfilerProxyStepSendGPUWait,     // state marks the waiting of send data from GPU for given network transfer/step
-  ncclProfilerProxyStepSendWait,        // state marks the waiting of send data from network for given network transfer/step
-  ncclProfilerProxyStepRecvWait,        // state marks the waiting of recv data from network for given network transfer/step
-  ncclProfilerProxyStepRecvFlushWait,   // state marks the waiting of recv data flush to GPU for given network transfer/step
-  ncclProfilerProxyStepRecvGPUWait,     // state marks the waiting of recv data consumption from GPU for given network transfer/step
+  ncclProfilerProxyStepSendGPUWait     = 8, // state marks the waiting of send data from GPU for given network transfer/step
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,// state marks the waiting of recv clear to send credits for given network transfer/step
+  ncclProfilerProxyStepSendWait        = 9, // state marks the waiting of send data from network for given network transfer/step
+  ncclProfilerProxyStepRecvWait        = 10,// state marks the waiting of recv data from network for given network transfer/step
+  ncclProfilerProxyStepRecvFlushWait   = 11,// state marks the waiting of recv data flush to GPU for given network transfer/step
+  ncclProfilerProxyStepRecvGPUWait     = 12,// state marks the waiting of recv data consumption from GPU for given network transfer/step
 
   // ncclProfileProxyCtrl event states
-  ncclProfilerProxyCtrlIdle,            // state marks proxy progress thread idle
-  ncclProfilerProxyCtrlActive,          // state marks proxy progress thread active
-  ncclProfilerProxyCtrlSleep,           // state marks proxy progress thread sleeping
-  ncclProfilerProxyCtrlWakeup,          // state marks proxy progress thread waking up
-  ncclProfilerProxyCtrlAppend,          // state marks append of new network work item begin
-  ncclProfilerProxyCtrlAppendEnd,       // state marks append of new network work item end
-} ncclProfilerEventState_v3_t;
+  ncclProfilerProxyCtrlIdle            = 13,// state marks proxy progress thread idle
+  ncclProfilerProxyCtrlActive          = 14,// state marks proxy progress thread active
+  ncclProfilerProxyCtrlSleep           = 15,// state marks proxy progress thread sleeping
+  ncclProfilerProxyCtrlWakeup          = 16,// state marks proxy progress thread waking up
+  ncclProfilerProxyCtrlAppend          = 17,// state marks append of new network work item begin
+  ncclProfilerProxyCtrlAppendEnd       = 18,// state marks append of new network work item end
+
+  // ncclProfileNetPlugin event states
+  ncclProfilerNetPluginUpdate          = 21,// state marks update of network defined event
+
+  // ncclProfileKernelCh event states
+  ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
+} ncclProfilerEventState_v4_t;
 ```
 
 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
 network requests for the GPU kernel. ProxyOp events are generated for every active channel and
-provide a summary of the activity of the proxy progress thread for that channel.
+provide a summary of the activity of the proxy progress thread for that channel. Most of the
+states for this event were duplicated with `ncclProfileProxyStep` events. Therefore, starting
+with version 4 of the profiler interface these states have been deprecated. The same level of
+information can still be obtained through the `ncclProfileProxyStep` events.
 
 `ncclProfileProxyStep` events are generated by the proxy progress thread while it is processing
 network requests for the GPU kernel. ProxyStep events describe individual network transfer in
@@ -348,15 +364,22 @@ reason the profiler defines the `ncclProfilerEventStateArgs_t` struct, reported
 
 ```
 typedef union {
-  struct {                // attributes to update for ncclProfileProxyOp events
-    size_t transSize;     // data transferred thus far
-    int steps;            // network transfer/steps processed thus far
-  } proxyOp;
+  struct {                // attributes for update for ncclProfileProxyStep events
+    size_t transSize;     // transfer size field for this proxy step
+  } proxyStep;
 
-  struct {                // attributes to update for ncclProfileProxyCtrl
+  struct {                // attributes to update for ncclProfileProxyCtrl events
     int appendedProxyOps; // number of appended proxy ops thus far
   } proxyCtrl;
-} ncclProfilerEventStateArgs_v3_t;
+
+  struct {                // attributes to update for ncclProfileNetPlugin events
+    void* data;           // network plugin opaque update data field
+  } netPlugin;
+
+  struct {                // attribute to update for ncclProfileKernelCh events
+    uint64_t pTimer;      // timestamp provided by the NCCL kernel
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
 ```
 
 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -396,12 +419,12 @@ ProxyCtrl event
 ## Profiling of collective and p2p operations
 
 The NCCL code is instrumented with profiler callbacks at different levels to capture start/stop of groups,
-collective and point-to-point operations, as well as proxy progress activity. Due to the asynchronous nature
+collective and point-to-point operations, as well as proxy, kernel and network activity. Due to the asynchronous nature
 of NCCL operations, events associated to collective and point-to-point operations are not easy to delimit
 precisely. For example, without both proxy and/or kernel activity it is impossible for the profiler to
 figure out when a collective operation completes. Therefore, `stopEvent` for collectives simply indicates to
-the profiler that the collective has been enqueued. The profiler can leverage proxy event information, if
-these are enabled, to estimate when the collective ends. In this case, the profiler can look at the `stopEvent`
+the profiler that the collective has been enqueued. The profiler can leverage proxy and/or kernel event information, if
+these are enabled, to estimate when the collective ends. For example, the profiler can look at the `stopEvent`
 call of the last `ncclProfileProxyOp` event to mark the completion of the associated collective event. This
 can be achieved by reference counting the collective event and letting calls to `startEvent` and `stopEvent`
 increment and decrement the reference counter, respectively.
@@ -425,8 +448,14 @@ enqueue can be time stamped by the profiler (at start and stop) to reconstruct t
 collective. However, this time only represents the launch time of the collective and not the actual
 execution time. To reconstruct the execution time more accurately proxy and kernel events are provided.
 
+With version 3 of the profiler interface network activity is no longer required to do intra-node profiling.
 Kernel events instrumentation leverages counters exposed by the kernel to the host and the proxy progress
 thread. Thus, the proxy progress thread infrastructure is shared between the network and the profiler. If
 the proxy is serving network requests the kernel profiling probing can be delayed, causing loss of
 accuracy. Similarly, if the CPU is under heavy load and the scheduling of the proxy progress thread is
-delayed, a similar loss of accuracy can be encountered. Keep this in mind when using kernel events.
+delayed, a similar loss of accuracy can be encountered.
+
+To mitigate this effect, with version 4 of the profiler NCCL uses a per-channel ring buffer of 64 elements.
+Every counter is complemented by two timestamps (ptimers) supplied by the NCCL kernel (one for start and one
+for stop of the operation in the kernel). NCCL propagates these timestamps to the profiler plugin that it can
+convert them to CPU time domain.
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
index 0638f2df1..4c1b8f53a 100644
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@@ -15,24 +15,6 @@
 #define MAX_CHANNELS                     32
 #define MAX_STEPS                        16
 #define MAX_OPS                          16 // Up to 64K ranks for PAT
-
-#define PROXY_OP_SEND_STATE_OFFSET       (ncclProfilerProxyOpSendPosted)
-#define PROXY_OP_RECV_STATE_OFFSET       (ncclProfilerProxyOpRecvPosted)
-#define PROXY_STEP_SEND_STATE_OFFSET     (ncclProfilerProxyStepSendGPUWait)
-#define PROXY_STEP_RECV_STATE_OFFSET     (ncclProfilerProxyStepRecvWait)
-
-#define NUM_PROXY_OP_SEND_STATES         (ncclProfilerProxyOpSendDone      - ncclProfilerProxyOpSendPosted    + 1)
-#define NUM_PROXY_OP_RECV_STATES         (ncclProfilerProxyOpRecvDone      - ncclProfilerProxyOpRecvPosted    + 1)
-#define NUM_PROXY_STEP_SEND_STATES       (ncclProfilerProxyStepSendWait    - ncclProfilerProxyStepSendGPUWait + 1)
-#define NUM_PROXY_STEP_RECV_STATES       (ncclProfilerProxyStepRecvGPUWait - ncclProfilerProxyStepRecvWait    + 1)
-
-#define PROXY_OP_SEND_STATE_IDX(state)   (state - PROXY_OP_SEND_STATE_OFFSET)
-#define PROXY_OP_RECV_STATE_IDX(state)   (state - PROXY_OP_RECV_STATE_OFFSET)
-#define PROXY_STEP_SEND_STATE_IDX(state) (state - PROXY_STEP_SEND_STATE_OFFSET)
-#define PROXY_STEP_RECV_STATE_IDX(state) (state - PROXY_STEP_RECV_STATE_OFFSET)
-
-#define MAX_PROXY_OP_STATES              ((NUM_PROXY_OP_SEND_STATES   > NUM_PROXY_OP_RECV_STATES  ) ? NUM_PROXY_OP_SEND_STATES   : NUM_PROXY_OP_RECV_STATES)
-#define MAX_PROXY_STEP_STATES            ((NUM_PROXY_STEP_SEND_STATES > NUM_PROXY_STEP_RECV_STATES) ? NUM_PROXY_STEP_SEND_STATES : NUM_PROXY_STEP_RECV_STATES)
 #define MAX_EVENTS_PER_REQ               (8)
 
 struct proxyOp;
@@ -68,13 +50,24 @@ struct kernelCh {
   struct taskEventBase* parent;
   double startTs;
   double stopTs;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
 };
 
+#define PROXY_STEP_SEND_GPU_WAIT 0
+#define PROXY_STEP_SEND_PEER_WAIT 1
+#define PROXY_STEP_SEND_WAIT 2
+#define PROXY_STEP_RECV_WAIT 0
+#define PROXY_STEP_RECV_FLUSH_WAIT 1
+#define PROXY_STEP_RECV_GPU_WAIT 2
+#define PROXY_STEP_MAX_STATES 3
+
 struct proxyStep {
   uint8_t type;                     // type of event: network transfer
+  int state;
   int step;                         // network transfer id in given channel
   int isSend;                       // send/recv channel operation
-  double timestamp[MAX_PROXY_STEP_STATES];
+  double timestamp[PROXY_STEP_MAX_STATES];
   double startTs;
   double stopTs;
   struct proxyOp* parent;
@@ -92,11 +85,8 @@ struct proxyOp {
   int chunkSize;                    // chunk size for this proxy operation
   int isSend;                       // send/recv channel operation
   size_t transSize;                 // transfer data size for this proxy operation
-  struct {
-    int steps;                      // completed steps for this proxy operation state
-    double timestamp;
-  } states[MAX_PROXY_OP_STATES];
   double startTs;
+  double progrTs;                   // In progress state transition
   double stopTs;
   int stepCount;                    // last processed network operation for this proxy operation
   struct proxyStep step[MAX_STEPS]; // array of network transfer events
@@ -119,8 +109,6 @@ struct proxyCtrl {
 struct taskEventBase {
   uint8_t type;                     // event type: collective/p2p
   int rank;                         // rank of the operation in NCCL communicator
-  const char* name;                 // FIXME: unused
-  uint64_t commHash;                // communicator identifier
   const char* func;                 // ncclFunc*
   int refCount;                     // number of references for this operation
   struct group* parent;             // parent event group
@@ -137,12 +125,11 @@ struct collective {
   size_t count;
   int root;
   const char* datatype;
-  uint8_t nMaxChannels;
+  uint8_t nChannels;
   const char* algo;
   const char* proto;
   int nWarps;
-  struct proxyOp send[MAX_CHANNELS][MAX_OPS];// array of send proxy operation events
-  struct proxyOp recv[MAX_CHANNELS][MAX_OPS];// array of recv proxy operation events
+  struct proxyOp op[MAX_CHANNELS][2*MAX_OPS];
   int nProxyOps[MAX_CHANNELS];
   struct kernelCh kernel[MAX_CHANNELS];
 };
@@ -154,6 +141,7 @@ struct p2p {
   size_t count;
   const char* datatype;
   int peer;
+  uint8_t nChannels;
   struct proxyOp op[MAX_CHANNELS];
   struct kernelCh kernel[MAX_CHANNELS];
 };
@@ -172,6 +160,11 @@ struct group {
 
 // arrays for different event objects
 struct context {
+  const char* commName;
+  uint64_t commHash;
+  int nranks;
+  int rank;
+
   int groupPoolSize;
   int groupPoolBase;
   int groupPoolIndex;
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
index d02202d51..c911426d9 100644
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@@ -25,42 +25,52 @@ enum {
 };
 
 typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
 
   /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
 
   /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
 } ncclProfilerEventState_t;
 
 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
 
+#include "profiler_v4.h"
 #include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
 #include "profiler_net.h"
 
-typedef ncclProfiler_v3_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v4_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
 
 #endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_v3.h b/ext-profiler/example/nccl/profiler_v3.h
index c1f1b919f..377118532 100644
--- a/ext-profiler/example/nccl/profiler_v3.h
+++ b/ext-profiler/example/nccl/profiler_v3.h
@@ -111,9 +111,4 @@ typedef struct {
   ncclResult_t (*finalize)(void* context);
 } ncclProfiler_v3_t;
 
-typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventState_v3_t ncclProfilerEventState_t;
-typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
-typedef ncclProfiler_v3_t ncclProfiler_t;
-
 #endif
diff --git a/ext-profiler/example/nccl/profiler_v4.h b/ext-profiler/example/nccl/profiler_v4.h
new file mode 100644
index 000000000..489f264c4
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v4.h
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communciator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index 08408dba7..e3f707a0a 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -38,6 +38,9 @@ static int detachPoolIndex;
 static int detachPoolDone;
 static struct proxyOp* detachPool;
 
+ncclDebugLogger_t logFn;
+#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+
 static double freq = -1;
 __hidden void calibrate() {
   struct timeval tv;
@@ -60,7 +63,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
 static int* eActivationMaskPtr;
 
-__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask) {
+__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
   pthread_mutex_lock(&lock);
   if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     // first thread initializes event mask, environment and detach pool
@@ -106,6 +109,13 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
 
   // pre-allocate memory for event object pools in dedicated profiler context
   struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
+  ctx->commName = commName;
+  ctx->commHash = commHash;
+  ctx->nranks = nranks;
+  ctx->rank = rank;
+  logFn = logfn;
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
+
   ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
   if (ctx->groupPool == NULL) goto fail;
 
@@ -142,17 +152,16 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask)
 __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   FILE* fh = NULL;
   char filename[PATH_MAX] = { 0 };
-  char hostname[64] = { 0 };
-  gethostname(hostname, 64);
+  struct context* ctx = (struct context *)context;
   const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
   if (dump) {
-    sprintf(filename, "%s-%s-%ld.txt", dump, hostname, syscall(SYS_gettid));
+    sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
     fh = fopen(filename, "w");
     fprintf(fh, "[\n");
   }
+  INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
 
   // print last N groups/collectives/p2ps
-  struct context* ctx = (struct context *)context;
   int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
   int end = ctx->groupPoolIndex;
   for (int i = start; i < end; i++) {
@@ -243,8 +252,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 
     event->base.type = ncclProfileColl;
     event->base.rank = eDescr->rank;
-    event->base.name = eDescr->coll.name;
-    event->base.commHash = eDescr->coll.commHash;
     event->base.func = eDescr->coll.func;
     event->base.startTs = gettime() - startTime;
     event->base.parent = parent;
@@ -254,7 +261,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->count = eDescr->coll.count;
     event->root = eDescr->coll.root;
     event->datatype = eDescr->coll.datatype;
-    event->nMaxChannels = eDescr->coll.nMaxChannels;
+    event->nChannels = eDescr->coll.nChannels;
     event->nWarps = eDescr->coll.nWarps;
     event->algo = eDescr->coll.algo;
     event->proto = eDescr->coll.proto;
@@ -281,8 +288,6 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 
     event->base.type = ncclProfileP2p;
     event->base.rank = eDescr->rank;
-    event->base.name = eDescr->p2p.name;
-    event->base.commHash = eDescr->p2p.commHash;
     event->base.func = eDescr->p2p.func;
     event->base.next = parent->eventHead;
     event->base.startTs = gettime() - startTime;
@@ -291,6 +296,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->count = eDescr->p2p.count;
     event->datatype = eDescr->p2p.datatype;
     event->peer = eDescr->p2p.peer;
+    event->nChannels = eDescr->p2p.nChannels;
     *eHandle = event;
     // increment the group ref counter so the event will staty open
     taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
@@ -331,6 +337,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->isSend = eDescr->proxyOp.isSend;
       event->startTs = gettime() - startTime;
       event->parent = NULL;
+      event->stepCount = 0;
       *eHandle = event;
       debugEvent(event, "PxnProxyOpStart");
       return ncclSuccess;
@@ -339,9 +346,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     if (eventBase->type == ncclProfileColl) {
       struct collective* parent = (struct collective *)eDescr->parentObj;
       int channelId = eDescr->proxyOp.channelId;
-      struct proxyOp* event = (eDescr->proxyOp.isSend) ?
-        &parent->send[channelId][parent->nProxyOps[channelId]++] :
-        &parent->recv[channelId][parent->nProxyOps[channelId]++];
+      struct proxyOp* event = &parent->op[channelId][parent->nProxyOps[channelId]++];
 
       event->type = ncclProfileProxyOp;
       event->channelId = channelId;
@@ -353,6 +358,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->isSend = eDescr->proxyOp.isSend;
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
+      event->stepCount = 0;
       *eHandle = event;
       __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
@@ -370,6 +376,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       event->isSend = eDescr->proxyOp.isSend;
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
+      event->stepCount = 0;
       *eHandle = event;
       __atomic_fetch_add(&parent->base.refCount, 1, __ATOMIC_RELAXED);
       debugEvent(event, "ProxyOpStart");
@@ -382,9 +389,10 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     int s = parent->stepCount++ % MAX_STEPS;
     struct proxyStep* event = &parent->step[s];
     event->type = ncclProfileProxyStep;
+    event->state = 0;
     event->step = eDescr->proxyStep.step;
-    event->isSend = parent->isSend;
     event->parent = parent;
+    event->isSend = parent->isSend;
     event->startTs = gettime() - startTime;
     event->nNetEvents = 0;
     *eHandle = event;
@@ -397,6 +405,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
       event->type = ncclProfileKernelCh;
       event->channelId = eDescr->kernelCh.channelId;
+      event->startGpuClk = eDescr->kernelCh.pTimer;
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
@@ -407,6 +416,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
       struct kernelCh* event = &parent->kernel[eDescr->kernelCh.channelId];
       event->type = ncclProfileKernelCh;
       event->channelId = eDescr->kernelCh.channelId;
+      event->startGpuClk = eDescr->kernelCh.pTimer;
       event->parent = eventBase;
       event->startTs = gettime() - startTime;
       *eHandle = event;
@@ -563,29 +573,57 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   // the event handle might be null if we run out of events
   if (eHandle == NULL) return ncclSuccess;
 
-  debugEvent(eHandle, "RecordEventState");
   uint8_t type = *(uint8_t *)eHandle;
   if (type == ncclProfileProxyOp) {
     struct proxyOp* event = (struct proxyOp *)eHandle;
-    int steps = event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps;
-    if (eState == ncclProfilerProxyOpSendRemFifoWait && eStateArgs->proxyOp.steps == steps) return ncclSuccess;
-    event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].steps = eStateArgs->proxyOp.steps;
-    event->states[event->isSend ? PROXY_OP_SEND_STATE_IDX(eState) : PROXY_OP_RECV_STATE_IDX(eState)].timestamp = gettime() - startTime;
-    event->transSize = eStateArgs->proxyOp.transSize;
+    if (eState == ncclProfilerProxyOpInProgress_v4) {
+      event->progrTs = gettime() - startTime;
+    }
   } else if (type == ncclProfileProxyStep) {
     struct proxyStep* event = (struct proxyStep *)eHandle;
-    event->timestamp[event->isSend ? PROXY_STEP_SEND_STATE_IDX(eState) : PROXY_STEP_RECV_STATE_IDX(eState)] = gettime() - startTime;
+    struct proxyOp* parent = event->parent;
+    switch (eState) {
+      case ncclProfilerProxyStepSendGPUWait:
+        event->timestamp[PROXY_STEP_SEND_GPU_WAIT] = gettime() - startTime;
+        break;
+      case ncclProfilerProxyStepSendPeerWait_v4:
+        // do not update step event if in SendPeerWait
+        if (event->state == ncclProfilerProxyStepSendPeerWait_v4) break;
+        event->timestamp[PROXY_STEP_SEND_PEER_WAIT] = gettime() - startTime;
+        event->state = ncclProfilerProxyStepSendPeerWait_v4;
+        break;
+      case ncclProfilerProxyStepSendWait:
+        event->timestamp[PROXY_STEP_SEND_WAIT] = gettime() - startTime;
+        parent->transSize += eStateArgs->proxyStep.transSize;
+        break;
+      case ncclProfilerProxyStepRecvWait:
+        event->timestamp[PROXY_STEP_RECV_WAIT] = gettime() - startTime;
+        break;
+      case ncclProfilerProxyStepRecvFlushWait:
+        event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT] = gettime() - startTime;
+        parent->transSize += eStateArgs->proxyStep.transSize;
+        break;
+      case ncclProfilerProxyStepRecvGPUWait:
+        event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
+        break;
+    }
   } else if (type == ncclProfileProxyCtrl) {
     struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
     if (eState == ncclProfilerProxyCtrlAppendEnd) {
       event->appended = eStateArgs->proxyCtrl.appendedProxyOps;
     }
     event->state = eState;
+  } else if (type == ncclProfileKernelCh) {
+    struct kernelCh* event = (struct kernelCh *)eHandle;
+    if (eState == ncclProfilerKernelChStop) {
+      event->stopGpuClk = eStateArgs->kernelCh.pTimer;
+    }
   }
+  debugEvent(eHandle, "RecordEventState");
   return ncclSuccess;
 }
 
-ncclProfiler_t ncclProfiler_v3 = {
+ncclProfiler_t ncclProfiler_v4 = {
   "Example-profiler",
   exampleProfilerInit,
   exampleProfilerStartEvent,
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.c
index 43f719045..a56106e10 100644
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.c
@@ -27,8 +27,8 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nMaxChannels\": %d}},\n",
-          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nMaxChannels);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
 }
 
 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -38,8 +38,8 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
 
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\"}},\n",
-          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.commHash, event->base.rank, event->peer, event->count, event->datatype);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
 }
 
 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -50,47 +50,43 @@ __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
 static __thread int proxyOpId;
 __hidden void printProxyOpEventHeader(FILE* fh, struct proxyOp* event) {
   if (event->isSend) {
-    int posted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendPosted);
-    int remFifoWait = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendRemFifoWait);
-    int transmitted = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendTransmitted);
-    int done = PROXY_OP_SEND_STATE_IDX(ncclProfilerProxyOpSendDone);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"REM_FIFO_WAIT\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
-            "Send", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[remFifoWait].steps, event->states[remFifoWait].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ScheduleSend", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "ScheduleSend", proxyOpId, getpid(), 1, event->progrTs);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ProgressSend", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
   } else {
-    int posted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvPosted);
-    int received = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvReceived);
-    int transmitted = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvTransmitted);
-    int done = PROXY_OP_RECV_STATE_IDX(ncclProfilerProxyOpRecvDone);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu, \"POSTED\": {\"step\": %d, \"ts\": %f}, \"RECEIVED\": {\"step\": %d, \"ts\": %f}, \"TRANSMITTED\": {\"step\": %d, \"ts\": %f}, \"DONE\": {\"step\": %d, \"ts\": %f}}},\n",
-            "Recv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize, event->states[posted].steps, event->states[posted].timestamp, event->states[received].steps, event->states[received].timestamp, event->states[transmitted].steps, event->states[transmitted].timestamp, event->states[done].steps, event->states[done].timestamp);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ScheduleRecv", proxyOpId, getpid(), 1, event->startTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+            "ScheduleRecv", proxyOpId, getpid(), 1, event->progrTs);
+    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"Peer\": %d, \"Steps\": %d, \"ChunkSize\": %d, \"transSize\": %lu}},\n",
+            "ProgressRecv", proxyOpId, getpid(), 1, event->progrTs, event->channelId, event->peer, event->nSteps, event->chunkSize, event->transSize);
   }
 }
 
 __hidden void printProxyOpEventTrailer(FILE* fh, struct proxyOp* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-          event->isSend ? "Send" : "Recv", proxyOpId++, getpid(), 1, event->stopTs);
+          event->isSend ? "ProgressSend" : "ProgressRecv", proxyOpId++, getpid(), 1, event->stopTs);
 }
 
 static __thread int proxyStepId;
 __hidden void printProxyStepEventHeader(FILE* fh, struct proxyStep* event) {
   if (event->isSend) {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "SendBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_GPU_WAIT], event->step);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "SendBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)]);
+            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendGPUWait)], event->step);
+            "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_PEER_WAIT], event->step);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "SendGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)]);
+            "SendPeerWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_STATE_IDX(ncclProfilerProxyStepSendWait)], event->step);
+            "SendWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_SEND_WAIT], event->step);
   } else {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "RecvBufferWait", proxyStepId, getpid(), 1, event->startTs, event->step);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "RecvBufferWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)]);
-    fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvWait)], event->step);
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_WAIT], event->step);
   }
 }
 
@@ -100,13 +96,13 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
             "SendWait", proxyStepId++, getpid(), 1, event->stopTs);
   } else {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)]);
+            "RecvWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvFlushWait)], event->step);
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_FLUSH_WAIT], event->step);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
-            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)]);
+            "RecvFlushWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT]);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Step\": %d}},\n",
-            "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_STATE_IDX(ncclProfilerProxyStepRecvGPUWait)], event->step);
+            "RecvGpuWait", proxyStepId, getpid(), 1, event->timestamp[PROXY_STEP_RECV_GPU_WAIT], event->step);
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"NET\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
             "RecvGpuWait", proxyStepId++, getpid(), 1, event->stopTs);
   }
@@ -115,8 +111,8 @@ __hidden void printProxyStepEventTrailer(FILE* fh, struct proxyStep* event) {
 static __thread int kernelId;
 __hidden void printKernelChEventHeader(FILE* fh, struct kernelCh* event) {
   if (event->type != ncclProfileKernelCh) return;
-  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d}},\n",
-          "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId);
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GPU\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"Channel\": %d, \"StartGpuClk\": %lu, \"StopGpuClk\": %lu}},\n",
+          "KernelCh", kernelId, getpid(), 1, event->startTs, event->channelId, event->startGpuClk, event->stopGpuClk);
 }
 
 __hidden void printKernelChEventTrailer(FILE* fh, struct kernelCh* event) {
@@ -134,6 +130,8 @@ __hidden void printProxyCtrlEvent(FILE* fh, struct proxyCtrl* event) {
     str = "Sleep";
   } else if (event->state == ncclProfilerProxyCtrlAppend || event->state == ncclProfilerProxyCtrlAppendEnd) {
     str = "Append";
+  } else {
+    return;
   }
   if (event->state == ncclProfilerProxyCtrlAppendEnd) {
     fprintf(fh, "{\"name\": \"%s\", \"cat\": \"PROXY\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"appended\": %d}},\n",
@@ -188,9 +186,8 @@ void debugEvent(void* eHandle, const char* tag) {
     fprintf(fh, "Collective event %p tag = %s {\n", event, tag);
     fprintf(fh, "  refCount          = %d\n", __atomic_load_n(&event->base.refCount, __ATOMIC_RELAXED));
     fprintf(fh, "  parent            = %p\n", event->base.parent);
-    for (int j = 0; j < MAX_OPS; j++) {
-      for (int i = 0; i < MAX_CHANNELS; i++) if (event->send[i][j].type == ncclProfileProxyOp) fprintf(fh, "  send[%d]           = %p\n", i, &event->send[i]);
-      for (int i = 0; i < MAX_CHANNELS; i++) if (event->recv[i][j].type == ncclProfileProxyOp) fprintf(fh, "  recv[%d]           = %p\n", i, &event->recv[i]);
+    for (int j = 0; j < 2*MAX_OPS; j++) {
+      for (int i = 0; i < MAX_CHANNELS; i++) if (event->op[i][j].type == ncclProfileProxyOp) fprintf(fh, "  op[%d]           = %p\n", i, &event->op[i]);
     }
     fprintf(fh, "  startTs           = %f\n", event->base.startTs);
     fprintf(fh, "  stopTs            = %f\n", event->base.stopTs);
@@ -207,17 +204,18 @@ void debugEvent(void* eHandle, const char* tag) {
   } else if (type == ncclProfileProxyOp) {
     struct proxyOp* event = (struct proxyOp *)eHandle;
     fprintf(fh, "ProxyOp event %p tag = %s {\n", event, tag);
-    fprintf(fh, "  type              = %s\n", event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  type              = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
     fprintf(fh, "  channel           = %d\n", event->channelId);
     fprintf(fh, "  parent            = %p\n", event->parent);
     fprintf(fh, "  rank              = %d\n", event->rank);
     fprintf(fh, "  startTs           = %f\n", event->startTs);
+    fprintf(fh, "  progrTs           = %f\n", event->progrTs);
     fprintf(fh, "  stopTs            = %f\n", event->stopTs);
     fprintf(fh, "}\n");
   } else if (type == ncclProfileProxyStep) {
     struct proxyStep* event = (struct proxyStep *)eHandle;
     fprintf(fh, "ProxyStep event %p tag = %s {\n", event, tag);
-    fprintf(fh, "  type              = %s\n", event->isSend ? "Send" : "Recv");
+    fprintf(fh, "  type              = %s\n", event->isSend < 0 ? "Unknown" : event->isSend ? "Send" : "Recv");
     fprintf(fh, "  parent            = %p\n", event->parent);
     fprintf(fh, "  startTs           = %f\n", event->startTs);
     fprintf(fh, "  stopTs            = %f\n", event->stopTs);
@@ -260,8 +258,7 @@ void printEvent(FILE* fh, void* handle) {
     for (int i = 0; i < MAX_CHANNELS; i++) {
       printKernelChEventHeader(fh, &c->kernel[i]);
       for (int j = 0; j < c->nProxyOps[i]; j++) {
-        printEvent(fh, &c->send[i][j]);
-        printEvent(fh, &c->recv[i][j]);
+        printEvent(fh, &c->op[i][j]);
       }
       printKernelChEventTrailer(fh, &c->kernel[i]);
     }
diff --git a/ext-profiler/example/print_event.h b/ext-profiler/example/print_event.h
index 8e2db4c2d..e32560dca 100644
--- a/ext-profiler/example/print_event.h
+++ b/ext-profiler/example/print_event.h
@@ -7,6 +7,9 @@
 #ifndef PRINT_EVENT_H_
 #define PRINT_EVENT_H_
 
+#include "nccl/common.h"
+extern ncclDebugLogger_t logFn;
+
 void debugEvent(void* eHandle, const char* tag);
 void printEvent(FILE* fh, void* handle);
 
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 545203a10..8a35a8fab 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -17,6 +17,8 @@ PROFAPI ?= 1
 NVTX ?= 1
 RDMA_CORE ?= 0
 NET_PROFILER ?= 0
+MLX5DV ?= 0
+MAX_EXT_NET_PLUGINS ?= 0
 
 NVCC = $(CUDA_HOME)/bin/nvcc
 
@@ -49,8 +51,10 @@ CUDA11_PTX    = -gencode=arch=compute_80,code=compute_80
 CUDA12_PTX    = -gencode=arch=compute_90,code=compute_90
 CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
-
-ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 12; echo $$?),0)
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+# Prior to SM75 is deprecated from CUDA13.0 onwards
+  NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
@@ -66,14 +70,21 @@ else
 endif
 $(info NVCC_GENCODE is ${NVCC_GENCODE})
 
+# CUDA 13.0 requires c++17
+ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
+  CXXSTD ?= -std=c++17
+else
+  CXXSTD ?= -std=c++11
+endif
+
 CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
-              -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
-              -I $(CUDA_INC) \
+              -Wall -Wno-unused-function -Wno-sign-compare $(CXXSTD) -Wvla \
+              -I $(CUDA_INC) -I $(CUDA_INC)/cccl \
               $(CXXFLAGS)
 # Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
 # 512 : 120, 640 : 96, 768 : 80, 1024 : 60
 # We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
-NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
+NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 
@@ -136,9 +147,17 @@ CXXFLAGS += -DPROFAPI
 endif
 
 ifneq ($(RDMA_CORE), 0)
-CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
+CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1 -libverbs
+endif
+
+ifneq ($(MLX5DV), 0)
+CXXFLAGS += -DNCCL_BUILD_MLX5DV=1 -lmlx5
 endif
 
 ifneq ($(NET_PROFILER), 0)
 CXXFLAGS += -DNCCL_ENABLE_NET_PROFILING=1
 endif
+
+ifneq ($(MAX_EXT_NET_PLUGINS), 0)
+CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS)
+endif
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 5c0b0de9a..f41e7a783 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 26
-NCCL_PATCH   := 6
+NCCL_MINOR   := 27
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
index 65da6300b..eab662ef9 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ include ../makefiles/version.mk
 INCEXPORTS  := nccl.h
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
diff --git a/src/allocator.cc b/src/allocator.cc
new file mode 100644
index 000000000..c58181948
--- /dev/null
+++ b/src/allocator.cc
@@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "transport.h"
+#include "group.h"
+
+NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
+ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+
+#if CUDART_VERSION >= 12010
+  size_t memGran = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp memprop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle = (CUmemGenericAllocationHandle)-1;
+  int cudaDev;
+  int flag;
+  int dcnt;
+
+  if (ptr == NULL || size == 0) goto fallback;
+
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+
+  if (ncclCuMemEnable()) {
+    size_t handleSize = size;
+    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    // Query device to see if FABRIC handle support is available
+    flag = 0;
+    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
+    if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
+    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
+    memprop.location.id = currentDev;
+    // Query device to see if RDMA support is available
+    flag = 0;
+    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
+    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
+    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+    CUDACHECK(cudaGetDeviceCount(&dcnt));
+    ALIGN_SIZE(handleSize, memGran);
+
+    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
+      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
+      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
+      if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
+        requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
+        memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
+        /* Allocate the physical memory on the device */
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+      } else if (err != CUDA_SUCCESS) {
+        // Catch and report any error from above
+        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+      }
+    } else {
+      /* Allocate the physical memory on the device */
+      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
+    }
+    /* Reserve a virtual address range */
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
+    /* Map the virtual address range to the physical allocation */
+    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
+    /* Now allow RW access to the newly mapped memory */
+    for (int i = 0; i < dcnt; ++i) {
+      int p2p = 0;
+      if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, i, cudaDev) == cudaSuccess) && p2p)) {
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = i;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
+      }
+      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
+    }
+    goto exit;
+  }
+
+fallback:
+#endif
+  // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc.  That's deliberate though:
+  // we want CUDA to return an error to the caller.
+  // coverity[var_deref_model]
+  CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
+ncclResult_t  ncclMemFree(void *ptr) {
+  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  ncclResult_t ret = ncclSuccess;
+  int saveDevice;
+
+  CUDACHECK(cudaGetDevice(&saveDevice));
+#if CUDART_VERSION >= 12010
+  CUdevice ptrDev = 0;
+
+  if (ptr == NULL) goto fallback;
+  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
+
+  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
+  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
+    goto exit;
+  }
+
+fallback:
+#endif
+  CUDACHECKGOTO(cudaFree(ptr), ret, fail);
+
+exit:
+  CUDACHECK(cudaSetDevice(saveDevice));
+  return ret;
+fail:
+  goto exit;
+}
+
+// This is a collective function and should be called by all ranks in the communicator
+ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
+  ncclResult_t ret = ncclSuccess;
+  void* regSymAddr = NULL;
+  size_t allocSize = size;
+  size_t granularity;
+  CUdevice cuDev;
+  CUmemAllocationProp memprop = {};
+  CUmemGenericAllocationHandle memHandle;
+  int bit = 0, cnt = 0;
+
+  // aligment must be power of 2 as an input
+  while (bit < sizeof(size_t) * 8) {
+    if (alignment & (1L << bit)) cnt++;
+    if (cnt == 2) {
+      WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
+      goto fail;
+    }
+    bit++;
+  }
+  // temporarily align the alignment to NCCL_REC_PAGE_SIZE
+  ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
+
+  CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
+  memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  memprop.requestedHandleTypes = ncclCuMemHandleType;
+  memprop.location.id = cuDev;
+  CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
+  ALIGN_SIZE(allocSize, granularity);
+
+  CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
+  ALIGN_SIZE(comm->symAllocHead, alignment);
+  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
+  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+  comm->symAllocHead += allocSize;
+  *symPtr = regSymAddr;
+
+exit:
+  return ret;
+fail:
+  *symPtr = NULL;
+  goto exit;
+}
+
+ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  ncclResult_t ret = ncclSuccess;
+  int saveDev = comm->cudaDev;
+  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
+  if (ncclCuMemEnable()) {
+    CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+    CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
+    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+    CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
+    NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
+    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+  }
+exit:
+  CUDACHECK(cudaSetDevice(saveDev));
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 9e24faadf..f05337249 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -94,6 +94,7 @@ ncclResult_t bootstrapNetInit() {
     pthread_mutex_lock(&bootstrapNetLock);
     if (bootstrapNetInitDone == 0) {
       const char* env = ncclGetEnv("NCCL_COMM_ID");
+      int nIfs = 0;
       if (env) {
         union ncclSocketAddress remoteAddr;
         if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
@@ -101,13 +102,15 @@ ncclResult_t bootstrapNetInit() {
           pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInvalidArgument;
         }
-        if (ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
+        NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
+                                               &nIfs));
+        if (nIfs <= 0) {
           WARN("NET/Socket : No usable listening interface found");
           pthread_mutex_unlock(&bootstrapNetLock);
           return ncclSystemError;
         }
       } else {
-        int nIfs = ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1);
+        NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
           pthread_mutex_unlock(&bootstrapNetLock);
@@ -828,7 +831,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
 
   NCCLCHECKGOTO(ncclCalloc(&state->peerP2pAddresses, nranks), ret, fail);
   memcpy(state->peerP2pAddresses + rank, &peerSocketAddress, sizeof(union ncclSocketAddress));
-  if (parent->config.splitShare) {
+  if (parent->shareResources) {
     /* map local rank to top parent local rank. */
     for (int i = 0; i < nranks; ++i) {
       comm->topParentRanks[i] = parent->topParentRanks[parentRanks[i]];
diff --git a/src/channel.cc b/src/channel.cc
index bc48986d8..c2b88414b 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -147,7 +147,7 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks) {
   int nPeers = nRanks + collnetNRanks + nvlsNRanks;
   /* channel peers are only valid when async init thread completes commAlloc() and
-   * the channel is intialized with initChannel(); if either is not done, this channel
+   * the channel is initialized with initChannel(); if either is not done, this channel
    * should never be free. */
   if (channel->id == -1 || channel->peers == NULL) return ncclSuccess;
 
diff --git a/src/debug.cc b/src/debug.cc
index e2cc4f810..f034bc7e0 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -16,6 +16,8 @@
 #include <chrono>
 #include "param.h"
 
+#define NCCL_DEBUG_RESET_TRIGGERED (-2)
+
 int ncclDebugLevel = -1;
 static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
 static char ncclDebugTimestampFormat[256];        // with space for subseconds
@@ -26,7 +28,7 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-static uint64_t ncclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask is INIT and ENV
+static uint64_t ncclDebugMask = 0;
 FILE *ncclDebugFile = stdout;
 static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
 static std::chrono::steady_clock::time_point ncclEpoch;
@@ -34,11 +36,16 @@ static bool ncclWarnSetDebugInfo = false;
 
 static __thread int tid = -1;
 
+// This function must be called with ncclDebugLock locked!
 static void ncclDebugInit() {
-  pthread_mutex_lock(&ncclDebugLock);
-  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
   const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
   int tempNcclDebugLevel = -1;
+  uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask
+  if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) {
+    // Finish the reset initiated via ncclResetDebugInit().
+    fclose(ncclDebugFile);
+    ncclDebugFile = stdout;
+  }
   if (nccl_debug == NULL) {
     tempNcclDebugLevel = NCCL_LOG_NONE;
   } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
@@ -61,7 +68,7 @@ static void ncclDebugInit() {
   if (ncclDebugSubsysEnv != NULL) {
     int invert = 0;
     if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
-    ncclDebugMask = invert ? ~0ULL : 0ULL;
+    tempNcclDebugMask = invert ? ~0ULL : 0ULL;
     char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
     char *subsys = strtok(ncclDebugSubsys, ",");
     while (subsys != NULL) {
@@ -102,7 +109,7 @@ static void ncclDebugInit() {
         mask = NCCL_ALL;
       }
       if (mask) {
-        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+        if (invert) tempNcclDebugMask &= ~mask; else tempNcclDebugMask |= mask;
       }
       subsys = strtok(NULL, ",");
     }
@@ -246,15 +253,15 @@ static void ncclDebugInit() {
     if (debugFn[0] != '\0') {
       FILE *file = fopen(debugFn, "w");
       if (file != nullptr) {
-        setbuf(file, nullptr); // disable buffering
+        setlinebuf(file); // disable block buffering
         ncclDebugFile = file;
       }
     }
   }
 
   ncclEpoch = std::chrono::steady_clock::now();
+  ncclDebugMask = tempNcclDebugMask;
   __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
-  pthread_mutex_unlock(&ncclDebugLock);
 }
 
 /* Common logging function used by the INFO, WARN and TRACE macros
@@ -262,19 +269,38 @@ static void ncclDebugInit() {
  * they can share the debugging mechanisms and output files
  */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
+  bool locked = false; // Keeps track of the ncclDebugLock state.
+  int gotLevel = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
+
   if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
 
   // Save the last error (WARN) as a human readable string
   if (level == NCCL_LOG_WARN) {
     pthread_mutex_lock(&ncclDebugLock);
+    locked = true;
     va_list vargs;
     va_start(vargs, fmt);
     (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
     va_end(vargs);
+  }
+
+  if (gotLevel >= 0 && (gotLevel < level || (flags & ncclDebugMask) == 0)) {
+    if (locked)
+      pthread_mutex_unlock(&ncclDebugLock);
+    return;
+  }
+
+  if (!locked) {
+    pthread_mutex_lock(&ncclDebugLock);
+    locked = true;
+  }
+  // From this point on ncclDebugLock is always locked so we don't need to check "locked" anymore.
+  if (ncclDebugLevel < 0)
+    ncclDebugInit();
+  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) {
     pthread_mutex_unlock(&ncclDebugLock);
+    return;
   }
-  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
 
   if (tid == -1) {
     tid = syscall(SYS_gettid);
@@ -335,7 +361,7 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   // Add level specific formatting.
   if (level == NCCL_LOG_WARN) {
     len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] %s:%d NCCL WARN ", cudaDev, filefunc, line);
-    if (ncclWarnSetDebugInfo) ncclDebugLevel = NCCL_LOG_INFO;
+    if (ncclWarnSetDebugInfo) __atomic_store_n(&ncclDebugLevel, NCCL_LOG_INFO, __ATOMIC_RELEASE);
   } else if (level == NCCL_LOG_INFO) {
     len += snprintf(buffer+len, sizeof(buffer)-len, "[%d] NCCL INFO ", cudaDev);
   } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
@@ -360,19 +386,17 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   // necessary since we write bytes instead of the string.
   buffer[len++] = '\n';
   fwrite(buffer, 1, len, ncclDebugFile);
+  pthread_mutex_unlock(&ncclDebugLock);
 }
 
 NCCL_API(void, ncclResetDebugInit);
 void ncclResetDebugInit() {
   // Cleans up from a previous ncclDebugInit() and reruns.
   // Use this after changing NCCL_DEBUG and related parameters in the environment.
-  __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
-  if (ncclDebugFile != stdout) {
-    fclose(ncclDebugFile);
-    ncclDebugFile = stdout;
-  }
-  ncclDebugLevel = -1;
-  ncclDebugInit();
+  pthread_mutex_lock(&ncclDebugLock);
+  // Let ncclDebugInit() know to complete the reset.
+  __atomic_store_n(&ncclDebugLevel, NCCL_DEBUG_RESET_TRIGGERED, __ATOMIC_RELEASE);
+  pthread_mutex_unlock(&ncclDebugLock);
 }
 
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
diff --git a/src/device/Makefile b/src/device/Makefile
index 3562563fc..df58489a0 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -23,6 +23,9 @@ INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)
 
+NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
+NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
+
 SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
 
 COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
@@ -30,7 +33,22 @@ COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
 define COMPILE
 @$(SAY) "Compiling" $2;\
  mkdir -p $(dir $1);\
- $(call COMPILE$(suffix $2),$1,$2)
+ $(call COMPILE$(or $3,$(suffix $2)),$1,$2)
+endef
+
+ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1)
+	NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \
+                     -gencode=arch=compute_120a,code=sm_120a
+else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
+  NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
+else
+	NVCC_GENCODE_LDMC_FP8 =
+endif
+
+define COMPILE_SYM
+@$(SAY) "Compiling" $2;\
+ mkdir -p $(dir $1);\
+ $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
 endef
 
 DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
@@ -48,8 +66,6 @@ endef
 
 all: $(MANIFEST)
 
-ifeq (1,1)
-# Case if the <gensrc> directory is generated on-demand:
 $(OBJDIR)/gensrc: generate.py
 	@mkdir -p $@
 	(which python3 >/dev/null || \
@@ -57,22 +73,26 @@ $(OBJDIR)/gensrc: generate.py
 	   printf "\n$${bar}\nERROR: Building NCCL requires a Python 3 installation invokable as 'python3'.\n$${bar}\n\n" 1>&2; \
 	   exit 1)) \
 	&& ./generate.py $@ "$(ONLY_FUNCS)"
-else
-# Case if the <gensrc> directory is pre-generated and checked in the repo as ./gen:
-$(OBJDIR)/gensrc:
-	@mkdir -p $(OBJDIR); ln -srfn ./gen $@
-endif
+
+$(OBJDIR)/gensrc/symmetric: $(OBJDIR)/gensrc symmetric/generate.py
+	@mkdir -p $@
+	./symmetric/generate.py $@
 
 # The trailing ";" is necessary to make this an "empty recipe":
 # https://www.gnu.org/software/make/manual/html_node/Empty-Recipes.html
 $(OBJDIR)/gensrc/rules.mk: $(OBJDIR)/gensrc ;
 
+$(OBJDIR)/gensrc/symmetric/rules.mk: $(OBJDIR)/gensrc/symmetric ;
+
 -include $(OBJDIR)/gensrc/rules.mk
 # "gensrc/rules.mk" populates $(LIB_OBJS_GEN)
 
+-include $(OBJDIR)/gensrc/symmetric/rules.mk
+# "gensrc/symmetric/rules.mk" populates $(LIB_OBJS_SYM_GEN)
+
 SRCS = common.cu onerank.cu
 
-LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN)
+LIB_OBJS = $(patsubst %, $(OBJDIR)/%.o, $(SRCS)) $(LIB_OBJS_GEN) $(LIB_OBJS_SYM_GEN)
 
 $(OBJDIR)/%.o: % $(OBJDIR)/%.d
 	$(call COMPILE,$@,$<)
@@ -80,12 +100,18 @@ $(OBJDIR)/%.o: % $(OBJDIR)/%.d
 $(OBJDIR)/genobj/%.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/%.d
 	$(call COMPILE,$@,$(OBJDIR)/gensrc/$*)
 
+$(OBJDIR)/genobj/symmetric/%.o: $(OBJDIR)/gensrc/symmetric $(OBJDIR)/genobj/symmetric/%.d
+	$(call COMPILE,$@,$(OBJDIR)/gensrc/symmetric/$*)
+
 $(OBJDIR)/%.d: %
 	$(call DEPENDS,$@,$<)
 
 $(OBJDIR)/genobj/%.d: $(OBJDIR)/gensrc/%
 	$(call DEPENDS,$@,$<)
 
+$(OBJDIR)/genobj/symmetric/%.d: $(OBJDIR)/gensrc/symmetric/%
+	$(call DEPENDS,$@,$<)
+
 $(DEVGLUE_OBJ): $(LIB_OBJS)
 	$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
 
@@ -94,6 +120,7 @@ $(MANIFEST): $(LIB_OBJS) $(DEVGLUE_OBJ)
 
 -include $(wildcard $(OBJDIR)/*.d)
 -include $(wildcard $(OBJDIR)/genobj/*.d)
+-include $(wildcard $(OBJDIR)/genobj/symmetric/*.d)
 
 .PHONY: clean
 clean:
diff --git a/src/device/all_gather.h b/src/device/all_gather.h
index 854ebbf3a..db967861e 100644
--- a/src/device/all_gather.h
+++ b/src/device/all_gather.h
@@ -173,73 +173,221 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SIMPLE
 
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  template<bool BcastSendNotRecv>
+  struct Scatterer {
+    struct ncclDevWorkColl* work;
+    ssize_t chunkSize;
+    ssize_t railGridOffset;
+
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
+    __device__ __forceinline__ void operator()(
+        int tid, int tn, int slice, int maxSliceSize,
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
+      ) {
+      static_assert(SlicePerChunk==1, "require: SlicePerChunk==1");
+      static_assert(MaxDsts<=1 || MaxSrcs<=1, "require: MaxDsts<=1 || MaxSrcs<=1");
+
+      struct ncclNvls* nvls = &ncclShmem.channel.nvls;
+      int nNodes = ncclShmem.comm.nNodes;
+      int nRails = nvls->nHeads;
+      int part = ncclShmem.channelId - work->channelLo;
+      char* inbuf = (char*)work->sendbuff;
+      char* outbuf = (char*)work->recvbuff;
+      ssize_t countPerRank = work->collnet.count;
+      bool inPlace = (inbuf == outbuf + ncclShmem.comm.rank * countPerRank);
+      ssize_t railAllBeg = min(railGridOffset + part * chunkSize, nNodes * countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkSize, nNodes * countPerRank);
+      int railAllSize = railAllEnd - railAllBeg;
+      int rail = 0;
+      int src = 0;
+
+      if (BcastSendNotRecv) {
+        rail = nvls->headRank;
+      } else {
+        if (work->regUsed) return;
+        rail = 0;
+      }
+      if (tid < nDsts) dstSizes[tid] = railAllSize;
+      do {
+        int node = railAllBeg / countPerRank;
+        int railAllOffset = 0;
+        while (railAllOffset < railAllSize) {
+          ssize_t railOneBeg = node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
+          ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg;
+          int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset);
+          int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail];
+          ssize_t userOneBeg = rank * countPerRank + railOneOffset;
+          int outIsDst = (inPlace && rank == ncclShmem.comm.rank) || BcastSendNotRecv || work->regUsed ? 0 : 1;
+          if (nSrcs != 0 && outIsDst + nDsts != 0) {
+            reduceCopy<ncclCollUnroll(), RedOp, T,
+              /*MultimemSrcs,MinSrcs,MaxSrcs=*/MultimemSrcs, 1, 1,
+              /*MultimemDsts=*/MultimemDsts, 0 + MultimemDsts + MinDsts, 1 + MaxDsts,
+              /*PreOpSrcs=*/0>
+              (tid, tn, 0, nullptr, false,
+                /*nSrcs=*/1, [=]__device__(int s/*==0*/) -> void* {
+              return (char*)srcPtrs[src] + railAllOffset;
+            },
+                /*nDsts=*/outIsDst + nDsts, [=]__device__(int d) -> void* {
+              return d < outIsDst ? outbuf + userOneBeg
+                : work->regUsed ? (char*)dstPtrs[d - outIsDst] + userOneBeg
+                : (char*)dstPtrs[d - outIsDst] + railAllOffset;
+            }, delta);
+          }
+          railAllOffset += delta;
+          node += 1;
+        }
+        rail += 1;
+        src += 1;
+      } while (!BcastSendNotRecv && src < nRails);
+    }
+  };
+
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    const ssize_t rank = ncclShmem.comm.rank;
-    size_t count, gridOffset, channelCount;
-    size_t chunkCount;
-    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
-    size_t offset;
     int nelem;
 
-    const int nThreadsBcast = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : 4 * WARP_SIZE;
-    const int nThreadsGather = work->regUsed ? WARP_SIZE : NCCL_MAX_NTHREADS - nThreadsBcast;
-    const int tidEndGather = nThreadsGather;
-    const int tidEndBcast = tidEndGather + nThreadsBcast;
+    const int nThreadsNetSend = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE :  6 * WARP_SIZE);
+    const int nThreadsGather = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE;
+    const int nThreadsBcast = NCCL_MAX_NTHREADS - nThreadsNetSend - nThreadsGather;
 
-    if (!work->regUsed) {
-      if (tid < tidEndGather) {
-        // Gather
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
-            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          offset = gridOffset + elemOffset;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
+    const int tidEndGather = nThreadsGather;
+    const int tidEndNetSend = tidEndGather + nThreadsNetSend;
+    const int tidEndBcast = tidEndNetSend + nThreadsBcast;
+
+    if (work->oneNode) {
+      const ssize_t rank = ncclShmem.comm.rank;
+      size_t count, gridOffset, channelCount, offset, chunkCount;
+      ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
+      if (!work->regUsed) {
+        if (tid < tidEndGather) {
+          // Gather
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+          Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
+            prims(tid, nThreadsGather, nvls->up, NULL, NULL, work->recvbuff,
+              work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            offset = gridOffset + elemOffset;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            prims.gather(offset, nvls->nHeads * count, nelem, count, -1, 0);
+          }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+        } else if (tid < tidEndBcast) {
+          // Bcast through NVLS
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+          Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+            prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
+              work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            offset = gridOffset + elemOffset;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            prims.send(offset, nelem);
+          }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
         }
-        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-      } else if (tid < tidEndBcast) {
-        // Bcast through NVLS
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-        Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, NULL, &nvls->down, work->sendbuff, NULL,
-            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          offset = gridOffset + elemOffset;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.send(offset, nelem);
+      } else {
+        if (tid < tidEndGather) {
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+          Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+            prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
+              work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+
+          /* used as sync */
+          prims.scatter(0, 0, 0, 0, -1, 0);
+
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            prims.gather(0, 0, 0, 0, -1, 0);
+          }
+        } else if (tid < tidEndBcast) {
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
+              work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
+          /* used as sync */
+          prims.recv(0, 0);
+
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            ssize_t inpOffset = gridOffset + elemOffset;
+            ssize_t outOffset = inpOffset + rank * count;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            prims.directSend(inpOffset, outOffset, nelem);
+          }
         }
-        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
       }
     } else {
-      /* direct allgather */
+      // NVLS + IB SHARP
+      int nNodes = ncclShmem.comm.nNodes;
+      int part = ncclShmem.channelId - work->channelLo;
+      ssize_t countPerRank = work->collnet.count;
+      const int nChannels = work->channelHi - work->channelLo + 1;
+      ssize_t chunkCount = work->collnet.chunkCount;
       if (tid < tidEndGather) {
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsGather, nvls->up, nvls->up, NULL, NULL,
-            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-
-        /* used as sync */
-        prims.scatter(0, 0, 0, 0, -1, 0);
-
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          prims.gather(0, 0, 0, 0, -1, 0);
+        Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/1, Proto, 0>
+          prims(tid, nThreadsGather, nvls->up, nullptr, nullptr, work->recvbuff,
+            /*redOpArg=*/0, 1 * Proto::MaxGroupWidth, 1, 1, work);
+        for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+          Scatterer</*BcastSendNotRecv=*/false> scat;
+          scat.work = work;
+          scat.chunkSize = chunkCount;
+          scat.railGridOffset = railGridOffset;
+          prims.template process</*Recv=*/1, /*Send=*/0>(scat);
         }
-      } else if (tid < tidEndBcast) {
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndGather, nThreadsBcast, &nvls->down, &nvls->down, work->sendbuff, NULL,
-            work->redOpArg, 1 * Proto::MaxGroupWidth, 0, 0, work);
-        /* used as sync */
-        prims.recv(0, 0);
-
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          ssize_t inpOffset = gridOffset + elemOffset;
-          ssize_t outOffset = inpOffset + rank * count;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directSend(inpOffset, outOffset, nelem);
+      } else {
+        if (work->netRegUsed) {
+          using ProtoSend = ProtoSimple<1, 1, COLL_UNROLL>;
+          using ProtoBcast = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+          int maxSteps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount);
+          int curSteps = -1;
+          int postThread = tid - tidEndGather == 0 ? 1 : 0;
+          // for UB, we need to control the send speed to avoid net congestion.
+          // first unroll 2 steps, then unroll the rest steps when the data is received.
+          if (postThread) {
+            curSteps = min(2, maxSteps);
+            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, curSteps);
+          }
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, ProtoBcast, 0>
+            prims(tid - tidEndGather, nThreadsNetSend + nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr,
+              /*redOpArg=*/0, 2 * ProtoBcast::MaxGroupWidth, 0, 0, work);
+          for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+            Scatterer</*BcastSendNotRecv=*/true> scat;
+            scat.work = work;
+            scat.chunkSize = chunkCount;
+            scat.railGridOffset = railGridOffset;
+            prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+            if (postThread && curSteps < maxSteps) {
+              curSteps++;
+              Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/1, ProtoSend, 0>::sendPeerNotify(nvls->out, 1, 1);
+            }
+          }
+        } else {
+          if (tid < tidEndNetSend) {
+            using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+            Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
+              prims(tid - tidEndGather, nThreadsNetSend, nullptr, &nvls->out, work->sendbuff, nullptr,
+                /*redOpArg=*/0, 0 * Proto::MaxGroupWidth, 1, 1);
+            for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+              ssize_t railAllBeg = railGridOffset + part * chunkCount;
+              ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
+              ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+              ssize_t railOneEnd = railOneBeg + countPerRank;
+              ssize_t beg = max(railAllBeg, railOneBeg);
+              ssize_t end = min(railAllEnd, railOneEnd);
+              prims.send(beg - railOneBeg, max(ssize_t(0), end - beg));
+            }
+          } else {
+            using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
+            Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
+              prims(tid - tidEndNetSend, nThreadsBcast, &nvls->out, &nvls->down, nullptr, nullptr,
+                /*redOpArg=*/0, 2 * Proto::MaxGroupWidth, 0, 0);
+            for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+              Scatterer</*BcastSendNotRecv=*/true> scat;
+              scat.work = work;
+              scat.chunkSize = chunkCount;
+              scat.railGridOffset = railGridOffset;
+              prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+            }
+          }
         }
       }
     }
@@ -254,7 +402,7 @@ struct RunWorkColl<ncclFuncAllGather, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_P
     ssize_t chunkSize;
     ssize_t railGridOffset;
 
-    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
     __device__ __forceinline__ void operator()(
         int tid, int tn, int slice, int maxSliceSize,
         int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
diff --git a/src/device/all_reduce.h b/src/device/all_reduce.h
index 81da55401..f6b6e9c0e 100644
--- a/src/device/all_reduce.h
+++ b/src/device/all_reduce.h
@@ -106,7 +106,7 @@ namespace {
         for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
           offset = gridOffset + elemOffset;
           nelem = min(chunkCount, channelCount - elemOffset);
-          prims.directSend(offset, nelem);
+          prims.directSend(offset, offset, nelem);
         }
       }
       else {
diff --git a/src/device/common.h b/src/device/common.h
index 855db730f..a2884b50c 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -52,7 +52,6 @@ struct ncclShmemData {
   uint16_t funcId;
   int nWorks;
   int workSize;
-  uint32_t workConsumed;
   uint64_t workCounter;
   bool profilerEnabled;
   struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
@@ -182,7 +181,6 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
     }
     if (tid == 0) {
       ncclShmem.workSize = workSize;
-      ncclShmem.workConsumed = batch.offsetBase + (64-__clzll(batch.offsetBitset))*workSize;
     }
     // We deliberately replicate these div and mod calculations into the case
     // blocks above so that they get constant divisor optimizations by the compiler.
@@ -242,6 +240,12 @@ __device__ __forceinline__ void loadWorkBatchToShmem(
   }
 }
 
+__device__ __forceinline__ unsigned long long int globaltimer() {
+  unsigned long long int timer;
+  asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(timer));
+  return timer;
+}
+
 template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
 struct RunWorkColl {
   __device__ void run(int tid, int tn, struct ncclDevWorkColl* work) {
@@ -296,40 +300,30 @@ struct RunWorkBatch {
 #define STOP  1
 #define FINI  2
 
-__device__ __forceinline__ bool profilerEnabled(void) {
-  // Check if any of the workItems in the batch is profiled. If so, there is an equivalent
-  // profiler ProxyOp waiting for the counter update in the host thread. If this check was
-  // done only for the first workItem the profiler counter for other workItems in the batch
-  // could never be updated, leaving the host thread spinning forever for the counter update
-  // and causing a hang.
-  bool enabled = false;
-  for (int i = 0; i < ncclShmem.nWorks && !enabled; i++) {
-    if (ncclShmem.workType == ncclDevWorkTypeP2p)
-      enabled = ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[i].profilerEnabled;
-    else
-      enabled = ((struct ncclDevWorkColl*)ncclShmem.workStorage)[i].profilerEnabled;
-  }
-  return enabled;
+__device__ __forceinline__ bool profilerEnabled(int workItemIdx) {
+  return (ncclShmem.workType == ncclDevWorkTypeP2p) ?
+    ((struct ncclDevWorkP2p*)ncclShmem.workStorage)[workItemIdx].profilerEnabled :
+    ((struct ncclDevWorkColl*)ncclShmem.workStorage)[workItemIdx].profilerEnabled;
 }
 
 __device__ __forceinline__ void profiler(int action) {
-  if (action == START) {
-    if (threadIdx.x == 0) {
-      // increment workCounter regardless of the profiler being active or not
+  if (threadIdx.x == 0) {
+    int idx = 0;
+    uint64_t wc = ncclShmem.channel.workCounter + 1;
+    if (action == START) {
+      for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) {
+        if (!profilerEnabled(idx++)) continue;
+        ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer();
+        ncclShmem.comm.workStarted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
+      }
+    } else {
+      for (; wc <= ncclShmem.channel.workCounter + ncclShmem.nWorks; wc++) {
+        if (!profilerEnabled(idx++)) continue;
+        ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp = globaltimer();
+        ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
+      }
       ncclShmem.channel.workCounter += ncclShmem.nWorks;
-      if(!profilerEnabled()) return;
-      ncclShmem.comm.workStarted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
-    }
-  } else if (action == STOP) {
-    if (threadIdx.x == 0 && profilerEnabled()) {
-      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
-    }
-  } else { // FINI
-    if (threadIdx.x == 0) {
-      // store the workCounter back to vidmem regardless of the profiler being active or not
-      ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
-      if (!profilerEnabled()) return;
-      ncclShmem.comm.workCompleted[ncclShmem.channelId] = ncclShmem.channel.workCounter;
+      if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
     }
   }
 }
@@ -388,11 +382,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   }
   __syncthreads(); // publish ncclShmem
 
-  if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-    // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
-    ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
-  }
-
   while (ncclShmem.aborted == 0) {
     profiler(START);
     if (0 <= SpecializedFnId && ncclShmem.funcId == (unsigned)SpecializedFnId) {
@@ -407,11 +396,6 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
     profiler(STOP);
     loadWorkBatchToShmem(tid, tn, args, batchIx);
     __syncthreads();
-
-    if (tid == 0 && ncclShmem.args.workStorageType == ncclDevWorkStorageTypeFifo) {
-      // ncclShmem.workConsumed written by loadWorkBatchToShmem before __syncthreads()
-      ncclShmem.comm.workConsumed[ncclShmem.channelId] = ncclShmem.workConsumed;
-    }
   }
   profiler(FINI);
 }
diff --git a/src/device/generate.py b/src/device/generate.py
index b69a2d7cc..f9c3a0e79 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -327,7 +327,7 @@ def partition_by_name(fns):
   out = f.write
   impl_names = sorted(name_to_funcs.keys())
   names = impl_names + ["host_table.cc", "device_table.cu"]
-  out("LIB_OBJS_GEN = $(patsubst %, $(OBJDIR)/genobj/%.o, {names})\n"
+  out("LIB_OBJS_GEN = $(patsubst %,$(OBJDIR)/genobj/%.o,{names})\n"
       .format(names=" ".join(names)))
   out("\n")
 
diff --git a/src/device/op128.h b/src/device/op128.h
index b2e519d8c..e7da4812c 100644
--- a/src/device/op128.h
+++ b/src/device/op128.h
@@ -99,37 +99,60 @@ template<>
 union BytePack<0> {};
 template<>
 union BytePack<1> {
-  uint8_t u8, native;
+  uint8_t u8[1], native;
 };
 template<>
 union BytePack<2> {
   BytePack<1> half[2];
+  BytePack<1> b1[2];
   uint8_t u8[2];
-  uint16_t u16, native;
+  uint16_t u16[1], native;
 };
 template<>
 union BytePack<4> {
   BytePack<2> half[2];
+  BytePack<1> b1[4];
+  BytePack<2> b2[2];
   uint8_t u8[4];
   uint16_t u16[2];
-  uint32_t u32, native;
+  uint32_t u32[1], native;
 };
 template<>
 union BytePack<8> {
   BytePack<4> half[2];
+  BytePack<1> b1[8];
+  BytePack<2> b2[4];
+  BytePack<4> b4[2];
   uint8_t u8[8];
   uint16_t u16[4];
   uint32_t u32[2];
-  uint64_t u64, native;
+  uint64_t u64[1], native;
 };
 template<>
 union alignas(16) BytePack<16> {
   BytePack<8> half[2];
+  BytePack<1> b1[16];
+  BytePack<2> b2[8];
+  BytePack<4> b4[4];
+  BytePack<8> b8[2];
   uint8_t u8[16];
   uint16_t u16[8];
   uint32_t u32[4];
   uint64_t u64[2];
-  ulong2 ul2, native;
+  ulong2 ul2[1], native;
+};
+template<int Size>
+union BytePack {
+  BytePack<Size/2> half[2];
+  BytePack<1> b1[Size];
+  BytePack<2> b2[Size/2];
+  BytePack<4> b4[Size/4];
+  BytePack<8> b8[Size/8];
+  BytePack<16> b16[Size/16];
+  uint8_t u8[Size];
+  uint16_t u16[Size/2];
+  uint32_t u32[Size/4];
+  uint64_t u64[Size/8];
 };
 
 template<typename T>
@@ -357,19 +380,19 @@ __device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
-  asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
+  asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.native) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
-  asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
+  asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.native) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
-  asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
+  asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.native) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
-  asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
+  asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.native) : "memory");
 }
 template<>
 __device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
@@ -384,6 +407,56 @@ __device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size
 }
 #endif
 
+// Load pack starting at index in array. Ignore elements past end (length of array).
+template<typename Pack, typename T>
+__device__ __forceinline__ Pack loadPack(T* ptr, int ix, int end) {
+  constexpr int Size = sizeof(Pack);
+  ptr += ix;
+  int n = end - ix;
+  if (alignof(T) == Size && sizeof(T) == Size) {
+    return *(Pack*)ptr;
+  } else if ((Size+3)/4 + 1 < Size/sizeof(T)) {
+    union { Pack ans; uint32_t part[Size/4]; };
+    int misalign = reinterpret_cast<uintptr_t>(ptr) % 4;
+    uint32_t* down = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
+    int i;
+    #pragma unroll
+    for (i=0; i < Size/4; i++) {
+      if (i*4/sizeof(T) < 1 || i*4/sizeof(T) < n) part[i] = down[i];
+    }
+    uint32_t extra;
+    if (misalign) extra = down[i];
+    #pragma unroll
+    for (i=0; i < Size/4; i++) {
+      part[i] = __funnelshift_r(part[i], part[i+1], 8*misalign);
+    }
+    if (misalign) part[i] = __funnelshift_r(part[i], extra, 8*misalign);
+    return ans;
+  } else {
+    union { Pack ans; BytePack<sizeof(T)> part[Size/sizeof(T)]; };
+    #pragma unroll
+    for (int i=0; i < Size/sizeof(T); i++) {
+      if (i < 1 || i < n) part[i] = ((BytePack<sizeof(T)>*)ptr)[i];
+    }
+    return ans;
+  }
+}
+
+// Store pack starting at index in array. Ignore elements past end (length of array).
+template<typename Pack, typename T>
+__device__ __forceinline__ void storePack(T* ptr, int ix, int end, Pack val) {
+  constexpr int Size = sizeof(Pack);
+  union { Pack tmp; BytePack<sizeof(T)> part[Size/sizeof(T)]; };
+  tmp = val;
+  ptr += ix;
+  int n = end - ix;
+  #pragma unroll
+  for (int i=0; i < Size/sizeof(T); i++) {
+    if (i < 1 || i < n) ((BytePack<sizeof(T)>*)ptr)[i] = part[i];
+  }
+}
+
+
 // Warp-uniform memory copy from shared address (not generic) to global memory.
 // The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
 // is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
@@ -426,10 +499,10 @@ __device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
     b4[3] = ld_shared<4>(srcAddr + 3*4);
     if (srcMisalign != 0) {
       BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
-      b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
-      b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
-      b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
-      b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
+      b4[0].native = __funnelshift_r(b4[0].native, b4[1].native, srcMisalign*8);
+      b4[1].native = __funnelshift_r(b4[1].native, b4[2].native, srcMisalign*8);
+      b4[2].native = __funnelshift_r(b4[2].native, b4[3].native, srcMisalign*8);
+      b4[3].native = __funnelshift_r(b4[3].native, b4_4.native, srcMisalign*8);
     }
     if (Multimem) multimem_st_global<16>(dstAddr, b16);
     else          st_global<16>(dstAddr, b16);
diff --git a/src/device/prims_simple.h b/src/device/prims_simple.h
index cf3ba9b55..2ad965bf7 100644
--- a/src/device/prims_simple.h
+++ b/src/device/prims_simple.h
@@ -125,7 +125,7 @@ class Primitives<
 
       void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
                                   : (ncclShmem.groups[group].srcs + Src);
-      if (flags & NetRegMode) {
+      if ((flags & NetRegMode) && ((!isSendNotRecv && DirectRecv) || (isSendNotRecv && DirectSend))) {
         if (P2p) {
           ptrs[index] = NULL;
         } else {
@@ -337,7 +337,7 @@ class Primitives<
   }
 
   template<int Recv, int Send, typename Fn>
-  __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag, uint32_t recvDirectFlag) {
+  __device__ __forceinline__ void process(Fn &&fn, uint32_t sendDirectFlag = 0, uint32_t recvDirectFlag = 0) {
     #pragma unroll 1
     for (int slice=0; slice < SlicePerChunk; slice++) {
       if (tid < nworkers) {
@@ -361,7 +361,7 @@ class Primitives<
               } else if (flags & DirectRead) {  // empty send
                 ptrs[index] = nullptr;
               } else {
-                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
               }
             } else {
               if (flags & DirectRead) {
@@ -372,11 +372,11 @@ class Primitives<
                 else
                   ptrs[index] = nullptr;
               } else {
-                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+                ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
               }
             }
           } else {
-            ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+            ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*connStepSize;
           }
         }
         subBarrier();
@@ -391,7 +391,7 @@ class Primitives<
         } else {
           nsend = fan.nsend();
         }
-        fn.template operator() < SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend >
+        fn.template operator()<SlicePerChunk, 0, Recv*MaxRecv, 0, Send*MaxSend, MultimemSrcs, MultimemDsts>
           (tid, nworkers, slice, stepSize * StepPerSlice,
             nrecv, ncclShmem.groups[group].srcs,
             nsend, ncclShmem.groups[group].dsts, ncclShmem.groups[group].dstSizes, sendDirectFlag, recvDirectFlag);
@@ -896,6 +896,12 @@ class Primitives<
   __device__ __forceinline__ void directRecvDirectSend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<1, 1, 1, 1, -1, -1>(inpIx, outIx, eltN, postOp);
   }
+  __device__ __forceinline__ void recvDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, -1, -1>(-1, outIx, eltN, postOp);
+  }
+  __device__ __forceinline__ void directRecvSend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 1, -1, -1>(outIx, outIx, eltN, postOp);
+  }
   __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
     genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
   }
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index c2378e3df..0d054bb2d 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -38,18 +38,18 @@ struct IsFloatingPoint<double>: std::true_type {};
 //  3. Have constructor taking `uint64_t opArg`.
 
 template<typename T>
-struct FuncCopy { using EltType = T; __device__ FuncCopy(uint64_t opArg=0) {}; };
+struct FuncCopy { using EltType = T; __device__ __forceinline__ FuncCopy(uint64_t opArg=0) {}; };
 template<typename T>
-struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
+struct FuncSum  { using EltType = T; __device__ __forceinline__ FuncSum(uint64_t opArg=0) {}; };
 template<typename T>
-struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
+struct FuncProd { using EltType = T; __device__ __forceinline__ FuncProd(uint64_t opArg=0) {}; };
 
 template<typename T>
 struct FuncMinMax {
   using EltType = T;
   BytePack<sizeof(T)> xormask; // only used by integers
   bool isMinNotMax; // only used by floats
-  __device__ FuncMinMax(uint64_t opArg=0) {
+  __device__ __forceinline__ FuncMinMax(uint64_t opArg=0) {
     xormask.native = opArg;
     isMinNotMax = (opArg&1)==0;
   }
@@ -64,13 +64,13 @@ template<typename T> struct FuncSumPostDiv;
 template<typename Fn>
 struct RedOpArg { // default case: no argument
   static constexpr bool ArgUsed = false;
-  __device__ static uint64_t loadArg(void *ptr) { return 0; }
+  __device__ __forceinline__ static uint64_t loadArg(void *ptr) { return 0; }
 };
 
 template<typename T>
 struct RedOpArg<FuncMinMax<T>> {
   static constexpr bool ArgUsed = true;
-  __device__ static uint64_t loadArg(void *ptr) {
+  __device__ __forceinline__ static uint64_t loadArg(void *ptr) {
     union { uint64_t u64; T val; };
     u64 = 0;
     val = *(T*)ptr;
@@ -84,6 +84,11 @@ struct RedOpArg<FuncMinMax<T>> {
 // of elements. These classes are intended to be specialized for specific
 // combinations of reduction function and pack size.
 
+template<typename A, typename B, int EltPerPackA>
+struct Apply_Cast/*{
+  static BytePack<EltPerPackA*sizeof(B)/sizeof(A)> cast(BytePack<EltPerPackA*sizeof(A)> a);
+}*/;
+
 template<typename Fn, int EltPerPack>
 struct Apply_Reduce /*{
   static BytePack<EltPerPack*sizeof(T)> reduce(
@@ -111,16 +116,60 @@ struct Apply_LoadMultimem/*{
   static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
 }*/;
 
+
+// Helpers for dealing with BytePack<0>'s
+template<typename A, typename B, int EltPerPack>
+struct Apply_Cast_MaybeEmpty: Apply_Cast<A, B, EltPerPack> {};
+template<typename A, typename B>
+struct Apply_Cast_MaybeEmpty<A, B, /*EltPerPack=*/0> {
+  __device__ constexpr static BytePack<0> cast(BytePack<0> a) { return {}; }
+};
+
+template<typename Fn, int EltPerPack>
+struct Apply_Reduce_MaybeEmpty: Apply_Reduce<Fn, EltPerPack> {};
+template<typename Fn>
+struct Apply_Reduce_MaybeEmpty<Fn, 0> {
+  __device__ constexpr static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) { return {}; }
+};
+
+template<typename Fn, int EltPerPack>
+struct Apply_PreOp_MaybeEmpty: Apply_PreOp<Fn, EltPerPack> {};
+template<typename Fn>
+struct Apply_PreOp_MaybeEmpty<Fn, 0> {
+  static constexpr bool IsIdentity = true;
+  __device__ constexpr static BytePack<0> preOp(Fn fn, BytePack<0> a) { return {}; }
+};
+
+template<typename Fn, int EltPerPack>
+struct Apply_PostOp_MaybeEmpty: Apply_PostOp<Fn, EltPerPack> {};
+template<typename Fn>
+struct Apply_PostOp_MaybeEmpty<Fn, 0> {
+  static constexpr bool IsIdentity = true;
+  __device__ constexpr static BytePack<0> postOp(Fn fn, BytePack<0> a) { return {}; }
+};
+
+template<typename Fn, int BytePerPack>
+struct Apply_LoadMultimem_MaybeEmpty: Apply_LoadMultimem<Fn, BytePerPack> {};
+template<typename Fn>
+struct Apply_LoadMultimem_MaybeEmpty<Fn, 0> {
+  __device__ constexpr static BytePack<0> load(Fn fn, uintptr_t addr) { return {}; }
+};
+
 ////////////////////////////////////////////////////////////////////////////////
 // Public API for calling the trait classes. These take the data elements as a
 // pack of any type, which could be a BytePack<?> or any integral type (uint64_t,
 // uint32_t, etc.), and will return a new pack where each element has been
 // transformed appropriately.
 
+template<typename A, typename B, typename PackA>
+__device__ __forceinline__ BytePack<BytePackOf<PackA>::Size*sizeof(B)/sizeof(A)> applyCast(PackA a) {
+  return Apply_Cast_MaybeEmpty<A, B, BytePackOf<PackA>::Size/sizeof(A)>::cast(toPack(a));
+}
+
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
   return fromPack<Pack>(
-    Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+    Apply_Reduce_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::reduce(fn, toPack(a), toPack(b))
   );
 }
@@ -128,7 +177,7 @@ __device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
   return fromPack<Pack>(
-    Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+    Apply_PreOp_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::preOp(fn, toPack(a))
   );
 }
@@ -136,23 +185,107 @@ __device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
 template<typename Fn, typename Pack>
 __device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
   return fromPack<Pack>(
-    Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+    Apply_PostOp_MaybeEmpty<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
       ::postOp(fn, toPack(a))
   );
 }
 
 template<typename Fn, int BytePerPack>
 __device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
-  return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
+  return Apply_LoadMultimem_MaybeEmpty<Fn, BytePerPack>::load(fn, addr);
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Apply_Cast
+
+template<typename A, typename B, int EltPerPack>
+struct Apply_Cast {
+  __device__ __forceinline__ static BytePack<EltPerPack*sizeof(B)> cast(BytePack<EltPerPack*sizeof(A)> a) {
+    BytePack<EltPerPack*sizeof(B)> b;
+    b.half[0] = Apply_Cast<A, B, EltPerPack/2>::cast(a.half[0]);
+    b.half[1] = Apply_Cast<A, B, EltPerPack/2>::cast(a.half[1]);
+    return b;
+  }
+};
+
+template<typename A, typename B>
+struct Apply_Cast<A, B, /*EltPerPack=*/1> {
+  __device__ __forceinline__ static BytePack<sizeof(B)> cast(BytePack<sizeof(A)> a) {
+    return toPack(B(fromPack<A>(a)));
+  }
+};
+
+template<>
+struct Apply_Cast<__half, float, /*EltPerPack=*/1> {
+  __device__ __forceinline__ static BytePack<sizeof(float)> cast(BytePack<sizeof(__half)> a) {
+    return toPack(__half2float(fromPack<__half>(a)));
+  }
+};
+template<>
+struct Apply_Cast<float, __half, /*EltPerPack=*/1> {
+  __device__ __forceinline__ static BytePack<sizeof(__half)> cast(BytePack<sizeof(float)> a) {
+    return toPack(__float2half_rn(fromPack<float>(a)));
+  }
+};
+
+template<>
+struct Apply_Cast<__half, float, /*EltPerPack=*/2> {
+  __device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) {
+    return toPack(__half22float2(fromPack<__half2>(a)));
+  }
+};
+template<>
+struct Apply_Cast<float, __half, /*EltPerPack=*/2> {
+  __device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) {
+    return toPack(__float22half2_rn(fromPack<float2>(a)));
+  }
+};
+
+#if defined(__CUDA_BF16_TYPES_EXIST__) && (CUDART_RUNTIME >= 12000 || __CUDA_ARCH__ >= 800)
+template<>
+struct Apply_Cast<__nv_bfloat16, float, /*EltPerPack=*/2> {
+  __device__ __forceinline__ static BytePack<4*2> cast(BytePack<2*2> a) {
+    return toPack(__bfloat1622float2(fromPack<__nv_bfloat162>(a)));
+  }
+};
+template<>
+struct Apply_Cast<float ,__nv_bfloat16, /*EltPerPack=*/2> {
+  __device__ __forceinline__ static BytePack<2*2> cast(BytePack<4*2> a) {
+    return toPack(__float22bfloat162_rn(fromPack<float2>(a)));
+  }
+};
+#endif
+
+#define EASY_CAST(A, B, EltPerPack, VecA, VecB) \
+  template<> \
+  struct Apply_Cast<A, B, EltPerPack> { \
+    __device__ __forceinline__ static BytePack<sizeof(B)*EltPerPack> cast(BytePack<sizeof(A)*EltPerPack> a) { \
+      return toPack(VecB(fromPack<VecA>(a))); \
+    } \
+  }; \
+  template<> \
+  struct Apply_Cast<B, A, EltPerPack> { \
+    __device__ __forceinline__ static BytePack<sizeof(A)*EltPerPack> cast(BytePack<sizeof(B)*EltPerPack> b) { \
+      return toPack(VecA(fromPack<VecB>(b))); \
+    } \
+  };
+
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+EASY_CAST(__nv_fp8_e5m2, float, 2, __nv_fp8x2_e5m2, float2)
+EASY_CAST(__nv_fp8_e5m2, float, 4, __nv_fp8x4_e5m2, float4)
+
+EASY_CAST(__nv_fp8_e4m3, float, 2, __nv_fp8x2_e4m3, float2)
+EASY_CAST(__nv_fp8_e4m3, float, 4, __nv_fp8x4_e4m3, float4)
+#endif
+#undef EASY_CAST
+
 ////////////////////////////////////////////////////////////////////////////////
 // Apply_Reduce
 
 // Nonsensical base case
 template<typename Fn>
 struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
-  __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
+  __device__ __forceinline__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
     return  {};
   }
 };
@@ -164,7 +297,7 @@ struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
 template<typename Fn, int EltPerPack>
 struct Apply_Reduce {
   template<int Size>
-  __device__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
+  __device__ __forceinline__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
     a.half[0] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[0], b.half[0]);
     a.half[1] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[1], b.half[1]);
     return a;
@@ -174,25 +307,25 @@ struct Apply_Reduce {
 // Base case definitions (EltPerPack == 1)
 template<typename T>
 struct Apply_Reduce<FuncCopy<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncCopy<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
     return a;
   }
 };
 template<typename T>
 struct Apply_Reduce<FuncSum<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
     return toPack<T>(fromPack<T>(a) + fromPack<T>(b));
   }
 };
 template<typename T>
 struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
     return toPack<T>(fromPack<T>(a) * fromPack<T>(b));
   }
 };
 template<typename T>
 struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
-  __device__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> reduce(FuncMinMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
     return (a.native ^ fn.xormask.native) < (b.native ^ fn.xormask.native) ? a : b;
   }
 };
@@ -200,7 +333,7 @@ struct Apply_Reduce<FuncMinMax<T>, /*EltPerPack=*/1> {
 // Optimizations for specfic types and element count combinations:
 template<>
 struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
-  __device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+  __device__ __forceinline__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
     constexpr uint32_t even = 0x00ff00ffu;
     uint32_t x = (a.native &  even) + (b.native &  even);
     uint32_t y = (a.native & ~even) + (b.native & ~even);
@@ -236,7 +369,7 @@ struct Apply_Reduce<FuncMinMax<uint8_t>, /*EltPerPack=*/4> {
 
 template<>
 struct Apply_Reduce<FuncProd<uint8_t>, /*EltPerPack=*/4> {
-  __device__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
+  __device__ __forceinline__ static BytePack<4> reduce(FuncProd<uint8_t> fn, BytePack<4> apack, BytePack<4> bpack) {
     uint32_t a = apack.native;
     uint32_t b = bpack.native;
     uint32_t ab0 = (a*b) & 0xffu;
@@ -332,7 +465,7 @@ template<typename Fn, int EltPerPack>
 struct Apply_PreOp {
   static constexpr bool IsIdentity = Apply_PreOp<Fn, EltPerPack/2>::IsIdentity;
   template<int Size>
-  __device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
+  __device__ __forceinline__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
     #if __cpp_if_constexpr
     if constexpr(!IsIdentity) {
     #else
@@ -352,7 +485,7 @@ template<typename Fn>
 struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = true;
   template<int Size>
-  __device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
+  __device__ __forceinline__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
     return a;
   }
 };
@@ -360,7 +493,7 @@ struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
 template<typename Fn>
 struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
   static constexpr bool IsIdentity = true;
-  __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
+  __device__ __forceinline__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
     return {};
   }
 };
@@ -373,7 +506,7 @@ template<typename Fn, int EltPerPack>
 struct Apply_PostOp {
   static constexpr bool IsIdentity = Apply_PostOp<Fn, EltPerPack/2>::IsIdentity;
   template<int Size>
-  __device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
+  __device__ __forceinline__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
     #if __cpp_if_constexpr
     if constexpr(!IsIdentity) {
     #else
@@ -393,7 +526,7 @@ template<typename Fn>
 struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = true;
   template<int Size>
-  __device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
+  __device__ __forceinline__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
     return a;
   }
 };
@@ -401,7 +534,7 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
 template<typename Fn>
 struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
   static constexpr bool IsIdentity = true;
-  __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
+  __device__ __forceinline__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
     return {};
   }
 };
@@ -413,7 +546,7 @@ struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
 template<typename T>
 struct RedOpArg<FuncPreMulSum<T>> {
   static constexpr bool ArgUsed = true;
-  __device__ static uint64_t loadArg(void *ptr) {
+  __device__ __forceinline__ static uint64_t loadArg(void *ptr) {
     union { uint64_t u64; T val; };
     u64 = 0;
     val = *(T*)ptr;
@@ -426,7 +559,7 @@ template<typename T>
 struct FuncPreMulSum {
   using EltType = T;
   T scalar;
-  __device__ FuncPreMulSum(uint64_t opArg=0) {
+  __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
     union { uint64_t u64; T val; };
     u64 = opArg;
     scalar = val;
@@ -441,7 +574,7 @@ struct FuncPreMulSum<half> {
   using EltType = half;
 #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
   __half2 scalar;
-  __device__ FuncPreMulSum(uint64_t opArg=0) {
+  __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
     union { uint64_t u64; __half val; };
     u64 = opArg;
     scalar.x = val;
@@ -449,7 +582,7 @@ struct FuncPreMulSum<half> {
   }
 #else
   float scalar;
-  __device__ FuncPreMulSum(uint64_t opArg=0) {
+  __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
     union { uint64_t u64; __half val; };
     u64 = opArg;
     scalar = (float)val;
@@ -466,7 +599,7 @@ struct FuncPreMulSum<half> {
     using EltType = __nv_bfloat16;
   #if __CUDA_ARCH__ >= 800
     __nv_bfloat162 scalar;
-    __device__ FuncPreMulSum(uint64_t opArg=0) {
+    __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
       union { uint64_t u64; __nv_bfloat16 val; };
       u64 = opArg;
       scalar.x = val;
@@ -474,7 +607,7 @@ struct FuncPreMulSum<half> {
     }
   #else
     float scalar;
-    __device__ FuncPreMulSum(uint64_t opArg=0) {
+    __device__ __forceinline__ FuncPreMulSum(uint64_t opArg=0) {
       union { uint64_t u64; __nv_bfloat16 val; };
       u64 = opArg;
       scalar = __bfloat162float(val);
@@ -489,7 +622,7 @@ struct FuncPreMulSum<half> {
   struct FuncPreMulSum<__nv_fp8_e4m3> {
     using EltType = __nv_fp8_e4m3;
     __half2 scalar2;
-    __device__ FuncPreMulSum(uint64_t opArg) {
+    __device__ __forceinline__ FuncPreMulSum(uint64_t opArg) {
       union { uint64_t u64; __nv_fp8_storage_t val; };
       u64 = opArg;
       scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
@@ -501,7 +634,7 @@ struct FuncPreMulSum<half> {
   struct FuncPreMulSum<__nv_fp8_e5m2> {
     using EltType = __nv_fp8_e5m2;
     __half2 scalar2;
-    __device__ FuncPreMulSum(uint64_t opArg) {
+    __device__ __forceinline__ FuncPreMulSum(uint64_t opArg) {
       union { uint64_t u64; __nv_fp8_storage_t val; };
       u64 = opArg;
       scalar2.x = __half(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
@@ -513,7 +646,7 @@ struct FuncPreMulSum<half> {
 
 template<typename T, int EltPerPack>
 struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
-  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncPreMulSum reduce dispatches to FuncSum.
     return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
@@ -523,7 +656,7 @@ struct Apply_Reduce<FuncPreMulSum<T>, EltPerPack> {
 template<typename T>
 struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = false;
-  __device__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
     return toPack<T>(fromPack<T>(a) * fn.scalar);
   }
 };
@@ -534,7 +667,7 @@ struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
 template<>
 struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = false;
-  __device__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
+  __device__ __forceinline__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
     #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
       return toPack<half>(__hmul(fromPack<half>(a), fn.scalar.x));
     #else
@@ -546,7 +679,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/2> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
+    __device__ __forceinline__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
       return toPack<half2>(__hmul2(fromPack<half2>(a), fn.scalar));
     }
   };
@@ -559,7 +692,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<__nv_bfloat16>, /*EltPerPack=*/1> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(__nv_bfloat16)> preOp(
+    __device__ __forceinline__ static BytePack<sizeof(__nv_bfloat16)> preOp(
         FuncPreMulSum<__nv_bfloat16> fn, BytePack<sizeof(__nv_bfloat16)> a
       ) {
       #if __CUDA_ARCH__ >= 800
@@ -573,7 +706,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
     template<>
     struct Apply_PreOp<FuncPreMulSum<__nv_bfloat16>, /*EltPerPack=*/2> {
       static constexpr bool IsIdentity = false;
-      __device__ static BytePack<sizeof(__nv_bfloat162)> preOp(
+      __device__ __forceinline__ static BytePack<sizeof(__nv_bfloat162)> preOp(
           FuncPreMulSum<__nv_bfloat16> fn, BytePack<sizeof(__nv_bfloat162)> a
         ) {
         return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar));
@@ -590,7 +723,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/1> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
+    __device__ __forceinline__ static BytePack<sizeof(__nv_fp8_e4m3)> preOp(
         FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8_e4m3)> a
       ) {
       return toPack<__nv_fp8_e4m3>(__nv_fp8_e4m3(__hmul(__half(fromPack<__nv_fp8_e4m3>(a)), fn.scalar2.x)));
@@ -599,7 +732,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e4m3>, /*EltPerPack=*/2> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
+    __device__ __forceinline__ static BytePack<sizeof(__nv_fp8x2_e4m3)> preOp(
         FuncPreMulSum<__nv_fp8_e4m3> fn, BytePack<sizeof(__nv_fp8x2_e4m3)> a
       ) {
       return toPack<__nv_fp8x2_e4m3>(__nv_fp8x2_e4m3(__hmul2(__half2(fromPack<__nv_fp8x2_e4m3>(a)), fn.scalar2)));
@@ -609,7 +742,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/1> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
+    __device__ __forceinline__ static BytePack<sizeof(__nv_fp8_e5m2)> preOp(
         FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8_e5m2)> a
       ) {
       return toPack<__nv_fp8_e5m2>(__nv_fp8_e5m2(__hmul(__half(fromPack<__nv_fp8_e5m2>(a)), fn.scalar2.x)));
@@ -618,7 +751,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
   template<>
   struct Apply_PreOp<FuncPreMulSum<__nv_fp8_e5m2>, /*EltPerPack=*/2> {
     static constexpr bool IsIdentity = false;
-    __device__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
+    __device__ __forceinline__ static BytePack<sizeof(__nv_fp8x2_e5m2)> preOp(
         FuncPreMulSum<__nv_fp8_e5m2> fn, BytePack<sizeof(__nv_fp8x2_e5m2)> a
       ) {
       return toPack<__nv_fp8x2_e5m2>(__nv_fp8x2_e5m2(__hmul2(__half2(fromPack<__nv_fp8x2_e5m2>(a)), fn.scalar2)));
@@ -633,7 +766,7 @@ struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
 template<typename T>
 struct RedOpArg<FuncSumPostDiv<T>> {
   static constexpr bool ArgUsed = true;
-  __device__ static uint64_t loadArg(void *ptr) {
+  __device__ __forceinline__ static uint64_t loadArg(void *ptr) {
     return *(uint64_t*)ptr;
   }
 };
@@ -646,12 +779,12 @@ struct FuncSumPostDiv {
   uint32_t divisor:31, isSigned:1;
   UintType recip;
   
-  __device__ FuncSumPostDiv(uint64_t opArg=0) {
+  __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) {
     isSigned = opArg & 1;
     divisor = opArg >> 1;
     recip =  UintType(-1)/divisor;
   }
-  __device__ T divide(T x) {
+  __device__ __forceinline__ T divide(T x) {
     // x is negative iff we are in signed mode and the top bit is set
     bool xneg = isSigned && (x & ~(T(-1)>>1));
     // Compute abs(x):
@@ -673,7 +806,7 @@ struct FuncSumPostDiv {
 template<typename T, int EltPerPack>
 struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
     Apply_Reduce<FuncSum<T>, EltPerPack> {
-  __device__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
+  __device__ __forceinline__ static BytePack<EltPerPack*sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b) {
     // FuncSumPostDiv reduce dispatches to FuncSum.
     return Apply_Reduce<FuncSum<T>, EltPerPack>::reduce(FuncSum<T>(), a, b);
   }
@@ -682,7 +815,7 @@ struct Apply_Reduce<FuncSumPostDiv<T>, EltPerPack>:
 template<typename T>
 struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
   static constexpr bool IsIdentity = false;
-  __device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
+  __device__ __forceinline__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
     return toPack<T>(fn.divide(fromPack<T>(a)));
   }
 };
@@ -690,120 +823,145 @@ struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
 ////////////////////////////////////////////////////////////////////////////////
 // Apply_LoadMultimem
 
-#define SIZEOF_BytePack_field_u16 2
-#define PTX_REG_BytePack_field_u16 "h"
-
-#define SIZEOF_BytePack_field_u32 4
-#define PTX_REG_BytePack_field_u32 "r"
-
-#define SIZEOF_BytePack_field_u64 8
-#define PTX_REG_BytePack_field_u64 "l"
+#define RegCode_for_size_1 "r"
+#define RegCode_for_size_2 "h"
+#define RegCode_for_size_4 "r"
+#define RegCode_for_size_8 "l"
+
+#define RegSize_for_size_1 4
+#define RegSize_for_size_2 2
+#define RegSize_for_size_4 4
+#define RegSize_for_size_8 8
+
+#define PtxAcc_for_u32
+#define PtxAcc_for_s32
+#define PtxAcc_for_s64
+#define PtxAcc_for_u64
+#define PtxAcc_for_f32
+#define PtxAcc_for_f64
+#if CUDART_VERSION >= 12020
+  #define PtxAcc_for_f16 ".acc::f32"
+  #define PtxAcc_for_bf16 ".acc::f32"
+  #define PtxAcc_for_f16x2 ".acc::f32"
+  #define PtxAcc_for_bf16x2 ".acc::f32"
+#else
+  #define PtxAcc_for_f16
+  #define PtxAcc_for_bf16
+  #define PtxAcc_for_f16x2
+  #define PtxAcc_for_bf16x2
+#endif
+#define PtxAcc_for_e4m3 ".acc::f16"
+#define PtxAcc_for_e5m2 ".acc::f16"
+#define PtxAcc_for_e4m3x4 ".acc::f16"
+#define PtxAcc_for_e5m2x4 ".acc::f16"
 
-#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_sum(T, ptx_ty, PackSize) \
   template<> \
-  struct Apply_LoadMultimem<FuncSum<T>, SIZEOF_BytePack_field_##pack_field> { \
-    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
-    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
-      BytePack<PackSize> ans; \
-      asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
-        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+  struct Apply_LoadMultimem<FuncSum<T>, PackSize> { \
+    __device__ __forceinline__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
+      BytePack<RegSize_for_size_##PackSize> reg; \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \
+        : "=" RegCode_for_size_##PackSize(reg.native) \
         : "l"(addr) : "memory"); \
+      BytePack<PackSize> ans; \
+      ans.native = reg.native; \
       return ans; \
     } \
   };
-#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax(T, ptx_ty, PackSize) \
   template<> \
-  struct Apply_LoadMultimem<FuncMinMax<T>, SIZEOF_BytePack_field_##pack_field> { \
-    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
-    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
-      BytePack<PackSize> ans; \
+  struct Apply_LoadMultimem<FuncMinMax<T>, PackSize> { \
+    __device__ __forceinline__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      BytePack<RegSize_for_size_##PackSize> reg; \
       if (fn.isMinNotMax) { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "=" RegCode_for_size_##PackSize(reg.native) \
           : "l"(addr) : "memory"); \
       } else { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+          : "=" RegCode_for_size_##PackSize(reg.native) \
           : "l"(addr) : "memory"); \
       } \
+      BytePack<PackSize> ans; \
+      ans.native = reg.native; \
       return ans; \
     } \
   };
 
-#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \
   template<> \
-  struct Apply_LoadMultimem<FuncSum<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
-    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
-    __device__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
-      BytePack<PackSize> ans; \
-      asm volatile("multimem.ld_reduce.relaxed.sys.global.add.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
-        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
-          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
-          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
-          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+  struct Apply_LoadMultimem<FuncSum<T>, 4*(VecEltSize)> { \
+    static constexpr int PackSize = 4*(VecEltSize); \
+    __device__ __forceinline__ static BytePack<PackSize> load(FuncSum<T> fn, uintptr_t addr) { \
+      union { BytePack<PackSize> ans; BytePack<VecEltSize> elts[4]; }; \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+        : "=" RegCode_for_size_##VecEltSize(elts[0].native), \
+          "=" RegCode_for_size_##VecEltSize(elts[1].native), \
+          "=" RegCode_for_size_##VecEltSize(elts[2].native), \
+          "=" RegCode_for_size_##VecEltSize(elts[3].native) \
         : "l"(addr) : "memory"); \
       return ans; \
     } \
   };
-#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \
   template<> \
-  struct Apply_LoadMultimem<FuncMinMax<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
-    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
-    __device__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
-      BytePack<PackSize> ans; \
+  struct Apply_LoadMultimem<FuncMinMax<T>, 4*(VecEltSize)> { \
+    static constexpr int PackSize = 4*(VecEltSize); \
+    __device__ __forceinline__ static BytePack<PackSize> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      union { BytePack<PackSize> ans; BytePack<VecEltSize> elts[4]; }; \
       if (fn.isMinNotMax) { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.min.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "=" RegCode_for_size_##VecEltSize(elts[0].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[1].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[2].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[3].native) \
           : "l"(addr) : "memory"); \
       } else { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.max.v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
-            "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+          : "=" RegCode_for_size_##VecEltSize(elts[0].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[1].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[2].native), \
+            "=" RegCode_for_size_##VecEltSize(elts[3].native) \
           : "l"(addr) : "memory"); \
       } \
       return ans; \
     } \
   };
 
-#define DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(T, ptx_ty, pack_field) \
-  DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(T, ptx_ty, VecEltSize) \
+  DEFINE_Apply_LoadMultimem_sum_v4(T, ptx_ty, VecEltSize) \
   template<> \
   struct Apply_LoadMultimem<FuncSum<T>, sizeof(T)> { \
-    __device__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
-      BytePack<2*sizeof(T)> tmp; \
-      asm volatile("multimem.ld_reduce.relaxed.sys.global.add." #ptx_ty " %0, [%1];" \
-        : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-        : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
-      return tmp.half[(addr/sizeof(T))%2]; \
+    __device__ __forceinline__ static BytePack<sizeof(T)> load(FuncSum<T> fn, uintptr_t addr) { \
+      union { BytePack<VecEltSize> tmp; BytePack<sizeof(T)> elts[(VecEltSize)/sizeof(T)]; }; \
+      asm volatile("multimem.ld_reduce.relaxed.sys.global.add" PtxAcc_for_##ptx_ty "." #ptx_ty " %0, [%1];" \
+        : "=" RegCode_for_size_##VecEltSize(tmp.native) \
+        : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
+      return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \
     } \
   };
-#define DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(T, ptx_ty, pack_field) \
-  DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, pack_field) \
+#define DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(T, ptx_ty, VecEltSize) \
+  DEFINE_Apply_LoadMultimem_minmax_v4(T, ptx_ty, VecEltSize) \
   template<> \
   struct Apply_LoadMultimem<FuncMinMax<T>, sizeof(T)> { \
-    __device__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
-      BytePack<2*sizeof(T)> tmp; \
+    __device__ __forceinline__ static BytePack<sizeof(T)> load(FuncMinMax<T> fn, uintptr_t addr) { \
+      union { BytePack<VecEltSize> tmp; BytePack<sizeof(T)> elts[(VecEltSize)/sizeof(T)]; }; \
       if (fn.isMinNotMax) { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.min." #ptx_ty " %0, [%1];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
+          : "=" RegCode_for_size_##VecEltSize(tmp.native) \
+          : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
       } else { \
         asm volatile("multimem.ld_reduce.relaxed.sys.global.max." #ptx_ty " %0, [%1];" \
-          : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
-          : "l"(addr & -uintptr_t(2*sizeof(T))) : "memory"); \
+          : "=" RegCode_for_size_##VecEltSize(tmp.native) \
+          : "l"(addr & -uintptr_t(VecEltSize)) : "memory"); \
       } \
-      return tmp.half[(addr/sizeof(T))%2]; \
+      return elts[(addr/sizeof(T))%((VecEltSize)/sizeof(T))]; \
     } \
   };
 
 template<typename Fn, int BytePerPack>
 struct Apply_LoadMultimem {
-  __device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
+  __device__ __forceinline__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
     __trap();
     return {};
   }
@@ -826,29 +984,36 @@ struct Apply_LoadMultimem {
       /*multimem.ld_reduce not supported:*/ 0;
   };
 
-  DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, u32)
-  DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, u32)
+  DEFINE_Apply_LoadMultimem_sum(uint32_t, u32, 4)
+  DEFINE_Apply_LoadMultimem_minmax(uint32_t, u32, 4)
 
-  DEFINE_Apply_LoadMultimem_sum(int32_t, s32, u32)
-  DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, u32)
+  DEFINE_Apply_LoadMultimem_sum(int32_t, s32, 4)
+  DEFINE_Apply_LoadMultimem_minmax(int32_t, s32, 4)
 
-  DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, u64)
-  DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, u64)
+  DEFINE_Apply_LoadMultimem_sum(uint64_t, u64, 8)
+  DEFINE_Apply_LoadMultimem_minmax(uint64_t, u64, 8)
 
-  DEFINE_Apply_LoadMultimem_sum(int64_t, u64, u64)
-  DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, u64)
+  DEFINE_Apply_LoadMultimem_sum(int64_t, u64, 8)
+  DEFINE_Apply_LoadMultimem_minmax(int64_t, s64, 8)
 
-  DEFINE_Apply_LoadMultimem_sum(float, f32, u32)
-  DEFINE_Apply_LoadMultimem_sum_v4(float, f32, u32)
+  DEFINE_Apply_LoadMultimem_sum(float, f32, 4)
+  DEFINE_Apply_LoadMultimem_sum_v4(float, f32, 4)
 
-  DEFINE_Apply_LoadMultimem_sum(double, f64, u64)
+  DEFINE_Apply_LoadMultimem_sum(double, f64, 8)
 
-  DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(half, f16x2, u32)
-  DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(half, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(half, f16x2, 4)
+  DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(half, f16x2, 4)
 
   #if defined(__CUDA_BF16_TYPES_EXIST__)
-    DEFINE_Apply_LoadMultimem_sum_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
-    DEFINE_Apply_LoadMultimem_minmax_v4x2_and_subhalf(__nv_bfloat16, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
+    DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
+  #endif
+
+  #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
+    DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
+    DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
+    DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
+    DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
   #endif
 #else
   template<typename Fn>
@@ -860,11 +1025,29 @@ struct Apply_LoadMultimem {
 #undef DEFINE_Apply_LoadMultimem
 #undef DEFINE_Apply_LoadMultimem_v4
 #undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
-#undef SIZEOF_BytePack_field_u64
-#undef PTX_REG_BytePack_field_u64
-#undef SIZEOF_BytePack_field_u32
-#undef PTX_REG_BytePack_field_u32
-#undef SIZEOF_BytePack_field_u16
-#undef PTX_REG_BytePack_field_u16
+
+#undef RegCode_for_size_2
+#undef RegCode_for_size_4
+#undef RegCode_for_size_8
+
+#undef RegSize_for_size_1
+#undef RegSize_for_size_2
+#undef RegSize_for_size_4
+#undef RegSize_for_size_8
+
+#undef PtxAcc_for_u32
+#undef PtxAcc_for_s32
+#undef PtxAcc_for_s64
+#undef PtxAcc_for_u64
+#undef PtxAcc_for_f32
+#undef PtxAcc_for_f64
+#undef PtxAcc_for_f16
+#undef PtxAcc_for_bf16
+#undef PtxAcc_for_f16x2
+#undef PtxAcc_for_bf16x2
+#undef PtxAcc_for_e4m3
+#undef PtxAcc_for_e5m2
+#undef PtxAcc_for_e4m3x4
+#undef PtxAcc_for_e5m2x4
 
 #endif // REDUCE_KERNEL_H_
diff --git a/src/device/reduce_scatter.h b/src/device/reduce_scatter.h
index 5d8de2819..63b981b09 100644
--- a/src/device/reduce_scatter.h
+++ b/src/device/reduce_scatter.h
@@ -142,82 +142,206 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_PAT, NCCL_PROTO_SI
 
 template<typename T, typename RedOp>
 struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  template<bool ReduceSendNotRecv>
+  struct Scatterer {
+    struct ncclDevWorkColl* work;
+    int chunkCount;
+    ssize_t railGridOffset;
+
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
+    __device__ __forceinline__ void operator()(
+        int tid, int tn, int slice, int maxSliceSize,
+        int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
+      ) {
+      static_assert(SlicePerChunk == 1, "require: SlicePerChunk==1");
+      static_assert(MaxDsts <= 1 || MaxSrcs <= 1, "require: MaxDsts<=1 || MaxSrcs<=1");
+
+      struct ncclNvls* nvls = &ncclShmem.channel.nvls;
+      int nNodes = ncclShmem.comm.nNodes;
+      int nRails = nvls->nHeads;
+      int part = ncclShmem.channelId - work->channelLo;
+      void* inbuf = (void*)work->sendbuff;
+      ssize_t countPerRank = work->collnet.count;
+
+      ssize_t railAllBeg = min(railGridOffset + part * chunkCount, nNodes * countPerRank);
+      ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
+      int railAllSize = railAllEnd - railAllBeg;
+      int rail = nvls->headRank;
+      int dst = 0;
+      if (ReduceSendNotRecv) {
+        if (work->regUsed) return;
+        rail = 0;
+        nSrcs = 1;
+      } else {
+        rail = nvls->headRank;
+      }
+      if (tid < nDsts) dstSizes[tid] = railAllSize;
+      do {
+        int node = railAllBeg / countPerRank;
+        int railAllOffset = 0;
+        while (railAllOffset < railAllSize) {
+          ssize_t railOneBeg = node * countPerRank;
+          ssize_t railOneEnd = railOneBeg + countPerRank;
+          ssize_t railOneOffset = (railAllBeg + railAllOffset) - railOneBeg;
+          int delta = min(railAllEnd, railOneEnd) - (railAllBeg + railAllOffset);
+          int rank = ncclShmem.comm.collNetDenseToUserRank[node * nRails + rail];
+          ssize_t userOneBeg = rank * countPerRank + railOneOffset;
+          if (nDsts != 0) {
+            reduceCopy<ncclCollUnroll(), RedOp, T,
+              /*MultimemSrcs=*/MultimemSrcs, 1, 1 + MaxSrcs,
+              /*MultimemDsts,MinDsts,MaxDsts=*/MultimemDsts, 1, 1,
+              /*PreOpSrcs=*/1>
+              (tid, tn, work->redOpArg, &work->redOpArg, false,
+                /*nSrcs=*/nSrcs, [=]__device__(int s) {
+              return work->regUsed ? (T*)srcPtrs[s] + userOneBeg :
+                !ReduceSendNotRecv ? (T*)srcPtrs[s] + railAllOffset:
+                (T*)inbuf + userOneBeg;
+            },
+                /*nDsts=*/1, [=]__device__(int d/*==0*/) {
+              return (T*)dstPtrs[dst] + railAllOffset;
+            }, delta);
+          }
+          railAllOffset += delta;
+          node += 1;
+        }
+        dst += 1;
+        rail += 1;
+      } while (ReduceSendNotRecv && dst < nRails);
+    }
+  };
+
   __device__ __forceinline__ void run(int tid, int/*nthreads*/, struct ncclDevWorkColl* work) {
     struct ncclNvls* nvls = &ncclShmem.channel.nvls;
-    size_t count;
-    size_t gridOffset;
-    size_t channelCount;
-    size_t chunkCount;
-    ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
-    const int rank = ncclShmem.comm.rank;
-    const int nranks = ncclShmem.comm.nRanks;
-    size_t offset;
     int nelem;
 
     /* if we are direct NVLS, we only need to allocate 1 warp to scatter for sync;
      * if not, based on #ranks, we allocate 7 or 5 warps to reduce to saturate bandwidth
      * and the rest are allocated to scatter. */
-    const int nThreadsReduce = work->regUsed ? (NCCL_MAX_NTHREADS - WARP_SIZE) : (nranks <= 6 ? 7 * WARP_SIZE : 5 * WARP_SIZE);
-    const int nThreadsScatter = work->regUsed ? WARP_SIZE : (NCCL_MAX_NTHREADS - nThreadsReduce);
-    const int tidEndScatter = nThreadsScatter;
+    const int nThreadsNetRecv = work->oneNode ? 0 : (work->netRegUsed ? WARP_SIZE :  6 * WARP_SIZE);
+    const int nThreadsScatter = work->regUsed ? roundUp(nvls->nHeads << 2, WARP_SIZE) : 8 * WARP_SIZE;
+    const int nThreadsReduce = NCCL_MAX_NTHREADS - nThreadsNetRecv - nThreadsScatter;
+    const int tidEndNetRecv = nThreadsNetRecv;
+    const int tidEndScatter = tidEndNetRecv + nThreadsScatter;
     const int tidEndReduce = tidEndScatter + nThreadsReduce;
 
-    if (!work->regUsed) {
-      if (tid < tidEndScatter) {
-        // Scatter
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-        Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
-            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          offset = gridOffset + elemOffset;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
+    if (work->oneNode) {
+      const int rank = ncclShmem.comm.rank;
+      size_t offset;
+      size_t count, gridOffset, channelCount, chunkCount;
+      ncclCollCbdPart(work, ncclShmem.channelId, NCCL_PROTO_SIMPLE, sizeof(T), &count, &gridOffset, &channelCount, &chunkCount);
+      if (!work->regUsed) {
+        if (tid < tidEndScatter) {
+          // Scatter
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+          Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+            prims(tid, nThreadsScatter, NULL, nvls->up, work->sendbuff, NULL,
+              work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            offset = gridOffset + elemOffset;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            prims.scatter(offset, nvls->nHeads * count, nelem, count, -1, 0);
+          }
+          // coverity[overrun-call] => Coverity think prims.index can be greater than 1
+        } else if (tid < tidEndReduce) {
+          // Reduce through NVLS
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+          Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+            prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
+              work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            offset = gridOffset + elemOffset;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            prims.recv(offset, nelem);
+          }
         }
-        // coverity[overrun-call] => Coverity think prims.index can be greater than 1
-      } else if (tid < tidEndReduce) {
-        // Reduce through NVLS
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, work->recvbuff,
-            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          offset = gridOffset + elemOffset;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          prims.recv(offset, nelem);
+      } else {
+        if (tid < tidEndScatter) {
+          // Scatter
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+          Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+            prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
+              work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            prims.scatter(0, 0, 0, 0, -1, 0);
+          }
+
+          /* gather used as sync */
+          prims.gather(0, 0, 0, 0, -1, 0);
+        } else if (tid < tidEndReduce) {
+          // Reduce through NVLS
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
+              work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
+          for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
+            size_t outOffset = gridOffset + elemOffset;
+            size_t inpOffset = outOffset + rank * count;
+            nelem = min(chunkCount, channelCount - elemOffset);
+            // Coverity complains about a possible overrun inside the method invoked below, but that's actually
+            // a false positive.
+            // coverity[overrun-call:FALSE]
+            prims.directRecvCopy(inpOffset, outOffset, nelem);
+          }
+
+          /* send for sync */
+          prims.send(0, 0);
         }
       }
     } else {
-      if (tid < tidEndScatter) {
-        // Scatter
+      // multi-node
+      int nNodes = ncclShmem.comm.nNodes;
+      int part = ncclShmem.channelId - work->channelLo;
+      ssize_t countPerRank = work->collnet.count;
+      const int nChannels = work->channelHi - work->channelLo + 1;
+      ssize_t chunkCount = work->collnet.chunkCount;
+      if (tid < tidEndNetRecv) {
         using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
-        Primitives<T, RedOp, FanSymmetric<NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
-          prims(tid, nThreadsScatter, nvls->up, nvls->up, NULL, NULL,
-            work->redOpArg, 0 * Proto::MaxGroupWidth, 1, 1);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          prims.scatter(0, 0, 0, 0, -1, 0);
+        if (work->netRegUsed) {
+          if (tid == 0) {
+            int steps = (int)divUp(nNodes * countPerRank, nChannels * chunkCount);
+            Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>::recvPeerNotify(nvls->out, 0, steps);
+          }
+          __syncwarp();
+        } else {
+          Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+            prims(tid, nThreadsNetRecv, &nvls->out, nullptr, nullptr, work->recvbuff,
+              work->redOpArg, 0 * Proto::MaxGroupWidth, 0, 0);
+          for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+            ssize_t railAllBeg = railGridOffset + part * chunkCount;
+            ssize_t railAllEnd = min(railAllBeg + chunkCount, nNodes * countPerRank);
+            ssize_t railOneBeg = ncclShmem.comm.node * countPerRank;
+            ssize_t railOneEnd = railOneBeg + countPerRank;
+            ssize_t beg = max(railAllBeg, railOneBeg);
+            ssize_t end = min(railAllEnd, railOneEnd);
+            prims.recv(beg - railOneBeg, max(ssize_t(0), end - beg), /*postOp=*/true);
+          }
         }
-
-        /* gather used as sync */
-        prims.gather(0, 0, 0, 0, -1, 0);
-      } else if (tid < tidEndReduce) {
-        // Reduce through NVLS
-        using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
-        Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
-          prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->down, NULL, work->recvbuff,
-            work->redOpArg, 3 * Proto::MaxGroupWidth, 0, 0, work);
-        for (size_t elemOffset = 0; elemOffset < channelCount; elemOffset += chunkCount) {
-          size_t outOffset = gridOffset + elemOffset;
-          size_t inpOffset = outOffset + rank * count;
-          nelem = min(chunkCount, channelCount - elemOffset);
-          // Coverity complains about a possible overrun inside the method invoked below, but that's actually
-          // a false positive.
-          // coverity[overrun-call:FALSE]
-          prims.directRecvCopy(inpOffset, outOffset, nelem);
+      } else {
+        if (tid < tidEndScatter) {
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
+          Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndNetRecv, nThreadsScatter, nullptr, nvls->up, work->sendbuff, nullptr,
+              work->redOpArg, 1 * Proto::MaxGroupWidth, 1, 1, work);
+          for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+            Scatterer</*ReduceSendNotRecv=*/true> scat;
+            scat.work = work;
+            scat.chunkCount = chunkCount;
+            scat.railGridOffset = railGridOffset;
+            prims.template process</*Recv=*/0, /*Send=*/1>(scat);
+          }
+        } else if (tid < tidEndReduce) {
+          using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
+          Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
+            prims(tid - tidEndScatter, nThreadsReduce, &nvls->down, &nvls->out, nullptr, nullptr,
+              work->redOpArg, 2 * Proto::MaxGroupWidth, 0, 1, work);
+          for (ssize_t railGridOffset = 0; railGridOffset < nNodes * countPerRank; railGridOffset += nChannels * chunkCount) {
+            Scatterer</*ReduceSendNotRecv=*/false> scat;
+            scat.work = work;
+            scat.chunkCount = chunkCount;
+            scat.railGridOffset = railGridOffset;
+            prims.template process</*Recv=*/1, /*Send=*/1>(scat);
+          }
         }
-
-        /* send for sync */
-        prims.send(0, 0);
       }
     }
   }
@@ -231,7 +355,7 @@ struct RunWorkColl<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NC
     int chunkSize;
     ssize_t railGridOffset;
 
-    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts>
+    template<int SlicePerChunk, int MinSrcs, int MaxSrcs, int MinDsts, int MaxDsts, int MultimemSrcs, int MultimemDsts>
     __device__ __forceinline__ void operator()(
         int tid, int tn, int slice, int maxSliceSize,
         int nSrcs, void** srcPtrs, int nDsts, void** dstPtrs, int32_t* dstSizes, uint32_t sendDirectFlag, uint32_t recvDirectFlag
diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh
new file mode 100644
index 000000000..8f81347ec
--- /dev/null
+++ b/src/device/symmetric/all_gather.cuh
@@ -0,0 +1,367 @@
+#include "symmetric.h"
+#include "symmetric/kernel.cuh"
+#include "symmetric/primitives.cuh"
+
+template<int BytePerPack, int UnrollPacks, int UnrollPeers>
+static __device__ void bcastDeep(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
+    char* inputHere, char* outputRank0, bool inPlace, int nIters
+  ) {
+  using Pack = BytePack<BytePerPack>;
+  int wn = tn/WARP_SIZE;
+  int w = t/WARP_SIZE;
+  int lane = t%WARP_SIZE;
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack tmp[UnrollPacks];
+
+  nIters -= w;
+  if (0 < nIters) {
+    #pragma unroll
+    for (int u=0; u < UnrollPacks; u++) {
+      tmp[u] = inpHere[u*WARP_SIZE];
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  if (0 < nIters) {
+    while (true) {
+      int dr = inPlace ? 1 : 0;
+      int r = rank + dr;
+      if (r == nRanks) r = 0;
+      #pragma unroll 2
+      for (int partial=0; partial <= 1; partial++) {
+        #pragma unroll 1
+        for (int i = 0;
+             partial ? i < 1 : (dr + UnrollPeers <= nRanks);
+             partial ? i++ : (dr += UnrollPeers)) {
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && dr == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
+            }
+            if (++r == nRanks) r = 0;
+          }
+        }
+      }
+      inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      nIters -= wn;
+      if (nIters <= 0) break;
+
+      // Load data for next iteration.
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        tmp[u] = inpHere[u*WARP_SIZE];
+      }
+    }
+  }
+}
+
+template<int UnrollPeers, typename T>
+static __device__ void bcastEnds(
+    ncclSymPrims& prim, int tn, int t,
+    T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
+  ) {
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
+  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  #pragma unroll 1
+  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
+    size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
+    BytePack<sizeof(T)> tmp = inpHere[elt];
+    int dr = inPlace ? 1 : 0;
+    int r = rank + dr;
+    if (r == nRanks) r = 0;
+    #pragma unroll 1
+    for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
+      #pragma unroll UnrollPeers
+      for (int u=0; u < UnrollPeers; u++) {
+        *add4G(outRank0+elt, r*stride4G) = tmp;
+        if (++r == nRanks) r = 0;
+      }
+    }
+    #pragma unroll UnrollPeers
+    for (int u=0; u < UnrollPeers; u++) {
+      if (dr+u == nRanks) break;
+      *add4G(outRank0+elt, r*stride4G) = tmp;
+      if (++r == nRanks) r = 0;
+    }
+  }
+}
+
+template<typename T>
+static __device__ void bcast(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
+  ) {
+  bool inPlace = (input == output);
+  // Mpve to rank=0
+  output = prim.peerPtr(0, output);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  size_t nBytes = nElts*sizeof(T);
+
+  uint32_t nPreBytes = (128u - inputUptr)%128u;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t cursor = nPreBytes;
+
+  constexpr int MinWarpPerBlock = 4;
+
+  if ((inputUptr-outputUptr)%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
+        prim, tn, t, waitNeeded,
+        (char*)input + cursor, (char*)output + cursor, inPlace,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
+        prim, tn, t, waitNeeded,
+        (char*)input + cursor, (char*)output + cursor, inPlace,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  constexpr int UnrollPeers = 8;
+  size_t nSufElts = (nBytes-cursor)/sizeof(T);
+  bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
+}
+
+__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
+  int const& rank = prim.rank;
+
+  // Threads numbered over rank.
+  int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                     prim.block, prim.nBlocks,
+                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int btn = prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+
+template<typename T>
+static __device__ void bcastMultimem(
+    ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
+  ) {
+  // Move output to multimem
+  output = prim.multimemPtr(output);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  size_t nBytes = nElts*sizeof(T);
+
+  uint32_t nPreBytes = (16-inputUptr)%16;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t nSufBytes;
+
+  if ((inputUptr-outputUptr)%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 8;
+    constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
+    uintptr_t cursor = nPreBytes;
+    uint32_t nChunks = (nBytes-cursor)/BytePerChunk;
+    uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
+    nSufBytes = nBytes - cursorAfter;
+    cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
+    cursor += (t%WARP_SIZE)*BytePerPack;
+    int nIters = nChunks - t/WARP_SIZE;
+    #pragma unroll 1
+    while (0 < nIters) {
+      BytePack<BytePerPack> tmp[UnrollPacks];
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        tmp[u] = *reinterpret_cast<BytePack<BytePerPack>*>(inputUptr + cursor + u*WARP_SIZE*BytePerPack);
+      }
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]);
+      }
+      cursor += tn*UnrollPacks*BytePerPack;
+      nIters -= tn/WARP_SIZE;
+    }
+  } else {
+    nPreBytes = 0;
+    nSufBytes = nBytes;
+  }
+
+  // Get the prefix+suffix element one at a time.
+  #pragma unroll 4
+  for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
+    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
+    BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
+    multimem_st_global(outputUptr + cursor, val);
+    cursor += tn*sizeof(T);
+  }
+}
+
+__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
+  int const& rank = prim.rank;
+
+  char* input = args->input;
+  char* output = args->output;
+  size_t bytes = args->nElts;
+  // Round robin memory to blocks.
+  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                    prim.block, prim.nBlocks,
+                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int tn = prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+template<typename EltType>
+static __device__ void allgather_LL_body(
+    ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
+  ) {
+  using Pack = BytePack<8>;
+  constexpr int EltPerPack = 8/sizeof(EltType);
+
+  ncclCoopCta cta;
+  int rank = prim.rank;
+  int nRanks = prim.nRanks;
+  constexpr int tn = ncclSymMaxThreads;
+  int t = threadIdx.x;
+
+  #pragma unroll 1
+  while (0 < nElts) {
+    int nIterPacks = min(nPacks, tn);
+    if (t < nIterPacks) {
+      Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
+      prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
+    }
+
+    int tn_div_nPacks = tn/nIterPacks;
+    int tn_mod_nPacks = tn%nIterPacks;
+    int peer = t/nIterPacks;
+    int pack = t%nIterPacks;
+    #if 1
+      // NOTE: Unrolling speedup on eos nranks=8 size=64K: 5.7us vs 6.7us
+      constexpr int Unroll = 4;
+      #pragma unroll 1
+      for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
+        Pack got[Unroll];
+        prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
+        #pragma unroll
+        for (int u=0; u < Unroll; u++) {
+          storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
+          peer += tn_div_nPacks;
+          pack += tn_mod_nPacks;
+          if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
+        }
+      }
+
+      int i = (nRanks*nIterPacks & -(Unroll*tn)) + t;
+      int n = (nRanks*nIterPacks)/tn % Unroll;
+      if (i + n*tn < nRanks*nIterPacks) n += 1;
+      if (n != 0) {
+        Pack got[Unroll];
+        prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
+        #pragma unroll
+        for (int u=0; u < Unroll; u++) {
+          if (u != 0 && u == n) break;
+          storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
+          peer += tn_div_nPacks;
+          pack += tn_mod_nPacks;
+          if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
+        }
+      }
+    #else
+      // The non-unrolled but "obviously correct" implementation for reference.
+      #pragma unroll 1
+      for (int i = t; i < nRanks*nIterPacks; i += tn) {
+        Pack got = prim.template recvLL<Pack>(i);
+        storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
+        peer += tn_div_nPacks;
+        pack += tn_mod_nPacks;
+        if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
+      }
+    #endif
+
+    prim.endLL(cta);
+
+    input += tn*EltPerPack;
+    output += tn*EltPerPack;
+    nElts -= tn*EltPerPack;
+    nPacks -= tn;
+  }
+}
+
+static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
+  using Pack = BytePack<8>;
+  constexpr int BytePerPack = 8;
+  int nElts = args->nElts;
+  int nPacks = divUp(nElts, BytePerPack);
+
+  uint32_t nPackPerBlock, nPackModBlock;
+  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
+  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
+  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
+  int nBlockPacks = blockPackEnd - blockPackBegin;
+  int nBlockElts = nElts - blockPackBegin*BytePerPack;
+  nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
+  char* blockInput = args->input + blockPackBegin*BytePerPack;
+  char* blockOutput = args->output + blockPackBegin*BytePerPack;
+
+  uint32_t lowBits = args->nElts;
+  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
+  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
+  if (__builtin_expect(lowBits%8 == 0, true)) {
+    // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
+    allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
+  } else {
+    allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
+  }
+}
+
+__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
+  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
+}
+
+__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
+  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
+}
diff --git a/src/device/symmetric/all_reduce.cuh b/src/device/symmetric/all_reduce.cuh
new file mode 100644
index 000000000..6c5219784
--- /dev/null
+++ b/src/device/symmetric/all_reduce.cuh
@@ -0,0 +1,432 @@
+#include "symmetric.h"
+#include "symmetric/kernel.cuh"
+#include "symmetric/primitives.cuh"
+
+template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
+static __device__ __forceinline__ void allreduceDeep(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
+    Red red, char* inputRank0, char* outputRank0, int32_t nIters
+  ) {
+  using Pack = BytePack<BytePerPack>;
+  using Acc = typename Red::EltType;
+  using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
+
+  int wn = tn/WARP_SIZE;
+  int w = t/WARP_SIZE;
+  int lane = t%WARP_SIZE;
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack acc0[UnrollPacks];
+
+  nIters -= w;
+  if (0 < nIters) {
+    #pragma unroll
+    for (int u=0; u < UnrollPacks; u++) {
+      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  if (0 < nIters) {
+    while (true) {
+      AccPack acc1[UnrollPacks];
+      int r = rank;
+      if (++r == nRanks) r = 0;
+      { Pack tmp1[UnrollPacks];
+        #pragma unroll
+        for (int u=0; u < UnrollPacks; u++) {
+          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+        }
+        #pragma unroll
+        for (int u=0; u < UnrollPacks; u++) {
+          acc1[u] = applyReduce(red, applyCast<T, Acc>(acc0[u]), applyCast<T, Acc>(tmp1[u]));
+        }
+      }
+
+      if (++r == nRanks) r = 0;
+
+      int dr = 2;
+      #pragma unroll 2
+      for (int partial=0; partial <= 1; partial++) {
+        #pragma unroll 1
+        for (int i = 0;
+             partial ? i < 1 : (dr + UnrollPeers <= nRanks);
+             partial ? i++ : (dr += UnrollPeers)) {
+          if (partial && dr == nRanks) break;
+
+          Pack tmp1[UnrollPeers][UnrollPacks];
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && ur!=0 && dr+ur == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+            }
+            if (++r == nRanks) r = 0;
+          }
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && ur!=0 && dr+ur == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              acc1[u] = applyReduce(red, acc1[u], applyCast<T, Acc>(tmp1[ur][u]));
+            }
+          }
+        }
+      }
+
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
+
+      dr = 0;
+      r = rank;
+      #pragma unroll 2
+      for (int partial=0; partial <= 1; partial++) {
+        #pragma unroll 1
+        for (int i = 0;
+             partial ? i < 1 : (dr + UnrollPeers <= nRanks);
+             partial ? i++ : (dr += UnrollPeers)) {
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && dr == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
+            }
+            if (++r == nRanks) r = 0;
+          }
+        }
+      }
+
+      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      nIters -= wn;
+      if (nIters <= 0) break;
+
+      // Load data for next iteration.
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      }
+    }
+  }
+}
+
+template<int UnrollPeers, typename Red, typename T>
+static __device__ __forceinline__ void allreduceEnds(
+    ncclSymPrims& prim, int tn, int t, Red red,
+    T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
+  ) {
+  using Acc = typename Red::EltType;
+
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
+  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+
+  #pragma unroll 1
+  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
+    size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
+    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(Acc)> acc1;
+    BytePack<sizeof(T)> tmp[UnrollPeers];
+    int dr = 1;
+    int r = rank+1;
+    if (nRanks == r) r = 0;
+    bool first = true;
+
+    #pragma unroll 2
+    for (int partial=0; partial <= 1; partial++) {
+      #pragma unroll 1
+      for (int j = 0;
+           partial ? j < 1 : (dr + UnrollPeers <= nRanks);
+           partial ? j++ : (dr += UnrollPeers)) {
+        if (partial && dr == nRanks) break;
+
+        #pragma unroll
+        for (int u=0; u < UnrollPeers-partial; u++) {
+          if (partial && u!=0 && dr+u == nRanks) break;
+          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          r += 1;
+          if (r == nRanks) r = 0;
+        }
+        if (first) {
+          first = false;
+          acc1 = applyCast<T, Acc>(acc0);
+        }
+        #pragma unroll
+        for (int u=0; u < UnrollPeers-partial; u++) {
+          if (partial && u!=0 && dr+u == nRanks) break;
+          acc1 = applyReduce(red, acc1, applyCast<T, Acc>(tmp[u]));
+        }
+      }
+    }
+
+    acc0 = applyCast<Acc, T>(acc1);
+    dr = 0;
+    r = rank;
+    #pragma unroll 2
+    for (int partial=0; partial <= 1; partial++) {
+      #pragma unroll 1
+      for (int j=0;
+           partial ? j < 1 : (dr + UnrollPeers <= nRanks);
+           partial ? j++ : (dr += UnrollPeers)) {
+        #pragma unroll
+        for (int u=0; u < UnrollPeers-partial; u++) {
+          if (partial && dr+u == nRanks) break;
+          *add4G(outRank0+elt, r*stride4G) = acc0;
+          r += 1;
+          if (r == nRanks) r = 0;
+        }
+      }
+    }
+  }
+}
+
+template<typename Red, typename T>
+static __device__ void allreduce(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
+    Red red, T* input, T* output, size_t nElts
+  ) {
+  int nRanks = prim.nRanks;
+  int nBlocks = prim.nBlocks;
+  // Mpve to rank=0
+  input = prim.peerPtr(0, input);
+  output = prim.peerPtr(0, output);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  size_t nBytes = nElts*sizeof(T);
+
+  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t cursor = nPreBytes;
+
+  constexpr int MinWarpPerBlock = 4;
+
+  if ((inputUptr-outputUptr)%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
+        prim, tn, t, waitNeeded, red,
+        (char*)input + cursor, (char*)output + cursor,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
+        prim, tn, t, waitNeeded, red,
+        (char*)input + cursor, (char*)output + cursor,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  constexpr int UnrollPeers = 8;
+  size_t nSufElts = (nBytes-cursor)/sizeof(T);
+  allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+}
+
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
+  int /*const&*/ rank = prim.rank;
+  int /*const&*/ nRanks = prim.nRanks;
+  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+
+  // Threads numbered globally such that we round robin warps by rank then block.
+  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                     rank, nRanks,
+                     prim.block, prim.nBlocks,
+                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int gtn = nRanks*prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+
+template<typename Red, typename T>
+static __device__ void allreduceMultimem(
+    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+  ) {
+  // Mpve to multimem
+  input = prim.multimemPtr(input);
+  output = prim.multimemPtr(output);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  size_t nBytes = nElts*sizeof(T);
+
+  constexpr int BytePerPack = LoadMultimem_BigPackSize<Red>::BigPackSize;
+  uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t nSufBytes;
+
+  if (alignof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) {
+    constexpr int UnrollPacks = 16*8/BytePerPack;
+    constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
+    uintptr_t cursor = nPreBytes;
+    int nChunks = (nBytes-cursor)/BytePerChunk;
+    uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
+    nSufBytes = nBytes - cursorAfter;
+    cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
+    cursor += (t%WARP_SIZE)*BytePerPack;
+    int nIters = nChunks - t/WARP_SIZE;
+    #pragma unroll 1
+    while (0 < nIters) {
+      BytePack<BytePerPack> tmp[UnrollPacks];
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        tmp[u] = applyLoadMultimem<Red, BytePerPack>(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack);
+      }
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        multimem_st_global(outputUptr + cursor + u*WARP_SIZE*BytePerPack, tmp[u]);
+      }
+      cursor += tn*UnrollPacks*BytePerPack;
+      nIters -= tn/WARP_SIZE;
+    }
+  } else {
+    nPreBytes = 0;
+    nSufBytes = nBytes;
+  }
+
+  // Get the prefix+suffix element one at a time.
+  #pragma unroll 4
+  for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
+    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
+    BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
+    multimem_st_global(outputUptr + cursor, val);
+    cursor += tn*sizeof(T);
+  }
+}
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
+  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
+
+  // Threads numbered globally such that we round robin warps by rank then block.
+  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                     prim.rank, prim.nRanks,
+                     prim.block, prim.nBlocks,
+                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
+  int /*const&*/ rank = prim.rank;
+  using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
+  Red<Acc> red(args->redOpArg);
+
+  using Pack = BytePack<8>;
+  using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
+  constexpr int EltPerPack = 8/sizeof(T);
+  int nElts = args->nElts;
+  int nPacks = divUp(nElts, EltPerPack);
+
+  bool packAligned = 8 <= alignof(T) || (
+      args->nElts*sizeof(T) |
+      (uint32_t)reinterpret_cast<uintptr_t>(args->input) |
+      (uint32_t)reinterpret_cast<uintptr_t>(args->output)
+    )%8 == 0;
+
+  uint32_t nPackPerBlock, nPackModBlock;
+  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
+  int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
+  int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
+
+  nPacks = end - begin;
+  nElts -= begin*EltPerPack;
+  nElts = min(nElts, nPacks*EltPerPack);
+  T* input = (T*)args->input + begin*EltPerPack;
+  T* output = (T*)args->output + begin*EltPerPack;
+
+  ncclCoopCta cta;
+  int t = threadIdx.x;
+  int tn = ncclSymMaxThreads;
+
+  if (__builtin_expect(packAligned, true)) {
+    #pragma unroll 1
+    while (0 < nPacks) {
+      if (t < nPacks) {
+        int nIterPacks = min(nPacks, tn);
+        Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
+        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
+        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
+        storePack((Pack*)output, t, nPacks, out);
+      }
+      prim.endLL(cta);
+
+      input += tn*EltPerPack;
+      output += tn*EltPerPack;
+      nPacks -= tn;
+    }
+  } else {
+    #pragma unroll 1
+    while (0 < nElts) {
+      if (t*EltPerPack < nElts) {
+        int nIterPacks = min(nPacks, tn);
+        Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
+        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
+        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
+        storePack(output, t*EltPerPack, nElts, out);
+      }
+      prim.endLL(cta);
+
+      input += tn*EltPerPack;
+      output += tn*EltPerPack;
+      nElts -= tn*EltPerPack;
+      nPacks -= tn;
+    }
+  }
+}
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
+  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
+}
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
+  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
+}
diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py
new file mode 100755
index 000000000..f630ff072
--- /dev/null
+++ b/src/device/symmetric/generate.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+################################################################################
+# The first command line argument is the path to the directory to generate and
+# populate.
+
+gensrc = sys.argv[1]
+
+if os.path.exists(gensrc):
+  for name in os.listdir(gensrc):
+    os.remove(os.path.join(gensrc, name))
+    #os.truncate(os.path.join(gensrc, name), 0)
+else:
+  os.mkdir(gensrc)
+
+def paste(sep, *args):
+  return sep.join(args)
+
+indents = 0
+def emitln(f, lines):
+  global indents
+  for ln in ((lines,) if isinstance(lines, str) else lines):
+    f.write('  '*indents + ln + '\n')
+
+def indent(s):
+  return '\n'.join('  '+l for l in s.splitlines())
+
+class Rec(object):
+  def __init__(me, **kw):
+    me.__dict__.update(kw)
+  def __eq__(x, y):
+    if len(x) != len(y): return False
+    for k in x:
+      if k not in y: return False
+      if x[k] != y[k]: return False
+    return True
+  def __hash__(me):
+    h = 0
+    for k in me.__dict__:
+      h += hash((k, me.__dict__[k]))
+    return h
+
+################################################################################
+# Edit this region for introducing new algos etc
+
+reductions = ["AllReduce","ReduceScatter"]
+all_reds = ["sum"]
+all_tys = ["f32","f16","bf16","f8e4m3","f8e5m2"]
+
+nvls_algos_by_coll = {
+  "AllReduce": ["AGxLLMC_R","RSxLDMC_AGxSTMC"],
+  "ReduceScatter": ["LDMC"]
+}
+ldmc_algos = ["RSxLDMC_AGxSTMC", "LDMC"]
+
+coll_to_lower = {
+  "AllGather": "all_gather",
+  "AllReduce": "all_reduce",
+  "ReduceScatter": "reduce_scatter"
+}
+
+red_to_ncclDevRedOp = {
+  "sum": "ncclDevSum"
+}
+red_to_Func = {
+  "sum": "FuncSum"
+}
+
+ty_to_ncclDataType = {
+  "f32": "ncclFloat32",
+  "f16": "ncclFloat16",
+  "bf16": "ncclBfloat16",
+  "f8e4m3": "ncclFloat8e4m3",
+  "f8e5m2": "ncclFloat8e5m2"
+}
+ty_to_cxxtype = {
+  "f32": "float",
+  "f16": "half",
+  "bf16": "__nv_bfloat16",
+  "f8e4m3": "__nv_fp8_e4m3",
+  "f8e5m2": "__nv_fp8_e5m2"
+}
+
+def enumerate_kernels():
+  for algo in ["LL","LLMC","ST","STMC"]:
+    yield Rec(coll="AllGather", algo=algo)
+  for red in all_reds:
+    for ty in all_tys:
+      for algo in ["AGxLL_R","AGxLLMC_R","RSxLD_AGxST","RSxLDMC_AGxSTMC"]:
+        yield Rec(coll="AllReduce", algo=algo, red=red, ty=ty)
+      for algo in ["LL","LD","LDMC"]:
+        yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty)
+
+def required_cuda(k):
+  cudart, arch, specific_sms  = 0, 0, None
+  is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, [])
+  if is_nvls:
+    cudart = max(cudart, 12010)
+    arch = 900
+  if k.coll in reductions:
+    if k.ty == "bf16":
+      cudart = max(cudart, 11000)
+    if k.ty.startswith("f8"):
+      cudart = max(cudart, 11080)
+      arch = 900
+      if k.algo in ldmc_algos:
+        cudart = 12070
+        arch = None
+        specific_sms = [100, 120]
+  return (cudart, arch, specific_sms)
+
+################################################################################
+
+def kernel_fdep(k):
+  return coll_to_lower[k.coll] + '.cu'
+
+def kernel_fname(k):
+  if k.coll in reductions:
+    if k.algo in ldmc_algos and k.ty.startswith('f8'):
+      return paste('_', coll_to_lower[k.coll], k.red, k.ty, k.algo) + '.cu'
+    else:
+      return paste('_', coll_to_lower[k.coll], k.red, k.ty) + '.cu'
+  else:
+    return coll_to_lower[k.coll] + '.cu'
+
+def kernel_gencode(k):
+  if k.coll in reductions and k.algo in ldmc_algos and k.ty.startswith('f8'):
+    return "$(NVCC_GENCODE_LDMC_FP8)"
+  else:
+    return "$(NVCC_GENCODE)"
+
+def kernel_cname(k):
+  if k.coll in reductions:
+    return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty)
+  else:
+    return paste("_", "ncclSymDevKernel", k.coll, k.algo)
+
+def kernel_conds(k):
+  cudart, arch, specific_sms = required_cuda(k)
+  if cudart == 0: return (None, None)
+
+  cudart_cond = "CUDART_VERSION >= %d"%cudart
+  if not specific_sms:
+    arch_cond = "__CUDA_ARCH__ >= %d"%arch
+  else:
+    arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms])
+  return cudart_cond, arch_cond
+
+def instantiate(k):
+  cudart_cond, arch_cond = kernel_conds(k)
+  if (cudart_cond, arch_cond) == (None, None):
+    form_red_ty = (
+      "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "  ncclSymRun_{id}<{red}, {ty}>(&args);\n"
+      "}}"
+    )
+    form = (
+      "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "  ncclSymRun_{id}(&args);\n"
+      "}}"
+    )
+  else:
+    form_red_ty = (
+      "#if {cudart_cond}\n"
+      "  __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "    #if {arch_cond}\n"
+      "      ncclSymRun_{id}<{red}, {ty}>(&args);\n"
+      "    #endif\n"
+      "  }}\n"
+      "#endif"
+    )
+    form = (
+      "#if {cudart_cond}\n"
+      "  __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "    #if {arch_cond}\n"
+      "      ncclSymRun_{id}(&args);\n"
+      "    #endif\n"
+      "  }}\n"
+      "#endif"
+    )
+
+  id = k.coll+'_'+k.algo
+  cname = kernel_cname(k)
+  if k.coll in reductions:
+    inst = form_red_ty.format(cname=cname, id=id, red=red_to_Func[k.red], ty=ty_to_cxxtype[k.ty], cudart_cond=cudart_cond, arch_cond=arch_cond)
+  else:
+    inst = form.format(cname=cname, id=id, cudart_cond=cudart_cond, arch_cond=arch_cond)
+  return inst
+
+def prototype(k):
+  cudart_cond, arch_cond = kernel_conds(k)
+  if cudart_cond is None:
+    form = "__global__ void {cname}(ncclSymDevArgs const);"
+  else:
+    form = (
+      "#if {cudart_cond}\n"
+      "  __global__ void {cname}(ncclSymDevArgs const);\n"
+      "#else\n"
+      "  constexpr void* {cname} = nullptr;\n"
+      "#endif"
+    )
+  return form.format(cname=kernel_cname(k), cudart_cond=cudart_cond)
+
+################################################################################
+
+def partition(vals, keyfn):
+  ans = {}
+  for x in vals:
+    k = keyfn(x)
+    if k not in ans:
+      ans[k] = []
+    ans[k].append(x)
+  return ans
+
+
+kernels_by_file = partition(enumerate_kernels(), lambda k: (kernel_fname(k), k.coll))
+
+# Add dependency only files (e.g. allreduce.cu)
+for coll in set(k.coll for k in enumerate_kernels()):
+  fname = coll_to_lower[coll]+'.cu'
+  if (fname, coll) not in kernels_by_file:
+    kernels_by_file[fname, coll] = []
+
+# Generate each kernel instantiation file
+for (fname, coll), ks in kernels_by_file.items():
+  with open(os.path.join(gensrc, fname), "w") as f:
+    emitln(f, '#include "symmetric.h"')
+    emitln(f, '#include "symmetric/kernel.cuh"')
+    emitln(f, '#include "symmetric/{coll}.cuh"'.format(coll=coll_to_lower[coll]))
+    for k in ks:
+      emitln(f, instantiate(k))
+
+# Generate <gensrc>/symmetric_host.cc
+with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
+  emitln(f, '#include "symmetric.h"')
+  emitln(f, '#include "device.h"')
+  emitln(f, '')
+
+  for k in enumerate_kernels():
+    emitln(f, prototype(k))
+  emitln(f, '')
+
+  emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels())))
+  emitln(f, 'extern void* const ncclSymKernelList[] = {')
+  for k in enumerate_kernels():
+    emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
+  emitln(f, 'nullptr};')
+  emitln(f, '')
+
+  emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {')
+  indents += 1
+  emitln(f, 'switch (id) {')
+  emitln(f, 'default: return nullptr;')
+  for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items():
+    emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':')
+    indents += 1
+    if len(coll_algo_ks) == 1:
+      emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';')
+    else:
+      emitln(f, 'switch ((ncclDevRedOp_t)red) {')
+      emitln(f, 'default: return nullptr;')
+      for red, coll_algo_red_ks in partition(coll_algo_ks, lambda k: k.red).items():
+        emitln(f, 'case '+red_to_ncclDevRedOp[red]+':')
+        indents += 1
+        emitln(f, 'switch (ty) {')
+        emitln(f, 'default: return nullptr;')
+        for k in coll_algo_red_ks:
+          emitln(f, 'case '+ty_to_ncclDataType[k.ty]+': return (void*)'+kernel_cname(k)+';')
+        emitln(f, '}')
+        indents -= 1
+      emitln(f, '}')
+    indents -=1
+  emitln(f, '}')
+  indents -= 1
+  emitln(f, '}')
+
+# Generate <gensrc>/rules.mk
+with open(os.path.join(gensrc, "rules.mk"), "w") as f:
+  inst_names = sorted(set(kernel_fname(k) for k in enumerate_kernels()))
+  names = inst_names + ["symmetric_kernels.cc"]
+  f.write("LIB_OBJS_SYM_GEN = $(patsubst %,$(OBJDIR)/genobj/symmetric/%.o,{names})\n"
+          .format(names=" ".join(names)))
+  f.write("\n")
+
+  inst_names = sorted(set((k.coll, kernel_fname(k), kernel_gencode(k)) for k in enumerate_kernels()))
+  for coll, name, gencode in inst_names:
+    f.write(
+      "$(OBJDIR)/genobj/symmetric/{name}.o: $(OBJDIR)/gensrc/symmetric $(OBJDIR)/genobj/symmetric/{coll}.cu.d\n"
+      "\t" "$(call COMPILE_SYM,$@,$(OBJDIR)/gensrc/symmetric/{name},{gencode})\n"
+      "\n"
+      .format(name=name, coll=coll_to_lower[coll], gencode=gencode)
+    )
diff --git a/src/device/symmetric/kernel.cuh b/src/device/symmetric/kernel.cuh
new file mode 100644
index 000000000..f631d51d9
--- /dev/null
+++ b/src/device/symmetric/kernel.cuh
@@ -0,0 +1,27 @@
+#ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
+#define NCCL_DEVICE_SYMMETRIC_KERNEL_H_
+
+#include "symmetric.h"
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
+
+__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
+#endif
diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh
new file mode 100644
index 000000000..167024400
--- /dev/null
+++ b/src/device/symmetric/primitives.cuh
@@ -0,0 +1,420 @@
+#ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
+#define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
+
+#include "symmetric.h"
+#include "bitops.h"
+#include "collectives.h"
+#include "op128.h"
+#include "reduce_kernel.h"
+
+#if __CUDA_ARCH__ >= 700
+// __grid_constant__ appears to break cuda-gdb
+#define NCCL_GRID_CONSTANT __grid_constant__
+#else
+#define NCCL_GRID_CONSTANT
+#endif
+
+// flattenIx(pos0, dim0, pos1, dim1, pos2, dim2, ...)
+// Given a position vector `pos` in a rectangular index space with lengths in the `dim`
+// vector, flatten that down to a linear index. The fastest moving dimension is given first.
+__device__ __forceinline__ int flattenIx() { return 0; }
+
+template<typename Int0, typename Int1, typename ...Ints>
+static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) {
+  return pos + size*flattenIx(more...);
+}
+
+// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
+// Pass these to idivFast64() for fast division on the GPU.
+static __device__ uint64_t idivRcp64_upto64(int x) {
+  static constexpr uint64_t table[65] = {
+    idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
+    idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
+    idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
+    idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
+    idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
+    idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
+    idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
+    idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
+    idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
+    idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
+    idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
+    idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
+    idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
+    idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
+    idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
+    idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
+    idivRcp64(0x40)
+  };
+  return table[x];
+}
+
+static __device__ uint32_t idivRcp32_upto64(int x) {
+  return idivRcp64_upto64(x)>>32;
+}
+
+namespace {
+struct ncclCoopCta {
+  __device__ void sync() { __syncthreads(); }
+  __device__ int self() { return threadIdx.x; }
+  __device__ int count() { return blockDim.x; }
+};
+struct ncclCoopWarps {
+  int log2_nWarps;
+  __device__ void sync() {
+    asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<<log2_nWarps) : "memory");
+  }
+  __device__ int self() { return threadIdx.x & ((32<<log2_nWarps)-1); }
+  __device__ int count() { return 32<<log2_nWarps; }
+};
+struct ncclCoopWarp {
+  __device__ void sync() { __syncwarp(); }
+  __device__ int self() { return threadIdx.x%32; }
+  __device__ int count() { return 32; }
+};
+}
+
+namespace {
+static constexpr int ncclSymPrims_UseBarrier = 1;
+static constexpr int ncclSymPrims_UseLL = 2;
+static constexpr int ncclSymPrims_UseMultimem = 4;
+struct ncclSymPrims {
+  int flags;
+  int const &rank;
+  int const &nRanks;
+  uint32_t const &nRanks_rcp32;
+  int block, nBlocks;
+  uint32_t nBlocks_rcp32;
+  uint32_t nBlocks_nWarps_rcp32;
+  uint32_t nRanks_nBlocks_rcp32;
+  uint32_t nWarpPerRank, nWarpPerRank_rcp32;
+  struct ncclSymDevBase* const &base;
+  uintptr_t offsetMc;
+
+  uint32_t const &stride4G;
+  uint32_t barEpoch;
+  uint32_t llEpoch;
+
+  __device__ ncclSymPrims(ncclSymDevComm const &comm, int flags):
+    flags(flags),
+    rank(comm.rank),
+    nRanks(comm.nRanks),
+    nRanks_rcp32(comm.nRanks_rcp32),
+    block(blockIdx.x),
+    nBlocks(gridDim.x),
+    nBlocks_rcp32(idivRcp32_upto64(nBlocks)),
+    nBlocks_nWarps_rcp32(imulRcp32(nBlocks, nBlocks_rcp32, blockDim.x/32, idivRcp32_upto64(blockDim.x/32))),
+    nRanks_nBlocks_rcp32(imulRcp32(nRanks, nRanks_rcp32, gridDim.x, nBlocks_rcp32)),
+    nWarpPerRank(idivFast32(nBlocks*blockDim.x/32, nRanks, nRanks_rcp32)),
+    nWarpPerRank_rcp32(idivRcp32_upto64(nWarpPerRank)),
+    base(comm.base),
+    offsetMc((flags & ncclSymPrims_UseMultimem) ? (char*)comm.baseMc - (char*)base : 0x0),
+    stride4G(comm.stride4G) {
+
+    #if CUDART_VERSION >= 12030 && __CUDA_ARCH__ >= 900
+      cudaGridDependencySynchronize();
+    #endif
+
+    if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) {
+      barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block];
+    }
+    if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2;
+  }
+  __device__  ~ncclSymPrims() {
+    if (threadIdx.x == 0) {
+      if (flags & ncclSymPrims_UseBarrier) {
+        ((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch;
+      }
+      if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2;
+    }
+  }
+
+  template<typename T>
+  __device__ T* peerPtr(int peer, T* selfPtr) {
+    return add4G(selfPtr, (peer-rank)*stride4G);
+  }
+
+  template<typename T>
+  __device__ T* multimemPtr(T* selfPtr) {
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(selfPtr) + offsetMc);
+  }
+
+  __device__  void barrierArrive(ncclCoopCta cta, bool release) {
+    cta.sync();
+    #if __CUDA_ARCH__ < 700
+      if (release) {
+        if (cta.self() == 0) __threadfence_system();
+        cta.sync();
+      }
+    #endif
+    if (flags & ncclSymPrims_UseMultimem) {
+    #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+      if (cta.self() == 0) {
+        uint32_t* inbox = &multimemPtr(base)->barInboxMc[block];
+        if (release) {
+          asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
+        } else {
+          asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
+        }
+      }
+    #endif
+    } else {
+      int r = cta.self();
+      if (r != rank && r < nRanks) {
+        uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank];
+        #if __CUDA_ARCH__ >= 700
+          if (release) {
+            asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
+          } else {
+            asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
+          }
+        #else
+          asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
+        #endif
+      }
+    }
+  }
+
+  __device__  void barrierWait(ncclCoopCta cta, bool acquire) {
+    if (flags & ncclSymPrims_UseMultimem) {
+    #if __CUDA_ARCH__ >= 900
+      if (cta.self() == 0) {
+        uint32_t* inbox = &base->barInboxMc[block];
+        while (true) {
+          uint32_t got;
+          if (acquire) {
+            asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
+          } else {
+            asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
+          }
+          if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break;
+        }
+        barEpoch += nRanks;
+      }
+    #endif
+    } else {
+      int r = cta.self();
+      if (r != rank && r < nRanks) {
+        uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r];
+        while (true) {
+          uint32_t got;
+          #if __CUDA_ARCH__ >= 700
+            if (acquire) {
+              asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
+            } else {
+              asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
+            }
+          #else
+            asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
+          #endif
+          if (got-(barEpoch+1) <= uint32_t(-1)>>1) break;
+        }
+      }
+      #if __CUDA_ARCH__ < 700
+        if (acquire) {
+          cta.sync();
+          if (cta.self() == 0) __threadfence();
+        }
+      #endif
+      barEpoch += 1;
+    }
+    cta.sync();
+  }
+
+  __device__ void endLL(ncclCoopCta cta) {
+    if (__builtin_expect(llEpoch >= -2u, false)) {
+      cta.sync();
+      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch);
+      int epochSize = ncclSymLLEpochSize(nRanks);
+      #pragma unroll 4
+      for (int i=cta.self(); i*16 < epochSize; i += cta.count()) {
+        buf[i] = uint4{0, 0, 0, 0};
+      }
+    }
+    cta.sync();
+    llEpoch += (llEpoch == -1u) ? 3 : 1;
+  }
+
+  template<typename T>
+  __device__ void sendLL(int peer, int slot, T val) {
+    union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
+    tmp = val;
+    uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot;
+    #pragma unroll
+    for (int u=0; u < divUp(sizeof(T),8); u++) {
+      asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
+    }
+  }
+
+  template<typename T>
+  __device__ void bcastLL(int slot, T val) {
+    if (flags & ncclSymPrims_UseMultimem) {
+      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
+      tmp = val;
+      uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot;
+      #pragma unroll
+      for (int u=0; u < divUp(sizeof(T),8); u++) {
+        asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
+      }
+    } else {
+      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
+      tmp = val;
+      uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot;
+      int dr = 0;
+      int r = rank;
+      #pragma unroll 1
+      for (; dr+8 <= nRanks; dr += 8) {
+        #pragma unroll
+        for (int ur=0; ur < 8; ur++) {
+          uint4* buf = add4G(buf0, r*stride4G);
+          #pragma unroll
+          for (int u=0; u < divUp(sizeof(T),8); u++) {
+            asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
+          }
+          r += 1;
+          if (r == nRanks) r = 0;
+        }
+      }
+      #pragma unroll
+      for (int ur=0; ur < 8; ur++, dr++) {
+        if (dr == nRanks) break;
+        uint4* buf = add4G(buf0, r*stride4G);
+        #pragma unroll
+        for (int u=0; u < divUp(sizeof(T),8); u++) {
+          asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
+        }
+        r += 1;
+        if (r == nRanks) r = 0;
+      }
+    }
+  }
+
+  template<int nSlotsMin, int nSlotsMax, typename T>
+  __device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) {
+    uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0;
+    uint4 tmp[nSlotsMax][divUp(sizeof(T),8)];
+    //int spins=0;
+    while (true) {
+      #pragma unroll
+      for (int u=0; u < nSlotsMax; u++) {
+        if (u < nSlotsMin || u < nSlots) {
+          #pragma unroll
+          for (int v=0; v < divUp(sizeof(T),8); v++) {
+            asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T))));
+          }
+        }
+      }
+      bool okAll = true;
+      #pragma unroll
+      for (int u=0; u < nSlotsMax; u++) {
+        #pragma unroll
+        for (int v=0; v < divUp(sizeof(T),8); v++) {
+          if (u < nSlotsMin || u < nSlots) {
+            bool ok = tmp[u][v].y == llEpoch &&
+                      tmp[u][v].w == llEpoch;
+            okAll &= ok;
+          }
+        }
+      }
+      if (__builtin_expect(okAll, true)) break;
+      //if (spins++ == 10<<20) spins=0;
+    }
+    #pragma unroll
+    for (int u=0; u < nSlotsMax; u++) {
+      if (nSlotsMin <= u && u == nSlots) break;
+      union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; };
+      #pragma unroll
+      for (int v=0; v < divUp(sizeof(T),8); v++) {
+        u32[v][0] = tmp[u][v].x;
+        u32[v][1] = tmp[u][v].z;
+      }
+      elts[u] = val;
+    }
+  }
+
+  template<typename Pack, typename T, typename Red, int Unroll=8>
+  __device__ Pack recvReduceLL(int slot, int stride, Red red) {
+    using Acc = typename Red::EltType;
+    using AccPack = BytePack<sizeof(Pack)*sizeof(Acc)/sizeof(T)>;
+    AccPack acc;
+    bool first = true;
+    int r = 0;
+    #pragma unroll 1
+    for (; r+Unroll <= nRanks; r += Unroll) {
+      Pack got[Unroll];
+      this->template recvLL</*Min=*/Unroll>(slot + r*stride, Unroll, stride, got);
+      AccPack acc0 = applyCast<T, Acc>(got[0]);
+      acc = first ? acc0 : applyReduce(red, acc, acc0);
+      first = false;
+      #pragma unroll
+      for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
+    }
+    if (r < nRanks) {
+      Pack got[Unroll];
+      this->template recvLL</*Min=*/1>(slot + r*stride, nRanks-r, stride, got);
+      AccPack acc0 = applyCast<T, Acc>(got[0]);
+      acc = first ? acc0 : applyReduce(red, acc, acc0);
+      #pragma unroll
+      for (int i=1; i < Unroll-1; i++) {
+        if (r+i < nRanks) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
+      }
+    }
+    return applyCast<Acc, T>(acc);
+  }
+
+  template<typename T>
+  __device__ T recvLL(int slot) {
+    T one[1];
+    this->template recvLL<1, 1, T>(slot, 1, 0, one);
+    return one[0];
+  }
+
+  template<typename Coop, typename T>
+  __device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) {
+    int me = coop.self();
+    if (me < nSlots) {
+      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me;
+      uint4 got[divUp(sizeof(T), 8)];
+      //int spins=0;
+      #pragma unroll 1
+      while (true) {
+        #pragma unroll
+        for (int u=0; u < divUp(sizeof(T), 8); u++) {
+          asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T))));
+        }
+        bool ok = true;
+        #pragma unroll
+        for (int u=0; u < divUp(sizeof(T), 8); u++) {
+          ok &= got[u].y == llEpoch;
+          ok &= got[u].w == llEpoch;
+        }
+        if (__builtin_expect(ok, true)) break;
+        //if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); }
+      }
+      union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+      #pragma unroll
+      for (int u=0; u < divUp(sizeof(T), 8); u++) {
+        u32[u][0] = got[u].x;
+        u32[u][1] = got[u].z;
+      }
+      dst[slot0 + me] = val;
+    }
+  }
+};
+}
+
+template<template<typename> typename Red, typename T, bool nvls>
+struct ncclSymAccumType { using Type = T; };
+
+// Only Red's whose opArg is invariant w.r.t. the datatype can have a different
+// accumulator type. At the moment this excludes integer min/max, sumpostdiv,
+// and premulsum.
+template<> struct ncclSymAccumType<FuncSum, __half, false> { using Type = float; };
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+template<> struct ncclSymAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
+#endif
+#if defined(__CUDA_FP8_TYPES_EXIST__)
+template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
+template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
+#endif
+#endif
diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh
new file mode 100644
index 000000000..4fd96093e
--- /dev/null
+++ b/src/device/symmetric/reduce_scatter.cuh
@@ -0,0 +1,387 @@
+#include "symmetric.h"
+#include "symmetric/kernel.cuh"
+#include "symmetric/primitives.cuh"
+
+template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
+static __device__ void reduceDeep(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
+    Red red, char* inputRank0, char* outputHere, int32_t nIters
+  ) {
+  using Pack = BytePack<BytePerPack>;
+  using Acc = typename Red::EltType;
+  using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
+
+  int wn = tn/WARP_SIZE;
+  int w = t/WARP_SIZE;
+  int lane = t%WARP_SIZE;
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  Pack acc0[UnrollPacks];
+
+  nIters -= w;
+  if (0 < nIters) {
+    #pragma unroll
+    for (int u=0; u < UnrollPacks; u++) {
+      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  if (0 < nIters) {
+    while (true) {
+      AccPack acc1[UnrollPacks];
+      int r = rank+1;
+      if (r == nRanks) r = 0;
+      { Pack tmp1[UnrollPacks];
+        #pragma unroll
+        for (int u=0; u < UnrollPacks; u++) {
+          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+        }
+        #pragma unroll
+        for (int u=0; u < UnrollPacks; u++) {
+          acc1[u] = applyReduce(red, applyCast<T, Acc>(acc0[u]), applyCast<T, Acc>(tmp1[u]));
+        }
+      }
+
+      r += 1;
+      if (r == nRanks) r = 0;
+
+      int dr = 2;
+      #pragma unroll 2
+      for (int partial=0; partial <= 1; partial++) {
+        #pragma unroll 1
+        for (int i = 0;
+             partial ? i < 1 : (dr + UnrollPeers <= nRanks);
+             partial ? i++ : (dr += UnrollPeers)) {
+          if (partial && dr == nRanks) break;
+
+          Pack tmp1[UnrollPeers][UnrollPacks];
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && ur!=0 && dr+ur == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+            }
+            r += 1;
+            if (r == nRanks) r = 0;
+          }
+          #pragma unroll
+          for (int ur=0; ur < UnrollPeers-partial; ur++) {
+            if (partial && ur!=0 && dr+ur == nRanks) break;
+            #pragma unroll UnrollPacks
+            for (int u=0; u < UnrollPacks; u++) {
+              acc1[u] = applyReduce(red, acc1[u], applyCast<T, Acc>(tmp1[ur][u]));
+            }
+          }
+        }
+      }
+
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
+
+      #pragma unroll UnrollPacks
+      for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u];
+
+      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      nIters -= wn;
+      if (nIters <= 0) break;
+
+      // Load data for next iteration.
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      }
+    }
+  }
+}
+
+template<int UnrollPeers, typename Red, typename T>
+static __device__ void reduceEnds(
+    ncclSymPrims& prim, int tn, int t, Red red,
+    T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts
+  ) {
+  using Acc = typename Red::EltType;
+
+  int const& rank = prim.rank;
+  int const& nRanks = prim.nRanks;
+  uint32_t const& stride4G = prim.stride4G;
+  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
+  BytePack<sizeof(T)>* outHere = (BytePack<sizeof(T)>*)outputHere;
+  #pragma unroll 1
+  for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
+    size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
+    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(Acc)> acc1;
+    BytePack<sizeof(T)> tmp[UnrollPeers];
+    int dr = 1;
+    int r = rank+1;
+    if (nRanks == r) r = 0;
+    bool first = true;
+
+    #pragma unroll 2
+    for (int partial=0; partial <= 1; partial++) {
+      #pragma unroll 1
+      for (int j = 0;
+           partial ? j < 1 : (dr + UnrollPeers <= nRanks);
+           partial ? j++ : (dr += UnrollPeers)) {
+        if (partial && dr == nRanks) break;
+
+        #pragma unroll
+        for (int u=0; u < UnrollPeers-partial; u++) {
+          if (partial && u!=0 && dr+u == nRanks) break;
+          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          r += 1;
+          if (r == nRanks) r = 0;
+        }
+        if (first) {
+          first = false;
+          acc1 = applyCast<T, Acc>(acc0);
+        }
+        #pragma unroll
+        for (int u=0; u < UnrollPeers-partial; u++) {
+          if (partial && u!=0 && dr+u == nRanks) break;
+          acc1 = applyReduce(red, acc1, applyCast<T, Acc>(tmp[u]));
+        }
+      }
+    }
+
+    acc0 = applyCast<Acc, T>(acc1);
+    outHere[elt] = acc0;
+  }
+}
+
+template<typename Red, typename T>
+static __device__ void reduce(
+    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
+    Red red, T* input, T* output, size_t nElts
+  ) {
+  int nRanks = prim.nRanks;
+  int nBlocks = prim.nBlocks;
+  // Mpve input to rank=0
+  input = prim.peerPtr(0, input);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  uint32_t alignment = uint32_t(inputUptr - outputUptr);
+  size_t nBytes = nElts*sizeof(T);
+
+  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t cursor = nPreBytes;
+
+  constexpr int MinWarpPerBlock = 4;
+
+  if (alignment%16 == 0) {
+    constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      reduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
+        prim, tn, t, waitNeeded, red,
+        (char*)input + cursor, (char*)output + cursor,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && alignment%4 == 0)) {
+    constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
+    constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
+    uint32_t chunks = (nBytes-cursor)/BytePerChunk;
+    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    if (chunks != 0) {
+      uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
+      reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
+        prim, tn, t, waitNeeded, red,
+        (char*)input + cursor, (char*)output + cursor,
+        chunks*MinWarpPerBlock
+      );
+      cursor = cursorAfter;
+      waitNeeded = false;
+    }
+  }
+
+  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  constexpr int UnrollPeers = 8;
+  size_t nSufElts = (nBytes-cursor)/sizeof(T);
+  reduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+}
+
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
+  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+
+  // Round robin warps over blocks.
+  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                    prim.block, prim.nBlocks,
+                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int tn = prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+
+template<typename Red, typename T>
+static __device__ void reduceMultimem(
+    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+  ) {
+  // Mpve input to multimem
+  input = prim.multimemPtr(input);
+
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  size_t nBytes = nElts*sizeof(T);
+
+  constexpr int BytePerPack = LoadMultimem_BigPackSize<Red>::BigPackSize;
+  uint32_t nPreBytes = (BytePerPack - inputUptr)%BytePerPack;
+  nPreBytes = min((size_t)nPreBytes, nBytes);
+  uintptr_t nSufBytes;
+
+  if (sizeof(T) == BytePerPack || (inputUptr-outputUptr)%BytePerPack == 0) {
+    constexpr int UnrollPacks = 8*(16/BytePerPack);
+    constexpr int BytePerChunk = UnrollPacks*WARP_SIZE*BytePerPack;
+    uintptr_t cursor = nPreBytes;
+    uint32_t nChunks = (nBytes-cursor)/BytePerChunk;
+    uintptr_t cursorAfter = cursor + uintptr_t(nChunks)*BytePerChunk;
+    nSufBytes = nBytes - cursorAfter;
+    cursor += (t/WARP_SIZE)*UnrollPacks*WARP_SIZE*BytePerPack;
+    cursor += (t%WARP_SIZE)*BytePerPack;
+    int nIters = nChunks - t/WARP_SIZE;
+    #pragma unroll 1
+    while (0 < nIters) {
+      BytePack<BytePerPack> tmp[UnrollPacks];
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        tmp[u] = applyLoadMultimem<Red, BytePerPack>(red, inputUptr + cursor + u*WARP_SIZE*BytePerPack);
+      }
+      #pragma unroll
+      for (int u=0; u < UnrollPacks; u++) {
+        *reinterpret_cast<BytePack<BytePerPack>*>(outputUptr + cursor + u*WARP_SIZE*BytePerPack) = tmp[u];
+      }
+      cursor += tn*UnrollPacks*BytePerPack;
+      nIters -= tn/WARP_SIZE;
+    }
+  } else {
+    nPreBytes = 0;
+    nSufBytes = nBytes;
+  }
+
+  // Get the prefix+suffix element one at a time.
+  #pragma unroll 4
+  for (uintptr_t i = t*sizeof(T); i < nPreBytes + nSufBytes; i += tn*sizeof(T)) {
+    uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
+    BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
+    *reinterpret_cast<BytePack<sizeof(T)>*>(outputUptr + cursor) = val;
+    cursor += tn*sizeof(T);
+  }
+}
+
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
+  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
+
+  // Round robin warps over blocks.
+  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                    prim.block, prim.nBlocks,
+                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+  int tn = prim.nBlocks*blockDim.x;
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+
+  reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+
+  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
+  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+}
+
+// T is user type, EltType is the most aligned type
+template<typename T, typename Red, typename EltType>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
+    ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
+  using Pack = BytePack<8>;
+  constexpr int EltPerPack = 8/sizeof(EltType);
+
+  int nRanks = prim.nRanks;
+  int rank = prim.rank;
+  int t = threadIdx.x;
+  int tn = ncclSymMaxThreads;
+  ncclCoopCta cta;
+
+  #pragma unroll 1
+  while (0 < nElts) {
+    int nIterPacks = min(nPacks, tn);
+    int tn_div_nPacks = tn/nIterPacks;
+    int tn_mod_nPacks = tn%nIterPacks;
+    int peer = t/nIterPacks;
+    int pack = t%nIterPacks;
+
+    #pragma unroll 1
+    for (int i = t; i < nRanks*nIterPacks; i += tn) {
+      Pack got = loadPack<Pack>(input + peer*nStrideElts, pack*EltPerPack, nElts);
+      prim.sendLL(peer, rank*nIterPacks + pack, got);
+      peer += tn_div_nPacks;
+      pack += tn_mod_nPacks;
+      if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
+    }
+
+    if (t < nIterPacks) {
+      Pack got = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
+      storePack(output, t*EltPerPack, nElts, got);
+    }
+    prim.endLL(cta);
+
+    input += tn*EltPerPack;
+    output += tn*EltPerPack;
+    nElts -= tn*EltPerPack;
+    nPacks -= tn;
+  }
+}
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) {
+  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL);
+  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
+
+  using Pack = BytePack<8>;
+  constexpr int EltPerPack = 8/sizeof(T);
+  int nAllElts = args->nElts;
+  int nAllPacks = divUp(nAllElts, EltPerPack);
+  uint32_t nPackPerBlock, nPackModBlock;
+  idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32);
+  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
+  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
+  int nPacks = blockPackEnd - blockPackBegin;
+  int nElts = nAllElts - blockPackBegin*EltPerPack;
+  nElts = min(nElts, nPacks*EltPerPack);
+  T* input = (T*)args->input + blockPackBegin*EltPerPack;
+  T* output = (T*)args->output + blockPackBegin*EltPerPack;
+
+  uint32_t lowBits = args->nElts*sizeof(T);
+  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
+  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
+  if (__builtin_expect(lowBits%8 == 0, true)) {
+    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack);
+  } else {
+    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, input, output, nElts, nPacks, nAllElts);
+  }
+}
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 4e8a211fc..f5b43724c 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -13,6 +13,7 @@
 #include "cudawrap.h"
 #include "profiler.h"
 #include "transport.h"
+#include "register_inline.h"
 
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
@@ -28,34 +29,41 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
   int carveout = ncclParamL1SharedMemoryCarveout();
   int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
-  for (int k=0; k < ncclDevKernelCount; k++) {
-    void* fn = ncclDevKernelList[k];
-    cudaFuncAttributes attr = {0};
-    if (fn == nullptr) continue;
-
-    CUDACHECKGOTO(cudaFuncGetAttributes(&attr, fn), result, ignore0);
-    if (maxStackSize) {
-      if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
-    ignore0:;
-    }
-    if (carveout) {
-      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
-        cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
-        result, ignore1);
-    ignore1:;
-    }
-    if (ncclMaxSharedMem != 0) {
-      int sharedMemSize = ncclMaxSharedMem;
-      if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
-        WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
-             cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
-        return ncclSystemError;
+  for (int sym=0; sym <= 1; sym++) {
+    int kcount = sym==0 ? ncclDevKernelCount : ncclSymKernelCount;
+    void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymKernelList;
+    for (int k=0; k < kcount; k++) {
+      void* fn = kptrs[k];
+      cudaFuncAttributes attr = {0};
+      if (fn == nullptr) continue;
+
+      cudaError_t errcode = cudaFuncGetAttributes(&attr, fn);
+      if (errcode == cudaErrorNoKernelImageForDevice) continue;
+      CUDACHECKGOTO(errcode, result, ignore0);
+
+      if (maxStackSize) {
+        if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
+      ignore0:;
       }
-      CUDACHECKGOTO(cudaFuncSetAttribute(fn,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
-        result, next_kernel);
+      if (carveout) {
+        CUDACHECKGOTO(cudaFuncSetAttribute(fn,
+          cudaFuncAttributePreferredSharedMemoryCarveout, carveout),
+          result, ignore1);
+      ignore1:;
+      }
+      if (ncclMaxSharedMem != 0) {
+        int sharedMemSize = ncclMaxSharedMem;
+        if (sharedMemSize > (maxSharedMem-attr.sharedSizeBytes)) {
+          WARN("cudaArch %d ncclMaxSharedMem %d exceeds device/fn maxSharedMem %zu",
+               cudaArch, sharedMemSize, maxSharedMem-attr.sharedSizeBytes);
+          return ncclSystemError;
+        }
+        CUDACHECKGOTO(cudaFuncSetAttribute(fn,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, sharedMemSize),
+          result, next_kernel);
+      }
+    next_kernel:;
     }
-  next_kernel:;
   }
   return result;
 }
@@ -258,8 +266,8 @@ static bool testBudget(
 
 ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
   struct ncclKernelPlanner* planner = &comm->planner;
+  if (planner->isSymColl) return ncclSuccess;
   struct ncclTaskColl *task;
-
   task = ncclIntruQueueHead(&planner->collTaskQueue);
   while (task != nullptr) {
     // Build a ncclDevWorkColl[Reg?] struct for each task.
@@ -331,6 +339,38 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
   int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
   int fnOpTyCount = 0;
 
+  if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) {
+    void* sendSymPtr;
+    void* recvSymPtr;
+    struct ncclReg* sendReg;
+    struct ncclReg* recvReg;
+    size_t size = task->count*ncclTypeSize(task->datatype);
+    NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg));
+    NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg));
+    bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype);
+
+    if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) {
+      enum ncclSymKernelId kernel;
+      int nChannels, nWarps;
+      float estTimeUs = 1.e18;
+      NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps));
+
+      // We should only use symmetric kernel if it beats the asymmetric kernel. But the
+      // perf model accuracy from asymmetric kernels is too inaccurate and reports too high
+      // of a bandwidth. For now just always use symmetric if available.
+      if (kernel != ncclSymKernelId_Count) {
+        task->sendbuff = sendSymPtr;
+        task->recvbuff = recvSymPtr;
+        task->devFuncId = (int)kernel;
+        task->nMaxChannels = nChannels;
+        task->nWarps = nWarps;
+        ncclIntruQueueEnqueue(&planner->collTaskQueue, task);
+        planner->isSymColl = true;
+        return ncclSuccess;
+      }
+    }
+  }
+
   // Walk the size sorted tasks, binning them by (fn,op,ty).
   while (task != nullptr) {
     struct ncclTaskColl* next = task->next;
@@ -603,6 +643,10 @@ static ncclResult_t scheduleCollTasksToPlan(
       (countHi != 0 ? countHi : countLo) -= cells*elementsPerCell - task->count;
 
       nChannels = (countLo!=0 ? 1 : 0) + nMidChannels + (cellsHi!=0 ? 1 : 0);
+
+      // Update number of channels propagated to the profiler
+      task->nChannels = (uint8_t)nChannels;
+
       // Ensure room for worst case of one new batch per channel
       if (!testBudget(budget, plan->nWorkBatches + nChannels, plan->workBytes + workNode->size)) {
         return ncclSuccess;
@@ -860,6 +904,8 @@ static ncclResult_t addP2pToPlan(
         partSize = divUp(bytes[dir], nChannels[dir]);
       }
     }
+    // Update number of channels propagated to the profiler
+    if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir];
   }
 
   struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
@@ -1052,47 +1098,17 @@ static ncclResult_t scheduleP2pTasksToPlan(
 }
 
 // Spin until its safe to increase comm->workFifoProduced to desiredProduced.
-static void waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) {
-  bool hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
-  if (hasRoom) return;
-  while (true) {
-    // We have to poll for notifications from device.
-    uint32_t* consumedLive = comm->workFifoConsumed;
-    uint32_t consumed[MAXCHANNELS];
-    for (int c=0; c < MAXCHANNELS; c++) {
-      consumed[c] = __atomic_load_n(&consumedLive[c], __ATOMIC_RELAXED);
-    }
-    // Compiler-only fence to prevent fusion of loops to encourage dense loads.
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-    uint32_t produced = comm->workFifoProduced;
-    uint32_t consumedLeast = produced;
-    for (int c=0; c < MAXCHANNELS; c++) {
-      // consumedLeast is min over all non-quiesced channels
-      if (consumed[c] != comm->channels[c].workFifoProduced) {
-        if ((produced - consumedLeast) < (produced - consumed[c])) {
-          consumedLeast = consumed[c];
-        }
-      }
-    }
-
-    // Compiler only fence to prevent fusion of loops to encourage dense stores.
-    __atomic_signal_fence(__ATOMIC_SEQ_CST);
-
-    for (int c=0; c < MAXCHANNELS; c++) {
-      // Advance counter on quiesced channels so they don't lag behind
-      // too far where they could get lost in 32-bit wraparound.
-      if (consumed[c] == comm->channels[c].workFifoProduced) {
-        comm->channels[c].workFifoProduced = consumedLeast;
-        __atomic_store_n(&consumedLive[c], consumedLeast, __ATOMIC_RELAXED);
-      }
+static ncclResult_t waitWorkFifoAvailable(struct ncclComm* comm, uint32_t desiredProduced) {
+  bool hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes;
+  if (!hasRoom) {
+    while (true) {
+      NCCLCHECK(ncclCommPollEventCallbacks(comm, /*waitSome=*/true));
+      hasRoom = (desiredProduced - comm->workFifoConsumed) <= comm->workFifoBytes;
+      if (hasRoom) break;
+      sched_yield();
     }
-    comm->workFifoConsumedLeast = consumedLeast;
-
-    hasRoom = (desiredProduced - comm->workFifoConsumedLeast) <= comm->workFifoBytes;
-    if (hasRoom) break;
-    sched_yield();
   }
+  return ncclSuccess;
 }
 
 namespace {
@@ -1106,11 +1122,14 @@ namespace {
     struct uploadWork_cleanup_t* me = (struct uploadWork_cleanup_t*)cb;
     free(me->hostBuf);
     CUDACHECK(cudaEventDestroy(me->base.event));
+    free(me);
     return ncclSuccess;
   }
 }
 
 static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  if (plan->isSymColl) return ncclSuccess;
+
   size_t workBytes = plan->workBytes;
   size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
   void* fifoBufHost;
@@ -1127,7 +1146,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
     fifoBufHost = comm->workFifoBuf;
     fifoCursor = comm->workFifoProduced;
     fifoMask = comm->workFifoBytes-1;
-    waitWorkFifoAvailable(comm, fifoCursor + workBytes);
+    NCCLCHECK(waitWorkFifoAvailable(comm, fifoCursor + workBytes));
     plan->kernelArgs->workBuf = comm->workFifoBufDev;
     break;
   case ncclDevWorkStorageTypePersistent:
@@ -1208,7 +1227,7 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
       ncclIntruQueueEnqueue(&comm->eventCallbackQueue, (struct ncclCommEventCallback *)cleanup);
 
       NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false), result, fail);
-      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), result, fail);
+      NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, /*waitSome=*/false), result, fail);
 
     finish_scope:
       if (mode != cudaStreamCaptureModeRelaxed) (void)cudaThreadExchangeStreamCaptureMode(&mode);
@@ -1226,6 +1245,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
   uint64_t collOpCount = comm->sharedRes->collOpCount;
   uint64_t p2pOpBump[MAXCHANNELS] = {/*0...*/};
   // Advance comm's collOpCount by number of colls in this plan.
+  int hasp2p = 0;
   comm->sharedRes->collOpCount += plan->collOpCount;
   comm->collOpCount += plan->collOpCount;
 
@@ -1244,6 +1264,7 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
       // remember last value to compute max.
       p2pOpBump[op->channelId] = (oldId>>1) + 1; // +1 to ensure next plan doesn't collide
       op->opCount = (comm->sharedRes->p2pOpCount[op->channelId]<<1) + oldId;
+      hasp2p = 1;
     } else { // coll
       op->opCount = (collOpCount<<1) + oldId;
     }
@@ -1253,9 +1274,11 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
     op = op->enqNext;
   }
 
-  for (int c=0; c < MAXCHANNELS; c++) {
-    // Advance channel's p2pOpCount by number of p2p's in this plan channel.
-    comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
+  if (hasp2p) {
+    for (int c=0; c < MAXCHANNELS; c++) {
+      // Advance channel's p2pOpCount by number of p2p's in this plan channel.
+      comm->sharedRes->p2pOpCount[c] += p2pOpBump[c];
+    }
   }
   return ncclSuccess;
 }
@@ -1263,8 +1286,10 @@ static ncclResult_t uploadProxyOps(struct ncclComm* comm, struct ncclKernelPlan*
 static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   NCCLCHECK(ncclProfilerStartGroupEvent(plan));
   NCCLCHECK(ncclProfilerStartTaskEvents(plan));
-  NCCLCHECK(uploadProxyOps(comm, plan));
-  NCCLCHECK(ncclProxyStart(comm));
+  if (ncclIntruQueueHead(&plan->proxyOpQueue)) {
+    NCCLCHECK(uploadProxyOps(comm, plan));
+    NCCLCHECK(ncclProxyStart(comm));
+  }
   NCCLCHECK(ncclProfilerStopTaskEvents(plan));
   NCCLCHECK(ncclProfilerStopGroupEvent(plan));
   if (!plan->persistent) {
@@ -1281,7 +1306,6 @@ static void CUDART_CB hostStreamPlanCallback(void *plan_) {
   if (result != ncclSuccess) {
     WARN("hostStreamPlanCallback() failed : %s", ncclGetErrorString(result));
   }
-  if (!plan->persistent) ncclAtomicRefCountDecrement(&plan->comm->sharedRes->noncapturedRefs);
   return;
 }
 
@@ -1357,9 +1381,8 @@ namespace {
 
 static ncclResult_t getImplicitOrder(enum ncclImplicitOrder *mode, bool capturing, int driver=-1) {
   if (ncclParamLaunchOrderImplicit()) {
-    // Due to an unresolved bug in CUDA ncclImplicitOrderLaunch is not supported in graphs
-    if (capturing) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
     if (driver < 0) { NCCLCHECK(ncclCudaDriverVersion(&driver)); }
+    if (capturing && driver < 12090) { *mode = ncclImplicitOrderSerial; return ncclSuccess; }
     *mode = 12030 <= std::min<int>(CUDART_VERSION, driver) ? ncclImplicitOrderLaunch : ncclImplicitOrderSerial;
     return ncclSuccess;
   }
@@ -1386,26 +1409,51 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
                                          : ncclDevWorkStorageTypeFifo;
 
-      struct ncclKernelPlanBudget budget;
-      budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
-      // Non-persistent kernels fill up at most half of our fifo per kernel.
-      budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
-
-      // Drain coll tasks first. This is essential since we partition tasks based
-      // on the work budget and p2p work isn't collective. If we were to drain p2p
-      // first, the place where we cut the kernel could vary by rank which would
-      // cause the "shortest channel first" channel picker to have divergent results.
-      if (planner->nTasksColl != 0) {
-        NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
-      }
-      // And only drain p2p tasks once colls are depleted.
-      if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
-        NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
-      }
-      finishPlan(comm, plan);
-      if (plan->workBytes != 0) {
+      if (planner->isSymColl) {
+        plan->workStorageType = ncclDevWorkStorageTypeArgs;
+
+        struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
+        plan->isSymColl = true;
+        plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype);
+        plan->threadPerBlock = task->nWarps*WARP_SIZE;
+        plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels);
+
+        plan->kernelArgsSize = sizeof(struct ncclSymDevArgs);
+        plan->kernelSymArgs = ncclMemoryStackAlloc<struct ncclSymDevArgs>(&comm->memScoped);
+        plan->kernelSymArgs->comm = comm->symDevComm;
+        plan->kernelSymArgs->rootRank = task->root;
+        plan->kernelSymArgs->redOpArg = task->opDev.scalarArg;
+        plan->kernelSymArgs->nElts = task->count;
+        plan->kernelSymArgs->input = (char*)task->sendbuff;
+        plan->kernelSymArgs->output = (char*)task->recvbuff;
+
+        planner->nTasksColl -= 1;
         ncclIntruQueueEnqueue(&planner->planQueue, plan);
+        INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d",
+        ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock);
         nPlans += 1;
+      } else {
+        struct ncclKernelPlanBudget budget;
+        budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
+        // Non-persistent kernels fill up at most half of our fifo per kernel.
+        budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
+
+        // Drain coll tasks first. This is essential since we partition tasks based
+        // on the work budget and p2p work isn't collective. If we were to drain p2p
+        // first, the place where we cut the kernel could vary by rank which would
+        // cause the "shortest channel first" channel picker to have divergent results.
+        if (planner->nTasksColl != 0) {
+          NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
+        }
+        // And only drain p2p tasks once colls are depleted.
+        if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
+          NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
+        }
+        finishPlan(comm, plan);
+        if (plan->workBytes != 0) {
+          ncclIntruQueueEnqueue(&planner->planQueue, plan);
+          nPlans += 1;
+        }
       }
     } while (planner->nTasksColl + planner->nTasksP2p != 0);
 
@@ -1428,6 +1476,7 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
 
     bool capturing = ncclCudaGraphValid(planner->capturingGraph);
     enum ncclImplicitOrder implicitOrder;
+    cudaError_t status = cudaSuccess;
     NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing), result, failure);
 
     if (implicitOrder != ncclImplicitOrderNone) {
@@ -1439,7 +1488,8 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       NCCLCHECKGOTO(ncclStreamWaitStream(launchStream, launchOrder, comm->sharedRes->scratchEvent), result, failure);
     }
 
-    if (persistent || comm->sharedRes->persistentRefs != 0 || ncclCudaLaunchBlocking || __atomic_load_n(&comm->sharedRes->noncapturedRefs, __ATOMIC_ACQUIRE)) {
+    if (!persistent && comm->sharedRes->persistentRefs) status = cudaEventQuery(comm->sharedRes->hostStream.serialEvent);
+    if (persistent || ncclCudaLaunchBlocking || status == cudaErrorNotReady) {
       // We have to launch host tasks to push proxy args. We are careful to only
       // do this if necessary since host tasks impose a high performance cost in CUDA.
       bool acquired = false;
@@ -1450,7 +1500,6 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
             acquired = true;
             NCCLCHECKGOTO(ncclStrongStreamAcquire(planner->capturingGraph, &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), result, failure);
           }
-          if (!persistent) ncclAtomicRefCountIncrement(&comm->sharedRes->noncapturedRefs);
           plan->isHostCbEnq = true;
           CUDACHECKGOTO(cudaLaunchHostFunc(hostStream, hostStreamPlanCallback, plan), result, failure);
         }
@@ -1485,6 +1534,8 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
 NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
+NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
+
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   ncclResult_t ret = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
@@ -1512,7 +1563,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     unsigned int clusterSize = (compCap >= 90) ? comm->config.cgaClusterSize : 0;
 
     CUlaunchConfig launchConfig = {0};
-    CUlaunchAttribute launchAttrs[4] = {};
+    CUlaunchAttribute launchAttrs[6] = {};
     int attrs = 0;
     /* Cooperative Group Array (CGA)
      * On sm90 and later we have an extra level of hierarchy where we
@@ -1549,6 +1600,18 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
       launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
       attrs++;
     }
+    if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
+      launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1;
+      attrs++;
+    }
+    #endif
+    #if CUDART_VERSION >= 13000
+    if (compCap >= 90 && driverVersion >= 13000) {
+      launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING;
+      launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable();
+      attrs++;
+    }
     #endif
     launchConfig.gridDimX = grid.x;
     launchConfig.gridDimY = grid.y;
@@ -1560,7 +1623,6 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     launchConfig.attrs = launchAttrs;
     launchConfig.numAttrs = attrs;
     launchConfig.hStream = launchStream;
-
     CUCHECKGOTO(cuLaunchKernelEx(&launchConfig, fn, nullptr, extra), ret, do_return);
   #endif
   } else {
@@ -1573,21 +1635,30 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
 }
 
 ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (!(plan->persistent || ncclCudaLaunchBlocking || plan->isHostCbEnq)) {
-    // We are not using the host stream for proxy ops and reclaimation submission.
+  if (!plan->isHostCbEnq) {
+    // we are not using the host stream for proxy ops and reclaimation submission, call
+    // hostStreamPlanTask directly
     NCCLCHECK(hostStreamPlanTask(comm, plan));
-  } else {
-    // We are using the host stream for proxy ops and reclaimation submission.
-    // Only plans with proxy ops have a callback pushed by ncclLaunchPrepare.
-    // Since non-persistent plans also require reclaimation, we have to do it
-    // here.
-    if (!plan->persistent && !plan->hasProxyOps) {
-      ncclIntruQueueMpscEnqueue(&comm->callbackQueue, &plan->reclaimer);
-    }
   }
   return ncclSuccess;
 }
 
+namespace {
+  struct KernelFinishCallback {
+    struct ncclCommEventCallback base;
+    uint32_t workFifoConsumed;
+  };
+  ncclResult_t KernelFinishCallback_fn(
+      struct ncclComm* comm, struct ncclCommEventCallback* cb
+    ) {
+    struct KernelFinishCallback* me = (struct KernelFinishCallback*)cb;
+    comm->workFifoConsumed = me->workFifoConsumed;
+    CUDACHECK(cudaEventDestroy(me->base.event));
+    free(me);
+    return ncclSuccess;
+  }
+}
+
 ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
   struct ncclKernelPlanner* planner = &comm->planner;
   if (!ncclIntruQueueEmpty(&planner->planQueue)) {
@@ -1597,7 +1668,21 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
 
     cudaStream_t launchStream = planner->streams->stream; // First user stream gets launch
     cudaStream_t deviceStream, launchOrder;
-    CUDACHECK(cudaEventRecord(comm->sharedRes->scratchEvent, launchStream));
+    cudaEvent_t finishedEvent = comm->sharedRes->scratchEvent;
+    CUDACHECK(cudaEventRecord(finishedEvent, launchStream));
+
+    if (comm->workFifoProduced - comm->workFifoProducedLastRecorded > comm->workFifoBytes/8) {
+      comm->workFifoProducedLastRecorded = comm->workFifoProduced;
+      struct KernelFinishCallback* cb;
+      NCCLCHECK(ncclCalloc(&cb, 1));
+      cb->base.event = finishedEvent;
+      cb->base.fn = KernelFinishCallback_fn;
+      cb->workFifoConsumed = comm->workFifoProduced;
+      ncclIntruQueueEnqueue(&comm->eventCallbackQueue, &cb->base);
+      // We just stole scratchEvent so must create a new one.
+      CUDACHECK(cudaEventCreateWithFlags(&comm->sharedRes->scratchEvent, cudaEventDisableTiming));
+    }
+
     // deviceStream waits on userStream[0]
     NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream));
 
@@ -1606,13 +1691,13 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
     // on launchStream as a fast-forward. When building CUDA graphs fast forwards should
     // be handled specially so as not to create graphs with a blowup in the number of edges.
     // So we could do this:
-    //   CUDACHECK(cudaStreamWaitEvent(deviceStream, comm->sharedRes->scratchEvent, 0));
+    //   CUDACHECK(cudaStreamWaitEvent(deviceStream, finishedEvent, 0));
     // But instead we do:
-    NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, comm->sharedRes->scratchEvent));
+    NCCLCHECK(ncclStreamAdvanceToEvent(planner->capturingGraph, deviceStream, finishedEvent));
 
     // Each userStream[i] waits on userStream[0]
     for (struct ncclCudaStreamList* l=planner->streams->next; l != nullptr; l = l->next) {
-      CUDACHECK(cudaStreamWaitEvent(l->stream, comm->sharedRes->scratchEvent, 0));
+      CUDACHECK(cudaStreamWaitEvent(l->stream, finishedEvent, 0));
     }
     bool capturing = ncclCudaGraphValid(planner->capturingGraph);
     enum ncclImplicitOrder implicitOrder;
@@ -1623,7 +1708,7 @@ ncclResult_t ncclLaunchFinish(struct ncclComm* comm) {
       // Incorporate launch event into per-device (context) launch order.
       NCCLCHECK(ncclStrongStreamAcquiredWorkStream(planner->capturingGraph, &comm->context->launchOrder, concurrent, &launchOrder));
       // If we don't have launch events (requires CUDA 12.3) then just use completion event (serialize execution).
-      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : comm->sharedRes->scratchEvent));
+      CUDACHECK(cudaStreamWaitEvent(launchOrder, implicitOrder == ncclImplicitOrderLaunch ? comm->sharedRes->launchEvent : finishedEvent));
       // Release launchOrder as acquired in ncclLaunchPrepare()
       NCCLCHECK(ncclStrongStreamRelease(planner->capturingGraph, &comm->context->launchOrder, concurrent));
     }
@@ -1645,7 +1730,7 @@ static inline ncclResult_t getCollNetSupport(
   if (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv) {
     netOp = ncclSum;
   }
-  *collNetSupport = comm->collNetSupport;
+  *collNetSupport = comm->config.collnetEnable;
   switch (info->func) {
   case ncclFuncAllReduce:
   case ncclFuncReduce:
@@ -1683,10 +1768,8 @@ static ncclResult_t updateCollCostTable(
     if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
     // CollNetDirect is only supported for up to 8 local GPUs
     if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
-    if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && nvlsSupport != 1 && info->func != ncclFuncAllGather) continue;
+    if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
-    /* now we only support single-node NVLS allgather and reducescatter */
-    if (a == NCCL_ALGO_NVLS && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) && (comm->nNodes > 1 || comm->nRanks > NCCL_MAX_NVLS_ARITY)) continue;
     /* Tree reduceScatter doesn't support scaling yet */
     if (a == NCCL_ALGO_PAT && info->func == ncclFuncReduceScatter
         && (info->opDev.op == ncclDevPreMulSum || info->opDev.op == ncclDevSumPostDiv)) continue;
@@ -1801,7 +1884,14 @@ static ncclResult_t getAlgoInfo(
     struct ncclComm* comm, struct ncclTaskColl* info,
     int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo/* = NULL*/
   ) {
-  size_t nBytes = ncclTypeSize(info->datatype)*ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count);
+  size_t elementSize = ncclTypeSize(info->datatype);
+  size_t nBytes = elementSize * ncclFuncMaxSendRecvCount(info->func, comm->nRanks, info->count);
+  struct ncclReg* regSendBuf = NULL;
+  struct ncclReg* regRecvBuf = NULL;
+  int regBuff;
+  bool isSendValid, isRecvValid;
+  size_t sendbuffSize = elementSize * ncclFuncSendCount(info->func, comm->nRanks, info->count);
+  size_t recvbuffSize = elementSize * ncclFuncRecvCount(info->func, comm->nRanks, info->count);
   info->algorithm = NCCL_ALGO_UNDEF;
   info->protocol = NCCL_PROTO_UNDEF;
   int nMaxChannels = 0;
@@ -1809,20 +1899,42 @@ static ncclResult_t getAlgoInfo(
   initCollCostTable((float **)collCostTable);
   NCCLCHECK(updateCollCostTable(comm, info, nBytes, collNetSupport, nvlsSupport, numPipeOps, (float **)collCostTable));
   if (comm->tuner != NULL) {
-    size_t elementSize = ncclTypeSize(info->datatype);
-    size_t sendbuffSize = elementSize*ncclFuncSendCount(info->func, comm->nRanks, info->count);
-    size_t recvbuffSize = elementSize*ncclFuncRecvCount(info->func, comm->nRanks, info->count);
-    struct ncclReg* regSendBuf;
-    struct ncclReg* regRecvBuf;
     NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &regSendBuf));
     NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &regRecvBuf));
-    int regBuff = ((regSendBuf && regRecvBuf) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister()));
+    NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid));
+    NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid));
+    regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister());
     NCCLCHECK(comm->tuner->getCollInfo(
           comm->tunerContext, info->func, nBytes,
           numPipeOps, (float **)collCostTable, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
           regBuff, &nMaxChannels));
+    NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
+  } else {
+    NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
+    // NCCL_CTA_POLICY_EFFICIENCY requires user (non-symmetric) buffer registration (currently unsupported with MNNVL)
+    if (comm->config.CTAPolicy == NCCL_CTA_POLICY_EFFICIENCY && ncclGetEnv("NCCL_ALGO") == NULL && ncclGetEnv("NCCL_PROTO") == NULL && !comm->MNNVL) {
+      // make algorithm selection based on buffer registration
+      // there can be other specialized policies for algorithms and protocols pickup in the future
+      NCCLCHECK(ncclRegFind(comm, info->sendbuff, sendbuffSize, &regSendBuf));
+      NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &regRecvBuf));
+      NCCLCHECK(ncclRegLocalIsValid(regSendBuf, &isSendValid));
+      NCCLCHECK(ncclRegLocalIsValid(regRecvBuf, &isRecvValid));
+      regBuff = (regSendBuf && regRecvBuf && isSendValid && isRecvValid) || (ncclCudaGraphValid(comm->planner.capturingGraph) && ncclParamGraphRegister());
+      if (regBuff && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) {
+        if ((comm->nNodes > 1 && collNetSupport && nvlsSupport) || (comm->nNodes == 1 && nvlsSupport)) {
+          int recChannels;
+          NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels));
+          if (recChannels <= info->nMaxChannels) {
+            info->algorithm = NCCL_ALGO_NVLS;
+            info->protocol = NCCL_PROTO_SIMPLE;
+            info->nMaxChannels = recChannels;
+            info->nWarps = comm->maxThreads[info->algorithm][info->protocol] / WARP_SIZE;
+          }
+        }
+      }
+    }
   }
-  NCCLCHECK(topoGetAlgoInfo(comm, info, nBytes, (float **)collCostTable, simInfo));
+
   info->nMaxChannels = nMaxChannels == 0 ? info->nMaxChannels : nMaxChannels;
   return ncclSuccess;
 }
@@ -1892,16 +2004,20 @@ static ncclResult_t calcCollChunking(
     while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth * 8 && chunkSize > 65536) chunkSize /= 2;
     while (nBytes / (nChannels * chunkSize) < comm->channels[0].collnetChain.depth && chunkSize > 32768) chunkSize /= 2;
   } else if (info->algorithm == NCCL_ALGO_NVLS) {
-    int maxChunkSize = comm->nvlsChunkSize;
-    if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
-    if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
-    // Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
-    // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
-    // coverity[overflow_before_widen]
-    uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
-    if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
-    if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
-    if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
+    if ((info->regBufType & NCCL_NVLS_REG_BUFFER) && (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter)) {
+      chunkSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS;
+    } else {
+      int maxChunkSize = comm->nvlsChunkSize;
+      if (comm->nNodes > 1 && comm->bandwidths[ncclFuncAllReduce][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] < 150) maxChunkSize = 32768;
+      if (chunkSize > maxChunkSize) chunkSize = maxChunkSize;
+      // Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
+      // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
+      // coverity[overflow_before_widen]
+      uint64_t concurrentOps = nChannels * comm->channels[0].nvls.nHeads;
+      if ((nBytes < (64 * (concurrentOps * chunkSize))) && (chunkSize > 65536)) chunkSize = 65536;
+      if ((nBytes < (8 * (concurrentOps * chunkSize))) && (chunkSize > 32768)) chunkSize = 32768;
+      if ((nBytes < (2 * (concurrentOps * chunkSize))) && (chunkSize > 16384)) chunkSize = 16384;
+    }
   } else if (info->algorithm == NCCL_ALGO_NVLS_TREE) {
     // Use uint64_t so that concurrentOps*chunkSize*X does not overflow.
     // However, nChannels * comm->channels[0].nvls.nHeads should easily fit in 32 bits.
@@ -2045,7 +2161,7 @@ static ncclResult_t calcCollChunking(
     proxyOp->reg = 0;
   }
 
-  if (pattern == ncclPatternCollnetDirect) {
+  if (pattern == ncclPatternCollnetDirect || pattern == ncclPatternNvls) {
     proxyOp->specifics.collnetDirect.nNodes = comm->nNodes;
     proxyOp->specifics.collnetDirect.node = comm->node;
     if (info->func == ncclFuncAllGather || info->func == ncclFuncReduceScatter) {
@@ -2168,7 +2284,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
     bool isSendNotRecv = info->coll == ncclFuncSend;
 
     // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-    ncclGroupCommJoin(info->comm);
+    ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
     struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
     p2p->func = info->coll;
     p2p->buff = (void*)info->recvbuff;
@@ -2235,7 +2351,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       return ncclSuccess;
     } else {
       // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-      ncclGroupCommJoin(info->comm);
+      ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
       struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
       t->func = info->coll;
       t->sendbuff = info->sendbuff;
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 76b508c2d..152739b0c 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -258,7 +258,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
     channel->nvls.out = -1;       // NVLS+SHARP not yet implemented.
     channel->nvls.headRank = headRank;
     channel->nvls.treeUp = channel->nvls.treeDown[0] = channel->nvls.treeDown[1] = channel->nvls.treeDown[2] = -1;
-    if (comm->collNetSupport && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
+    if (comm->config.collnetEnable && channel->nvls.headRank != -1) channel->nvls.out = comm->nRanks;
   }
   if (comm->nNodes == 1) return ncclSuccess;
 
@@ -330,7 +330,7 @@ int ncclMinNchannels() {
   if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
   if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
   if (minNchannels > MAXCHANNELS) {
-    WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
+    INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
     minNchannels = MAXCHANNELS;
   }
   if (minNchannels < 0) minNchannels = 0;
@@ -346,7 +346,7 @@ int ncclMaxNchannels() {
   maxNchannels = std::min(maxNchannels, ncclDevMaxChannelsForArgsBytes(ncclParamWorkArgsBytes()));
   if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
   if (maxNchannels < 1) {
-    WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
+    INFO(NCCL_GRAPH|NCCL_ENV, "User asked for a maximum of %d channels, setting it to 1", maxNchannels);
     maxNchannels = 1;
   }
   return maxNchannels;
@@ -379,7 +379,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   int nNodes = comm->nNodes;
   int nChannels = comm->nChannels;
   int minHeadNum = INT_MAX;
-  int shared = parent && parent->nvlsSupport  && parent->config.splitShare;
+  int shared = parent && parent->nvlsSupport  && parent->shareResources;
   NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
   NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
@@ -452,7 +452,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
 
   // Setup CollNet
-  if (comm->collNetSupport == 1) {
+  if (comm->config.collnetEnable) {
     struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
     // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
     if (collNetChainGraph->bwIntra > collNetChainGraph->bwInter && comm->nRanks > comm->nNodes) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 998371247..bc5cc755e 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -214,7 +214,7 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
       const char* str = ncclGetEnv(disableEnv);
       if (str) {
         int disable = strtol(str, NULL, 0);
-        if (disable == 1) l = 0;
+        if (disable == 1) l = PATH_LOC;
         if (l >= 0) INFO(NCCL_ALL, "%s set by environment to %d", disableEnv, disable);
       }
     }
@@ -247,7 +247,18 @@ ncclResult_t ncclGetLevel(int* level, const char* disableEnv, const char* levelE
 
 NCCL_PARAM(IgnoreDisabledP2p, "IGNORE_DISABLED_P2P", 0);
 
-int ncclTopoUserP2pLevel = -1;
+static int ncclTopoUserP2pLevel = -1; // Initially "uninitialized".  When initialized but unset, changes to -2.
+
+// Gets the user-provided value of NCCL_P2P_LEVEL/NCCL_P2P_DISABLE.  If the user did not provide any, the value
+// of the "level" argument is left unchanged.
+ncclResult_t ncclGetUserP2pLevel(int* level) {
+  if (ncclTopoUserP2pLevel == -1)
+    NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
+  if (ncclTopoUserP2pLevel != -2)
+    *level = ncclTopoUserP2pLevel;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
                               int* p2p, int *read, int* intermediateRank) {
   int mnnvl = 0;
@@ -275,9 +286,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
 
   // Get GPUs from topology
   int g1, g2;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/true));
   struct ncclTopoNode* gpu1 = system->nodes[GPU].nodes+g1;
-  if (ncclTopoRankToIndex(system, rank2, &g2) == ncclInternalError) {
+  if (ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) == ncclInternalError) {
     // GPU not found, we can't use p2p.
     return ncclSuccess;
   }
@@ -302,15 +313,8 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
   if ((arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD) && system->nodes[GPU].count <= 2) p2pLevel = PATH_SYS;
 
   // User override
-  if (ncclTopoUserP2pLevel == -1)
-    NCCLCHECK(ncclGetLevel(&ncclTopoUserP2pLevel, "NCCL_P2P_DISABLE", "NCCL_P2P_LEVEL"));
-  if (ncclTopoUserP2pLevel != -2) {
-    p2pLevel = ncclTopoUserP2pLevel;
-    goto compare;
-  }
+  NCCLCHECK(ncclGetUserP2pLevel(&p2pLevel));
 
-
-compare:
   // Compute the PCI distance and compare with the p2pLevel.
   if (path->type <= p2pLevel) *p2p = 1;
 
@@ -378,7 +382,8 @@ NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 int ncclTopoUserGdrLevel = -1;
 const char* ncclTopoGdrModeStr[ncclTopoGdrModeNum] = { "Disabled", "Default", "PCI" };
 
-NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 0);
+// On C2C platforms use GDRDMA on NICs which are connected to the CPUs
+NCCL_PARAM(NetGdrC2c, "NET_GDR_C2C", 1);
 
 ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t netId, int read, enum ncclTopoGdrMode* gdrMode) {
   *gdrMode = ncclTopoGdrModeDisable;
@@ -387,7 +392,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
   int n, g;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
   struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
 
   // Check that both the NIC and GPUs support it
@@ -423,29 +428,29 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int rank, int64_t n
     // In case of PXN, use the intermediate GPU distance instead
     int proxyRank;
     NCCLCHECK(ncclTopoGetIntermediateRank(system, gpu->gpu.rank, netId, &proxyRank));
-    NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g));
+    NCCLCHECK(ncclTopoRankToIndex(system, proxyRank, &g, /*showWarn=*/true));
     gpu = system->nodes[GPU].nodes+g;
     distance = gpu->paths[NET][n].type;
   }
 
-  int c;
-  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
-  if (ncclParamNetGdrC2c() && distance == PATH_PHB && gpu->paths[CPU][c].type == PATH_C2C) {
-    // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
-    INFO(NCCL_NET, "GPU %d / HCA %lx connected to CPU %d via C2C link", rank, netId, c);
+  // On C2C platforms we can still use GDRDMA on NICs connected to the CPUs
+  if (ncclParamNetGdrC2c() && distance == PATH_P2C) {
+    INFO(NCCL_GRAPH | NCCL_NET, "GPU %d / HCA %lx connected via C2C link", rank, netId);
     distance = PATH_C2C;
   }
 
   if (distance > netGdrLevel) {
-    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
+    INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Disabled for GPU %d / HCA %lx (distance %d > %d)", rank, netId, distance, netGdrLevel);
     return ncclSuccess;
   }
 
   // Force PCIe mapping if path goes through PCI on a C2C system
+  int c;
+  NCCLCHECK(ncclGetLocalCpu(system, g, &c));
   if (gpu->paths[CPU][c].type == PATH_C2C && distance != PATH_C2C) *gdrMode = ncclTopoGdrModePci;
   else *gdrMode = ncclTopoGdrModeDefault;
 
-  INFO(NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
+  INFO(NCCL_GRAPH|NCCL_NET,"GPU Direct RDMA Enabled for GPU %d / HCA %lx (distance %d <= %d), read %d mode %s", rank, netId, distance, netGdrLevel, read, ncclTopoGdrModeStr[*gdrMode]);
   return ncclSuccess;
 }
 
@@ -480,7 +485,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclComm* comm, int64_t netId, int netDev,
   if (props.forceFlush == 1 || ncclParamNetForceFlush()) return ncclSuccess;
   int g;
   struct ncclTopoSystem* system = comm->topo;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   // Flush is required on Ampere and earlier
   if (gpu->gpu.cudaCompCap >= 90) *flush = 0;
@@ -506,8 +511,8 @@ ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int rank1, int rank
   *net = 1;
   // First check the current GPU-to-GPU speed.
   int g1, g2;
-  if (ncclTopoRankToIndex(system, rank1, &g1) != ncclSuccess ||
-      ncclTopoRankToIndex(system, rank2, &g2) != ncclSuccess) {
+  if (ncclTopoRankToIndex(system, rank1, &g1, /*showWarn=*/false) != ncclSuccess ||
+      ncclTopoRankToIndex(system, rank2, &g2, /*showWarn=*/false) != ncclSuccess) {
     return ncclSuccess;
   }
 
@@ -533,7 +538,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
   // Get GPU and NET
   int n, g;
   NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &n));
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &g, /*showWarn=*/true));
   struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
   struct ncclTopoLinkList* path = gpu->paths[NET]+n;
   if (path->type == PATH_PXN) {
@@ -601,6 +606,8 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   return ncclSuccess;
 }
 
+NCCL_PARAM(PxnC2c, "PXN_C2C", 0);
+
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
   // Precompute paths between GPUs/NICs.
 
@@ -659,6 +666,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
     }
   }
+  // update the GPU -> NIC path in the case of C2C + PHB
+  for (int n = 0; n < system->nodes[NET].count; n++) {
+    struct ncclTopoNode* netNode = system->nodes[NET].nodes + n;
+    for (int g = 0; g < system->nodes[GPU].count; g++) {
+      struct ncclTopoNode* gpuNode = system->nodes[GPU].nodes + g;
+      int c;
+      NCCLCHECK(ncclGetLocalCpu(system, g, &c));
+      if (c == -1) continue;
+      if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) {
+        gpuNode->paths[NET][n].type = PATH_P2C;
+        netNode->paths[GPU][g].type = PATH_P2C;
+      }
+    }
+  }
 
   // Update paths for NICs (no GPU Direct, PXN, ...)
   for (int n=0; n<system->nodes[NET].count; n++) {
@@ -674,15 +695,20 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
           // PXN = PCI + NVLink.
           struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
           // Only use PXN for NIC n if remote GPU p ...
-          if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
-              peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
-              NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
-              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
-               gpu->paths[NET][n].type > PATH_PXB))                  // or avoids going through a CPU
-          // We can use that GPU as relay to communicate with that NIC.
-          // Only enabling it in the GPU->NIC direction for now to favor
-          // receiving locally and sending remotely (consistent with net.cc)
-          NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
+          if (/* (1) is either connected to the NIC with PXB*/
+              (peerNode->paths[NET][n].type <= PATH_PXB ||
+               /* or with P2C and PxN over C2C is enabled */
+               (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) &&
+              /* and (2) is connected to us through NVLink */
+              peerNode->paths[GPU][g].type <= PATH_NVL &&
+              /* and (3) is on the same node as us */
+              NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
+              /* and (4) has either higher bw to that NIC or avoid going through the CPU*/
+              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB))
+            // We can use that GPU as relay to communicate with that NIC.
+            // Only enabling it in the GPU->NIC direction for now to favor
+            // receiving locally and sending remotely (consistent with net.cc)
+            NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
         }
       }
       if (gpu->paths[NET][n].type < PATH_PHB) {
@@ -761,7 +787,7 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
   int peer;
   struct ncclTopoSystem* system = comm->topo;
   struct ncclTopoLinkList* path = NULL;
-  if (ncclTopoRankToIndex(system, peerRank, &peer) == ncclSuccess) {
+  if (ncclTopoRankToIndex(system, peerRank, &peer, /*showWarn=*/false) == ncclSuccess) {
     // Same rank
     if (g == peer) {
       *nChannels = -1;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 15a01243f..9d8ad3ff8 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -137,6 +137,7 @@ static ncclResult_t ncclTopoFollowPath(struct ncclTopoSystem* system, struct ncc
   float bw = intra ? graph->bwIntra : graph->bwInter;
   int type = intra ? graph->typeIntra : graph->typeInter;
 
+  if (path->type >= PATH_DIS) return ncclSuccess;
   if (mult == 1 && (path->type > type)) return ncclSuccess;
   if (mult == 1 && (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
         graph->pattern == NCCL_TOPO_PATTERN_TREE ||
@@ -328,8 +329,7 @@ ncclResult_t ncclTopoReplayGetGpu(struct ncclTopoSystem* system, struct ncclTopo
     *g = i;
     return ncclSuccess;
   }
-  if (*g == -1) return ncclInternalError;
-  return ncclSuccess;
+  return ncclInternalError;
 }
 
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time);
@@ -658,24 +658,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         }
 
         // Then try the most local GPUs
-        float maxBw = 0;
-        int minHops = 0xfffffff;
-        struct ncclTopoLinkList* paths = net->paths[GPU];
-        for (int g=0; g<system->nodes[GPU].count; g++) {
-          if (paths[g].bw > maxBw) {
-            maxBw = paths[g].bw;
-            minHops = paths[g].count;
-          } else if (paths[g].bw == maxBw && paths[g].count > 0 && paths[g].count < minHops) {
-            minHops = paths[g].count;
-          }
-        }
-        if (maxBw >= bw) {
-          for (int i=0; i<system->nodes[GPU].count; i++) {
-            int g = (graph->nChannels+i)%system->nodes[GPU].count;
-            if (paths[g].bw == maxBw && paths[g].count == minHops) {
-              NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, g));
-            }
-          }
+        int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
+        NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
+        // if no GPUs are connected, skip this net
+        if (pathType == PATH_DIS) continue;
+        for (int g = 0; g < localGpuCount; ++g) {
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
         }
       }
     }
@@ -762,6 +750,7 @@ struct kvDict kvDictLinkType[] = {
   { "PIX", PATH_PIX },
   { "PXB", PATH_PXB },
   { "PXN", PATH_PXN },
+  { "P2C", PATH_P2C },
   { "PHB", PATH_PHB },
   { "SYS", PATH_SYS },
   { NULL, 0 }
@@ -920,8 +909,8 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTRA_SM90 (sizeof(sm90SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
-float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0 };
-float sm100SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
+float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
 
@@ -1060,13 +1049,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
     if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
-      goto search;
+      if (tmpGraph.typeIntra < PATH_DIS) goto search;
     }
     tmpGraph.typeIntra = minTypeIntra;
 
     if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
-      goto search;
+      if (tmpGraph.typeInter < PATH_DIS) goto search;
     }
     tmpGraph.typeInter = minTypeInter;
 
@@ -1124,7 +1113,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   }
 
   if (graph->nChannels == 0 && graph->collNet == 0 && graph->pattern != NCCL_TOPO_PATTERN_NVLS) {
-    WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
+    INFO(NCCL_GRAPH, "Could not find a path for pattern %d, falling back to simple order", graph->pattern);
     for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
     graph->inter[0] = graph->inter[1] = 0;
     graph->bwIntra = graph->bwInter = 0.1;
@@ -1248,7 +1237,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
       }
       if (pxnLevel == 1) {
         int g, n;
-        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g));
+        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g, /*showWarn=*/true));
         NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
         struct ncclTopoNode* gpu = comm->topo->nodes[GPU].nodes+g;
         if (gpu->paths[NET][n].type <= PATH_PXN) {
@@ -1260,7 +1249,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
         // Check which local GPU corresponds to that NIC and see if we can use PXN.
         int n, g1, g2;
         NCCLCHECK(ncclTopoIdToIndex(comm->topo, NET, netId, &n));
-        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1));
+        NCCLCHECK(ncclTopoRankToIndex(comm->topo, rank, &g1, /*showWarn=*/true));
         NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2));
         if (g2 != -1) {
           struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9499f396d..9fe81bbcd 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -9,12 +9,10 @@
 #include "topo.h"
 #include "comm.h"
 #include "nvmlwrap.h"
-#include "net.h"
 #include "coll_net.h"
 #include "transport.h"
 #include <sys/stat.h>
 #include <fcntl.h>
-#include "xml.h"
 #include "cpuset.h"
 #include "bootstrap.h"
 
@@ -22,8 +20,8 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "PHB", "SYS", "NET", "DIS" };
+const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "",    "", "SYS", "NET" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -251,7 +249,7 @@ ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
       pciSwitch->pci.device |= 0xffff;
       free(subSwIds);
       // Restart, as system->nodes[PCI].nodes has changed.
-      s = 0;
+      s = -1;  // Will be incremented to 0 in the next loop iteration
       continue;
 fail:
       free(subSwIds);
@@ -404,7 +402,9 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
   return ncclSuccess;
 }
 
-struct kvDict kvDictPciClass[] = { { "0x060400", PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
+#define PCI_BRIDGE_DEVICE_CLASS "0x060400"
+
+struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
@@ -699,6 +699,7 @@ static ncclResult_t xmlInitAttrInt(struct ncclXmlNode* node, const char* attrNam
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
     snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
   }
   return ncclSuccess;
@@ -709,6 +710,7 @@ static ncclResult_t xmlInitAttrUint64(struct ncclXmlNode* node, const char* attr
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
     snprintf(node->attrs[index].value, MAX_STR_LEN, "0x%lx", value);
   }
   return ncclSuccess;
@@ -719,6 +721,7 @@ static ncclResult_t xmlInitAttrFloat(struct ncclXmlNode* node, const char* attrN
   if (index == -1) {
     index = node->nAttrs++;
     strncpy(node->attrs[index].key, attrName, MAX_STR_LEN);
+    node->attrs[index].key[MAX_STR_LEN] = '\0';
     snprintf(node->attrs[index].value, MAX_STR_LEN, "%f", value);
   }
   return ncclSuccess;
@@ -799,6 +802,17 @@ typedef struct xmlNodeStack {
 
 } xmlNodeStack;
 
+ncclResult_t ncclFindFirstPciParent(ncclXmlNode** parent) {
+  ncclXmlNode* newParent = *parent;
+  while (strcmp(newParent->name, "pci") != 0) {
+    newParent = newParent->parent;
+    if (newParent == nullptr) return ncclSuccess;
+    if (strcmp(newParent->name, "system") == 0) return ncclSuccess;
+  }
+  *parent = newParent;
+  return ncclSuccess;
+}
+
 // 1. Find the common parent xmlNode between the given set of nodes
 ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXmlNode** parent) {
   // Track a stack of parents per-net node being merged
@@ -897,6 +911,7 @@ ncclResult_t ncclTopoGetPath(ncclXmlNode** nodes, int nNodes, int* path, ncclXml
   }
 
 out:
+  ncclFindFirstPciParent(&common);
   *parent = common;
   free(parents);
   return ncclSuccess;
@@ -960,13 +975,19 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoMakeVnic(ncclComm_t comm, struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
-struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
+struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
   if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
     WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
     return ncclInternalError;
   }
 
+  // Don't make vNics of size 1
+  if (vProps->ndevs == 1) {
+    TRACE(NCCL_GRAPH, "TOPO/NET : Skipping vNic of size 1");
+    return ncclSuccess;
+  }
+
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
   ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
@@ -976,11 +997,18 @@ struct ncclXmlNode** physNetNodes, struct ncclXmlNode** netNode, ncclResult_t (*
     return ret;
   }
 
+  // Mark original NICs as keep="0" in the topology
+  for (int i = 0; i < vProps->ndevs; i++) {
+    int dev = vProps->devs[i];
+    struct ncclXmlNode* netNode = physNetNodes[dev];
+    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 0));
+  }
+
   INFO(NCCL_GRAPH, "TOPO/NET : Made vNic %d", vDevIndex);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
   ncclResult_t ret = ncclSuccess;
   INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
   char* ncStr;
@@ -1018,8 +1046,7 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char
       goto fail;
     }
 
-    struct ncclXmlNode* netNode;
-    ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+    ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
     if (ret == ncclSuccess) {
       // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
       for (int i = 0; i < vProps.ndevs; i++) {
@@ -1041,7 +1068,7 @@ ncclResult_t ncclTopoForceMerge(ncclComm_t comm, struct ncclXml* xml, const char
   goto exit;
 }
 
-ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
   // Compute the path type between each device
   int* paths = NULL;
   ncclResult_t res = ncclSuccess;
@@ -1085,8 +1112,7 @@ ncclResult_t ncclTopoAutoMerge(ncclComm_t comm, struct ncclXml* xml, int mergeLe
         return ncclInternalError;
       }
 
-      struct ncclXmlNode* netNode;
-      ncclResult_t ret = ncclTopoMakeVnic(comm, xml, &vProps, physNetNodes, &netNode, makeVDevice);
+      ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
 
       // Merging failed.
       // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
@@ -1118,6 +1144,7 @@ struct kvDict nicPathKvList[] = {
   { "PIX",  PATH_PIX },
   { "PXB",  PATH_PXB },
   { "PXN",  PATH_PXN },
+  { "P2C",  PATH_P2C },
   { "PHB",  PATH_PHB },
   { "SYS",  PATH_SYS },
   { NULL, 0 }
@@ -1139,14 +1166,19 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper
   if (path == PATH_LOC) {
     *parent = NULL;
   } else if (parent && strcmp((*parent)->name, "pci") == 0) {
-    // If the common parent is PCI, we must reparent the new NIC under a made up busId
-    NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+    // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
+    const char* c;
+    NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
+    if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
+      // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
+      NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+    }
   }
   TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
   int* placedDevs = NULL;
   struct ncclXmlNode** physNetNodes = NULL;
   if (physicalDevs == 0) return ncclSuccess;
@@ -1170,15 +1202,15 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   { // Avoids warnings related to jumping to "out"
     const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
     if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-    const char* forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+    char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE");
     NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
     memset(placedDevs, 0, sizeof(int)*physicalDevs);
 
     if (forceMerge) {
-      NCCLCHECKGOTO(ncclTopoForceMerge(comm, xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+      NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
     }
   }
-  NCCLCHECKGOTO(ncclTopoAutoMerge(comm, xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
 
 out:
   free(physNetNodes);
@@ -1187,7 +1219,7 @@ ncclResult_t ncclTopoMakeVNics(ncclComm_t comm, struct ncclXml* xml, ncclResult_
   return res;
 }
 
-static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int keep, int virtualNics) {
+static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) {
   for (int n = startIndex; n < endIndex; n++) {
     ncclNetProperties_t props;
     NCCLCHECK(getProperties(n, &props));
@@ -1206,15 +1238,17 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star
     const char* colAttr;
     NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
 
-    // If coll == 0 but the netNode is tagged as coll, don't update the keep value
-    if (colAttr == NULL || coll != 0 || strcmp(colAttr,"1") != 0) NCCLCHECK(xmlSetAttrInt(netNode, "keep", keep));
+    NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
+    int dev;
+    xmlGetAttrIntDefault(netNode, "dev", &dev, -1);
+    if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n);
     NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
     NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (comm->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
     INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
     // Only set coll if it's not 0
@@ -1230,30 +1264,22 @@ static ncclResult_t ncclTopoPopulateNics(ncclComm_t comm, ncclXml* xml, int star
   return ncclSuccess;
 }
 
-struct ncclTopoNetState {
-  int nVirtualNics;
-  int nPhysicalNics;
-  const char* name;
-};
-
 // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
-static ncclResult_t ncclTopoProcessNet(ncclComm_t comm, ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName) {
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) {
   int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
   if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
   // Enumerate physical devices
-  NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 1, 0));
+  NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport));
   if (!usePhysicalDevices) {
     if (state->nVirtualNics == -1) {
-      NCCLCHECK(ncclTopoMakeVNics(comm, xml, makeVDevice, getProperties, state->nPhysicalNics));
+      NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics));
       int nDevs;
       NCCLCHECK(devices(&nDevs));
       state->nVirtualNics = nDevs - state->nPhysicalNics;
     }
-    // Remove keep=1 for physical collnets
     if (state->nVirtualNics > 0) {
-      NCCLCHECK(ncclTopoPopulateNics(comm, xml, 0, state->nPhysicalNics, getProperties, netName, coll, 0, 0));
       // Populate new devices
-      NCCLCHECK(ncclTopoPopulateNics(comm, xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, 1, 1));
+      NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport));
     }
   }
 
@@ -1301,6 +1327,15 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
     // Try default XML topology location
     NCCLCHECKGOTO(ncclTopoGetXmlFromFile("/var/run/nvidia-topologyd/virtualTopology.xml", xml, 0), ret, fail);
   }
+  // Fixup the cpu's host_hashes.
+  struct ncclXmlNode* node;
+  // Update every cpu node's host_hash attribute since those are not
+  // intended to be preserved from the XML files that have been read.
+  NCCLCHECKGOTO(xmlFindTag(xml, "cpu", &node), ret, fail);
+  while (node != nullptr) {
+    NCCLCHECKGOTO(xmlSetAttrLong(node, "host_hash", getHostHash()), ret, fail);
+    NCCLCHECKGOTO(xmlFindNextTag(xml, "cpu", node, &node), ret, fail);
+  }
   if (xml->maxIndex == 0) {
     // Create top tag
     struct ncclXmlNode* top;
@@ -1313,7 +1348,6 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   // Detect only the GPU managed by this process.  We'll get any others through XML fusion.
   char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
   NCCLCHECKGOTO(int64ToBusId(comm->peerInfo[comm->rank].busId, busId), ret, fail);
-  struct ncclXmlNode* node;
   NCCLCHECKGOTO(ncclTopoFillGpu(xml, busId, &node), ret, fail);
   if (node) {
     NCCLCHECKGOTO(xmlSetAttrInt(node, "keep", 1), ret, fail);
@@ -1330,12 +1364,12 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   state = NULL;
   if (collNetSupport(comm)) {
     NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
-    NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 1, dumpXmlFile, state,
-      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name), ret, fail);
+    NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state,
+      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
   }
   NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
-  NCCLCHECKGOTO(ncclTopoProcessNet(comm, xml, 0, dumpXmlFile, state,
-    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name), ret, fail);
+  NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
+    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
   pthread_mutex_unlock(&netLock);
   netLockHeld = 0;
 
@@ -1399,7 +1433,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   goto exit;
 }
 
-static ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType,
                                      int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType) {
   int minType = PATH_DIS;
   float maxBw = 0;
@@ -1452,7 +1486,7 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
 
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
   int gpu;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
 
   int localNets[NCCL_TOPO_MAX_NODES];
   int localNetCount;
@@ -1517,7 +1551,7 @@ NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0);
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity) {
   struct ncclTopoNode* cpu = NULL, *gpu = NULL;
   int gpuIndex, cpuIndex;
-  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex));
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpuIndex, /*showWarn=*/true));
   NCCLCHECK(ncclGetLocalCpu(system, gpuIndex, &cpuIndex));
   gpu = system->nodes[GPU].nodes+gpuIndex;
   cpu = system->nodes[CPU].nodes+cpuIndex;
@@ -1529,8 +1563,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #ifdef ENABLE_TRACE
   {
     char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&mask, affinityStr));
-    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev,
+          ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr)));
   }
 #endif
 
@@ -1540,8 +1574,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 #ifdef ENABLE_TRACE
   {
     char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&cpuMask, affinityStr));
-    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev, affinityStr);
+    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev,
+          ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr)));
   }
 #endif
 
@@ -1558,8 +1592,8 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
   // If there is a non empty set, use it to set affinity
   if (CPU_COUNT(&finalMask)) {
     char affinityStr[sizeof(cpu_set_t)*2];
-    NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr));
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev, affinityStr);
+    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev,
+         ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr)));
   }
   return ncclSuccess;
 }
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 921a7f5d6..07ef5e105 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -9,6 +9,8 @@
 
 #include "graph.h"
 #include "core.h"
+#include "xml.h"
+#include "net.h"
 
 #define LOC_BW 5000.0
 #define SM60_NVLINK_BW 18.0
@@ -50,9 +52,10 @@ extern const char* topoNodeTypeStr[];
 #define LINK_PCI 4
 // Skipping 5 for PATH_PXB
 // Skipping 6 for PATH_PXN
-// Skipping 7 for PATH_PHB
-#define LINK_SYS 8
-#define LINK_NET 9
+// Skipping 7 for PATH_P2C
+// Skipping 8 for PATH_PHB
+#define LINK_SYS 9
+#define LINK_NET 10
 extern const char* topoLinkTypeStr[];
 
 // Local (myself)
@@ -76,20 +79,23 @@ extern const char* topoLinkTypeStr[];
 // Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
 #define PATH_PXN 6
 
+// Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
+#define PATH_P2C 7
+
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
-#define PATH_PHB 7
+#define PATH_PHB 8
 
 // Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
-#define PATH_SYS 8
+#define PATH_SYS 9
 
 // Connection through the network
-#define PATH_NET 9
+#define PATH_NET 10
 
 // New type of path which should precede PATH_PIX
 #define PATH_PORT PATH_NVL
 
 // Disconnected
-#define PATH_DIS 10
+#define PATH_DIS 11
 extern const char* topoPathTypeStr[];
 
 struct ncclTopoNode;
@@ -181,6 +187,13 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int*
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
 ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
 
+struct ncclTopoNetState {
+  int nVirtualNics;
+  int nPhysicalNics;
+  const char* name;
+};
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport);
+
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
 ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, uint64_t localHostHash);
@@ -200,7 +213,7 @@ static ncclResult_t ncclTopoIdToIndex(struct ncclTopoSystem* system, int type, i
   return ncclInternalError;
 }
 
-static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index) {
+static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank, int* index, bool showWarn) {
   *index = -1;
   for (int i=0; i<system->nodes[GPU].count; i++) {
     if (system->nodes[GPU].nodes[i].gpu.rank == rank) {
@@ -208,6 +221,7 @@ static ncclResult_t ncclTopoRankToIndex(struct ncclTopoSystem* system, int rank,
       return ncclSuccess;
     }
   }
+  if (showWarn) WARN("ncclTopoRankToIndex could not find rank %d", rank);
   return ncclInternalError;
 }
 
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 68085b893..64dc5cf22 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -16,13 +16,13 @@ static int getNthreads(const char* name, int env, int min, int max, int def) {
   int nt = env;
   if (nt > 0) {
     if (nt % WARP_SIZE != 0) {
-      WARN("Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
       nt = max;
     } else if (nt > max) {
-      WARN("Invalid %s %d (maximum %d).", name, nt, max);
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (maximum %d).", name, nt, max);
       nt = max;
     } else if (nt < min) {
-      WARN("Invalid %s %d (minimum %d).", name, nt, min);
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (minimum %d).", name, nt, min);
       nt = min;
      }
   } else {
@@ -51,11 +51,14 @@ static int getNthreads(const char* name, int env, int min, int max, int def) {
 //     NCCL_PROTO="^LL128;allreduce:LL128"
 // Enable everything but LL128, but only LL128 for allreduce.
 ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
+  ncclResult_t ret = ncclSuccess;
   char* fullStr = strdup(str);
   char* tmpFullStr;
   char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
+  char* subToken = nullptr;
+  char* tokStr = nullptr;
   while (fullToken) {
-    char* subToken = strdup(fullToken);
+    subToken = strdup(fullToken);
     char* tmpSubStr;
     char* prefix = strtok_r(subToken, ":", &tmpSubStr);
     char* elemList = strtok_r(NULL, ":", &tmpSubStr);
@@ -65,7 +68,8 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
         // because then all the prefixes before the prefix-less entry would be
         // overwritten.
         WARN("All entries except the first must have a prefix: \"%s\"", str);
-        return ncclInvalidUsage;
+        ret = ncclInvalidUsage;
+        goto fail;
       }
       elemList = prefix;
       prefix = NULL;
@@ -84,7 +88,7 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
       foundPrefix = true;
       for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
 
-      char* tokStr = strdup(elemList);
+      tokStr = strdup(elemList);
       char* tmpStr;
       char* elem = strtok_r(tokStr, ",", &tmpStr);
       while (elem) {
@@ -97,22 +101,32 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
         }
         if (e==nelems) {
           WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
-          return ncclInvalidUsage;
+          ret = ncclInvalidUsage;
+          goto fail;
         }
         elem = strtok_r(NULL, ",", &tmpStr);
       }
       free(tokStr);
+      tokStr = nullptr;
     }
     if (!foundPrefix) {
       WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
-      return ncclInvalidUsage;
+      ret = ncclInvalidUsage;
+      goto fail;
     }
     free(subToken);
+    subToken = nullptr;
 
     fullToken = strtok_r(NULL, ";", &tmpFullStr);
   }
+
+exit:
+  free(tokStr);
+  free(subToken);
   free(fullStr);
-  return ncclSuccess;
+  return ret;
+fail:
+  goto exit;
 }
 
 // Latencies in us, Bandwidths in GB/s
@@ -194,6 +208,8 @@ static float getNetOverhead(struct ncclComm* comm) {
   return 1.0;
 }
 
+NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
+
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
   int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
@@ -248,7 +264,14 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
             && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
-        if (a == NCCL_ALGO_NVLS) bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+        if (a == NCCL_ALGO_NVLS) {
+          if (coll == ncclFuncAllReduce) {
+            bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+          } else {
+            // allgather and reducescatter
+            bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
+          }
+        }
         if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
         float busBw = graphs[a]->nChannels * bw;
 
@@ -264,19 +287,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
         if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
           if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
-            busBw = ppn * bw;
-            // AllGather/ReduceScatter requires 1:1 GPU:NIC
-            int nicPerNode = comm->collNetHeadsNum;
-            if (coll == ncclFuncAllGather && comm->nNodes > 1) {
-              if (!comm->ncclCollNet || !comm->ncclCollNet->iallgather || ppn > nicPerNode) busBw = 0;
-            }
-            if (coll == ncclFuncReduceScatter && comm->nNodes > 1) {
-              if (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter || ppn > nicPerNode) busBw = 0;
-            }
-            // Measured corrective ratio needed at 1 ppn and 8ppn. Here we hackishly
-            // interpolate the two.
-            float w = (ppn-1)/(8-1);
-            busBw *= w*0.85 + (1-w)*0.95;
+            busBw = ppn * std::min(graphs[a]->bwIntra, graphs[a]->bwInter * 0.9f);
           } else {
             // Collnet+Direct requires all GPUs to have a local NIC to work at full speed
             float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
@@ -285,6 +296,26 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
             if (minCompCap >= 90) busBw *= .85;
           }
         }
+        // disable collnet for allgather/reducescatter if #localranks > #heads
+        // AllGather/ReduceScatter requires 1:1 GPU:NIC
+        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_COLLNET_DIRECT) && p == NCCL_PROTO_SIMPLE && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) && comm->nNodes > 1) {
+          int nHeads = 0;
+          if (coll == ncclFuncAllGather && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->iallgather)) busBw = 0.0f;
+          if (coll == ncclFuncReduceScatter && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter)) busBw = 0.0f;
+          if (comm->config.collnetEnable)
+            nHeads = comm->collNetHeadsNum;
+          else
+            busBw = 0.0f;
+          if (busBw > 0.0f) {
+            for (int r = 0; r < comm->nRanks; r++) {
+              int node = comm->rankToNode[r];
+              if (comm->nodeRanks[node].localRanks > nHeads) {
+                busBw = 0.0f;
+                break;
+              }
+            }
+          }
+        }
 
         // Convert bus BW to algorithm BW
         if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
@@ -411,7 +442,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       // Disable NVLS Tree on a single node
       if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
       // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
-      if (comm->collNetSupport == 0 &&
+      if (comm->config.collnetEnable == 0 &&
           (a == NCCL_ALGO_COLLNET_DIRECT ||
            a == NCCL_ALGO_COLLNET_CHAIN ||
            (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
@@ -426,17 +457,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
       // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
-      pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= PATH_PXN));
+      pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN)));
       pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
       pEnable &= (minCompCap == maxCompCap);
-      switch (minCompCap) {
-      case 70: pEnable &= 1; break;
-      case 80: pEnable &= 1; break;
-      case 90: pEnable &= !(CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2); break;
-      case 100: pEnable &= 1; break;
-      case 120: pEnable &= 1; break;
-      default: pEnable &= 0; break;
-      }
+      pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
     }
     if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
     if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
@@ -483,7 +507,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
       }
     }
   }
- 
+
   // Set per-thread amount of work before we increase nThreads and nChannels
   for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
     comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index a41289389..96b0c9a7c 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -39,7 +39,13 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
 #if INT_OK
     int o = 0;
     do {
-      value[o++] = c;
+      value[o] = c;
+      if (o == MAX_STR_LEN-1) {
+        value[o] = '\0';
+        WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN);
+        return ncclInternalError;
+      }
+      o++;
       NCCLCHECK(xmlGetChar(file, &c));
     } while (c >= '0' && c <= '9');
     value[o] = '\0';
@@ -51,10 +57,17 @@ ncclResult_t xmlGetValue(FILE* file, char* value, char* last) {
 #endif
   }
   int o = 0;
+  char quote = c;  // Remember which quote type we started with
   do {
     NCCLCHECK(xmlGetChar(file, &c));
-    value[o++] = c;
-  } while (c != '"');
+    value[o] = c;
+    if (o == MAX_STR_LEN-1) {
+      value[o] = '\0';
+      WARN("Error : value %s too long (max %d)", value, MAX_STR_LEN);
+      return ncclInternalError;
+    }
+    o++;
+  } while (c != quote);
   value[o-1] = '\0';
   NCCLCHECK(xmlGetChar(file, last));
   return ncclSuccess;
@@ -267,7 +280,7 @@ ncclResult_t ncclTopoDumpXmlRec(int indent, FILE* file, struct ncclXmlNode* node
 ncclResult_t ncclTopoDumpXmlToFile(const char* xmlTopoFile, struct ncclXml* xml) {
   FILE* file = fopen(xmlTopoFile, "w");
   if (file == NULL) {
-    WARN("Unable to open %s, not dumping topology.", xmlTopoFile);
+    INFO(NCCL_GRAPH|NCCL_ENV, "Unable to open %s, not dumping topology.", xmlTopoFile);
     return ncclSuccess;
   }
   NCCLCHECK(ncclTopoDumpXmlRec(0, file, xml->nodes));
@@ -375,7 +388,7 @@ ncclResult_t ncclTopoGetXmlFromFile(const char* xmlTopoFile, struct ncclXml* xml
   FILE* file = fopen(xmlTopoFile, "r");
   if (file == NULL) {
     if (warn) {
-      WARN("Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
+      INFO(NCCL_GRAPH|NCCL_ENV, "Could not open XML topology file %s : %s", xmlTopoFile, strerror(errno));
     }
     return ncclSuccess;
   }
@@ -759,7 +772,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
     int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : (sm < 90) ? 12 : 18;
 
     if (maxNvLinks > 0 && nvmlDev == NULL) {
-      WARN("No NVML device handle. Skipping nvlink detection.");
+      INFO(NCCL_GRAPH, "No NVML device handle. Skipping nvlink detection.");
       maxNvLinks = 0;
     }
 
@@ -961,8 +974,16 @@ ncclResult_t ncclTopoTrimXmlRec(struct ncclXmlNode* node, int* keep) {
       NCCLCHECK(ncclTopoTrimXmlRec(subs[s], &k));
       *keep += k;
     }
-    if (*keep == 0 && // Trim PCI switches or CPU with no used GPU/NIC under them.
-        (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0)) {
+    // Remove node if it has no children and no keep attribute
+    if (*keep == 0 && // Trim PCI switches, CPUs with no used GPU/NIC under them, or pruned NICs
+        (strcmp(node->name, "pci") == 0 || strcmp(node->name, "cpu") == 0 || strcmp(node->name, "nic") == 0 || strcmp(node->name, "net") == 0)) {
+#ifdef ENABLE_TRACE
+      const char* name;
+      const char* busid;
+      NCCLCHECK(xmlGetAttr(node, "name", &name));
+      NCCLCHECK(xmlGetAttr(node, "busid", &busid));
+      TRACE(NCCL_GRAPH, "Removing node %s %s %s\n", node->name, name, busid);
+#endif
       NCCLCHECK(xmlRemoveNode(node));
     }
   }
diff --git a/src/graph/xml.h b/src/graph/xml.h
index f06c0e68b..ad9f0faff 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -117,6 +117,13 @@ static ncclResult_t xmlGetAttrIntDefault(struct ncclXmlNode* node, const char* a
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrName, uint64_t* value) {
+  const char* str;
+  NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
+  *value = strtoull(str, NULL, 0);
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
   const char* str;
   NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
@@ -124,7 +131,6 @@ static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrNam
   return ncclSuccess;
 }
 
-
 static ncclResult_t xmlGetAttrFloat(struct ncclXmlNode* node, const char* attrName, float* value) {
   const char* str;
   NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
@@ -254,7 +260,6 @@ static ncclResult_t xmlSetAttrInt(struct ncclXmlNode* node, const char* attrName
     node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   snprintf(node->attrs[index].value, MAX_STR_LEN, "%d", value);
-  node->attrs[index].value[MAX_STR_LEN] = '\0';
   return ncclSuccess;
 }
 
@@ -267,7 +272,6 @@ static ncclResult_t xmlSetAttrFloat(struct ncclXmlNode* node, const char* attrNa
     node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   snprintf(node->attrs[index].value, MAX_STR_LEN, "%g", value);
-  node->attrs[index].value[MAX_STR_LEN] = '\0';
   return ncclSuccess;
 }
 
@@ -280,7 +284,6 @@ static ncclResult_t xmlSetAttrLong(struct ncclXmlNode* node, const char* attrNam
     node->attrs[index].key[MAX_STR_LEN] = '\0';
   }
   snprintf(node->attrs[index].value, MAX_STR_LEN, "%#lx", value);
-  node->attrs[index].value[MAX_STR_LEN] = '\0';
   return ncclSuccess;
 }
 
diff --git a/src/group.cc b/src/group.cc
index c48c0de88..08ac54e9e 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -12,16 +12,14 @@
 #include <assert.h>
 #include "bootstrap.h"
 
+#define GROUP_MAX_RECLAIM_STEPS 10
+
 __thread int ncclGroupDepth = 0; // depth of ncclGroupStart nesting
 __thread ncclResult_t ncclGroupError = ncclSuccess;
-__thread struct ncclComm* ncclGroupCommHead = nullptr;
+__thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum] = {nullptr};
 __thread struct ncclComm* ncclGroupCommPreconnectHead = nullptr;
 __thread struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> ncclAsyncJobs;
-__thread struct ncclGroupJob *ncclGroupJobMainPtr = NULL;
-__thread struct ncclGroupJob ncclGroupJobMain;
 __thread int ncclGroupBlocking = -1; /* default mode */
-__thread bool ncclGroupJobAbortFlag = false;
-
 void* ncclAsyncJobMain(void* arg);
 
 ncclResult_t ncclAsyncLaunch(
@@ -191,6 +189,66 @@ ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
   goto exit;
 }
 
+struct ncclGroupSymmetricJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+};
+
+NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
+
+ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
+  struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_;
+  struct ncclComm* comm = job->comm;
+  ncclResult_t ret = ncclSuccess;
+
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+  if (comm->baseStride == 0) {
+    cudaStream_t hostStream;
+    // first time to allocate symmetric VA space.
+    // calling into this function means symmetric is supported.
+    struct ncclSymDevBase* symBase = NULL;
+    size_t size = ncclSymDevBase::size(comm->localRanks);
+    if (ncclParamWinStride() != -1) {
+      comm->baseStride = ncclParamWinStride();
+    } else {
+      size_t maxStride = 0;
+      for (int r = 0; r < comm->nRanks; ++r)
+        if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem;
+      comm->baseStride = maxStride;
+    }
+    INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30);
+    NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail);
+    comm->symAllocHead = 0;
+
+    // Allocate symmetric memory for NCCL internal usage
+    NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail);
+    assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride));
+    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
+    CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail);
+    CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail);
+    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
+
+    comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride);
+    comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr;
+    comm->symDevComm.nRanks = comm->localRanks;
+    comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks);
+    comm->symDevComm.rank = comm->localRank;
+    comm->symDevComm.stride4G = comm->baseStride >> 32;
+  }
+
+  while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) {
+    struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue);
+    NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail);
+    free(task);
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
 static ncclResult_t doLaunches(struct ncclComm* head) {
   ncclResult_t result = ncclSuccess;
   struct ncclComm* cliqueHead = head;
@@ -207,7 +265,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
       CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
       NCCLCHECKGOTO(ncclLaunchPrepare(comm), result, failure);
       if (useBarrier) ncclCommIntraBarrierIn(comm, 1);
-      comm = comm->groupNext;
+      comm = comm->groupNext[ncclGroupTaskTypeCollective];
     } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
     cliqueNextHead = comm;
 
@@ -224,7 +282,7 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
       bool moreRounds = false;
       comm = cliqueHead;
       do { // Iterate clique members.
-        struct ncclComm* next = comm->groupNext;
+        struct ncclComm* next = comm->groupNext[ncclGroupTaskTypeCollective];
         if (useBarrier) {
           // Barrier reduction result tells us if this was the final round.
           moreRounds = 0 != ncclCommIntraBarrierOut(comm);
@@ -259,64 +317,60 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
   return result;
 }
 
-static inline void groupResetJobState(struct ncclGroupJob* job) {
-  if (job) {
-    if (job->groupBlockingPtr) *job->groupBlockingPtr = -1;
-    if (job->abortFlagPtr) *job->abortFlagPtr = false;
-    if (job->groupErrorPtr) *job->groupErrorPtr = ncclSuccess;
-    if (job->groupCommHeadPtr) *job->groupCommHeadPtr = NULL;
-    if (job->groupCommPreconnectHeadPtr) *job->groupCommPreconnectHeadPtr = NULL;
-    memset(job, 0, sizeof(struct ncclGroupJob));
-  }
+static inline void groupLocalResetJobState() {
+  ncclGroupError = ncclSuccess;
+  for (int type = 0; type < ncclGroupTaskTypeNum; ++type) ncclGroupCommHead[type] = NULL;
+  ncclGroupCommPreconnectHead = NULL;
+  ncclGroupBlocking = -1;
+  ncclIntruQueueConstruct(&ncclAsyncJobs);
   return;
 }
 
-static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclComm** groupCommPreconnectHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t* groupErrorPtr, int* groupBlockingPtr, volatile bool* groupJobAbortFlagPtr, ncclResult_t error) {
-  struct ncclComm* comm = *groupCommHeadPtr;
-
-  /* reset all thread local variables */
-  *groupCommHeadPtr = NULL;
-  *groupCommPreconnectHeadPtr = NULL;
-  *groupErrorPtr = ncclSuccess;
-  *groupBlockingPtr = -1;
-  *groupJobAbortFlagPtr = false;
-
-  while (comm != nullptr) {
-    struct ncclComm* next = comm->groupNext;
-    (void) ncclGroupCommLeave(comm); // overwrites comm->groupNext
-    // We don't know if preconnect succeeded or happened at all, so clear
-    // the flags that let `taskAppend()` skip over checking if preconnect
-    // is needed.
-    comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
-    for (int i = 0; i < comm->nRanks; i++) {
-      comm->connectSend[i] = 0UL;
-      comm->connectRecv[i] = 0UL;
-    }
-    // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
-    // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
-    while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
-      struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
-      // Persistent plans will be reclaimed via the callbackQueue when the
-      // graph drops its UserObject reference.
-      if (!plan->persistent) {
-        while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
-          struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
-          ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
+static void groupCleanup(struct ncclComm** groupCommHeadPtr, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncJobsPtr, ncclResult_t error) {
+  struct ncclComm* comm;
+  for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
+    comm = groupCommHeadPtr[type];
+    // reset groupCommHeadPtr[type]
+    groupCommHeadPtr[type] = nullptr;
+    while (comm != nullptr) {
+      struct ncclComm* next = comm->groupNext[type];
+      (void)ncclGroupCommLeave(comm, type); // overwrites comm->groupNext
+      // We don't know if preconnect succeeded or happened at all, so clear
+      // the flags that let `taskAppend()` skip over checking if preconnect
+      // is needed.
+      if (type == ncclGroupTaskTypeCollective) {
+        comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
+        for (int i = 0; i < comm->nRanks; i++) {
+          comm->connectSend[i] = 0UL;
+          comm->connectRecv[i] = 0UL;
+        }
+        // Reclaim abandoned kernel plan memory. Note ncclWork structs were already
+        // reclaimed by a `ncclMemoryStackPop(&comm->memScoped)` during `ncclGroupCommLeave()`.
+        while (!ncclIntruQueueEmpty(&comm->planner.planQueue)) {
+          struct ncclKernelPlan* plan = ncclIntruQueueDequeue(&comm->planner.planQueue);
+          // Persistent plans will be reclaimed via the callbackQueue when the
+          // graph drops its UserObject reference.
+          if (!plan->persistent) {
+            while (!ncclIntruQueueEmpty(&plan->proxyOpQueue)) {
+              struct ncclProxyOp* pxop = ncclIntruQueueDequeue(&plan->proxyOpQueue);
+              ncclMemoryPoolFree(&comm->memPool_ncclProxyOp, pxop);
+            }
+            ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
+          }
+        }
+
+        { // Reset comm->planner to empty.
+          ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+          memset(&comm->planner, 0, sizeof(comm->planner));
+          comm->planner.peers = tmp;
+          if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks * sizeof(comm->planner.peers[0]));
         }
-        ncclMemoryPoolFree(&comm->memPool_ncclKernelPlan, plan);
       }
-    }
 
-    { // Reset comm->planner to empty.
-      ncclKernelPlanner::Peer* tmp = comm->planner.peers;
-      memset(&comm->planner, 0, sizeof(comm->planner));
-      comm->planner.peers = tmp;
-      if (comm->planner.peers != NULL) memset(comm->planner.peers, 0, comm->nRanks*sizeof(comm->planner.peers[0]));
+      if (!comm->config.blocking)
+        (void)ncclCommSetAsyncError(comm, error);
+      comm = next;
     }
-
-    if (!comm->config.blocking)
-      (void) ncclCommSetAsyncError(comm, error);
-    comm = next;
   }
 
   /* reset everything */
@@ -393,11 +447,10 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
   ncclResult_t ret = ncclSuccess;
   struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
-  struct ncclComm *groupCommHeadMain = *gjob->groupCommHeadPtr;
-  struct ncclComm *groupCommPreconnectHeadMain = *gjob->groupCommPreconnectHeadPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = gjob->asyncJobsPtr;
-
-  bool *groupAbortFlag = gjob->abortFlagPtr;
+  struct ncclComm **groupCommHeadMain = gjob->groupCommHead;
+  struct ncclComm *groupCommPreconnectHeadMain = gjob->groupCommPreconnectHead;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsMain = &gjob->asyncJobs;
+  bool *groupAbortFlag = &gjob->abortFlag;
 
   if (!simInfo && groupCommPreconnectHeadMain != nullptr) {
     struct ncclComm* comm = groupCommPreconnectHeadMain;
@@ -421,9 +474,41 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
 
   NCCLCHECKGOTO(asyncJobLaunch(asyncJobsMain, groupAbortFlag), ret, fail);
 
+  // only loop through sym alloc and register tasks
+  for (int type = ncclGroupTaskTypeSymRegister; type <= ncclGroupTaskTypeSymRegister; ++type) {
+    if (groupCommHeadMain[type]) {
+      struct ncclComm* cliqueHead = groupCommHeadMain[type];
+      struct ncclComm* comm = NULL;
+      struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncSymJobs;
+      ncclIntruQueueConstruct(&asyncSymJobs);
+      do {
+        comm = cliqueHead;
+        do {
+          struct ncclGroupSymmetricJob* job;
+          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
+          job->base.func = ncclCommGroupRegisterSymmetric;
+          job->base.undo = nullptr;
+          job->base.destructor = free;
+          job->base.state = ncclGroupJobRunning;
+          job->base.abortFlag = comm->abortFlag;
+          job->base.abortFlagDev = comm->abortFlagDev;
+          job->comm = comm;
+          ncclIntruQueueEnqueue(&asyncSymJobs, (struct ncclAsyncJob*)job);
+          comm = comm->groupNext[type];
+        } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
+        NCCLCHECKGOTO(asyncJobLaunch(&asyncSymJobs, groupAbortFlag), ret, fail);
+        while (!ncclIntruQueueEmpty(&asyncSymJobs)) {
+          struct ncclAsyncJob* job = ncclIntruQueueDequeue(&asyncSymJobs);
+          if (job->destructor) job->destructor((void*)job);
+        }
+        cliqueHead = comm;
+      } while (cliqueHead != nullptr);
+    }
+  }
+
   /* Connect channels at runtime if cumem is supported */
-  if (groupCommHeadMain != nullptr) {
-    struct ncclComm* cliqueHead = groupCommHeadMain;
+  if (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr) {
+    struct ncclComm* cliqueHead = groupCommHeadMain[ncclGroupTaskTypeCollective];
     struct ncclComm* comm = NULL;
     struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncCollJobs;
     ncclIntruQueueConstruct(&asyncCollJobs);
@@ -454,7 +539,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
           memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
           ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
         }
-        comm = comm->groupNext;
+        comm = comm->groupNext[ncclGroupTaskTypeCollective];
       } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
       // connect
       NCCLCHECKGOTO(asyncJobLaunch(&asyncCollJobs, groupAbortFlag), ret, fail);
@@ -466,42 +551,49 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
     } while (cliqueHead != nullptr);
 
     // done with all buffer allocation, start registration and enqueue
-    comm = groupCommHeadMain;
+    comm = groupCommHeadMain[ncclGroupTaskTypeCollective];
     do {
       CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
       NCCLCHECKGOTO(ncclTasksRegAndEnqueue(comm), ret, fail);
-      comm = comm->groupNext;
+      comm = comm->groupNext[ncclGroupTaskTypeCollective];
     } while (comm);
   }
 
-  if ((!simInfo) && (groupCommHeadMain != nullptr)) {
-    NCCLCHECKGOTO(doLaunches(groupCommHeadMain), ret, fail);
+  if ((!simInfo) && (groupCommHeadMain[ncclGroupTaskTypeCollective] != nullptr)) {
+    NCCLCHECKGOTO(doLaunches(groupCommHeadMain[ncclGroupTaskTypeCollective]), ret, fail);
   }
 
   while (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueDequeue(asyncJobsMain);
-    if (!job->destroyFlag && job->comm && !job->comm->config.blocking)
+    if (!job->destroyFlag && job->comm && !job->comm->config.blocking && groupCommHeadMain[ncclGroupTaskTypeCollective] == nullptr)
       (void) ncclCommSetAsyncError(job->comm, ret);
     if (job->destructor) job->destructor((void*)job);
   }
 
-  while (groupCommHeadMain != nullptr) {
-    struct ncclComm* comm = groupCommHeadMain;
-    struct ncclComm* next = comm->groupNext;
-    // Poll for callbacks sent to us from other threads. Typically these free
-    // resources from to our memory pools and UB
-    NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
-    (void) ncclGroupCommLeave(comm);
-    if (!comm->config.blocking) {
-      (void) ncclCommSetAsyncError(comm, ret);
+  for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
+    while (groupCommHeadMain[type] != nullptr) {
+      struct ncclComm* comm = groupCommHeadMain[type];
+      struct ncclComm* next = comm->groupNext[type];
+      // Poll for callbacks sent to us from other threads. Typically these free
+      // resources from to our memory pools and UB
+      if (comm->reclaimSteps == GROUP_MAX_RECLAIM_STEPS) {
+        NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), ret, fail);
+        comm->reclaimSteps = 0;
+      } else {
+        comm->reclaimSteps++;
+      }
+      (void)ncclGroupCommLeave(comm, type);
+      if (!comm->config.blocking) {
+        (void)ncclCommSetAsyncError(comm, ret);
+      }
+      groupCommHeadMain[type] = next;
     }
-    groupCommHeadMain = next;
   }
 
 exit:
   return ret;
 fail:
-  groupCleanup(gjob->groupCommHeadPtr, gjob->groupCommPreconnectHeadPtr, gjob->asyncJobsPtr, gjob->groupErrorPtr, gjob->groupBlockingPtr, gjob->abortFlagPtr, ret);
+  groupCleanup(gjob->groupCommHead, &gjob->asyncJobs, ret);
   goto exit;
 }
 
@@ -514,6 +606,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
   ncclSimInfo_t internalSimInfo = NCCL_SIM_INFO_INITIALIZER;
   ncclSimInfo_t* internalSimInfoPtr = NULL;
   size_t realSize = 0;
+  bool hasCommHead = false;
+  ncclGroupJob* groupJob = NULL;
 
   internalSimInfo.magic = 0;
 
@@ -539,72 +633,108 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
     internalSimInfoPtr = &internalSimInfo;
   }
 
-  if (ncclGroupCommHead != nullptr || !ncclIntruQueueEmpty(&ncclAsyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
-    ncclGroupJobMain.groupCommHeadPtr = &ncclGroupCommHead;
-    ncclGroupJobMain.groupCommPreconnectHeadPtr = &ncclGroupCommPreconnectHead;
-    ncclGroupJobMain.groupErrorPtr = &ncclGroupError;
-    ncclGroupJobMain.asyncJobsPtr = &ncclAsyncJobs;
-    ncclGroupJobMain.abortFlagPtr = &ncclGroupJobAbortFlag;
-    ncclGroupJobMain.groupBlockingPtr = &ncclGroupBlocking;
-    ncclGroupJobMain.initialized = true;
-    ncclGroupJobMainPtr = &ncclGroupJobMain;
+  for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
+    if (ncclGroupCommHead[type]) {
+      hasCommHead = true;
+      break;
+    }
+  }
+
+  NCCLCHECKGOTO(ncclCalloc(&groupJob, 1), ret, fail);
+  ncclIntruQueueConstruct(&groupJob->asyncJobs);
+  groupJob->groupRefCount = 0;
+  groupJob->nonBlockingInit = false;
+  memcpy(groupJob->groupCommHead, ncclGroupCommHead, sizeof(ncclGroupCommHead));
+  groupJob->groupCommPreconnectHead = ncclGroupCommPreconnectHead;
+  groupJob->groupError = ncclSuccess;
+  groupJob->abortFlag = false;
+  groupJob->joined = false;
+  ncclIntruQueueTransfer(&groupJob->asyncJobs, &ncclAsyncJobs);
+
+  if (hasCommHead || !ncclIntruQueueEmpty(&groupJob->asyncJobs) || ncclGroupCommPreconnectHead != nullptr) {
     /* make sure ncclGroupBlocking has been set. */
     assert(ncclGroupBlocking == 0 || ncclGroupBlocking == 1);
     if (ncclGroupBlocking == 0) {
       /* nonblocking group */
-      if (!ncclIntruQueueEmpty(&ncclAsyncJobs)) {
-        ncclAsyncJob* job = ncclIntruQueueHead(&ncclAsyncJobs);
+      if (!ncclIntruQueueEmpty(&groupJob->asyncJobs)) {
+        ncclAsyncJob* job = ncclIntruQueueHead(&groupJob->asyncJobs);
         do {
           NCCLCHECKGOTO(ncclCommSetAsyncError(job->comm, ncclInProgress), ret, fail);
-          job->comm->groupJob = ncclGroupJobMainPtr;
+          if (job->comm->groupJob == NULL) {
+            job->comm->groupJob = groupJob;
+            groupJob->groupRefCount++;
+          }
           job = job->next;
         } while (job);
       }
 
-      if (ncclGroupCommHead) {
-        ncclComm_t comm = ncclGroupCommHead;
-        do {
-          NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
-          /* link group job to communicators. */
-          comm->groupJob = ncclGroupJobMainPtr;
-          comm = comm->groupNext;
-        } while (comm);
+      for (int type = 0; type < ncclGroupTaskTypeNum; ++type) {
+        if (ncclGroupCommHead[type]) {
+          ncclComm_t comm = ncclGroupCommHead[type];
+          do {
+            NCCLCHECKGOTO(ncclCommSetAsyncError(comm, ncclInProgress), ret, fail);
+            /* link group job to communicators. */
+            if (comm->groupJob == NULL) {
+              comm->groupJob = groupJob;
+              groupJob->groupRefCount++;
+            }
+            comm = comm->groupNext[type];
+          } while (comm);
+        }
       }
 
-      ncclGroupJobMainPtr->base.func = groupLaunchNonBlocking;
-      PTHREADCHECKGOTO(pthread_create(&ncclGroupJobMainPtr->base.thread, NULL, ncclAsyncJobMain, (void*)&ncclGroupJobMainPtr->base), "pthread_create", ret, fail);
+      groupJob->base.func = groupLaunchNonBlocking;
+      PTHREADCHECKGOTO(pthread_create(&groupJob->base.thread, NULL, ncclAsyncJobMain, (void*)&groupJob->base), "pthread_create", ret, fail);
+      groupJob->nonBlockingInit = true;
       ret = ncclInProgress;
     } else {
       /* blocking group */
       int savedDev;
       CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, fail);
-      NCCLCHECKGOTO(groupLaunch(&ncclGroupJobMainPtr->base, internalSimInfoPtr), ret, fail);
+      NCCLCHECKGOTO(groupLaunch(&groupJob->base, internalSimInfoPtr), ret, fail);
       CUDACHECKGOTO(cudaSetDevice(savedDev), ret, fail);
       if (simInfo) memcpy((void*)simInfo, (void*)internalSimInfoPtr, realSize);
-      groupResetJobState(ncclGroupJobMainPtr);
+      free(groupJob);
     }
   }
+  /* Reset the job state for the next group call. */
+  groupLocalResetJobState();
 
 exit:
   return ret;
 fail:
-  groupCleanup(&ncclGroupCommHead, &ncclGroupCommPreconnectHead, &ncclAsyncJobs, &ncclGroupError, &ncclGroupBlocking, &ncclGroupJobAbortFlag, ret);
+  if (groupJob) {
+    groupCleanup(groupJob->groupCommHead, &groupJob->asyncJobs, ret);
+    free(groupJob);
+  } else {
+    groupCleanup(ncclGroupCommHead, &ncclAsyncJobs, ret);
+  }
+  groupLocalResetJobState();
   goto exit;
 }
 
 ncclResult_t ncclGroupJobComplete(struct ncclGroupJob* groupJob) {
   ncclResult_t ret = ncclSuccess;
-  if (groupJob && groupJob->initialized) {
-    ret = ncclAsyncJobComplete(&groupJob->base);
-    groupResetJobState(groupJob);
+  if (groupJob && groupJob->nonBlockingInit) {
+    if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) {
+      ret = ncclAsyncJobComplete(&groupJob->base);
+    }
+    if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) {
+      free(groupJob);
+    }
   }
   return ret;
 }
 
 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob) {
-  if (groupJob && groupJob->initialized) {
-    __atomic_store_n(groupJob->abortFlagPtr, true, __ATOMIC_RELEASE);
-    NCCLCHECK(ncclGroupJobComplete(groupJob));
+  if (groupJob && groupJob->nonBlockingInit) {
+    if (!__atomic_exchange_n(&groupJob->joined, true, __ATOMIC_ACQ_REL)) {
+      __atomic_store_n(&groupJob->abortFlag, true, __ATOMIC_RELAXED);
+      ncclAsyncJobComplete(&groupJob->base);
+    }
+    if (ncclAtomicRefCountDecrement(&groupJob->groupRefCount) == 0) {
+      free(groupJob);
+    }
   }
   return ncclSuccess;
 }
diff --git a/src/include/allocator.h b/src/include/allocator.h
new file mode 100644
index 000000000..189c3d4e2
--- /dev/null
+++ b/src/include/allocator.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOCATOR_H_
+#define NCCL_ALLOCATOR_H_
+
+ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
+ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
+
+#endif
diff --git a/src/include/bitops.h b/src/include/bitops.h
index dcf0e2e09..71053ed49 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -19,6 +19,28 @@
   #endif
 #endif
 
+template<typename Int>
+constexpr static __host__ __device__ Int minval(Int a) { return a; }
+template<typename Int, typename ...More>
+constexpr static __host__ __device__ Int minval(Int a, Int b, More ...more) {
+  #if __CUDA_ARCH__
+    return minval(min(a, b), more...);
+  #else
+    return minval(a < b ? a : b, more...);
+  #endif
+}
+
+template<typename Int>
+constexpr static __host__ __device__ Int maxval(Int a) { return a; }
+template<typename Int, typename ...More>
+constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
+  #if __CUDA_ARCH__
+    return maxval(max(a, b), more...);
+  #else
+    return maxval(a > b ? a : b, more...);
+  #endif
+}
+
 #define DIVUP(x, y) \
     (((x)+(y)-1)/(y))
 
@@ -32,32 +54,150 @@
   size = ((size + (align) - 1) / (align)) * (align);
 
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
+static __host__ __device__ constexpr Z divUp(X x, Y y) {
   return (x+y-1)/y;
 }
 
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
+static __host__ __device__ constexpr Z roundUp(X x, Y y) {
   return (x+y-1) - (x+y-1)%y;
 }
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundDown(X x, Y y) {
+static __host__ __device__ constexpr Z roundDown(X x, Y y) {
   return x - x%y;
 }
 
 // assumes second argument is a power of 2
 template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
+static __host__ __device__ constexpr Z alignUp(X x, int a) {
   return (x + a-1) & Z(-a);
 }
 // assumes second argument is a power of 2
 template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignDown(X x, int a) {
+static __host__ __device__ constexpr Z alignDown(X x, int a) {
   return x & Z(-a);
 }
 
 template<typename Int>
-inline __host__ __device__ int countOneBits(Int x) {
+constexpr __host__ __device__ bool isPow2(Int x) {
+  return (x & (x-1)) == 0;
+}
+
+template<typename T>
+static __host__ __device__ T add4G(T base, int delta4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = base;
+  u32[1] += delta4G;
+  return tmp;
+}
+
+template<typename T>
+static __host__ __device__ T incWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = ptr;
+  u32[1] += delta4G;
+  if (u32[1] >= hi4G) u32[1] -= hi4G-lo4G;
+  return tmp;
+}
+
+template<typename T>
+static __host__ __device__ T decWrap4G(T ptr, uint32_t delta4G, uint32_t lo4G, uint32_t hi4G) {
+  union { T tmp; uint32_t u32[2]; };
+  tmp = ptr;
+  u32[1] -= delta4G;
+  if (u32[1] < lo4G) u32[1] += hi4G-lo4G;
+  return tmp;
+}
+
+// Produce the reciprocal of x for use in idivByRcp
+constexpr __host__ __device__ uint32_t idivRcp32(uint32_t x) {
+  return uint32_t(uint64_t(0x100000000)/x);
+}
+constexpr __host__ __device__ uint64_t idivRcp64(uint64_t x) {
+  return uint64_t(-1)/x + isPow2(x);
+}
+
+static __host__ __device__ uint32_t mul32hi(uint32_t a, uint32_t b) {
+#if __CUDA_ARCH__
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+static __host__ __device__ uint64_t mul64hi(uint64_t a, uint64_t b) {
+#if __CUDA_ARCH__
+  return __umul64hi(a, b);
+#else
+  return (uint64_t)(((unsigned __int128)a)*b >> 64);
+#endif
+}
+
+// Produce the reciprocal of x*y given their respective reciprocals. This incurs
+// no integer division on device.
+static __host__ __device__ uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint32_t rcp = mul32hi(xrcp, yrcp);
+  uint32_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+static __host__ __device__ uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint64_t rcp = mul64hi(xrcp, yrcp);
+  uint64_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+
+// Fast integer division where divisor has precomputed reciprocal.
+// idivFast(x, y, idivRcp(y)) == x/y
+static __host__ __device__ void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q = x, r = 0;
+  if (yrcp != 0) {
+    q = mul32hi(x, yrcp);
+    r = x - y*q;
+    if (r >= y) { q += 1; r -= y; }
+  }
+  *quo = q;
+  *rem = r;
+}
+static __host__ __device__ void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q = x, r = 0;
+  if (yrcp != 0) {
+    q = mul64hi(x, yrcp);
+    r = x - y*q;
+    if (r >= y) { q += 1; r -= y; }
+  }
+  *quo = q;
+  *rem = r;
+}
+
+static __host__ __device__ uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return q;
+}
+static __host__ __device__ uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return q;
+}
+
+static __host__ __device__ uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return r;
+}
+static __host__ __device__ uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return r;
+}
+
+template<typename Int>
+static __host__ __device__ int countOneBits(Int x) {
 #if __CUDA_ARCH__
   if (sizeof(Int) <= sizeof(unsigned int)) {
     return __popc((unsigned int)x);
@@ -83,7 +223,7 @@ inline __host__ __device__ int countOneBits(Int x) {
 
 // Returns index of first one bit or returns -1 if mask is zero.
 template<typename Int>
-inline __host__ __device__ int firstOneBit(Int mask) {
+static __host__ __device__ int firstOneBit(Int mask) {
   int i;
 #if __CUDA_ARCH__
   if (sizeof(Int) <= sizeof(int)) {
@@ -108,14 +248,14 @@ inline __host__ __device__ int firstOneBit(Int mask) {
 }
 
 template<typename Int>
-inline __host__ __device__ int popFirstOneBit(Int* mask) {
+static __host__ __device__ int popFirstOneBit(Int* mask) {
   Int tmp = *mask;
   *mask &= *mask-1;
   return firstOneBit(tmp);
 }
 
 template<typename Int>
-inline __host__ __device__ int log2Down(Int x) {
+static __host__ __device__ int log2Down(Int x) {
   int w, n;
 #if __CUDA_ARCH__
   if (sizeof(Int) <= sizeof(int)) {
@@ -147,7 +287,7 @@ inline __host__ __device__ int log2Down(Int x) {
 }
 
 template<typename Int>
-inline __host__ __device__ int log2Up(Int x) {
+static __host__ __device__ int log2Up(Int x) {
   int w, n;
   if (x != 0) x -= 1;
 #if __CUDA_ARCH__
@@ -180,19 +320,19 @@ inline __host__ __device__ int log2Up(Int x) {
 }
 
 template<typename Int>
-inline __host__ __device__ Int pow2Up(Int x) {
+static __host__ __device__ Int pow2Up(Int x) {
   return Int(1)<<log2Up(x);
 }
 
 template<typename Int>
-inline __host__ __device__ Int pow2Down(Int x) {
+static __host__ __device__ Int pow2Down(Int x) {
   // True, log2Down can return -1, but we don't normally pass 0 as an argument...
   // coverity[negative_shift]
   return Int(1)<<log2Down(x);
 }
 
 template<typename UInt, int nSubBits>
-inline __host__ UInt reverseSubBits(UInt x) {
+static __host__ UInt reverseSubBits(UInt x) {
   if (nSubBits >= 16 && 8*sizeof(UInt) == nSubBits) {
     switch (8*sizeof(UInt)) {
     case 16: x = __builtin_bswap16(x); break;
@@ -225,7 +365,7 @@ template<> struct ncclToUnsigned<unsigned long long> { using type = unsigned lon
 
 // Reverse the bottom nBits bits of x. The top bits will be overwritten with 0's.
 template<typename Int>
-inline __host__ __device__ Int reverseBits(Int x, int nBits) {
+static __host__ __device__ Int reverseBits(Int x, int nBits) {
   using UInt = typename ncclToUnsigned<Int>::type;
   union { UInt ux; Int sx; };
   sx = x;
@@ -249,7 +389,7 @@ inline __host__ __device__ Int reverseBits(Int x, int nBits) {
 // has nearly the full range of uint32_t except it only keeps the top 3 bits
 // beneath the leading 1 bit and thus has a max value of 0xf0000000.
 
-inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
+static __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
   int log2x;
   #if __CUDA_ARCH__
     log2x = 31-__clz(x|1);
@@ -261,7 +401,7 @@ inline __host__ __device__ uint32_t u32fpEncode(uint32_t x, int bitsPerPow2) {
   return exponent<<bitsPerPow2 | mantissa;
 }
 
-inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
+static __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
   uint32_t exponent = x>>bitsPerPow2;
   uint32_t mantissa = (x & ((1u<<bitsPerPow2)-1)) | (exponent!=0 ? 0x8 : 0);
   if (exponent != 0) exponent -= 1;
@@ -270,16 +410,16 @@ inline __host__ __device__ uint32_t u32fpDecode(uint32_t x, int bitsPerPow2) {
 
 constexpr uint32_t u32fp8MaxValue() { return 0xf0000000; }
 
-inline __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
+static __host__ __device__ uint8_t u32fp8Encode(uint32_t x) {
   return u32fpEncode(x, 3);
 }
-inline __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
+static __host__ __device__ uint32_t u32fp8Decode(uint8_t x) {
   return u32fpDecode(x, 3);
 }
 
 // The hash isn't just a function of the bytes but also where the bytes are split
 // into different calls to eatHash().
-inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
+static __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size_t size) {
   char const* ptr = (char const*)bytes;
   acc[0] ^= size;
   while (size != 0) {
@@ -302,11 +442,11 @@ inline __host__ __device__ void eatHash(uint64_t acc[2], const void* bytes, size
 }
 
 template<typename T>
-inline __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
+static __host__ __device__ void eatHash(uint64_t acc[2], const T* bytes) {
   eatHash(acc, (const void*)bytes, sizeof(T));
 }
 
-inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
+static __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
   uint64_t h = acc[0];
   h ^= h >> 31;
   h *= 0xbac3bd562846de6b;
@@ -316,13 +456,13 @@ inline __host__ __device__ uint64_t digestHash(uint64_t const acc[2]) {
   return h;
 }
 
-inline __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
+static __host__ __device__ uint64_t getHash(const void* bytes, size_t size) {
   uint64_t acc[2] = {1, 1};
   eatHash(acc, bytes, size);
   return digestHash(acc);
 }
 template<typename T>
-inline __host__ __device__ uint64_t getHash(const T* bytes) {
+static __host__ __device__ uint64_t getHash(const T* bytes) {
   return getHash((const void*)bytes, sizeof(T));
 }
 
diff --git a/src/include/comm.h b/src/include/comm.h
index 409518713..1378e0765 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -17,6 +17,7 @@
 #include "register.h"
 #include "graph.h"
 #include "profiler.h"
+#include "allocator.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -131,7 +132,6 @@ struct ncclSharedResources {
   int* tpRankToLocalRank;
   // Internal streams
   struct ncclStrongStream deviceStream, hostStream;
-  int noncapturedRefs; // number of non-captured hostStreamPlanCallback on the stream
   int persistentRefs;
   cudaEvent_t launchEvent, scratchEvent;
 
@@ -218,6 +218,7 @@ struct ncclTaskColl {
   // Profiler plugin
   int eActivationMask;
   void* eventHandle;
+  uint8_t nChannels;
 };
 struct ncclTaskP2p {
   struct ncclTaskP2p* next;
@@ -231,6 +232,7 @@ struct ncclTaskP2p {
   // Profiler plugin
   int eActivationMask;
   void* eventHandle;
+  uint8_t nChannels;
 };
 
 struct ncclKernelPlan {
@@ -243,10 +245,14 @@ struct ncclKernelPlan {
 
   bool persistent; // aka captured in a graph
   bool isHostCbEnq;
+  bool isSymColl;
   enum ncclDevWorkStorageType workStorageType;
   bool kernelSpecialized;
-  void *kernelFn;
-  struct ncclDevKernelArgs* kernelArgs;
+  void* kernelFn;
+  union {
+    struct ncclDevKernelArgs* kernelArgs;
+    struct ncclSymDevArgs* kernelSymArgs;
+  };
   size_t kernelArgsSize;
   uint64_t channelMask; // bitset of which channels are present
   bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
@@ -355,6 +361,7 @@ struct ncclKernelPlanner {
   struct Peer* peers/*[nRanks]*/;
   int nTasksColl, nTasksP2p;
   bool persistent;
+  bool isSymColl;
 
   // The list of user streams aggregated over all tasks present.
   struct ncclCudaStreamList* streams;
@@ -404,6 +411,12 @@ struct ncclKernelPlanner {
 
 #define NCCL_MAGIC 0x0280028002800280 // Nickel atomic number is 28.
 
+typedef enum ncclGroupTaskType {
+  ncclGroupTaskTypeCollective = 0,
+  ncclGroupTaskTypeSymRegister = 1,
+  ncclGroupTaskTypeNum = 2,
+} ncclGroupTaskType_t;
+
 struct ncclComm {
   uint64_t startMagic;
   struct ncclMemoryStack memPermanent, memScoped;
@@ -420,9 +433,10 @@ struct ncclComm {
   struct ncclTopoSystem* topo;
   struct ncclProxyConnector* gproxyConn;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> legacyRegCleanupQueue;
+  bool peerInfoValid;
 
-  int netPluginLoaded;
   ncclNet_t* ncclNet;
+  int netPluginIndex;
   int ncclNetVer;
   ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
@@ -439,7 +453,6 @@ struct ncclComm {
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
 
-  const char* commName;
   uint64_t commHash;
   int rank;    // my rank in the communicator
   int nRanks;  // number of GPUs in communicator
@@ -515,6 +528,7 @@ struct ncclComm {
 
   // Device side of the communicator (for cudaFree's)
   struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+  struct ncclSymDevComm symDevComm;
 
   uint32_t workArgsBytes; // max size of kernel args
   uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -522,12 +536,10 @@ struct ncclComm {
   void* workFifoBufDev;
   void* workFifoBufGdrHandle;
 
-  // Monotonic number of bytes (mod 1<<32) consumed per channel. In cudaHost memory.
-  uint32_t* workFifoConsumed/*[MAXCHANNELS]*/;
-  // Last observed value of: min(workFifoConsumed[c] for c < MAXCHANNELS)
-  uint32_t workFifoConsumedLeast;
   // Monotonic number of bytes (mod 1<<32) sent to fifo.
   uint32_t workFifoProduced;
+  uint32_t workFifoProducedLastRecorded;
+  uint32_t workFifoConsumed;
 
   // Intra-process sync
   struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
@@ -543,10 +555,8 @@ struct ncclComm {
   struct ncclProxyState* proxyState;
   int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
   // Whether this communicator uses collNet
-  int collNetSupport;
   bool isOneRPN;
   uint8_t collNetSupportMatrix[4/*sum,prod,max,min*/][ncclNumTypes];
-  bool intraNodeP2pSupport;
   int* collNetHeads;
   int collNetHeadsNum;
   int* collNetDenseToUserRank;
@@ -568,7 +578,7 @@ struct ncclComm {
 
   // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
   // this comm is not yet in a group.
-  struct ncclComm* groupNext;
+  struct ncclComm* groupNext[ncclGroupTaskTypeNum];
   // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
   struct ncclComm* preconnectNext;
   int localPersistentRefs; // number of persistent plan-lists capturing this comm
@@ -588,6 +598,7 @@ struct ncclComm {
   ncclUserRedOp *userRedOps;
 
   // Queue of things for the main thread to do
+  int reclaimSteps;
   struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
 
   ncclConfig_t config;
@@ -600,6 +611,9 @@ struct ncclComm {
   // group job to support multi-thread FT
   struct ncclGroupJob *groupJob;
 
+  // Flag indicating if this communicator shares resources with parent or children
+  bool shareResources;
+
   // Tuning plugin
   int tunerPluginLoaded;
   ncclTuner_t* tuner;
@@ -613,9 +627,18 @@ struct ncclComm {
   // buffer registration cache
   struct ncclRegCache regCache;
   int isAllNvlink;
+  bool isAllDirectP2p;
+  int symmetricSupport;
   bool useNetPXN;
   bool useGdr;
   int splitCount;
+  // symmetric buffer
+  uint8_t* baseUCSymPtr;
+  uint8_t* baseMCSymPtr;
+  size_t baseStride;
+  size_t symAllocHead;
+  CUmemGenericAllocationHandle symMCHandle;
+  struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
   uint64_t endMagic;
 };
 
@@ -647,15 +670,21 @@ inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome)
   return ncclSuccess;
 }
 
-inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm) {
+inline ncclResult_t ncclCommPollEventCallbacks(struct ncclComm *comm, bool waitSome) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   while (true) {
     struct ncclCommEventCallback* cb = ncclIntruQueueHead(&comm->eventCallbackQueue);
     if (cb == nullptr) break;
-    cudaError_t ok = cudaEventSynchronize(cb->event);
-    if (ok == cudaErrorNotReady) break;
+    cudaError_t ok;
+    if (waitSome) {
+      ok = cudaEventSynchronize(cb->event);
+      waitSome = false;
+    } else {
+      ok = cudaEventQuery(cb->event);
+      if (ok == cudaErrorNotReady) break;
+    }
     ncclIntruQueueDequeue(&comm->eventCallbackQueue);
     if (ok == cudaSuccess) {
       NCCLCHECKGOTO(cb->fn(comm, cb), result, finish);
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index ec55cbc54..99e3edf4d 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -58,4 +58,29 @@ static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
   return ncclSuccess;
 }
 
+static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
+  int c = 0;
+  int start = -1;
+  // Iterate through all possible CPU bits plus one extra position
+  for (int cpu = 0; cpu <= CPU_SETSIZE; cpu++) {
+    int isSet = (cpu == CPU_SETSIZE) ? 0 : CPU_ISSET(cpu, mask);
+    // Start of a new range
+    if (isSet && start == -1) {
+      start = cpu;
+    }
+    // End of a range, add comma between ranges
+    if (!isSet && start != -1) {
+      if (cpu-1 == start) {
+        c += snprintf(str+c, len-c, "%s%d", c ? "," : "", start);
+      } else {
+        c += snprintf(str+c, len-c, "%s%d-%d", c ? "," : "", start, cpu-1);
+      }
+      if (c >= len-1) break;
+      start = -1;
+    }
+  }
+  if (c == 0) str[0] = '\0';
+  return str;
+}
+
 #endif
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index bf6132657..2edc60f21 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -36,6 +36,10 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
     }							      \
 } while(false)
 
+#define CUCALL(cmd) do {				      \
+    pfn_##cmd;				                \
+} while(false)
+
 #define CUCHECKGOTO(cmd, res, label) do {		      \
     CUresult err = pfn_##cmd;				      \
     if( err != CUDA_SUCCESS ) {				      \
@@ -66,49 +70,49 @@ extern CUmemAllocationHandleType ncclCuMemHandleType;
     }									\
 } while(0)
 
-#define DECLARE_CUDA_PFN_EXTERN(symbol) extern PFN_##symbol pfn_##symbol
+#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
 
 #if CUDART_VERSION >= 11030
 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGet);
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorString);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorName);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange);
-DECLARE_CUDA_PFN_EXTERN(cuCtxCreate);
-DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent);
-DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice);
-DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute);
-DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 11040);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernel, 4000);
 #if CUDART_VERSION >= 11080
-DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx);
+DECLARE_CUDA_PFN_EXTERN(cuLaunchKernelEx, 11060);
 #endif
 // cuMem API support
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve);
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree);
-DECLARE_CUDA_PFN_EXTERN(cuMemCreate);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity);
-DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemMap);
-DECLARE_CUDA_PFN_EXTERN(cuMemRelease);
-DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle);
-DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess);
-DECLARE_CUDA_PFN_EXTERN(cuMemUnmap);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
+DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationPropertiesFromHandle, 10020);
 #if CUDA_VERSION >= 11070
-DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange); // DMA-BUF support
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
-DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
 #endif
 #endif
 
diff --git a/src/include/device.h b/src/include/device.h
index f6ca51b75..2c5ce1029 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -10,6 +10,7 @@
 #include "nccl.h"
 #include "nccl_common.h"
 #include "bitops.h"
+#include "symmetric.h"
 #include <algorithm>
 #include <stdint.h>
 #include <sys/types.h>
@@ -29,6 +30,30 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
   #define NCCL_CUDA_ARCH 0
 #endif
 
+#ifdef __CUDA_ARCH_SPECIFIC__
+  #define NCCL_CUDA_ARCH_SPECIFIC __CUDA_ARCH_SPECIFIC__
+#elif defined(__CUDA_ARCH_HAS_FEATURE__)
+  #if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 900
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM100_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1000
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM101_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1010
+  #elif __CUDA_ARCH_HAS_FEATURE__(SM120_ALL)
+    #define NCCL_CUDA_ARCH_SPECIFIC 1200
+  #else
+    #define NCCL_CUDA_ARCH_SPECIFIC 0
+  #endif
+#else
+  #define NCCL_CUDA_ARCH_SPECIFIC 0
+#endif
+
+#ifdef __CUDA_ARCH_FAMILY_SPECIFIC__
+  #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC __CUDA_ARCH_FAMILY_SPECIFIC__
+#else
+  #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
+#endif
+
 #include "net_device.h"
 
 enum ncclDevRedOp_t {
@@ -380,6 +405,14 @@ struct alignas(16) ncclDevChannel {
   uint64_t workCounter;
 };
 
+#define MAX_PROFILER_EVENTS_PER_CHANNEL 64
+struct ncclDevProfiler {
+  struct {
+    uint64_t counter;
+    uint64_t timestamp;
+  } data[MAX_PROFILER_EVENTS_PER_CHANNEL];
+};
+
 struct ncclDevComm {
   int rank;
   int nRanks;
@@ -389,9 +422,6 @@ struct ncclDevComm {
   int p2pChunkSize;
   int isAllNvlink;
 
-  // Work fifo return credits
-  uint32_t* workConsumed/*[MAXCHANNELS]*/;
-
   int* collNetDenseToUserRank;
 
   // Flag to ask NCCL kernels to abort
@@ -402,8 +432,8 @@ struct ncclDevComm {
   int* rankToLocalRank;
 
   // Profiler counters
-  uint64_t* workStarted/*[MAXCHANNELS]*/;
-  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
 };
 
 struct alignas(16) ncclDevCommAndChannels {
@@ -476,7 +506,7 @@ __host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int
 
 __host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
   // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? (cudaArch == 1200 ? 6 : 8) : 4;
+  return cudaArch >= 800 ? (cudaArch / 100 == 12 ? 6 : 8) : 4;
 }
 
 __host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
@@ -507,7 +537,6 @@ extern int const ncclDevKernelCount;
 extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
 
 // Table of most specialized kernel function to run given func index.
-extern int const ncclDevFuncIdCount;
 extern int const ncclDevFuncRowToId[];
 extern void* const ncclDevKernelForFunc[/*funcIndex*/];
 extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
@@ -535,11 +564,7 @@ inline bool ncclNvlsSupported(int devRedOp, int type) {
 
 // `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
 inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
   constexpr int NumTypes = ncclNumTypes;
-  #else
-  constexpr int NumTypes = ncclNumTypes + 1;
-  #endif
   int row;
   do {
     row = 0; // ncclDevFuncIndex_P2p
@@ -564,7 +589,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
     }
     row += nAlgos*NCCL_NUM_PROTOCOLS;
 
-    nAlgos = 6;
+    nAlgos = 6; // TREE RING COLLNET_DIRECT COLLNET_CHAIN NVLS NVLS_TREE
     if (coll == ncclFuncAllReduce) {
       row += ((devRedOp*NumTypes + type)*nAlgos + algo)*NCCL_NUM_PROTOCOLS + proto;
       break;
diff --git a/src/include/graph.h b/src/include/graph.h
index a06556e37..7475e5a7b 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -50,6 +50,8 @@ int ncclPxnDisable(struct ncclComm* comm);
 ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
 ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu);
 
+ncclResult_t ncclGetUserP2pLevel(int* level);
+
 // Find CPU affinity
 ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
 
@@ -74,7 +76,9 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
 
-#define NCCL_TOPO_MAX_NODES 256
+// Allows for up to 32 NICs per node on GB200-NVL72
+#define NCCL_TOPO_MAX_NODES 576
+ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
 
 // Init search. Needs to be done before calling ncclTopoCompute
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
diff --git a/src/include/group.h b/src/include/group.h
index c06d1ef1b..033a187da 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -9,9 +9,11 @@
 
 #include "nccl.h"
 #include "comm.h"
+#include "allocator.h"
+#include "register.h"
 
 ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
-void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommJoin(struct ncclComm* comm, int type);
 void ncclGroupCommPreconnect(struct ncclComm* comm);
 ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
 ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
@@ -52,13 +54,14 @@ ncclResult_t ncclAsyncLaunch(
 
 struct ncclGroupJob {
   struct ncclAsyncJob base;
-  struct ncclComm **groupCommHeadPtr;
-  struct ncclComm **groupCommPreconnectHeadPtr;
-  ncclResult_t *groupErrorPtr;
-  bool *abortFlagPtr;
-  int *groupBlockingPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
-  bool initialized;
+  int groupRefCount;
+  bool nonBlockingInit;
+  bool joined;
+  struct ncclComm *groupCommHead[ncclGroupTaskTypeNum];
+  struct ncclComm *groupCommPreconnectHead;
+  ncclResult_t groupError;
+  bool abortFlag;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> asyncJobs;
 };
 
 ncclResult_t ncclGroupStartInternal();
@@ -69,27 +72,9 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
 
 extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
 extern __thread ncclResult_t ncclGroupError;
-extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
 extern __thread int ncclGroupBlocking;
-extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
-extern __thread struct ncclGroupJob ncclGroupJobMain;
-
-static inline void groupResetJobState() {
-  ncclGroupBlocking = -1;
-  ncclGroupJobMainPtr = NULL;
-  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
-  return;
-}
-
-static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
-  ncclResult_t ret = ncclSuccess;
-  if (job) {
-    ret = ncclAsyncJobComplete(&job->base);
-    groupResetJobState();
-  }
-  return ret;
-}
 
 inline ncclResult_t ncclGroupStartInternal() {
   ncclGroupDepth++;
@@ -104,31 +89,32 @@ inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
 }
 
 // Add comm to this thread's group
-inline void ncclGroupCommJoin(struct ncclComm* comm) {
-  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+inline void ncclGroupCommJoin(struct ncclComm* comm, int type) {
+  if (comm->groupNext[type] == reinterpret_cast<struct ncclComm*>(0x1)) {
     // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
     // the users program order yet insures siblings occur consecutively. This
     // is required by doLaunches() in "group.cc".
-    struct ncclComm** pp = &ncclGroupCommHead;
+    struct ncclComm** pp = &ncclGroupCommHead[type];
     while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
-      pp = &(*pp)->groupNext;
+      pp = &(*pp)->groupNext[type];
 
     // didn't find its clique, we need to insert it with ascending order based on commHash
     if (*pp == nullptr) {
-      pp = &ncclGroupCommHead;
-      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext;
+      pp = &ncclGroupCommHead[type];
+      while (*pp != nullptr && (*pp)->commHash < comm->commHash) pp = &(*pp)->groupNext[type];
     }
-    comm->groupNext = *pp;
+    comm->groupNext[type] = *pp;
     *pp = comm;
     // Comms gets a new memory stack scope upon joining. Each task batched for
     // this comm is allocated there.
     ncclMemoryStackPush(&comm->memScoped);
-    // Initialize planner
-    ncclKernelPlanner::Peer* tmp = comm->planner.peers;
-    memset(&comm->planner, 0, sizeof(comm->planner));
-    comm->planner.peers = tmp;
+    if (type == ncclGroupTaskTypeCollective) {
+      // Initialize planner
+      ncclKernelPlanner::Peer* tmp = comm->planner.peers;
+      memset(&comm->planner, 0, sizeof(comm->planner));
+      comm->planner.peers = tmp;
+    }
   }
-
   ncclGroupBlocking = comm->config.blocking;
 }
 
@@ -141,8 +127,8 @@ inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
 }
 
 // Comm has left group
-inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
-  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm, int type) {
+  comm->groupNext[type] = reinterpret_cast<struct ncclComm*>(0x1);
   ncclMemoryStackPop(&comm->memScoped);
   return ncclSuccess;
 }
diff --git a/src/include/mlx5/mlx5dvcore.h b/src/include/mlx5/mlx5dvcore.h
new file mode 100644
index 000000000..9ec40c039
--- /dev/null
+++ b/src/include/mlx5/mlx5dvcore.h
@@ -0,0 +1,18 @@
+#ifndef NCCL_MLX5DV_CORE_H_
+#define NCCL_MLX5DV_CORE_H_
+
+/* Basic MLX5 direct verbs structs. Needed to dynamically load MLX5 direct verbs functions without
+ * explicit including of MLX5 direct verbs header.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "ibvwrap.h"
+
+enum mlx5dv_reg_dmabuf_access  {
+	MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT		= (1<<0),
+};
+
+#endif  // NCCL_MLX5DV_CORE_H_
diff --git a/src/include/mlx5/mlx5dvsymbols.h b/src/include/mlx5/mlx5dvsymbols.h
new file mode 100644
index 000000000..fb08368e7
--- /dev/null
+++ b/src/include/mlx5/mlx5dvsymbols.h
@@ -0,0 +1,23 @@
+#ifndef NCCL_MLX5DV_SYMBOLS_H_
+#define NCCL_MLX5DV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_MLX5DV
+#include <infiniband/mlx5dv.h>
+#else
+#include "mlx5/mlx5dvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* MLX5 Direct Verbs Function Pointers*/
+struct ncclMlx5dvSymbols {
+  bool (*mlx5dv_internal_is_supported)(struct ibv_device *device);
+  int (*mlx5dv_internal_get_data_direct_sysfs_path)(struct ibv_context *context, char *buf, size_t buf_len);
+  /* DMA-BUF support */
+  struct ibv_mr * (*mlx5dv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+  };
+
+/* Constructs MLX5 direct verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols);
+
+#endif  // NCCL_MLX5DV_SYMBOLS_H_
diff --git a/src/include/mlx5/mlx5dvwrap.h b/src/include/mlx5/mlx5dvwrap.h
new file mode 100644
index 000000000..4f858f3c6
--- /dev/null
+++ b/src/include/mlx5/mlx5dvwrap.h
@@ -0,0 +1,41 @@
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_MLX5DVWRAP_H_
+#define NCCL_MLX5DVWRAP_H_
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#ifdef NCCL_BUILD_MLX5DV
+#include <infiniband/mlx5dv.h>
+#else
+#include "mlx5/mlx5dvcore.h"
+#endif
+
+#include "core.h"
+#include "ibvwrap.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef enum mlx5dv_return_enum
+{
+    MLX5DV_SUCCESS = 0,                   //!< The operation was successful
+} mlx5dv_return_t;
+
+ncclResult_t wrap_mlx5dv_symbols(void);
+/* NCCL wrappers of MLX5 direct verbs functions */
+bool wrap_mlx5dv_is_supported(struct ibv_device *device);
+ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len);
+/* DMA-BUF support */
+ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access);
+
+#endif // NCCL_MLX5DVWRAP_H_
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index fcf2251fe..0f387c15e 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -7,6 +7,8 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_
 
+#include <cstdint>
+
 typedef enum {
   NCCL_LOG_NONE = 0,
   NCCL_LOG_VERSION = 1,
@@ -38,6 +40,16 @@ typedef enum {
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
+// NCCL core profiler callback for network defined events instrumentation
+enum {
+  ncclProfilerNetEventStart = 0,
+  ncclProfilerNetEventStop,
+  ncclProfilerNetEventUpdate,
+  ncclProfilerNetEventUpdateAndStop,
+};
+
+typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+
 #define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
 typedef enum {
   ncclFuncBroadcast = 0,
@@ -51,7 +63,7 @@ typedef enum {
   ncclNumFuncs = 8
 } ncclFunc_t;
 
-#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
 #define NCCL_ALGO_UNDEF -1
 #define NCCL_ALGO_TREE 0
 #define NCCL_ALGO_RING 1
diff --git a/src/include/net.h b/src/include/net.h
index afc2d160e..552e9bcb4 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -14,8 +14,6 @@
 
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm);
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm);
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
 
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 2c18b36b9..de50dfe2e 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -31,10 +31,11 @@
 #define NVTX_SID_CommInitRankScalable 12 // same schema as NVTX_SID_CommInitRank
 #define NVTX_SID_CommSplit            13
 #define NVTX_SID_CommFinalize         14
+#define NVTX_SID_CommShrink           15
 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 15 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 16 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h
index 228a19275..89a41d4b5 100644
--- a/src/include/nvtx_payload_schemas.h
+++ b/src/include/nvtx_payload_schemas.h
@@ -67,6 +67,16 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static cons
   )
 )
 
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommShrink, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, newcomm, TYPE_UINT64, nccl_nvtxCommStr),
+    (int, nranks, TYPE_INT, nccl_nvtxNranksStr),
+    (int, myrank, TYPE_INT, nccl_nvtxRankStr),
+    (int, cudaDev, TYPE_INT, nccl_nvtxCudaDevStr),
+    (int, num_exclude, TYPE_INT, "num_exclude")
+  )
+)
+
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommFinalize, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
     (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr)
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
index d57aad5a9..18d1486d7 100644
--- a/src/include/plugin/nccl_net.h
+++ b/src/include/plugin/nccl_net.h
@@ -28,10 +28,9 @@
 #define NCCL_NET_MAX_REQUESTS 32
 
 // Max number of ncclNet objects which can live in the same process
-#define NCCL_NET_MAX_PLUGINS 3
-
-// NCCL core profiler callback for network defined events instrumentation
-typedef ncclResult_t (*ncclProfilerCallback_t)(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData);
+#ifndef NCCL_NET_MAX_PLUGINS
+#define NCCL_NET_MAX_PLUGINS 16
+#endif
 
 #include "net/net_v10.h"
 #include "net/net_v9.h"
diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h
index 34cf9a927..710aac4d5 100644
--- a/src/include/plugin/nccl_profiler.h
+++ b/src/include/plugin/nccl_profiler.h
@@ -19,43 +19,53 @@ enum {
 };
 
 typedef enum {
-  ncclProfilerProxyOpSendPosted,
-  ncclProfilerProxyOpSendRemFifoWait,
-  ncclProfilerProxyOpSendTransmitted,
-  ncclProfilerProxyOpSendDone,
-  ncclProfilerProxyOpRecvPosted,
-  ncclProfilerProxyOpRecvReceived,
-  ncclProfilerProxyOpRecvTransmitted,
-  ncclProfilerProxyOpRecvDone,
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
 
   /* Legacy proxy profiler states */
-  ncclProfilerProxyStepSendGPUWait,
-  ncclProfilerProxyStepSendWait,
-  ncclProfilerProxyStepRecvWait,
-  ncclProfilerProxyStepRecvFlushWait,
-  ncclProfilerProxyStepRecvGPUWait,
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
 
   /* Legacy proxy control states */
-  ncclProfilerProxyCtrlIdle,
-  ncclProfilerProxyCtrlActive,
-  ncclProfilerProxyCtrlSleep,
-  ncclProfilerProxyCtrlWakeup,
-  ncclProfilerProxyCtrlAppend,
-  ncclProfilerProxyCtrlAppendEnd,
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined event states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
 } ncclProfilerEventState_t;
 
 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
 
 #include <cstdint>
+#include "profiler/profiler_v4.h"
 #include "profiler/profiler_v3.h"
 #include "profiler/profiler_v2.h"
 #include "profiler/profiler_v1.h"
 
-typedef ncclProfiler_v3_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v3_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v3_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v4_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
 
 #define NCCL_PROFILER_NET_VER_BITS  (16)
 #define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
diff --git a/src/include/plugin/profiler/profiler_v4.h b/src/include/plugin/profiler/profiler_v4.h
new file mode 100644
index 000000000..157d8ddd5
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v4.h
@@ -0,0 +1,123 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index bae0501bb..2fb6a7d38 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -21,8 +21,8 @@ struct ncclProxyConnector;
 
 struct ncclProfilerProxy {
   bool initialized;
-  uint64_t* workStarted/*[MAXCHANNELS]*/;
-  uint64_t* workCompleted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workStarted/*[MAXCHANNELS]*/;
+  struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
   uint64_t workCounter[MAXCHANNELS]; // host work counter
   struct ncclProxyConnector sendProxyConn[MAXCHANNELS];
   struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
@@ -43,8 +43,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan);
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan);
 
 // Proxy Op Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartSendProxyOpEvent(int sub, struct ncclProxyArgs* args);
-ncclResult_t ncclProfilerStartRecvProxyOpEvent(int sub, struct ncclProxyArgs* args);
+ncclResult_t ncclProfilerStartProxyOpEvent(int sub, struct ncclProxyArgs* args);
 ncclResult_t ncclProfilerStopProxyOpEvent(int sub, struct ncclProxyArgs* args);
 
 // Proxy Step Start/Stop Event Wrappers
@@ -57,11 +56,11 @@ ncclResult_t ncclProfilerStartProxyCtrlEvent(void* profilerContext, void** eHand
 ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle);
 
 // Kernel Channel Start/Stop Event Wrappers
-ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s);
-ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s);
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start);
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop);
 
 // Record Event Wrappers
-ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState);
+ncclResult_t ncclProfilerRecordProxyOpEventState(int sub, struct ncclProxyArgs* args, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyStepEventState(int sub, struct ncclProxyArgs* args, int stepId, ncclProfilerEventState_t eState);
 ncclResult_t ncclProfilerRecordProxyCtrlEventState(void*eHandle, int appended, ncclProfilerEventState_t eState);
 
diff --git a/src/include/proxy.h b/src/include/proxy.h
index f90c80275..772aa206c 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -105,6 +105,13 @@ struct ncclProxyOp {
   struct ncclProxyOp *enqNext;
 };
 
+struct ncclProxySubArgs;
+
+struct ncclProxyEventHandle {
+  void* stepEventHandle;
+  struct ncclProxySubArgs* subArgPtr;
+};
+
 struct ncclProxySubArgs {
   struct ncclProxyConnection* connection;
   int reg;
@@ -137,13 +144,12 @@ struct ncclProxySubArgs {
   // Profiler plugin
   int eActivationMask;
   int rank;
-  uint64_t profilerSteps;
   pid_t pid;
   void* profilerContext;
   void* taskEventHandle;
   void* opEventHandle;
   void* kernelEventHandle;
-  void* stepEventHandles[NCCL_STEPS];
+  struct ncclProxyEventHandle pHandles[NCCL_STEPS];
   size_t transSize;
   uint64_t workCounter;
 
@@ -226,6 +232,8 @@ struct ncclProxyPeer {
 };
 
 struct ncclSharedNetComms {
+  int activeConnect[MAXCHANNELS];
+  int activeAccept[MAXCHANNELS];
   void* sendComm[MAXCHANNELS];
   void* recvComm[MAXCHANNELS];
   int sendRefCount[MAXCHANNELS];
diff --git a/src/include/register.h b/src/include/register.h
index 143f41bc9..231cbfc34 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -29,18 +29,24 @@ struct ncclRegNetHandles {
   struct ncclRegNetHandles* next;
 };
 
+struct ncclSymRegTask {
+  struct ncclSymRegTask *next;
+  void* buff;
+  size_t baseSize;
+  CUmemGenericAllocationHandle memHandle;
+  struct ncclReg* regHandle;
+  size_t alignment;
+};
+
 struct ncclReg {
   // common attributes
-  size_t pages;
+  uintptr_t begAddr, endAddr; // page aligned
   int localRefs;
   int graphRefs;
-  uintptr_t addr;
   uint32_t state;
   // net reg
   struct ncclRegNetHandles* netHandleHead;
   // nvls reg
-  uintptr_t baseAddr;
-  size_t baseSize;
   CUdeviceptr regAddr;
   size_t regUCSize, regMCSize;
   int dev;
@@ -52,6 +58,10 @@ struct ncclReg {
   // general ipc reg
   struct ncclPeerRegIpcAddr regIpcAddrs;
   struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
+  // symmetric reg
+  void* baseSymPtr;
+  size_t symSize;
+  int winFlags;
 };
 
 struct ncclRegCache {
@@ -60,10 +70,14 @@ struct ncclRegCache {
   uintptr_t pageSize;
 };
 
+struct ncclWindow {
+  struct ncclReg* handle;
+};
+
 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg);
 ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
 ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
 ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
+ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
 
 #endif
diff --git a/src/include/register_inline.h b/src/include/register_inline.h
new file mode 100644
index 000000000..fb7641b13
--- /dev/null
+++ b/src/include/register_inline.h
@@ -0,0 +1,33 @@
+#ifndef NCCL_REGISTER_INLINE_H_
+#define NCCL_REGISTER_INLINE_H_
+
+#include "comm.h"
+#include "register.h"
+
+static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** outReg) {
+  struct ncclRegCache* cache = &comm->regCache;
+  *outReg = NULL;
+  for (int slot=0; /*true*/; slot++) {
+    if (slot == cache->population) return ncclSuccess;
+    struct ncclReg *reg = cache->slots[slot];
+    if ((uintptr_t)data < reg->begAddr) return ncclSuccess;
+    if ((uintptr_t)data + size <= reg->endAddr) {
+      *outReg = reg;
+      return ncclSuccess;
+    }
+  }
+}
+
+static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
+  struct ncclReg* regRecord = NULL;
+  *symPtr = NULL;
+  *outReg = NULL;
+  NCCLCHECK(ncclRegFind(comm, data, size, &regRecord));
+  if (regRecord && regRecord->baseSymPtr) {
+    *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
+    *outReg = regRecord;
+  }
+  return ncclSuccess;
+}
+
+#endif
diff --git a/src/include/socket.h b/src/include/socket.h
index ffa148091..adeae9b2a 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -69,8 +69,10 @@ struct ncclSocket {
 
 const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
-int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
-int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
+                                          union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found);
+ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
+                                int* nIfs);
 
 // Initialize a socket
 ncclResult_t ncclSocketInit(struct ncclSocket* sock, const union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0, int customRetry = 0);
diff --git a/src/include/symmetric.h b/src/include/symmetric.h
new file mode 100644
index 000000000..7a189bcca
--- /dev/null
+++ b/src/include/symmetric.h
@@ -0,0 +1,90 @@
+#ifndef NCCL_DEVICE_SYMMETRIC_H_
+#define NCCL_DEVICE_SYMMETRIC_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "bitops.h"
+
+constexpr int ncclSymMaxBlocks = 64;
+constexpr int ncclSymMaxThreads = 512;
+constexpr int ncclSymLLMaxEltSize = 64;
+
+constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
+  return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
+}
+
+constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
+  return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
+}
+
+struct alignas(16) ncclSymDevBase {
+  uint32_t llEpoch[ncclSymMaxBlocks];
+  uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
+  uint32_t barInboxMc[ncclSymMaxBlocks];
+  uint32_t barInboxPerPeer[];
+
+  static constexpr size_t size(int nRanks) {
+    return sizeof(ncclSymDevBase) +
+           alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
+           ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
+  }
+};
+
+static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
+  // Get pointer to buffer trailing the header struct.
+  char* ans = (char*)(base + 1);
+  // Skip over barInboxPerPeer[]
+  ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
+  // Skip to our block
+  int epochSize = ncclSymLLEpochSize(nRanks);
+  ans += block * /*epochs=*/2 * epochSize;
+  ans += (epoch & 1)*epochSize;
+  return (uint4*)ans;
+}
+
+struct ncclSymDevComm {
+  ncclSymDevBase* base;
+  ncclSymDevBase* baseMc;
+  uint32_t stride4G;
+  int nRanks, rank;
+  uint32_t nRanks_rcp32; // idivRcp32(nRanks)
+};
+
+struct alignas(16) ncclSymDevArgs {
+  struct ncclSymDevComm comm;
+  int rootRank;
+  uint64_t redOpArg; // must be collectively uniform
+  size_t nElts;
+  char* input;
+  char* output;
+};
+
+enum ncclSymKernelId {
+  ncclSymKernelId_AllReduce_AGxLL_R,
+  ncclSymKernelId_AllReduce_AGxLLMC_R,
+  ncclSymKernelId_AllReduce_RSxLD_AGxST,
+  ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
+
+  ncclSymKernelId_AllGather_LL,
+  ncclSymKernelId_AllGather_LLMC,
+  ncclSymKernelId_AllGather_ST,
+  ncclSymKernelId_AllGather_STMC,
+
+  ncclSymKernelId_ReduceScatter_LL,
+  ncclSymKernelId_ReduceScatter_LD,
+  ncclSymKernelId_ReduceScatter_LDMC,
+
+  ncclSymKernelId_Count
+};
+
+bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+
+ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
+
+// Generated by src/device/symmetric/generate.py
+extern int const ncclSymKernelCount;
+extern void* const ncclSymKernelList[];
+void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+const char* ncclSymKernelIdToString(int kernelId);
+
+#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index c563fbbd6..a9971a74f 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -22,6 +22,7 @@
 
 #include "proxy.h"
 #include "comm.h"
+#include "bootstrap.h"
 
 extern struct ncclTransport p2pTransport;
 extern struct ncclTransport shmTransport;
@@ -46,6 +47,7 @@ struct ncclPeerInfo {
   int64_t busId;
   struct ncclComm* comm;
   int cudaCompCap;
+  size_t totalGlobalMem;
   // MNNVL support
   nvmlGpuFabricInfoV_t fabricInfo;
   int cuMemSupport;
@@ -53,6 +55,8 @@ struct ncclPeerInfo {
 };
 
 #define CONNECT_SIZE 256
+#define NCCL_MAX_PAGE_SIZE (512L * 1024L * 1024L)
+#define NCCL_REC_PAGE_SIZE (2L * 1024L * 1024L)
 struct ncclConnect {
   char data[CONNECT_SIZE];
 };
@@ -80,6 +84,7 @@ struct ncclNvlsSharedRes {
   char* ucBuff; // Unicast NVLS buffer address
   char* ucCredit; // Unicast NVLS credit address
   int nChannels;
+  int nHeads;
   struct ncclShmemCollBuff nvlsShmem;
   void *nvlsShmemHandle;
 };
@@ -119,7 +124,8 @@ struct ncclTransport {
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode);
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
+ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -154,5 +160,15 @@ ncclResult_t ncclRegisterP2pIpcBuffer(struct ncclComm* comm, void* userbuff, siz
 ncclResult_t ncclRegisterP2pNetBuffer(struct ncclComm* comm, void* userbuff, size_t size, struct ncclConnector* conn, int* regFlag, void** handle, struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue);
 ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
 ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
+ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
+
+ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
+ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
+ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
 
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 383f678c8..bfed2722c 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -43,6 +43,12 @@ static long log2i(long n) {
   return log2Down(n);
 }
 
+// Comparator function for qsort/bsearch to compare integers
+static int compareInts(const void *a, const void *b) {
+    int ia = *(const int*)a, ib = *(const int*)b;
+    return (ia > ib) - (ia < ib);
+}
+
 inline uint64_t clockNano() {
   struct timespec ts;
   clock_gettime(CLOCK_MONOTONIC, &ts);
diff --git a/src/init.cc b/src/init.cc
index 47d7fa3c6..83764a883 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -18,6 +18,7 @@
 #include "argcheck.h"
 #include "tuner.h"
 #include "ras.h"
+#include "profiler.h"
 #include "mnnvl.h"
 #include <fcntl.h>
 #include <string.h>
@@ -29,6 +30,7 @@
 #include <unistd.h>
 #include "param.h"
 #include "nvtx_payload_schemas.h"
+#include "utils.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -48,6 +50,10 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM);
 NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0);
 NCCL_PARAM(CommBlocking, "COMM_BLOCKING", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(RuntimeConnect, "RUNTIME_CONNECT", 1);
+NCCL_PARAM(WinEnable, "WIN_ENABLE", 1);
+NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
@@ -174,6 +180,10 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
+  if (comm->symmetricSupport && comm->symDevComm.base) {
+    NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride));
+  }
+
   NCCLCHECK(ncclRasCommFini(comm));
 
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
@@ -253,15 +263,16 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   NCCLCHECK(ncclRegCleanup(comm));
 
+  if (comm->symmetricSupport) {
+    NCCLCHECK(ncclNvlsSymmetricFinalize(comm));
+    NCCLCHECK(ncclIpcSymmetricFinalize(comm));
+  }
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
   NCCLCHECK(ncclNetFinalize(comm));
-  NCCLCHECK(ncclNetPluginUnload(comm));
-
   ncclCudaContextDrop(comm->context);
-
   free(comm);
 
   return ncclSuccess;
@@ -271,7 +282,7 @@ NCCL_PARAM(DisableGraphHelper, "GRAPH_HELPER_DISABLE", 0);
 // GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory
 NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
 #define NCCL_WORK_FIFO_BYTES_DEFAULT (1<<20)
-NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", -1);
+NCCL_PARAM(WorkFifoBytes, "WORK_FIFO_BYTES", NCCL_WORK_FIFO_BYTES_DEFAULT);
 NCCL_PARAM(WorkArgsBytes, "WORK_ARGS_BYTES", INT64_MAX);
 enum ncclLaunchMode ncclParamLaunchMode;
 
@@ -331,12 +342,10 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   comm->rank = rank;
   comm->nRanks = ndev;
 
-  NCCLCHECK(ncclNetPluginLoad(comm));
   NCCLCHECK(ncclNetInit(comm));
-  NCCLCHECK(ncclProfilerPluginInit(comm));
   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
-  if (parent && parent->config.splitShare) {
+  if (parent && parent->shareResources) {
     if (parent->ncclNet != comm->ncclNet) {
       WARN("Split shares resources, but parent comm netName %s is different from child comm netName %s", parent->ncclNet->name, comm->ncclNet->name);
       return ncclInvalidUsage;
@@ -361,13 +370,14 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
   comm->dmaBufSupport = (dmaBufSupported(comm) == ncclSuccess) ? true : false;
 
-  comm->collNetSupport = 0;
   memset(comm->collNetSupportMatrix, 0, sizeof(comm->collNetSupportMatrix));
 
   ncclMemoryPoolConstruct(&comm->memPool_ncclKernelPlan);
   ncclMemoryPoolConstruct(&comm->memPool_ncclProxyOp);
 
-  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  for (int i = 0; i < ncclGroupTaskTypeNum; i++) {
+    comm->groupNext[i] = reinterpret_cast<struct ncclComm*>(0x1);
+  }
   comm->preconnectNext = reinterpret_cast<struct ncclComm*>(0x1);
 
   static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels");
@@ -378,7 +388,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   // Mark channels as non initialized.
   for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
 
-  if (parent == NULL || !parent->config.splitShare) {
+  if (parent == NULL || !parent->shareResources) {
     struct ncclSharedResources* sharedRes = NULL;
     NCCLCHECK(ncclCalloc(&sharedRes, 1));
     /* most of attributes are assigned later in initTransportsRank(). */
@@ -432,6 +442,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   bool ccEnable;
   cudaStream_t deviceStream;
 
+  memset(&tmpCommAndChans, '\0', sizeof(tmpCommAndChans));
   NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->deviceStream, /*concurrent=*/false, &deviceStream), ret, fail);
   NCCLCHECKGOTO(ncclCudaCallocAsync(&devCommAndChans, 1, deviceStream), ret, fail);
   ncclCommPushCudaFree(comm, devCommAndChans);
@@ -458,22 +469,12 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   if (ccEnable) {
     comm->workFifoBytes = 0;
   } else {
-    int64_t workFifoBytesParam = ncclParamWorkFifoBytes();
-    if (workFifoBytesParam == -1) {
-      if (comm->MNNVL && (comm->compCap >= 100)) {
-        // WAR: Disable work fifo for Blackwell all2all hang issue on MNNVL
-        INFO(NCCL_INIT, "Disabling work fifo");
-        comm->workFifoBytes = 0;
-      } else {
-        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
-      }
-    } else {
-      if (0 != (workFifoBytesParam & (workFifoBytesParam-1))) {
-        WARN("NCCL_WORK_FIFO_BYTES=%ld is being ignored because it is not a power of 2.", workFifoBytesParam);
-        comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
-      }
-      comm->workFifoBytes = std::min<uint64_t>(workFifoBytesParam, 1ul<<30);
+    comm->workFifoBytes = ncclParamWorkFifoBytes();
+    if (0 != (comm->workFifoBytes & (comm->workFifoBytes-1))) {
+      WARN("NCCL_WORK_FIFO_BYTES=%d is being ignored because it is not a power of 2.", comm->workFifoBytes);
+      comm->workFifoBytes = NCCL_WORK_FIFO_BYTES_DEFAULT;
     }
+    comm->workFifoBytes = std::min(comm->workFifoBytes, 1u<<30);
   }
 
   if (comm->rank == 0) {
@@ -492,11 +493,9 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
     comm->workFifoBufDev = comm->workFifoBuf;
   }
 
-  NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->workFifoConsumed, MAXCHANNELS), ret, fail);
-  ncclCommPushCudaHostFree(comm, comm->workFifoConsumed);
   comm->workFifoProduced = 0;
-  comm->workFifoConsumedLeast = 0;
-  tmpCommAndChans.comm.workConsumed = comm->workFifoConsumed;
+  comm->workFifoProducedLastRecorded = 0;
+  comm->workFifoConsumed = 0;
 
   // Alloc profiler counters for the kernel
   NCCLCHECKGOTO(ncclCudaHostCalloc(&comm->profiler.workStarted, MAXCHANNELS), ret, fail);
@@ -549,6 +548,7 @@ NCCL_PARAM(MNNVLUUID, "MNNVL_UUID", -1);
 NCCL_PARAM(MNNVLCliqueId, "MNNVL_CLIQUE_ID", -1);
 
 static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
+  cudaDeviceProp prop;
   info->rank = comm->rank;
   info->cudaDev = comm->cudaDev;
   info->nvmlDev = comm->nvmlDev;
@@ -556,6 +556,8 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
   info->hostHash=getHostHash()+commHash;
   info->pidHash=getPidHash()+commHash;
   info->cuMemSupport = ncclCuMemEnable();
+  CUDACHECK(cudaGetDeviceProperties(&prop, comm->cudaDev));
+  info->totalGlobalMem = ROUNDUP(prop.totalGlobalMem, (1L << 32));
 
   // Get the device MAJOR:MINOR of /dev/shm so we can use that
   // information to decide whether we can use SHM for inter-process
@@ -700,6 +702,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     struct ncclTopoRanks topoRanks;
     int cpuArch;
     int cpuVendor;
+    int localRanks;
   };
 
   int nChannelsOrig;
@@ -711,12 +714,14 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   struct ncclProxyConnector proxyConn;
   int* pxnPeers = NULL;
   int *topParentLocalRanks = NULL;
+  int p2pLevel = -1;
 
   timers[TIMER_INIT_ALLGATHER] = clockNano();
   // AllGather1 - begin
   NCCLCHECKGOTO(ncclCalloc(&comm->peerInfo, nranks+1), ret, fail); // Extra rank to represent CollNet root
   NCCLCHECKGOTO(fillInfo(comm, comm->peerInfo+rank, comm->commHash), ret, fail);
   NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, comm->peerInfo, sizeof(struct ncclPeerInfo)), ret, fail);
+  __atomic_store_n(&comm->peerInfoValid, true, __ATOMIC_RELEASE);
 
   comm->cuMemSupport = 1;
   for (int i = 0; i < nranks; i++) {
@@ -738,7 +743,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   timers[TIMER_INIT_ALLGATHER] = clockNano() - timers[TIMER_INIT_ALLGATHER];
 
   // Check for MNNVL support
-  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0) || ncclParamMNNVLEnable() == 1) {
+  NCCLCHECKGOTO(ncclGetUserP2pLevel(&p2pLevel), ret, fail);
+  if ((nNodes > 1 && ncclParamMNNVLEnable() != 0 && p2pLevel != 0) || ncclParamMNNVLEnable() == 1) {
     NCCLCHECKGOTO(ncclMnnvlCheck(comm), ret, fail);
   }
 
@@ -829,14 +835,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
 
   // Determine local CollNet support
-  if (collNetSupport(comm)) {
-    const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
-    if (collNetEnable != NULL) {
-      INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
-      if (strcmp(collNetEnable, "1") == 0) {
-        comm->collNetSupport = 1;
-      }
-    }
+  if (!collNetSupport(comm)) {
+    comm->config.collnetEnable = 0;
   }
 
   // Determine local Nvls support
@@ -873,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   collNetDirectGraph->collNet = 1;
   collNetDirectGraph->minChannels = 1;
   collNetDirectGraph->maxChannels = MAXCHANNELS;
-  if (comm->collNetSupport) {
+  if (comm->config.collnetEnable) {
     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetChainGraph), ret, fail);
     NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, collNetChainGraph), ret, fail);
     NCCLCHECKGOTO(ncclTopoCompute(comm->topo, collNetDirectGraph), ret, fail);
@@ -1014,7 +1014,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
     comm->maxTreePattern = std::max(comm->maxTreePattern, allGather3Data[i].graphInfo[NCCL_ALGO_TREE].pattern);
   }
-  if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->collNetSupport = 0;
+  if (graphs[NCCL_ALGO_COLLNET_CHAIN]->nChannels == 0) comm->config.collnetEnable = 0;
   if (graphs[NCCL_ALGO_NVLS]->nChannels == 0) comm->nvlsSupport = comm->nvlsChannels = 0;
 
   comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
@@ -1025,11 +1025,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
 
   // Determine CollNet support after all-gather now that we know nNodes and each node localRanks
-  if (comm->collNetSupport == 1) {
+  if (comm->config.collnetEnable == 1) {
     int collNetNodeThreshold = ncclParamCollNetNodeThreshold();
     if (comm->nNodes < collNetNodeThreshold) {
       INFO(NCCL_INIT, "Communicator has %d nodes which is less than CollNet node threshold %d, disabling CollNet", comm->nNodes, collNetNodeThreshold);
-      comm->collNetSupport = 0;
+      comm->config.collnetEnable = 0;
     }
   }
   NCCLCHECK(ncclTopoPathAllNVLink(comm->topo, &comm->isAllNvlink));
@@ -1075,9 +1075,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
   comm->topParentLocalRanks = topParentLocalRanks;
 
-  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->intraNodeP2pSupport, &comm->directMode), ret, fail);
+  // Profiler plugin context has to be initialized before proxy thread
+  NCCLCHECK(ncclProfilerPluginInit(comm));
+
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
   // Launch proxy service thread, after this, the proxy calls can be used.
-  if (parent && parent->config.splitShare) {
+  if (parent && parent->shareResources) {
     comm->proxyState = parent->sharedRes->proxyState;
     ncclAtomicRefCountIncrement(&parent->sharedRes->proxyState->refCount);
   } else {
@@ -1147,10 +1150,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     for (int c=0; c<comm->nChannels; c++) {
       NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
     }
-    // Setup NVLS
+    // Attempt to setup NVLS, may silently fail and disable NVLS
     NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
     // Check if we can setup CollNet
-    if (comm->collNetSupport > 0) ncclCollNetSetup(comm, parent, graphs);
+    if (comm->config.collnetEnable) ncclCollNetSetup(comm, parent, graphs);
   } else {
     for (int c=0; c<comm->nChannels; c++) {
       NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, fail);
@@ -1163,7 +1166,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     // Connect PAT only for communicators with 1 GPU per node
     if (comm->maxLocalRanks == 1) NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
 
-    // Setup NVLS
+    // Attempt to setup NVLS, may silently fail and disable NVLS
     NCCLCHECKGOTO(ncclNvlsSetup(comm, parent), ret, fail);
     NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
 
@@ -1171,7 +1174,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
 
     // Check if we can setup CollNet
-    if (comm->collNetSupport > 0) {
+    if (comm->config.collnetEnable) {
       ncclCollNetSetup(comm, parent, graphs);
       NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
       if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
@@ -1244,9 +1247,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
   }
 
+  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+  comm->baseStride = 0;
+
   // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
   // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
   NCCLCHECKGOTO(devCommSetup(comm), ret, fail);
+
   timers[TIMER_INIT_CONNECT] = clockNano() -  timers[TIMER_INIT_CONNECT];
   /* Local intra-node barrier */
   NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
@@ -1260,7 +1267,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   /* If split resource is shared, we are not able to unlink the proxy ops pool here since the child comm can
    * attach the proxy ops pool of parent at any time; otherwise, unlink it here to make sure the pool will be
    * properly cleaned up. */
-  if (comm->sharedRes->owner == comm && !comm->config.splitShare && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
+  if (comm->sharedRes->owner == comm && !comm->shareResources && ret == ncclSuccess && !ncclCuMemEnable()) ncclProxyShmUnlink(comm);
   free(allTopoRanks);
   free(nodesTreePatterns);
   free(nodesFirstRank);
@@ -1293,6 +1300,9 @@ struct ncclCommInitRankAsyncJob {
   struct ncclComm* parent;
   int color, key;
   int splitCount;
+  // For Shrink
+  int* excludeRanksList;
+  int excludeRanksCount;
   // name of the function calling
   char funcName[NCCL_COMMINIT_FUNCNAME_LEN];
 };
@@ -1303,6 +1313,7 @@ struct ncclCommFinalizeAsyncJob {
 };
 
 NCCL_PARAM(CommSplitShareResources, "COMM_SPLIT_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(CommShrinkShareResources, "COMM_SHRINK_SHARE_RESOURCES", NCCL_CONFIG_UNDEF_INT);
 
 typedef struct{
   int key;
@@ -1350,6 +1361,21 @@ static ncclResult_t commGetSplitInfo(struct ncclComm* comm, struct ncclComm* par
   goto exit;
 }
 
+static ncclResult_t getParentRanks(int parentRanks, int parentRank, int* excludeRanksList, int excludeRanksCount, int* nRanksRet, int* myRankRet, int* parentRanksRet) {
+  int count = 0, j = 0;
+  for (int i = 0; i < parentRanks; i++) {
+    // we assume excludeRanksList is sorted
+    if (j < excludeRanksCount && excludeRanksList[j] == i) {
+      j++;
+      continue;
+    }
+    if (i == parentRank) *myRankRet = count;
+    parentRanksRet[count++] = i;
+  }
+  *nRanksRet = parentRanks - excludeRanksCount;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   struct ncclCommInitRankAsyncJob* job = (struct ncclCommInitRankAsyncJob*)job_;
   ncclComm_t comm = job->comm;
@@ -1383,9 +1409,13 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
 
   if (job->parent) {
     NCCLCHECKGOTO(ncclCalloc(&parentRanks, job->parent->nRanks), res, fail);
-    NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
-    // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
-    if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
+    if (job->excludeRanksCount) {
+      NCCLCHECKGOTO(getParentRanks(job->parent->nRanks, job->parent->rank, job->excludeRanksList, job->excludeRanksCount, &job->nranks, &job->myrank, parentRanks), res, fail);
+    } else {
+      NCCLCHECKGOTO(commGetSplitInfo(comm, job->parent, job->color, job->key, &job->nranks, &job->myrank, parentRanks), res, fail);
+      // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
+      if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
+    }
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
@@ -1477,6 +1507,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   int minCTAsEnv;
   int maxCTAsEnv;
   int splitShareEnv;
+  int collnetEnableEnv;
+  int ctaPolicyEnv;
+  int shrinkShareEnv;
+  int nvlsCTAsEnv;
 
   /* override configuration from env variable. */
   blockingEnv = ncclParamCommBlocking();
@@ -1522,6 +1556,25 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
     comm->config.splitShare = splitShareEnv;
   }
+  shrinkShareEnv = ncclParamCommShrinkShareResources();
+  if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.shrinkShare = shrinkShareEnv;
+  }
+
+  collnetEnableEnv = ncclParamCollnetEnable();
+  if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.collnetEnable = collnetEnableEnv;
+  }
+
+  ctaPolicyEnv = ncclParamCtaPolicy();
+  if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.CTAPolicy = ctaPolicyEnv;
+  }
+
+  nvlsCTAsEnv = ncclParamNvlsChannels();
+  if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
+    comm->config.nvlsCTAs = nvlsCTAsEnv;
+  }
 
   /* cap channels if needed */
   if (comm->config.minCTAs > MAXCHANNELS) {
@@ -1544,6 +1597,20 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     comm->config.splitShare = 0;
   }
 
+  if (comm->config.collnetEnable != 1 && comm->config.collnetEnable != 0) {
+    INFO(NCCL_ENV, "collnetEnable %d is not a valid value 0/1, set it to 0", comm->config.collnetEnable);
+    comm->config.collnetEnable = 0;
+  }
+
+  if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) {
+    INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT);
+    comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT;
+  }
+
+  if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT && comm->config.nvlsCTAs <= 0) {
+    INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs);
+    comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT;
+  }
   return ret;
 }
 
@@ -1584,6 +1651,17 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
       internalConfigPtr->maxCTAs = defaultConfig.maxCTAs;
       internalConfigPtr->netName = defaultConfig.netName;
     }
+
+    if (internalConfigPtr->version < NCCL_VERSION(2, 25, 0)) {
+      internalConfigPtr->trafficClass = defaultConfig.trafficClass;
+    }
+
+    if (internalConfigPtr->version < NCCL_VERSION(2, 27, 0)) {
+      internalConfigPtr->collnetEnable = defaultConfig.collnetEnable;
+      internalConfigPtr->CTAPolicy = defaultConfig.CTAPolicy;
+      internalConfigPtr->shrinkShare = defaultConfig.shrinkShare;
+      internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs;
+    }
   }
 
   /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
@@ -1615,6 +1693,31 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
     goto fail;
   }
 
+  if (internalConfigPtr->collnetEnable != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->collnetEnable < 0 || internalConfigPtr->collnetEnable > 1)) {
+    WARN("Invalid config collnetEnable attribute value %d", internalConfigPtr->collnetEnable);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT ||
+    internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) {
+    WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  if (internalConfigPtr->shrinkShare != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->shrinkShare != 0 && internalConfigPtr->shrinkShare != 1) {
+    WARN("Invalid config shrinkShare attribute value %d", internalConfigPtr->shrinkShare);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  if (internalConfigPtr->nvlsCTAs != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlsCTAs <= 0) {
+    WARN("Invalid config nvlsCTAs attribute value %d", internalConfigPtr->nvlsCTAs);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
   /* default config value can be tuned on different platform. */
   NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
@@ -1623,6 +1726,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   NCCL_CONFIG_DEFAULT(internalConfigPtr, netName, NCCL_CONFIG_UNDEF_PTR, NULL, "Net name", "%s");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, splitShare, NCCL_CONFIG_UNDEF_INT, 0, "Split share", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, trafficClass, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "Traffic class", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, commName, NCCL_CONFIG_UNDEF_PTR, NULL, "Comm name", "%s");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, collnetEnable, NCCL_CONFIG_UNDEF_INT, 0, "Collnet enable", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d");
 
   /* assign config to communicator */
   comm->config.blocking = internalConfigPtr->blocking;
@@ -1632,7 +1740,11 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   comm->config.netName = internalConfigPtr->netName;
   comm->config.splitShare = internalConfigPtr->splitShare;
   comm->config.trafficClass = internalConfigPtr->trafficClass;
-
+  comm->config.commName = internalConfigPtr->commName;
+  comm->config.collnetEnable = internalConfigPtr->collnetEnable;
+  comm->config.CTAPolicy = internalConfigPtr->CTAPolicy;
+  comm->config.shrinkShare = internalConfigPtr->shrinkShare;
+  comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs;
   NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
 exit:
@@ -1909,7 +2021,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
       WARN("commDestroySync: comm %p rank %d sync deviceStream error %d\n", comm, comm->rank, ret);
     }
 
-    NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm), ret, fail);
+    NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, true), ret, fail);
     NCCLCHECKGOTO(ncclCommPollCallbacks(comm, false), ret, fail);
     // And keep polling until all graphs referencing us die.
     while (comm->localPersistentRefs != 0) {
@@ -2052,7 +2164,6 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
     NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
-  NCCLCHECK(ncclGroupStartInternal());
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
     WARN("comm %p has already been destroyed", comm);
@@ -2067,13 +2178,22 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
-  ncclGroupErrCheck(res);
-  NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   goto exit;
 }
 
+static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) {
+  // Set abort flags
+  if (comm->childAbortFlag != nullptr) {
+    __atomic_store_n(comm->childAbortFlag, value, __ATOMIC_RELEASE);
+    __atomic_store_n(comm->childAbortFlagDev, value, __ATOMIC_RELEASE);
+  }
+  __atomic_store_n(comm->abortFlag, value, __ATOMIC_RELEASE);
+  __atomic_store_n(comm->abortFlagDev, value, __ATOMIC_RELEASE);
+  return ncclSuccess;
+}
+
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NVTX3_RANGE(NcclNvtxParamsCommAbort);
@@ -2081,14 +2201,8 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   if (comm == NULL) {
     return ncclSuccess;
   }
-  NCCLCHECK(ncclGroupStartInternal());
   // Ask anything that might still be running on the device to quit
-  if (comm->childAbortFlag != nullptr) {
-    __atomic_store_n(comm->childAbortFlag, 1, __ATOMIC_RELEASE);
-    __atomic_store_n(comm->childAbortFlagDev, 1, __ATOMIC_RELEASE);
-  }
-  __atomic_store_n(comm->abortFlag, 1, __ATOMIC_RELEASE);
-  __atomic_store_n(comm->abortFlagDev, 1, __ATOMIC_RELEASE);
+  NCCLCHECK(setCommAbortFlags(comm,1));
   comm->destroyFlag = 1;
   /* init thread must be joined before we destroy the comm,
    * and we should ignore the init error here. */
@@ -2109,38 +2223,51 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
-  ncclGroupErrCheck(res);
-  NCCLCHECK(ncclGroupEndInternal());
   return ncclSuccess;
 fail:
   goto exit;
 }
 
-NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
-ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
+static void childCommCleanupJob(void* job) {
+  struct ncclCommInitRankAsyncJob* initJob = (struct ncclCommInitRankAsyncJob*)job;
+  if (initJob->excludeRanksList) free(initJob->excludeRanksList);
+  free(job);
+}
+
+// initializing a child communicator (for both split and shrink)
+static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm, bool isShrink, int flags, int color, int key, int* excludeRanksList, int excludeRanksCount,
+                                          ncclConfig_t* config, const char* caller) {
   struct ncclCommInitRankAsyncJob *job = NULL;
   struct ncclComm* childComm = NCCL_COMM_NULL;
   ncclResult_t res = ncclSuccess;
 
-  NVTX3_RANGE(NcclNvtxParamsCommSplit)
-
   int oldDev;
   CUDACHECK(cudaGetDevice(&oldDev));
+  NCCLCHECKGOTO(CommCheck(comm, caller, "comm"), res, exit);
+  NCCLCHECKGOTO(PtrCheck(newcomm, caller, "newcomm"), res, exit);
+  if (isShrink) {
+    NCCLCHECKGOTO(PtrCheck(excludeRanksList, caller, "excludeRanksList"), res, exit);
+    NCCLCHECKGOTO(excludeRanksCount > 0 ? ncclSuccess : ncclInvalidArgument, res, exit);
+    // excludeRanksList may not be sorted, need to sort it
+    qsort(excludeRanksList, excludeRanksCount, sizeof(int), compareInts);
+    // ranks in excludeRanksList should not call into this function
+    NCCLCHECKGOTO(bsearch(&comm->rank, excludeRanksList, excludeRanksCount, sizeof(int), compareInts) ? ncclInvalidArgument : ncclSuccess, res, exit);
+  }
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, exit);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit);
 
-  NCCLCHECK(ncclGroupStartInternal());
-  NCCLCHECKGOTO(CommCheck(comm, "CommSplit", "comm"), res, fail);
-  NCCLCHECKGOTO(PtrCheck(newcomm, "CommSplit", "newcomm"), res, fail);
-  NCCLCHECKGOTO(ncclCommEnsureReady(comm), res, fail);
-
-  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, fail);
   /* *newcomm should be NCCL_COMM_NULL until comm split fully complete. */
   *newcomm = NCCL_COMM_NULL;
-  if (color == NCCL_SPLIT_NOCOLOR) {
+  if (!isShrink && color == NCCL_SPLIT_NOCOLOR) {
     INFO(NCCL_INIT, "Rank %d has color with NCCL_SPLIT_NOCOLOR, not creating a new communicator", comm->rank);
   } else {
     NCCLCHECKGOTO(ncclCalloc(&childComm, 1), res, fail);
     childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
-    if (comm->config.splitShare) {
+
+    // Set the shareResource field, this is used throughout the init and must be reset every time.
+    // If we shrink, we only reuse resources if we shrink in the default mode
+    comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare;
+    if (comm->shareResources) {
       childComm->abortFlag = comm->abortFlag;
       childComm->abortFlagDev = comm->abortFlagDev;
       childComm->abortFlagRefCount = comm->abortFlagRefCount;
@@ -2161,38 +2288,39 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
       NCCLCHECKGOTO(parseCommConfig(childComm, config), res, fail);
     }
 
-    /* start with ncclInProgress and will be changed to ncclSuccess if init succeeds. */
-    childComm->initState = ncclInProgress;
+    /* start with ncclInternalError and will be changed to ncclSuccess if init succeeds. */
+    childComm->initState = ncclInternalError;
   }
 
   NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
   job->comm = childComm;
   job->newcomm = newcomm;
   job->parent = comm;
-  job->splitCount = ++comm->splitCount;
   job->color = color;
   job->key = key;
+  if (excludeRanksList) {
+    // need to copy the list of ranks to exclude because the job is async
+    job->excludeRanksCount = excludeRanksCount;
+    NCCLCHECKGOTO(ncclCalloc(&job->excludeRanksList, excludeRanksCount), res, fail);
+    memcpy(job->excludeRanksList, excludeRanksList, excludeRanksCount * sizeof(int));
+  } else {
+    // each split has to lead to a unique comm, so increment the splitCount
+    job->splitCount = ++comm->splitCount;
+    job->excludeRanksList = NULL;
+  }
   job->cudaDev = comm->cudaDev;
-  snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", __func__);
-  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, NULL, free, comm), res, fail);
+  snprintf(job->funcName, NCCL_COMMINIT_FUNCNAME_LEN, "%s", caller);
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, ncclCommInitRankFunc, /*undo=*/NULL, /*destructor=*/childCommCleanupJob, comm), res, fail);
 
 exit:
   (void)cudaSetDevice(oldDev);
-  (void)ncclGroupErrCheck(res);
-  NCCLCHECK(ncclGroupEndInternal());
-
-  if (res == ncclSuccess && *newcomm) {
-    NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema,
-      NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
-  }
-
   return res;
 fail:
   if (childComm) {
-    if (!comm->config.splitShare) {
-      free(childComm->abortFlag);
+    if (!comm->shareResources) {
+      if (childComm->abortFlag) free(childComm->abortFlag);
       if (childComm->abortFlagDev) ncclCudaHostFree(childComm->abortFlagDev);
-      free(childComm->abortFlagRefCount);
+      if (childComm->abortFlagRefCount) free(childComm->abortFlagRefCount);
     }
     free(childComm);
   }
@@ -2200,6 +2328,44 @@ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newc
   goto exit;
 }
 
+NCCL_API(ncclResult_t, ncclCommShrink, ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
+ncclResult_t  ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t *newcomm, ncclConfig_t* config, int shrinkFlags) {
+  NVTX3_RANGE(NcclNvtxParamsCommShrink)
+  ncclResult_t res = ncclSuccess;
+  NCCLCHECK(ncclGroupStartInternal());
+  // Handle error mode by setting abort flags and waiting for kernels to complete and unset the flags to avoid bootstrap issues
+  if (shrinkFlags & NCCL_SHRINK_ABORT) {
+    NCCLCHECKGOTO(setCommAbortFlags(comm, 1), res, exit);
+    NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit);
+    NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
+  }
+  NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/true, shrinkFlags, /*color=*/0, /*key=*/comm->rank, excludeRanksList, excludeRanksCount, config, __func__), res, exit);
+
+  if (*newcomm) NVTX3_RANGE_ADD_PAYLOAD(CommShrink, NcclNvtxParamsCommShrinkSchema, NVTX3_PAYLOAD(comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, excludeRanksCount));
+
+exit:
+  (void)ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  return res;
+}
+
+NCCL_API(ncclResult_t, ncclCommSplit, ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config);
+ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t *config) {
+  NVTX3_RANGE(NcclNvtxParamsCommSplit)
+
+  ncclResult_t res = ncclSuccess;
+  NCCLCHECK(ncclGroupStartInternal());
+  NCCLCHECKGOTO(ncclCommInitChildComm(comm, newcomm, /*isShrink=*/false, /*shrink mode=*/NCCL_SHRINK_DEFAULT, color, key, NULL, 0, config, __func__), res, exit);
+
+  if (*newcomm)
+    NVTX3_RANGE_ADD_PAYLOAD(CommSplit, NcclNvtxParamsCommSplitSchema, NVTX3_PAYLOAD((*newcomm)->commHash, comm->commHash, comm->nRanks, comm->rank, comm->cudaDev, color, key));
+
+exit:
+  (void)ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  return res;
+}
+
 NCCL_API(const char*, ncclGetErrorString, ncclResult_t code);
 const char* ncclGetErrorString(ncclResult_t code) {
   switch (code) {
@@ -2277,119 +2443,3 @@ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
   *rank = comm->rank;
   return ncclSuccess;
 }
-
-NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
-ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  ncclResult_t ret = ncclSuccess;
-
-#if CUDART_VERSION >= 12010
-  size_t memGran = 0;
-  CUdevice currentDev;
-  CUmemAllocationProp memprop = {};
-  CUmemAccessDesc accessDesc = {};
-  CUmemGenericAllocationHandle handle;
-  int cudaDev;
-  int flag;
-  int dcnt;
-
-  if (ptr == NULL || size == 0) goto fallback;
-
-  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
-
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-
-  if (ncclCuMemEnable()) {
-    size_t handleSize = size;
-    int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-    // Query device to see if FABRIC handle support is available
-    flag = 0;
-    (void) CUPFN(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDev));
-    if (flag) requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
-    memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
-    memprop.location.id = currentDev;
-    // Query device to see if RDMA support is available
-    flag = 0;
-    CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, currentDev));
-    if (flag) memprop.allocFlags.gpuDirectRDMACapable = 1;
-    CUCHECK(cuMemGetAllocationGranularity(&memGran, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-    CUDACHECK(cudaGetDeviceCount(&dcnt));
-    ALIGN_SIZE(handleSize, memGran);
-
-    if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC) {
-      /* First try cuMemCreate() with FABRIC handle support and then remove if it fails */
-      CUresult err = CUPFN(cuMemCreate(&handle, handleSize, &memprop, 0));
-      if (err == CUDA_ERROR_NOT_PERMITTED || err == CUDA_ERROR_NOT_SUPPORTED) {
-        requestedHandleTypes &= ~CU_MEM_HANDLE_TYPE_FABRIC;
-        memprop.requestedHandleTypes = (CUmemAllocationHandleType) requestedHandleTypes;
-        /* Allocate the physical memory on the device */
-        CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
-      }
-    } else {
-      /* Allocate the physical memory on the device */
-      CUCHECK(cuMemCreate(&handle, handleSize, &memprop, 0));
-    }
-    /* Reserve a virtual address range */
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)ptr, handleSize, memGran, 0, 0));
-    /* Map the virtual address range to the physical allocation */
-    CUCHECK(cuMemMap((CUdeviceptr)*ptr, handleSize, 0, handle, 0));
-    /* Now allow RW access to the newly mapped memory */
-    for (int i = 0; i < dcnt; ++i) {
-      int p2p = 0;
-      if (i == cudaDev || ((cudaDeviceCanAccessPeer(&p2p, cudaDev, i) == cudaSuccess) && p2p)) {
-        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        accessDesc.location.id = i;
-        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, handleSize, &accessDesc, 1));
-      }
-      if (0 == p2p && i != cudaDev) INFO(NCCL_ALLOC, "P2P not supported between GPU%d and GPU%d", cudaDev, i);
-    }
-    goto exit;
-  }
-
-fallback:
-#endif
-  // Coverity is right to complain that we may pass a NULL ptr to cudaMalloc.  That's deliberate though:
-  // we want CUDA to return an error to the caller.
-  // coverity[var_deref_model]
-  CUDACHECKGOTO(cudaMalloc(ptr, size), ret, fail);
-
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
-ncclResult_t  ncclMemFree(void *ptr) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
-  ncclResult_t ret = ncclSuccess;
-  int saveDevice;
-
-  CUDACHECK(cudaGetDevice(&saveDevice));
-#if CUDART_VERSION >= 12010
-  CUdevice ptrDev = 0;
-
-  if (ptr == NULL) goto fallback;
-  if (ncclCudaLibraryInit() != ncclSuccess) goto fallback;
-
-  CUCHECKGOTO(cuPointerGetAttribute((void*)&ptrDev, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr), ret, fail);
-  CUDACHECKGOTO(cudaSetDevice((int)ptrDev), ret, fail);
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemFree(ptr), ret, fail);
-    goto exit;
-  }
-
-fallback:
-#endif
-  CUDACHECKGOTO(cudaFree(ptr), ret, fail);
-
-exit:
-  CUDACHECK(cudaSetDevice(saveDevice));
-  return ret;
-fail:
-  goto exit;
-}
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 64a84f556..5b66fea92 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -105,53 +105,53 @@ int ncclCuMemHostEnable() {
 #endif
 }
 
-#define DECLARE_CUDA_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr
+#define DECLARE_CUDA_PFN(symbol,version) PFN_##symbol##_v##version pfn_##symbol = nullptr
 
 #if CUDART_VERSION >= 11030
 /* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN(cuDeviceGet);
-DECLARE_CUDA_PFN(cuDeviceGetAttribute);
-DECLARE_CUDA_PFN(cuGetErrorString);
-DECLARE_CUDA_PFN(cuGetErrorName);
+DECLARE_CUDA_PFN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN(cuGetErrorName, 6000);
 /* enqueue.cc */
-DECLARE_CUDA_PFN(cuMemGetAddressRange);
-DECLARE_CUDA_PFN(cuLaunchKernel);
+DECLARE_CUDA_PFN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN(cuLaunchKernel, 4000);
 #if CUDA_VERSION >= 11080
-DECLARE_CUDA_PFN(cuLaunchKernelEx);
+DECLARE_CUDA_PFN(cuLaunchKernelEx, 11060);
 #endif
 /* proxy.cc */
-DECLARE_CUDA_PFN(cuCtxCreate);
-DECLARE_CUDA_PFN(cuCtxDestroy);
-DECLARE_CUDA_PFN(cuCtxGetCurrent);
-DECLARE_CUDA_PFN(cuCtxSetCurrent);
-DECLARE_CUDA_PFN(cuCtxGetDevice);
+DECLARE_CUDA_PFN(cuCtxCreate, 11040);
+DECLARE_CUDA_PFN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN(cuCtxGetCurrent, 4000);
+DECLARE_CUDA_PFN(cuCtxSetCurrent, 4000);
+DECLARE_CUDA_PFN(cuCtxGetDevice, 2000);
 /* cuMem API support */
-DECLARE_CUDA_PFN(cuMemAddressReserve);
-DECLARE_CUDA_PFN(cuMemAddressFree);
-DECLARE_CUDA_PFN(cuMemCreate);
-DECLARE_CUDA_PFN(cuMemGetAllocationGranularity);
-DECLARE_CUDA_PFN(cuMemExportToShareableHandle);
-DECLARE_CUDA_PFN(cuMemImportFromShareableHandle);
-DECLARE_CUDA_PFN(cuMemMap);
-DECLARE_CUDA_PFN(cuMemRelease);
-DECLARE_CUDA_PFN(cuMemRetainAllocationHandle);
-DECLARE_CUDA_PFN(cuMemSetAccess);
-DECLARE_CUDA_PFN(cuMemUnmap);
-DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle);
+DECLARE_CUDA_PFN(cuMemAddressReserve, 10020);
+DECLARE_CUDA_PFN(cuMemAddressFree, 10020);
+DECLARE_CUDA_PFN(cuMemCreate, 10020);
+DECLARE_CUDA_PFN(cuMemGetAllocationGranularity, 10020);
+DECLARE_CUDA_PFN(cuMemExportToShareableHandle, 10020);
+DECLARE_CUDA_PFN(cuMemImportFromShareableHandle, 10020);
+DECLARE_CUDA_PFN(cuMemMap, 10020);
+DECLARE_CUDA_PFN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN(cuMemRetainAllocationHandle, 11000);
+DECLARE_CUDA_PFN(cuMemSetAccess, 10020);
+DECLARE_CUDA_PFN(cuMemUnmap, 10020);
+DECLARE_CUDA_PFN(cuMemGetAllocationPropertiesFromHandle, 10020);
 /* ncclMemAlloc/Free */
-DECLARE_CUDA_PFN(cuPointerGetAttribute);
+DECLARE_CUDA_PFN(cuPointerGetAttribute, 4000);
 #if CUDA_VERSION >= 11070
 /* transport/collNet.cc/net.cc*/
-DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange); // DMA-BUF support
+DECLARE_CUDA_PFN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
-DECLARE_CUDA_PFN(cuMulticastAddDevice);
-DECLARE_CUDA_PFN(cuMulticastBindMem);
-DECLARE_CUDA_PFN(cuMulticastBindAddr);
-DECLARE_CUDA_PFN(cuMulticastCreate);
-DECLARE_CUDA_PFN(cuMulticastGetGranularity);
-DECLARE_CUDA_PFN(cuMulticastUnbind);
+DECLARE_CUDA_PFN(cuMulticastAddDevice, 12010);
+DECLARE_CUDA_PFN(cuMulticastBindMem, 12010);
+DECLARE_CUDA_PFN(cuMulticastBindAddr, 12010);
+DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
+DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
+DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
 #endif
 #endif
 
@@ -162,8 +162,17 @@ bool ncclCudaLaunchBlocking = false;
 
 #if CUDART_VERSION >= 11030
 
-#if CUDART_VERSION >= 12000
-#define LOAD_SYM(symbol, ignore) do {                                   \
+#if CUDART_VERSION >= 13000
+#define LOAD_SYM(symbol, version, ignore) do {                           \
+    cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
+    res = cudaGetDriverEntryPointByVersion(#symbol, (void **) (&pfn_##symbol), version, cudaEnableDefault, &driverStatus); \
+    if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
+      if (!ignore) {                                                    \
+        WARN("Retrieve %s version %d failed with %d status %d", #symbol, version, res, driverStatus); \
+        return ncclSystemError; }                                       \
+    } } while(0)
+#elif CUDART_VERSION >= 12000
+#define LOAD_SYM(symbol, version, ignore) do {                           \
     cudaDriverEntryPointQueryResult driverStatus = cudaDriverEntryPointSymbolNotFound; \
     res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault, &driverStatus); \
     if (res != cudaSuccess || driverStatus != cudaDriverEntryPointSuccess) { \
@@ -172,7 +181,7 @@ bool ncclCudaLaunchBlocking = false;
         return ncclSystemError; }                                       \
     } } while(0)
 #else
-#define LOAD_SYM(symbol, ignore) do {                                   \
+#define LOAD_SYM(symbol, version, ignore) do {                           \
     res = cudaGetDriverEntryPoint(#symbol, (void **) (&pfn_##symbol), cudaEnableDefault); \
     if (res != cudaSuccess) { \
       if (!ignore) {                                                    \
@@ -188,46 +197,46 @@ static ncclResult_t cudaPfnFuncLoader(void) {
 
   cudaError_t res;
 
-  LOAD_SYM(cuGetErrorString, 0);
-  LOAD_SYM(cuGetErrorName, 0);
-  LOAD_SYM(cuDeviceGet, 0);
-  LOAD_SYM(cuDeviceGetAttribute, 0);
-  LOAD_SYM(cuMemGetAddressRange, 1);
-  LOAD_SYM(cuCtxCreate, 1);
-  LOAD_SYM(cuCtxDestroy, 1);
-  LOAD_SYM(cuCtxGetCurrent, 1);
-  LOAD_SYM(cuCtxSetCurrent, 1);
-  LOAD_SYM(cuCtxGetDevice, 1);
-  LOAD_SYM(cuLaunchKernel, 1);
+  LOAD_SYM(cuGetErrorString, 6000, 0);
+  LOAD_SYM(cuGetErrorName, 6000, 0);
+  LOAD_SYM(cuDeviceGet, 2000, 0);
+  LOAD_SYM(cuDeviceGetAttribute, 2000, 0);
+  LOAD_SYM(cuMemGetAddressRange, 3020, 1);
+  LOAD_SYM(cuCtxCreate, 11040, 1);
+  LOAD_SYM(cuCtxDestroy, 4000, 1);
+  LOAD_SYM(cuCtxGetCurrent, 4000, 1);
+  LOAD_SYM(cuCtxSetCurrent, 4000, 1);
+  LOAD_SYM(cuCtxGetDevice, 2000, 1);
+  LOAD_SYM(cuLaunchKernel, 4000, 1);
 #if CUDA_VERSION >= 11080
-  LOAD_SYM(cuLaunchKernelEx, 1);
+  LOAD_SYM(cuLaunchKernelEx, 11060, 1);
 #endif
 /* cuMem API support */
-  LOAD_SYM(cuMemAddressReserve, 1);
-  LOAD_SYM(cuMemAddressFree, 1);
-  LOAD_SYM(cuMemCreate, 1);
-  LOAD_SYM(cuMemGetAllocationGranularity, 1);
-  LOAD_SYM(cuMemExportToShareableHandle, 1);
-  LOAD_SYM(cuMemImportFromShareableHandle, 1);
-  LOAD_SYM(cuMemMap, 1);
-  LOAD_SYM(cuMemRelease, 1);
-  LOAD_SYM(cuMemRetainAllocationHandle, 1);
-  LOAD_SYM(cuMemSetAccess, 1);
-  LOAD_SYM(cuMemUnmap, 1);
-  LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 1);
+  LOAD_SYM(cuMemAddressReserve, 10020, 1);
+  LOAD_SYM(cuMemAddressFree, 10020, 1);
+  LOAD_SYM(cuMemCreate, 10020, 1);
+  LOAD_SYM(cuMemGetAllocationGranularity, 10020, 1);
+  LOAD_SYM(cuMemExportToShareableHandle, 10020, 1);
+  LOAD_SYM(cuMemImportFromShareableHandle, 10020, 1);
+  LOAD_SYM(cuMemMap, 10020, 1);
+  LOAD_SYM(cuMemRelease, 10020, 1);
+  LOAD_SYM(cuMemRetainAllocationHandle, 11000, 1);
+  LOAD_SYM(cuMemSetAccess, 10020, 1);
+  LOAD_SYM(cuMemUnmap, 10020, 1);
+  LOAD_SYM(cuMemGetAllocationPropertiesFromHandle, 10020, 1);
 /* ncclMemAlloc/Free */
-  LOAD_SYM(cuPointerGetAttribute, 1);
+  LOAD_SYM(cuPointerGetAttribute, 4000, 1);
 #if CUDA_VERSION >= 11070
-  LOAD_SYM(cuMemGetHandleForAddressRange, 1); // DMA-BUF support
+  LOAD_SYM(cuMemGetHandleForAddressRange, 11070, 1); // DMA-BUF support
 #endif
 #if CUDA_VERSION >= 12010
 /* NVSwitch Multicast support */
-  LOAD_SYM(cuMulticastAddDevice, 1);
-  LOAD_SYM(cuMulticastBindMem, 1);
-  LOAD_SYM(cuMulticastBindAddr, 1);
-  LOAD_SYM(cuMulticastCreate, 1);
-  LOAD_SYM(cuMulticastGetGranularity, 1);
-  LOAD_SYM(cuMulticastUnbind, 1);
+  LOAD_SYM(cuMulticastAddDevice, 12010, 1);
+  LOAD_SYM(cuMulticastBindMem, 12010, 1);
+  LOAD_SYM(cuMulticastBindAddr, 12010, 1);
+  LOAD_SYM(cuMulticastCreate, 12010, 1);
+  LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
+  LOAD_SYM(cuMulticastUnbind, 12010, 1);
 #endif
   return ncclSuccess;
 }
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 698465ca4..23bf5e125 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -8,7 +8,11 @@
 #include <sys/types.h>
 #include <unistd.h>
 
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
 #include "ibvcore.h"
+#endif
 #include "ibvsymbols.h"
 
 static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc
new file mode 100644
index 000000000..5bb4109f3
--- /dev/null
+++ b/src/misc/mlx5dvsymbols.cc
@@ -0,0 +1,74 @@
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "mlx5/mlx5dvsymbols.h"
+
+#ifdef NCCL_BUILD_MLX5DV
+/* Mlx5dv linking mode. Symbols are pointers to linked MLX5 Direct Verbs */
+
+#define ASSIGN_SYM(container, symbol, name) container->name= &symbol;
+
+ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
+  ASSIGN_SYM(mlx5dvSymbols, mlx5dv_is_supported, mlx5dv_internal_is_supported);
+  ASSIGN_SYM(mlx5dvSymbols, mlx5dv_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path);
+  ASSIGN_SYM(mlx5dvSymbols, mlx5dv_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr);
+  return ncclSuccess;
+}
+
+#else
+/* Mlx5dv dynamic loading mode. Symbols are loaded from shared objects. */
+
+#include <dlfcn.h>
+#include "core.h"
+
+// MLX5DV Library versioning
+#define MLX5DV_VERSION "MLX5_1.8"
+
+ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
+  static void* mlx5dvhandle = NULL;
+  void* tmp;
+  void** cast;
+
+  mlx5dvhandle=dlopen("libmlx5.so", RTLD_NOW);
+  if (!mlx5dvhandle) {
+    mlx5dvhandle=dlopen("libmlx5.so.1", RTLD_NOW);
+    if (!mlx5dvhandle) {
+      INFO(NCCL_INIT, "Failed to open libmlx5.so[.1]");
+      goto teardown;
+    }
+  }
+
+#define LOAD_SYM(handle, symbol, funcptr) do {           \
+    cast = (void**)&funcptr;                             \
+    tmp = dlvsym(handle, symbol, MLX5DV_VERSION);       \
+    if (tmp == NULL) {                                   \
+      WARN("dlvsym failed on %s - %s version %s", symbol, dlerror(), MLX5DV_VERSION);  \
+      goto teardown;                                     \
+    }                                                    \
+    *cast = tmp;                                         \
+  } while (0)
+
+// Attempt to load a specific symbol version - fail silently
+#define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
+    cast = (void**)&funcptr;                                     \
+    *cast = dlvsym(handle, symbol, version);                     \
+  } while (0)
+
+  LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
+  // Cherry-pick the mlx5dv_get_data_direct_sysfs_path API from MLX5 1.25
+  LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_get_data_direct_sysfs_path", mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path, "MLX5_1.25");
+  // Cherry-pick the ibv_reg_dmabuf_mr API from MLX5 1.25
+  LOAD_SYM_VERSION(mlx5dvhandle, "mlx5dv_reg_dmabuf_mr", mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr, "MLX5_1.25");
+
+  return ncclSuccess;
+
+teardown:
+  mlx5dvSymbols->mlx5dv_internal_is_supported = NULL;
+  mlx5dvSymbols->mlx5dv_internal_get_data_direct_sysfs_path = NULL;
+  mlx5dvSymbols->mlx5dv_internal_reg_dmabuf_mr = NULL;
+
+  if (mlx5dvhandle != NULL) dlclose(mlx5dvhandle);
+  return ncclSystemError;
+}
+
+#endif
diff --git a/src/misc/mlx5dvwrap.cc b/src/misc/mlx5dvwrap.cc
new file mode 100644
index 000000000..930ed5d2e
--- /dev/null
+++ b/src/misc/mlx5dvwrap.cc
@@ -0,0 +1,75 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "mlx5/mlx5dvwrap.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifdef NCCL_BUILD_MLX5DV
+#include <infiniband/mlx5dv.h>
+#else
+#include "mlx5/mlx5dvcore.h"
+#endif
+#include "mlx5/mlx5dvsymbols.h"
+
+static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static ncclResult_t initResult;
+struct ncclMlx5dvSymbols mlx5dvSymbols;
+
+ncclResult_t wrap_mlx5dv_symbols(void) {
+  pthread_once(&initOnceControl,
+               [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); });
+  return initResult;
+}
+
+/* CHECK_NOT_NULL: helper macro to check for NULL symbol */
+#define CHECK_NOT_NULL(container, internal_name) \
+  if (container.internal_name == NULL) { \
+     WARN("lib wrapper not initialized."); \
+     return ncclInternalError; \
+  }
+
+#define MLX5DV_PTR_CHECK_ERRNO(container, internal_name, call, retval, error_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  retval = container.call; \
+  if (retval == error_retval) { \
+    WARN("Call to " name " failed with error %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+#define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
+  CHECK_NOT_NULL(container, internal_name); \
+  int ret = container.call; \
+  if (ret != success_retval) { \
+    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    return ncclSystemError; \
+  } \
+  return ncclSuccess;
+
+bool wrap_mlx5dv_is_supported(struct ibv_device *device) {
+  if (mlx5dvSymbols.mlx5dv_internal_is_supported == NULL) {
+    return 0;
+  }
+  return mlx5dvSymbols.mlx5dv_internal_is_supported(device);
+}
+
+ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) {
+  MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path");
+}
+
+/* DMA-BUF support */
+ncclResult_t wrap_mlx5dv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
+  MLX5DV_PTR_CHECK_ERRNO(mlx5dvSymbols, mlx5dv_internal_reg_dmabuf_mr, mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access), *ret, NULL, "mlx5dv_reg_dmabuf_mr");
+}
+
+struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access, int mlx5_access) {
+  if (mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr == NULL) {
+    errno = EOPNOTSUPP; // ncclIbDmaBufSupport() requires this errno being set
+    return NULL;
+  }
+  return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access);
+}
\ No newline at end of file
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 731dbcee1..278fb5c51 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -68,7 +68,8 @@ static ncclResult_t socketProgress(int op, struct ncclSocket* sock, void* ptr, i
       return ncclSuccess;
     } else {
       char line[SOCKET_NAME_MAXLEN+1];
-      WARN("socketProgress: Connection closed by remote peer %s", ncclSocketToString(&sock->addr, line, 0));
+      WARN("socketProgress: Connection closed by remote peer %s",
+           ncclSocketToString(&sock->addr, line, /*numericHostForm*/0));
       return ncclRemoteError;
     }
   }
@@ -86,17 +87,22 @@ static ncclResult_t socketWait(int op, struct ncclSocket* sock, void* ptr, int s
  * Output: "IPv4/IPv6 address<port>"
  */
 const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm /*= 1*/) {
-  if (buf == NULL || addr == NULL) return NULL;
-  const struct sockaddr *saddr = &addr->sa;
-  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) { buf[0]='\0'; return buf; }
+  const struct sockaddr *saddr;
   char host[NI_MAXHOST], service[NI_MAXSERV];
+  int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
+  if (buf == NULL || addr == NULL) goto fail;
+  saddr = &addr->sa;
+  if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) goto fail;
   /* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
    * (When not set, this will still happen in case the node's name cannot be determined.)
    */
-  int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
-  (void) getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
+  if (getnameinfo(saddr, sizeof(union ncclSocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag)) goto fail;
   sprintf(buf, "%s<%s>", host, service);
   return buf;
+fail:
+  if (buf)
+    buf[0] = '\0';
+  return buf;
 }
 
 static uint16_t socketToPort(union ncclSocketAddress *addr) {
@@ -120,7 +126,8 @@ static int envSocketFamily(void) {
   return family;
 }
 
-static int findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family, int maxIfNameSize, int maxIfs) {
+static ncclResult_t findInterfaces(const char* prefixList, char* names, union ncclSocketAddress *addrs, int sock_family,
+                                   int maxIfNameSize, int maxIfs, int* found) {
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
 #endif
@@ -131,10 +138,10 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
   if (searchExact) prefixList++;
   int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
 
-  int found = 0;
+  *found = 0;
   struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
+  SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+  for (interface = interfaces; interface && *found < maxIfs; interface = interface->ifa_next) {
     if (interface->ifa_addr == NULL) continue;
 
     /* We only support IPv4 & IPv6 */
@@ -162,23 +169,23 @@ static int findInterfaces(const char* prefixList, char* names, union ncclSocketA
     // Check that this interface has not already been saved
     // getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
     bool duplicate = false;
-    for (int i = 0; i < found; i++) {
+    for (int i = 0; i < *found; i++) {
       if (strcmp(interface->ifa_name, names+i*maxIfNameSize) == 0) { duplicate = true; break; }
     }
 
     if (!duplicate) {
       // Store the interface name
-      strncpy(names+found*maxIfNameSize, interface->ifa_name, maxIfNameSize);
+      strncpy(names + (*found)*maxIfNameSize, interface->ifa_name, maxIfNameSize);
       // Store the IP address
       int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-      memset(addrs+found, '\0', sizeof(*addrs));
-      memcpy(addrs+found, interface->ifa_addr, salen);
-      found++;
+      memset(addrs + *found, '\0', sizeof(*addrs));
+      memcpy(addrs + *found, interface->ifa_addr, salen);
+      (*found)++;
     }
   }
 
   freeifaddrs(interfaces);
-  return found;
+  return ncclSuccess;
 }
 
 static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote) {
@@ -219,20 +226,21 @@ static bool matchSubnet(struct ifaddrs local_if, union ncclSocketAddress* remote
     same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
     return same;
   } else {
-    WARN("Net : Unsupported address family type");
+    INFO(NCCL_NET, "Net : Unsupported address family type");
     return false;
   }
 }
 
-int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs) {
+ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
+                                          union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int* found) {
 #ifdef ENABLE_TRACE
   char line[SOCKET_NAME_MAXLEN+1];
-#endif
   char line_a[SOCKET_NAME_MAXLEN+1];
-  int found = 0;
+#endif
+  *found = 0;
   struct ifaddrs *interfaces, *interface;
-  getifaddrs(&interfaces);
-  for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
+  SYSCHECK(getifaddrs(&interfaces), "getifaddrs");
+  for (interface = interfaces; interface && !*found; interface = interface->ifa_next) {
     if (interface->ifa_addr == NULL) continue;
 
     /* We only support IPv4 & IPv6 */
@@ -247,21 +255,18 @@ int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAd
 
     // Store the local IP address
     int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
-    memcpy(localAddrs+found, interface->ifa_addr, salen);
+    memcpy(localAddr, interface->ifa_addr, salen);
 
     // Store the interface name
-    strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
+    strncpy(ifName, interface->ifa_name, ifNameMaxSize);
 
-    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, ncclSocketToString(localAddrs+found, line), ncclSocketToString(remoteAddr, line_a));
-    found++;
-    if (found == maxIfs) break;
+    TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s",
+          interface->ifa_name, ncclSocketToString(localAddr, line), ncclSocketToString(remoteAddr, line_a));
+    *found = 1;
   }
 
-  if (found == 0) {
-    WARN("Net : No interface found in the same subnet as remote address %s", ncclSocketToString(remoteAddr, line_a));
-  }
   freeifaddrs(interfaces);
-  return found;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair) {
@@ -344,40 +349,41 @@ ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char
   return ncclSuccess;
 }
 
-int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs) {
+ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs,
+                                int* nIfs) {
   static int shownIfName = 0;
-  int nIfs = 0;
   // Allow user to force the INET socket family selection
   int sock_family = envSocketFamily();
   // User specified interface
   const char* env = ncclGetEnv("NCCL_SOCKET_IFNAME");
+  *nIfs = 0;
   if (env && strlen(env) > 1) {
     INFO(NCCL_ENV, "NCCL_SOCKET_IFNAME set by environment to %s", env);
     // Specified by user : find or fail
     if (shownIfName++ == 0) INFO(NCCL_NET, "NCCL_SOCKET_IFNAME set to %s", env);
-    nIfs = findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    NCCLCHECK(findInterfaces(env, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
   } else {
     // Try to automatically pick the right one
     // Start with IB
-    nIfs = findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    NCCLCHECK(findInterfaces("ib", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
     // else see if we can get some hint from COMM ID
-    if (nIfs == 0) {
+    if (*nIfs == 0) {
       const char* commId = ncclGetEnv("NCCL_COMM_ID");
       if (commId && strlen(commId) > 1) {
         INFO(NCCL_ENV, "NCCL_COMM_ID set by environment to %s", commId);
         // Try to find interface that is in the same subnet as the IP in comm id
         union ncclSocketAddress idAddr;
-        ncclSocketGetAddrFromString(&idAddr, commId);
-        nIfs = ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, maxIfs);
+        NCCLCHECK(ncclSocketGetAddrFromString(&idAddr, commId));
+        NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
       }
     }
     // Then look for anything else (but not docker or lo)
-    if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
     // Finally look for docker, then lo.
-    if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
-    if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
+    if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+    if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
   }
-  return nIfs;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclSocketListen(struct ncclSocket* sock) {
@@ -439,17 +445,20 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
     /* per accept's man page, for linux sockets, the following errors might be already pending errors
      * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
     if (++sock->errorRetries == ncclParamRetryCnt()) {
-      WARN("socketTryAccept: exceeded error retry count (%d), %s", sock->errorRetries, strerror(errno));
+      WARN("socketTryAccept: exceeded error retry count after %d attempts, %s", sock->errorRetries, strerror(errno));
       return ncclSystemError;
     }
-    INFO(NCCL_ALL, "Call to accept returned %s, retrying", strerror(errno));
-  } else if (errno != EAGAIN && errno != EWOULDBLOCK) {
+    INFO(NCCL_NET|NCCL_INIT, "Call to accept returned %s, retrying", strerror(errno));
+  } else if (errno != EINTR && errno != EAGAIN && errno != EWOULDBLOCK) {
     WARN("socketTryAccept: Accept failed: %s", strerror(errno));
     return ncclSystemError;
   }
   return ncclSuccess;
 }
 
+NCCL_PARAM(SocketMaxRecvBuff, "SOCKET_RCVBUF", -1);
+NCCL_PARAM(SocketMaxSendBuff, "SOCKET_SNDBUF", -1);
+
 static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
   const int one = 1;
   /* Set socket as non-blocking if async or if we need to be able to abort */
@@ -458,34 +467,55 @@ static ncclResult_t socketSetFlags(struct ncclSocket* sock) {
     SYSCHECK(flags = fcntl(sock->fd, F_GETFL), "fcntl");
     SYSCHECK(fcntl(sock->fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
   }
-  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt");
+  SYSCHECK(setsockopt(sock->fd, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)), "setsockopt TCP NODELAY");
+  // setsockopt should not fail even if the sizes are too large, do not change the default if unset by the user (=-1)
+  int rcvBuf = ncclParamSocketMaxRecvBuff(), sndBuf = ncclParamSocketMaxSendBuff();
+  if (sndBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, (char*)&sndBuf, sizeof(int)), "setsockopt SO_SNDBUF");
+  if (rcvBuf > 0) SYSCHECK(setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, (char*)&rcvBuf, sizeof(int)), "setsockopt SO_RCVBUF");
   return ncclSuccess;
 }
 
+static void socketResetAccept(struct ncclSocket* sock) {
+  char line[SOCKET_NAME_MAXLEN+1];
+  INFO(NCCL_NET|NCCL_INIT, "socketFinalizeAccept: didn't receive a valid magic from %s",
+       ncclSocketToString(&sock->addr, line));
+  // Ignore spurious connection and accept again
+  (void)close(sock->fd);
+  sock->fd = -1;
+  sock->state = ncclSocketStateAccepting;
+  sock->finalizeCounter = 0;
+}
+
 static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
   uint64_t magic;
   enum ncclSocketType type;
   int received;
+  char line[SOCKET_NAME_MAXLEN+1];
   // once accepted, linux sockets do NOT inherit file status flags such as O_NONBLOCK (BSD ones do)
   NCCLCHECK(socketSetFlags(sock));
 
   if (sock->asyncFlag == 0 || sock->finalizeCounter < sizeof(magic)) {
     if (sock->asyncFlag == 0) {
       received = 0;
-      NCCLCHECK(socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received));
+      if (socketWait(NCCL_SOCKET_RECV, sock, &magic, sizeof(magic), &received) != ncclSuccess) {
+        socketResetAccept(sock);
+        return ncclSuccess;
+      }
     } else {
+      int closed = 0;
       received = sock->finalizeCounter;
-      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received));
+      NCCLCHECK(socketProgress(NCCL_SOCKET_RECV, sock, sock->finalizeBuffer, sizeof(magic), &received, &closed));
       sock->finalizeCounter = received;
-      if (received < sizeof(magic)) return ncclSuccess;
+      if (received < sizeof(magic)) {
+        if (closed) {
+          socketResetAccept(sock);
+        }
+        return ncclSuccess;
+      }
       memcpy(&magic, sock->finalizeBuffer, sizeof(magic));
     }
     if (magic != sock->magic) {
-      WARN("socketFinalizeAccept: wrong magic %lx != %lx", magic, sock->magic);
-      close(sock->fd);
-      sock->fd = -1;
-      // Ignore spurious connection and accept again
-      sock->state = ncclSocketStateAccepting;
+      socketResetAccept(sock);
       return ncclSuccess;
     }
   }
@@ -500,7 +530,7 @@ static ncclResult_t socketFinalizeAccept(struct ncclSocket* sock) {
     memcpy(&type, sock->finalizeBuffer, sizeof(type));
   }
   if (type != sock->type) {
-    WARN("socketFinalizeAccept: wrong type %d != %d", type, sock->type);
+    WARN("socketFinalizeAccept from %s: wrong type %d != %d", ncclSocketToString(&sock->addr, line), type, sock->type);
     sock->state = ncclSocketStateError;
     close(sock->fd);
     sock->fd = -1;
@@ -532,32 +562,38 @@ static ncclResult_t socketResetFd(struct ncclSocket* sock) {
   }
   goto exit;
 }
+
 static ncclResult_t socketConnectCheck(struct ncclSocket* sock, int errCode, const char funcName[]) {
+  char line[SOCKET_NAME_MAXLEN+1];
   if (errCode == 0) {
     sock->state = ncclSocketStateConnected;
   } else if (errCode == EINPROGRESS) {
     sock->state = ncclSocketStateConnectPolling;
-  } else if (errCode == ETIMEDOUT || errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
+  } else if (errCode == EINTR || errCode == EWOULDBLOCK || errCode == EAGAIN || errCode == ETIMEDOUT ||
+             errCode == EHOSTUNREACH || errCode == ECONNREFUSED) {
     if (sock->customRetry == 0) {
       if (sock->errorRetries++ == ncclParamRetryCnt()) {
         sock->state = ncclSocketStateError;
-        WARN("%s: connect returned %s, exceeded error retry count (%d)", funcName, strerror(errCode), sock->errorRetries);
+        WARN("%s: connect to %s returned %s, exceeded error retry count after %d attempts",
+             funcName, ncclSocketToString(&sock->addr, line), strerror(errCode), sock->errorRetries);
         return ncclRemoteError;
       }
       unsigned int sleepTime = sock->errorRetries * ncclParamRetryTimeOut();
-      INFO(NCCL_ALL, "%s: connect returned %s, retrying (%d/%ld) after sleep for %u msec", funcName, strerror(errCode), sock->errorRetries, ncclParamRetryCnt(), sleepTime);
+      INFO(NCCL_NET|NCCL_INIT, "%s: connect to %s returned %s, retrying (%d/%ld) after sleep for %u msec",
+           funcName, ncclSocketToString(&sock->addr, line), strerror(errCode),
+           sock->errorRetries, ncclParamRetryCnt(), sleepTime);
       msleep(sleepTime);
     }
     NCCLCHECK(socketResetFd(sock)); /* in case of failure in connect, socket state is unspecified */
     sock->state = ncclSocketStateConnecting;
   } else {
-    char line[SOCKET_NAME_MAXLEN+1];
     sock->state = ncclSocketStateError;
-    WARN("%s: Connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
+    WARN("%s: connect to %s failed : %s", funcName, ncclSocketToString(&sock->addr, line), strerror(errCode));
     return ncclSystemError;
   }
   return ncclSuccess;
 }
+
 static ncclResult_t socketStartConnect(struct ncclSocket* sock) {
   /* blocking/non-blocking connect() is determined by asyncFlag. */
   int ret = connect(sock->fd, &sock->addr.sa, sock->salen);
@@ -568,6 +604,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
   struct pollfd pfd;
   int timeout = 1, ret;
   socklen_t rlen = sizeof(int);
+  char line[SOCKET_NAME_MAXLEN+1];
 
   memset(&pfd, 0, sizeof(struct pollfd));
   pfd.fd = sock->fd;
@@ -577,10 +614,7 @@ static ncclResult_t socketPollConnect(struct ncclSocket* sock) {
   if (ret == 0 || (ret < 0 && errno == EINTR)) {
     return ncclSuccess;
   } else if (ret < 0) {
-    WARN("socketPollConnect poll() failed with error %s", strerror(errno));
-    return ncclRemoteError;
-  } else if (ret != 1 || (pfd.revents & POLLOUT) == 0) {
-    WARN("socketPollConnect poll() returned %d%s", ret, (pfd.revents & POLLOUT) ? "" : ", no POLLOUT events");
+    WARN("socketPollConnect to %s failed with error %s", ncclSocketToString(&sock->addr, line), strerror(errno));
     return ncclSystemError;
   }
 
@@ -899,7 +933,7 @@ ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how) {
   if (sock != NULL) {
     if (sock->fd >= 0) {
-      shutdown(sock->fd, how);
+      SYSCHECK(shutdown(sock->fd, how), "shutdown");
     }
     sock->state = ncclSocketStateTerminating;
   }
@@ -921,8 +955,8 @@ ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait) {
        * by refcount of fd, but close() is. close() won't close a fd and send FIN packet if
        * the fd is duplicated (e.g. fork()). So shutdown() guarantees the correct and graceful
        * connection close here. */
-      shutdown(sock->fd, SHUT_RDWR);
-      close(sock->fd);
+      (void)shutdown(sock->fd, SHUT_RDWR);
+      (void)close(sock->fd);
     }
     sock->state = ncclSocketStateClosed;
     sock->fd = -1;
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 7d957d432..0adb4b137 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -9,6 +9,12 @@
 #include "checks.h"
 #include "param.h"
 
+#if CUDART_VERSION >= 13000
+#define cudaStreamGetCaptureInfo_v3 cudaStreamGetCaptureInfo
+#define cudaGraphAddDependencies_v2 cudaGraphAddDependencies
+#define cudaStreamUpdateCaptureDependencies_v2 cudaStreamUpdateCaptureDependencies
+#endif
+
 // Tracks the captured work a given graph captured identified by its graph id.
 struct ncclStrongStreamCapture {
   struct ncclStrongStreamCapture* next;
@@ -89,7 +95,11 @@ ncclResult_t ncclCudaGetCapturingGraph(
     } else {
       #if CUDART_VERSION >= 11030
         cudaStreamCaptureStatus status;
+      #if CUDART_VERSION >= 13000
+        CUDACHECK(cudaStreamGetCaptureInfo_v3(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr, nullptr));
+      #else
         CUDACHECK(cudaStreamGetCaptureInfo_v2(stream, &status, &graph->graphId, &graph->graph, nullptr, nullptr));
+      #endif
         if (status != cudaStreamCaptureStatusActive) {
           graph->origin = nullptr;
           graph->graph = nullptr;
@@ -224,7 +234,11 @@ ncclResult_t ncclStrongStreamAcquire(
       CUDACHECK(cudaEventRecord(scratch, graph.origin));
       CUDACHECK(cudaStreamWaitEvent(cap->captureStream, scratch, 0));
       CUDACHECK(cudaEventDestroy(scratch));
+      #if CUDART_VERSION >= 13000
+      CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(cap->captureStream, nullptr, nullptr, 0, cudaStreamSetCaptureDependencies));
+      #else
       CUDACHECK(cudaStreamUpdateCaptureDependencies(cap->captureStream, nullptr, 0, cudaStreamSetCaptureDependencies));
+      #endif
 
       if (mixing && firstCapture) {
         CUDACHECK(cudaEventRecord(ss->serialEvent, ss->liveStream));
@@ -284,7 +298,11 @@ ncclResult_t ncclStrongStreamRelease(
 
         // Make this record order after previous record on this stream.
         if (cap->lastRecord != nullptr) {
+        #if CUDART_VERSION >= 13000
+          CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1));
+        #else
           CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
+        #endif
         }
         cap->lastRecord = recordNode;
 
@@ -292,7 +310,11 @@ ncclResult_t ncclStrongStreamRelease(
         cudaStreamCaptureStatus status;
         cudaGraphNode_t const* nodes;
         size_t count = 0;
+        #if CUDART_VERSION >= 13000
+        cudaError_t res = cudaStreamGetCaptureInfo_v3(cap->captureStream, &status, nullptr, nullptr, &nodes, nullptr, &count);
+        #else
         cudaError_t res = cudaStreamGetCaptureInfo_v2(cap->captureStream, &status, nullptr, nullptr, &nodes, &count);
+        #endif
 
         #if CUDART_VERSION >= 12030
         if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
@@ -308,7 +330,11 @@ ncclResult_t ncclStrongStreamRelease(
         else {
           CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
           for (int i=0; i < (int)count; i++) {
+          #if CUDART_VERSION >= 13000
+            CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &nodes[i], &recordNode, nullptr, 1));
+          #else
             CUDACHECK(cudaGraphAddDependencies(graph.graph, &nodes[i], &recordNode, 1));
+          #endif
           }
         }
 
@@ -339,7 +365,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu
     cudaStreamCaptureStatus status;
     cudaGraphNode_t const* nodes;
     size_t count = 0;
+    #if CUDART_VERSION >= 13000
+    cudaError_t res = cudaStreamGetCaptureInfo_v3(tmp, &status, nullptr, nullptr, &nodes, nullptr, &count);
+    #else
     cudaError_t res = cudaStreamGetCaptureInfo_v2(tmp, &status, nullptr, nullptr, &nodes, &count);
+    #endif
 
     #if CUDART_VERSION >= 12030
     if (res == cudaErrorLossyQuery) { // CUDA is telling us the dependencies have edge annotations.
@@ -352,7 +382,11 @@ ncclResult_t ncclStreamAdvanceToEvent(struct ncclCudaGraph g, cudaStream_t s, cu
     #endif
     else {
       CUDACHECK(res /* = cudaStreamGetCaptureInfo_v2(...)*/);
+    #if CUDART_VERSION >= 13000
+      CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(s, (cudaGraphNode_t*)nodes, nullptr, count, cudaStreamSetCaptureDependencies));
+    #else
       CUDACHECK(cudaStreamUpdateCaptureDependencies(s, (cudaGraphNode_t*)nodes, count, cudaStreamSetCaptureDependencies));
+    #endif
     }
 
     CUDACHECK(cudaStreamDestroy(tmp));
diff --git a/src/mnnvl.cc b/src/mnnvl.cc
index 07e8b21d9..34a18b80a 100644
--- a/src/mnnvl.cc
+++ b/src/mnnvl.cc
@@ -58,7 +58,12 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
 
     // Allocate FABRIC handle compatible memory
     ncclResult_t ret = ncclCuMemAlloc(&ptr, &handle, CU_MEM_HANDLE_TYPE_FABRIC, CUDA_IPC_MIN);
-    if (ret != ncclSuccess) return ncclSuccess;
+    if (ret != ncclSuccess) {
+      // Return an error if this is a MNNVL capable system but FABRIC handles are not supported
+      WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX channel configuration (/dev/nvidia-caps-imex-channels). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
+           comm->clique.size);
+      return ncclSystemError;
+    }
     err = CUPFN(cuMemExportToShareableHandle(&cuDesc, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0));
     if (err != CUDA_SUCCESS ||
         (err = CUPFN(cuMemImportFromShareableHandle(&handle, &cuDesc, CU_MEM_HANDLE_TYPE_FABRIC))) != CUDA_SUCCESS) {
@@ -66,7 +71,7 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
       (void) pfn_cuGetErrorString(err, &errStr);
       NCCLCHECK(ncclCuMemFree(ptr));
       // Return an error if this is a MNNVL capable system but it's not working
-      WARN("MNNVL (cliqueSize %d) is available but not supported on this system. Check the IMEX configuration.",
+      WARN("MNNVL (cliqueSize %d) is available but not working on this system. Check the IMEX configuration (nvidia-imex-ctl -N). Set NCCL_MNNVL_ENABLE=0 to ignore this issue.",
           comm->clique.size);
       return ncclSystemError;
     }
diff --git a/src/nccl.h.in b/src/nccl.h.in
index f3ab5344f..292a83914 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -31,6 +31,7 @@ extern "C" {
 #include <limits.h>
 /* Opaque handle to communicator */
 typedef struct ncclComm* ncclComm_t;
+typedef struct ncclWindow* ncclWindow_t;
 #define NCCL_COMM_NULL NULL
 
 #define NCCL_UNIQUE_ID_BYTES 128
@@ -52,9 +53,21 @@ typedef enum { ncclSuccess                 =  0,
 #define NCCL_SPLIT_NOCOLOR -1
 #define NCCL_UNDEF_FLOAT -1.0f
 
+/* Window Registration flags */
+#define NCCL_WIN_DEFAULT 0x00
+#define NCCL_WIN_COLL_SYMMETRIC 0x01
+
+/* NCCL performance policy */
+#define NCCL_CTA_POLICY_DEFAULT 0x00
+#define NCCL_CTA_POLICY_EFFICIENCY 0x01
+
+/* ncclCommShrink flags*/
+#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
+#define NCCL_SHRINK_ABORT 0x01   /* First, terminate ongoing parent operations, and then shrink the parent communicator */
+
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
-typedef struct ncclConfig_v21700 {
+typedef struct ncclConfig_v22700 {
   /* attributes that users should never touch. */
   size_t size;
   unsigned int magic;
@@ -67,6 +80,11 @@ typedef struct ncclConfig_v21700 {
   const char *netName;
   int splitShare;
   int trafficClass;
+  const char *commName;
+  int collnetEnable;
+  int CTAPolicy;
+  int shrinkShare;
+  int nvlsCTAs;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -82,6 +100,11 @@ typedef struct ncclConfig_v21700 {
   NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
   NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
   NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
+  NCCL_CONFIG_UNDEF_PTR,                    /* commName */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* collnetEnable */         \
+  NCCL_CONFIG_UNDEF_INT,                    /* CTAPolicy */             \
+  NCCL_CONFIG_UNDEF_INT,                    /* shrinkShare */           \
+  NCCL_CONFIG_UNDEF_INT,                    /* nvlsCTAs */              \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
@@ -173,6 +196,14 @@ ncclResult_t pncclCommAbort(ncclComm_t comm);
 ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
 
+/* Shrink existing communicator.
+ * Ranks in excludeRanksList will be removed form the existing communicator.
+ * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
+ * If config is NULL, the new communicator will inherit the original communicator's configuration
+ * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
+ncclResult_t  ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
+ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
+
 /* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
  * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
  * The number of ncclUniqueIds and their order must be the same for every rank.
@@ -216,6 +247,14 @@ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, v
 ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
 ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
 
+/* Register memory window  */
+ncclResult_t  ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+
+/* Deregister symmetric memory */
+ncclResult_t  ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+
 /* Reduction operation selector */
 typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
 typedef enum { ncclSum        = 0,
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
index 9257d7786..78944106a 100644
--- a/src/plugin/net.cc
+++ b/src/plugin/net.cc
@@ -8,6 +8,7 @@
 #include "bootstrap.h"
 #include "checks.h"
 #include "plugin.h"
+#include "nccl_net.h"
 
 #include <string.h>
 #include <errno.h>
@@ -15,137 +16,100 @@
 //#include <sys/stat.h>
 //#include <unistd.h>
 
-extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
-extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
-extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
-extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
-extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
-
-extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
-extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
-extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
-extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
-extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
-
-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
-static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
-ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
-enum ncclNetState {
-  ncclNetStateInit = 0,
-  ncclNetStateEnabled = 1,
-  ncclNetStateDisabled = 2
-};
-enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
-enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
+typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
+typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
+
+extern getNcclNet_t getNcclNet_v6;
+extern getNcclNet_t getNcclNet_v7;
+extern getNcclNet_t getNcclNet_v8;
+extern getNcclNet_t getNcclNet_v9;
+extern getNcclNet_t getNcclNet_v10;
+extern getNcclCollNet_t getNcclCollNet_v6;
+extern getNcclCollNet_t getNcclCollNet_v7;
+extern getNcclCollNet_t getNcclCollNet_v8;
+extern getNcclCollNet_t getNcclCollNet_v9;
+extern getNcclCollNet_t getNcclCollNet_v10;
 
 NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
+#define NCCL_NET_VERSION_COUNT 5
+int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6};
+getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
+getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7,  getNcclCollNet_v6};
+
+#define NCCL_NET_NUM_INTERNAL_PLUGINS 2
+
+typedef enum ncclNetPluginState {
+  ncclNetPluginStateDisabled        = -2,       // Plugin library failed to initialize
+  ncclNetPluginStateLoadFailed      = -1,       // Plugin library failed to load
+  ncclNetPluginStateLoadReady       = 0,        // Plugin library is ready to be loaded
+  ncclNetPluginStateInitReady       = 1,        // Plugin library is loaded and ready to be initialized
+  ncclNetPluginStateEnabled         = 2,        // Plugin library is loaded and initialized
+} ncclNetPluginState_t;
+
+#define MAX_STR_LEN 255
+typedef struct netPluginLib {
+  char name[MAX_STR_LEN];                       // Name of the plugin library
+  void* dlHandle;                               // Handle to the plugin library
+  ncclNet_t* ncclNet;                           // Pointer to the ncclNet_t structure
+  int ncclNetVer;                               // Version of the nccl net plugin
+  ncclCollNet_t* ncclCollNet;                   // Pointer to the ncclCollNet_t structure
+  ncclNetPluginState_t ncclNetPluginState;      // State of the nccl net plugin
+  ncclNetPluginState_t ncclCollNetPluginState;  // State of the nccl coll net plugin
+  int ncclNetPluginRefCount;                    // Reference count for the nccl net plugin
+} netPluginLib_t;
+
+int pluginCount = 0;
+bool netPluginLibsInitialized = false;
+netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 };
 static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static void* netPluginLib;
-
-static int netPluginRefCount;
-static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
+static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
 
-enum {
-  netPluginLoadFailed  = -1,
-  netPluginLoadReady   =  0,
-  netPluginLoadSuccess =  1,
-};
-
-static int netPluginStatus = netPluginLoadReady;
+static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
+  if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
+    INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
+    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
+    memset(pluginLib, 0, sizeof(netPluginLib_t));
+  }
+  return ncclSuccess;
+}
 
-ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
-  static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
-  pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
+static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
+  pluginLib->dlHandle = ncclOpenNetPluginLib(pluginLib->name);
 
-  pthread_mutex_lock(&netPluginLock);
-  if (netPluginLoadFailed == netPluginStatus) {
-    goto exit;
-  }
-  if (netPluginLoadSuccess == netPluginStatus) {
-    ++netPluginRefCount;
-    goto exit;
+  if (pluginLib->dlHandle == nullptr) goto fail;
+  // load ncclNet
+  for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
+    pluginLib->ncclNetVer = ncclNetVersion[i];
+    pluginLib->ncclNet = getNcclNet[i](pluginLib->dlHandle);
+    if (pluginLib->ncclNet) break;
   }
 
-  netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
-  if (netPluginLib == nullptr) {
-    goto fail;
-  }
+  // if we fail to find a net, exit
+  if (pluginLib->ncclNet == nullptr) goto fail;
 
-  ncclNets[0] = getNcclNet_v10(netPluginLib);
-  if (ncclNets[0]) ncclNetsVer[0] = 10;
-  if (ncclNets[0] == nullptr) {
-    // Try v9 plugin
-    ncclNets[0] = getNcclNet_v9(netPluginLib);
-    if (ncclNets[0]) ncclNetsVer[0] = 9;
-  }
-  if (ncclNets[0] == nullptr) {
-    // Try v8 plugin
-    ncclNets[0] = getNcclNet_v8(netPluginLib);
-    if (ncclNets[0]) ncclNetsVer[0] = 8;
-  }
-  if (ncclNets[0] == nullptr) {
-    // Try v7 plugin
-    ncclNets[0] = getNcclNet_v7(netPluginLib);
-    if (ncclNets[0]) ncclNetsVer[0] = 7;
-  }
-  if (ncclNets[0] == nullptr) {
-    // Try v6 plugin
-    ncclNets[0] = getNcclNet_v6(netPluginLib);
-    if (ncclNets[0]) ncclNetsVer[0] = 6;
-  }
-  if (ncclNets[0] == nullptr) {
-    goto fail;
-  }
+  pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady;
 
-  // Check for CollNet
-  ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
-  if (ncclCollNets[0] == nullptr) {
-    ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
-  }
-  if (ncclCollNets[0] == nullptr) {
-    ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
-  }
-  if (ncclCollNets[0] == nullptr) {
-    ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
-  }
-  if (ncclCollNets[0] == nullptr) {
-    ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
+  // load ncclColNet
+  for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
+    pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle);
+    if (pluginLib->ncclCollNet) break;
   }
 
-  ++netPluginRefCount;
-  netPluginStatus = netPluginLoadSuccess;
-  comm->netPluginLoaded = 1;
+  if (pluginLib->ncclCollNet == nullptr)
+    pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
+  else
+    pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
 
+  INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name);
 exit:
-  pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
 fail:
-  if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
-  netPluginStatus = netPluginLoadFailed;
-  goto exit;
-}
-
-ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&netPluginLock);
-  if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
-    if (ncclNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
-    }
-    if (ncclCollNets[0]) {
-      INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
-    }
-    NCCLCHECK(ncclClosePluginLib(netPluginLib));
-    netPluginLib = nullptr;
-    ncclNets[0] = nullptr;
-    ncclCollNets[0] = nullptr;
-    netPluginStatus = netPluginLoadReady;
-    comm->netPluginLoaded = 0;
-    for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
-      ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
+  if (pluginLib->dlHandle) {
+    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
   }
-  pthread_mutex_unlock(&netPluginLock);
-  return ncclSuccess;
+  pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed;
+  pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
+  goto exit;
 }
 
 ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
@@ -172,72 +136,156 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
   return ncclSuccess;
 }
 
-static ncclResult_t netGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
-    else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
-    else ncclNetStates[i] = ncclNetStateEnabled;
+static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) {
+  int ndev;
+  if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
+    if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
+    if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
+  }
+  pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
+  INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
+
+  if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
+    if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+    else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+    else {
+      pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
+    }
   }
-  *state = ncclNetStates[i];
-  pthread_mutex_unlock(&netLock);
+exit:
   return ncclSuccess;
+fail:
+  pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
+  pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  goto exit;
 }
 
-static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
-  pthread_mutex_lock(&netLock);
-  if (ncclCollNetStates[i] == ncclNetStateInit) {
-    int ndev;
-    if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
-    else ncclCollNetStates[i] = ncclNetStateEnabled;
+static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
+  const char* netName = comm->config.netName;
+  if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
+  if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
+
+  if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
+    comm->ncclNet = netPluginLibs[pluginIndex].ncclNet;
+    comm->ncclNetVer = netPluginLibs[pluginIndex].ncclNetVer;
+    comm->netPluginIndex = pluginIndex;
+    netPluginLibs[pluginIndex].ncclNetPluginRefCount++;
+    *isAssigned = true;
+    INFO(NCCL_INIT|NCCL_NET, "Assigned NET plugin %s to comm", netPluginLibs[pluginIndex].ncclNet->name);
+    if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
+      comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
+    }
   }
-  *state = ncclCollNetStates[i];
-  pthread_mutex_unlock(&netLock);
+exit:
   return ncclSuccess;
+fail:
+  *isAssigned = false;
+  netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
+  netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
+  goto exit;
 }
 
-ncclResult_t ncclNetInit(struct ncclComm* comm) {
-  // Initialize main communication network
-  const char* netName;
-  bool ok = false;
-
-  netName = comm->config.netName;
-  for (int i=0; i<3; i++) {
-    if (ncclNets[i] == nullptr) continue;
-    enum ncclNetState state;
-    NCCLCHECK(netGetState(i, &state));
-    if (state != ncclNetStateEnabled) continue;
-    if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
-    if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
-      // Mismatched device plugin version
-      continue;
+static ncclResult_t ncclNetPluginDisableOtherExternal(int pluginIndex) {
+  // Only if an external plugin is enabled, disable other external plugins
+  if (pluginIndex >= (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) return ncclSuccess;
+  char names[MAX_STR_LEN*(NCCL_NET_MAX_PLUGINS - NCCL_NET_NUM_INTERNAL_PLUGINS)] = { 0 };
+  for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
+    if (i != pluginIndex) {
+      // Append all disabled plugin names to a string
+      snprintf(names+strlen(names), sizeof(names)-strlen(names), (strlen(names) == 0) ? "%s" : ", %s", netPluginLibs[i].name);
+      netPluginLibs[i].ncclNetPluginState = ncclNetPluginStateDisabled;
     }
+  }
+  if(strlen(names) > 0) {
+    INFO(NCCL_INIT|NCCL_NET, "Disabling external plugins: %s", names);
+  }
+  return ncclSuccess;
+}
 
-    comm->ncclNet = ncclNets[i];
-    comm->ncclNetVer = ncclNetsVer[i];
-    ok = true;
-
-    if (ncclCollNets[i]) {
-      NCCLCHECK(collNetGetState(i, &state));
-      if (state == ncclNetStateEnabled) {
-        comm->ncclCollNet = ncclCollNets[i];
+static void initPluginLibsOnceFunc() {
+  char* netPluginName = nullptr;
+  const char* defaultNetPlugin = "libnccl-net.so";
+  const char* envNetPlugin = nullptr;
+  char* envNetPluginList = nullptr;
+  char* savePtr = nullptr;
+  int pluginCounter = 0;
+
+  memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t));
+  envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN");
+  if (envNetPlugin) {
+    envNetPluginList = strdup(envNetPlugin);
+    // Iterate over list until the list is empty
+    netPluginName = strtok_r(envNetPluginList, ",", &savePtr);
+    while(netPluginName) {
+      // We have 2 internal plugins (ib and socket)
+      // So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list
+      if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) {
+        INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
+        break;
+      }
+      // need to leave space for the name + "\n"
+      if((strlen(netPluginName)+1) <= MAX_STR_LEN) {
+        netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
+        netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
+        strcpy(netPluginLibs[pluginCounter].name, netPluginName);
+        pluginCounter++;
+      } else {
+        INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
       }
+      netPluginName = strtok_r(nullptr, ",", &savePtr);
     }
-    break;
+    if (envNetPluginList) free(envNetPluginList);
+  } else {
+    // Add default net plugin
+    netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
+    netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
+    strcpy(netPluginLibs[pluginCounter++].name, defaultNetPlugin);
   }
 
-  if (!ok) {
-    WARN("Error: network %s not found.", netName ? netName : "");
-    return ncclInvalidUsage;
+  // Add 2 internal ib and socket plugins
+  netPluginLibs[pluginCounter].ncclNet = &ncclNetIb;
+  netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
+  netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket;
+  netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
+  pluginCount = pluginCounter;
+}
+
+ncclResult_t ncclNetInit(struct ncclComm* comm) {
+  bool ncclNetPluginInitialized = false;
+  pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc);
+  pthread_mutex_lock(&netPluginLock);
+  for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) {
+    if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
+      NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
+    }
+    if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) {
+      NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex]));
+    }
+    if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
+      bool isAssigned = false;
+      NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
+      if (isAssigned) {
+        // If one external plugin is assigned to a comm, then disable all other external plugins
+        ncclNetPluginDisableOtherExternal(pluginIndex);
+        ncclNetPluginInitialized = true;
+        break;
+      }
+    }
   }
-  return ncclSuccess;
+  pthread_mutex_unlock(&netPluginLock);
+  if (ncclNetPluginInitialized) return ncclSuccess;
+  WARN("Failed to initialize any NET plugin");
+  return ncclInvalidUsage;
 }
 
 ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
-  comm->ncclNet = nullptr;
-  comm->ncclCollNet = nullptr;
+  int pluginIndex = comm->netPluginIndex;
+  pthread_mutex_lock(&netPluginLock);
+  netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
+  for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
+    NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
+  }
+  pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
 }
 
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index a43df28d3..a9c1d0dc0 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -23,7 +23,7 @@ enum ncclPluginType {
 static void *libHandles[NUM_LIBS];
 static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
 static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
-static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
+static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
 static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
 
 static void* tryOpenLib(char* name, int* err, char* errStr) {
@@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
   return handle;
 }
 
-static void appendNameToList(char* nameList, int *nameListLen, char* name) {
-  snprintf(nameList, *nameListLen, " %s", name);
-  nameList += strlen(name) + 1;
-  *nameListLen -= strlen(name) + 1;
+static void appendNameToList(char* nameList, int *leftChars, char* name) {
+  snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name);
+  *leftChars -= strlen(name) + 1;
 }
 
 static void* openPluginLib(enum ncclPluginType type, const char* libName) {
@@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
   char eNoEntNameList[PATH_MAX] = { 0 };
 
   if (libName && strlen(libName)) {
-    snprintf(libName_, MAX_STR_LEN, "%s", libName);
-    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-    if (libHandles[type]) {
-      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-      return libHandles[type];
-    }
-    if (openErr == ENOENT) {
-      appendNameToList(eNoEntNameList, &len, libName_);
+    // match names that start with 'lib' and end with '.so'
+    if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
+      snprintf(libName_, MAX_STR_LEN, "%s", libName);
+      libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+      if (libHandles[type]) {
+        INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+        return libHandles[type];
+      }
+      if (openErr == ENOENT) {
+        appendNameToList(eNoEntNameList, &len, libName_);
+      } else {
+        INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+      }
     } else {
-      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
-    }
-
-    snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
-    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-    if (libHandles[type]) {
-      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-      return libHandles[type];
-    }
-    if (openErr == ENOENT) {
-      appendNameToList(eNoEntNameList, &len, libName_);
-    } else {
-      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+      snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
+      libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+      if (libHandles[type]) {
+        INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+        return libHandles[type];
+      }
+      if (openErr == ENOENT) {
+        appendNameToList(eNoEntNameList, &len, libName_);
+      } else {
+        INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+      }
     }
   } else {
     snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
@@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) {
 }
 
 ncclResult_t ncclClosePluginLib(void* handle) {
+  bool found = false;
   for (int l=0; l<NUM_LIBS; l++) {
     if (libHandles[l] == handle) {
       libHandles[l] = nullptr;
-      dlclose(handle);
-      return ncclSuccess;
+      if (!found) {
+        if (handle) {
+          dlclose(handle);
+        }
+        found = true;
+      }
     }
   }
-  return ncclInternalError;
+  return ncclSuccess;
 }
diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc
index 18b9b5c4f..da347b63e 100644
--- a/src/plugin/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -17,6 +17,7 @@
 extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
 extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
 extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v4(void* lib);
 
 static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
 static int profilerPluginRefCount;
@@ -49,7 +50,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
     goto fail;
   }
 
-  ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
+  ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
+  if (ncclProfiler == nullptr) {
+    ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
+  }
   if (ncclProfiler == nullptr) {
     ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
   }
@@ -162,7 +166,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
   TIME_START_EVENT(init);
   ncclProfilerPluginLoad();
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
+    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
     if (err) {
       WARN("Profiler init failed with error (%d). Continue without profiler.", err);
       ncclProfiler = NULL;
@@ -239,8 +243,6 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
           eDescr.type = ncclProfileColl;
           eDescr.parentObj = plan->groupEventHandle;
           eDescr.rank = plan->comm->rank;
-          eDescr.coll.name = plan->comm->commName;
-          eDescr.coll.commHash = plan->comm->commHash;
           eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
           eDescr.coll.func = ncclFuncToString(ct->func);
           eDescr.coll.sendBuff = ct->sendbuff;
@@ -248,7 +250,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
           eDescr.coll.count = ct->count;
           eDescr.coll.root = ct->root;
           eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
-          eDescr.coll.nMaxChannels = ct->nMaxChannels;
+          eDescr.coll.nChannels = ct->nChannels;
           eDescr.coll.nWarps = ct->nWarps;
           eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
           eDescr.coll.proto = ncclProtoToString(ct->protocol);
@@ -264,7 +266,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
     // gives the consistency.
     if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
                               (ct->eActivationMask & ncclProfileKernelCh)))
-      plan->comm->seqNumber[ct->func]++;
+      __atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED);
     ct = ct->next;
   }
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
@@ -277,13 +279,12 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
           eDescr.type = ncclProfileP2p;
           eDescr.parentObj = plan->groupEventHandle;
           eDescr.rank = plan->comm->rank;
-          eDescr.p2p.name = plan->comm->commName;
-          eDescr.p2p.commHash = plan->comm->commHash;
           eDescr.p2p.func = ncclFuncToString(pt->func);
           eDescr.p2p.buff = pt->buff;
           eDescr.p2p.count = pt->count;
           eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
           eDescr.p2p.peer = pt->root;
+          eDescr.p2p.nChannels = pt->nChannels;
           ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
         }
         pt = pt->next;
@@ -319,7 +320,7 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
 // made of sliceSteps steps rather than one step. In the profiler we are still
 // interested in whole network transfers though, so we account for this when
 // computing the actual network step number.
-ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
+ncclResult_t ncclProfilerStartProxyOpEvent(int s, struct ncclProxyArgs* args) {
   TIME_START_EVENT(proxyOpStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
@@ -333,29 +334,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
       eDescr.proxyOp.peer = sub->peer;
       eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
       eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
-      eDescr.proxyOp.isSend = 1;
-      ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
-    }
-  }
-  TIME_STOP_EVENT(proxyOpStart);
-  return ncclSuccess;
-}
-
-ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
-  TIME_START_EVENT(proxyOpStart);
-  struct ncclProxySubArgs* sub = &args->subs[s];
-  if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
-      ncclProfilerEventDescr_t eDescr = { 0 };
-      eDescr.type = ncclProfileProxyOp;
-      eDescr.parentObj = sub->taskEventHandle;
-      eDescr.rank = sub->rank;
-      eDescr.proxyOp.pid = sub->pid;
-      eDescr.proxyOp.channelId = sub->channelId;
-      eDescr.proxyOp.peer = sub->peer;
-      eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
-      eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
-      eDescr.proxyOp.isSend = 0;
+      eDescr.proxyOp.isSend = args->progress == ncclTransports[TRANSPORT_NET]->send.proxyProgress ? 1 : 0;
       ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
     }
   }
@@ -385,7 +364,8 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
       eDescr.parentObj = sub->opEventHandle;
       eDescr.rank = sub->rank;
       eDescr.proxyStep.step = step_;
-      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
+      sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
@@ -403,7 +383,8 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
       eDescr.parentObj = sub->opEventHandle;
       eDescr.rank = sub->rank;
       eDescr.proxyStep.step = step_;
-      ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
+      ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
+      sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
     }
   }
   TIME_STOP_EVENT(proxyStepStart);
@@ -415,9 +396,9 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, i
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     int step_ = DIVUP(stepId, args->sliceSteps);
-    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
-      ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
-      sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
+    if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
+      ncclProfiler->stopEvent(sub->pHandles[step_%NCCL_STEPS].stepEventHandle);
+      sub->pHandles[step_%NCCL_STEPS].stepEventHandle = NULL;
     }
   }
   TIME_STOP_EVENT(proxyStepStop);
@@ -451,7 +432,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
+ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start) {
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     struct ncclProxySubArgs* sub = &args->subs[s];
     if (sub->eActivationMask & ncclProfileKernelCh) {
@@ -459,29 +440,31 @@ ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
       eDescr.type = ncclProfileKernelCh;
       eDescr.parentObj = sub->taskEventHandle;
       eDescr.kernelCh.channelId = sub->channelId;
+      eDescr.kernelCh.pTimer = start;
       ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
     }
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
+ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop) {
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
     struct ncclProxySubArgs* sub = &args->subs[s];
     if (sub->kernelEventHandle) {
+      ncclProfilerEventStateArgs_t a = { };
+      a.kernelCh.pTimer = stop;
+      ncclProfiler->recordEventState(sub->kernelEventHandle, ncclProfilerKernelChStop, &a);
       ncclProfiler->stopEvent(sub->kernelEventHandle);
     }
   }
   return ncclSuccess;
 }
 
-ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
+ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, ncclProfilerEventState_t eState) {
   TIME_START_EVENT(proxyOpRecord);
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
     ncclProfilerEventStateArgs_t a = { };
-    a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
-    a.proxyOp.transSize = transSize;
     ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
   }
   TIME_STOP_EVENT(proxyOpRecord);
@@ -493,8 +476,10 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
   struct ncclProxySubArgs* sub = &args->subs[s];
   if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
     int step_ = DIVUP(stepId, args->sliceSteps);
-    if (sub->stepEventHandles[step_%NCCL_STEPS]) {
-      ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
+    if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
+      ncclProfilerEventStateArgs_t a = { };
+      a.proxyStep.transSize = sub->transSize;
+      ncclProfiler->recordEventState(sub->pHandles[step_%NCCL_STEPS].stepEventHandle, eState, &a);
     }
   }
   TIME_STOP_EVENT(proxyStepRecord);
@@ -547,18 +532,28 @@ bool ncclProfilerPluginLoaded(void) {
 
 ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
-    if (type == 0) { // start
+    if (type == ncclProfilerNetEventStart) { // start
+      struct ncclProxyEventHandle* p = (struct ncclProxyEventHandle*)pHandle;
+      struct ncclProxySubArgs* sub = p->subArgPtr;
       if (sub->eActivationMask & ncclProfileNetPlugin) {
         ncclProfilerEventDescr_t eDescr = { 0 };
         eDescr.type = ncclProfileNetPlugin;
-        eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
+        eDescr.parentObj = p->stepEventHandle;
         eDescr.rank = sub->rank;
         eDescr.netPlugin.id = pluginId;
         eDescr.netPlugin.data = extData;
         ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
       }
-    } else { // stop
+    } else if (type == ncclProfilerNetEventStop) { // stop
+      ncclProfiler->stopEvent(*eHandle);
+    } else if (type == ncclProfilerNetEventUpdate) { // update
+      ncclProfilerEventStateArgs_t args = { };
+      args.netPlugin.data = extData;
+      ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
+    } else { // update and stop
+      ncclProfilerEventStateArgs_t args = { };
+      args.netPlugin.data = extData;
+      ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
       ncclProfiler->stopEvent(*eHandle);
     }
   }
diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc
index 139742942..2126afc68 100644
--- a/src/plugin/profiler/profiler_v1.cc
+++ b/src/plugin/profiler/profiler_v1.cc
@@ -53,6 +53,7 @@ static uint8_t ncclStringToDatatype(const char* dt) {
 }
 
 static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  *eHandle = NULL;
   ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
   eDescr_v1.type = eDescr->type;
   eDescr_v1.parentObj = eDescr->parentObj;
@@ -60,8 +61,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
   switch(eDescr->type) {
     case ncclProfileGroup: break;
     case ncclProfileColl: {
-      eDescr_v1.coll.name = eDescr->coll.name;
-      eDescr_v1.coll.commHash = eDescr->coll.commHash;
+      eDescr_v1.coll.name = nullptr; // removed in v4
+      eDescr_v1.coll.commHash = 0; // removed in v4
       eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
       eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
       eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
@@ -71,14 +72,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
       eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
       eDescr_v1.coll.op = 0; // removed in v2
       eDescr_v1.coll.trafficBytes = 0; // removed in v3
-      eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v1.coll.nMaxChannels = eDescr->coll.nChannels;
       eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
       eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
       eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
     } break;
     case ncclProfileP2p: {
-      eDescr_v1.p2p.name = eDescr->p2p.name;
-      eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v1.p2p.name = nullptr; // removed in v4
+      eDescr_v1.p2p.commHash = 0; // removed in v4
       eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
       eDescr_v1.p2p.buff = eDescr->p2p.buff;
       eDescr_v1.p2p.count = eDescr->p2p.count;
@@ -97,21 +98,34 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
       eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
     } break;
     case ncclProfileProxyCtrl: break;
-    case ncclProfileKernelCh:
-    case ncclProfileNetPlugin: {
-      *eHandle = NULL;
-      return ncclSuccess;
-    }
-    default:;
+    default: return ncclSuccess;
   }
   return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
 }
 
 static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
-  return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
+  ncclProfilerEventStateArgs_v1_t args = { };
+  switch (eState) {
+    case ncclProfilerProxyCtrlIdle:
+    case ncclProfilerProxyCtrlActive:
+    case ncclProfilerProxyCtrlSleep:
+    case ncclProfilerProxyCtrlWakeup:
+    case ncclProfilerProxyCtrlAppend:
+    case ncclProfilerProxyCtrlAppendEnd:
+      args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
+      break;
+    case ncclProfilerProxyStepSendGPUWait:
+    case ncclProfilerProxyStepSendWait:
+    case ncclProfilerProxyStepRecvWait:
+    case ncclProfilerProxyStepRecvFlushWait:
+    case ncclProfilerProxyStepRecvGPUWait:
+      break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v1->recordEventState(eHandle, eState, &args);
 }
 
-static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
   ncclProfiler.startEvent = ncclProfiler_startEvent;
   ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
index 52907d6e3..11e521e90 100644
--- a/src/plugin/profiler/profiler_v2.cc
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -20,8 +20,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
   switch(eDescr->type) {
     case ncclProfileGroup: break;
     case ncclProfileColl: {
-      eDescr_v2.coll.name = eDescr->coll.name;
-      eDescr_v2.coll.commHash = eDescr->coll.commHash;
+      eDescr_v2.coll.name = nullptr; // removed in v4
+      eDescr_v2.coll.commHash = 0; // removed in v4
       eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
       eDescr_v2.coll.func = eDescr->coll.func;
       eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff;
@@ -30,14 +30,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
       eDescr_v2.coll.root = eDescr->coll.root;
       eDescr_v2.coll.datatype = eDescr->coll.datatype;
       eDescr_v2.coll.trafficBytes = 0; // removed in v3
-      eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels;
+      eDescr_v2.coll.nMaxChannels = eDescr->coll.nChannels;
       eDescr_v2.coll.nWarps = eDescr->coll.nWarps;
       eDescr_v2.coll.algo = eDescr->coll.algo;
       eDescr_v2.coll.proto = eDescr->coll.proto;
     } break;
     case ncclProfileP2p: {
-      eDescr_v2.p2p.name = eDescr->p2p.name;
-      eDescr_v2.p2p.commHash = eDescr->p2p.commHash;
+      eDescr_v2.p2p.name = nullptr; // removed in v4
+      eDescr_v2.p2p.commHash = 0; // removed in v4
       eDescr_v2.p2p.func = eDescr->p2p.func;
       eDescr_v2.p2p.buff = eDescr->p2p.buff;
       eDescr_v2.p2p.count = eDescr->p2p.count;
@@ -62,10 +62,28 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
 }
 
 static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
-  return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
+  ncclProfilerEventStateArgs_v2_t args = { };
+  switch (eState) {
+    case ncclProfilerProxyCtrlIdle:
+    case ncclProfilerProxyCtrlActive:
+    case ncclProfilerProxyCtrlSleep:
+    case ncclProfilerProxyCtrlWakeup:
+    case ncclProfilerProxyCtrlAppend:
+    case ncclProfilerProxyCtrlAppendEnd:
+      args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
+      break;
+    case ncclProfilerProxyStepSendGPUWait:
+    case ncclProfilerProxyStepSendWait:
+    case ncclProfilerProxyStepRecvWait:
+    case ncclProfilerProxyStepRecvFlushWait:
+    case ncclProfilerProxyStepRecvGPUWait:
+      break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v2->recordEventState(eHandle, eState, &args);
 }
 
-static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
   NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
   ncclProfiler.startEvent = ncclProfiler_startEvent;
   ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc
index 322bea57a..3dba3231a 100644
--- a/src/plugin/profiler/profiler_v3.cc
+++ b/src/plugin/profiler/profiler_v3.cc
@@ -6,14 +6,105 @@
 
 #include "comm.h"
 #include "nccl_profiler.h"
+#include "checks.h"
 
+static ncclProfiler_t ncclProfiler;
 static ncclProfiler_v3_t* ncclProfiler_v3;
 
+static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  *eHandle = nullptr;
+  ncclProfilerEventDescr_v3_t eDescr_v3 = { };
+  eDescr_v3.type = eDescr->type;
+  eDescr_v3.parentObj = eDescr->parentObj;
+  eDescr_v3.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v3.coll.name = nullptr; // removed in v4
+      eDescr_v3.coll.commHash = 0; // removed in v4
+      eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v3.coll.func = eDescr->coll.func;
+      eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v3.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v3.coll.count = eDescr->coll.count;
+      eDescr_v3.coll.root = eDescr->coll.root;
+      eDescr_v3.coll.datatype = eDescr->coll.datatype;
+      eDescr_v3.coll.nMaxChannels = eDescr->coll.nChannels;
+      eDescr_v3.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v3.coll.algo = eDescr->coll.algo;
+      eDescr_v3.coll.proto = eDescr->coll.proto;
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v3.p2p.name = nullptr; // removed in v4
+      eDescr_v3.p2p.commHash = 0; // removed in v4
+      eDescr_v3.p2p.func = eDescr->p2p.func;
+      eDescr_v3.p2p.buff = eDescr->p2p.buff;
+      eDescr_v3.p2p.count = eDescr->p2p.count;
+      eDescr_v3.p2p.datatype = eDescr->p2p.datatype;
+      eDescr_v3.p2p.peer = eDescr->p2p.peer;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v3.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v3.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v3.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v3.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v3.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v3.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v3.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    case ncclProfileKernelCh: {
+      eDescr_v3.kernelCh.channelId = eDescr->kernelCh.channelId;
+    } break;
+    case ncclProfileNetPlugin: {
+      eDescr_v3.netPlugin.id = eDescr->netPlugin.id;
+      eDescr_v3.netPlugin.data = eDescr->netPlugin.data;
+    } break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v3->startEvent(context, eHandle, &eDescr_v3);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  ncclProfilerEventStateArgs_v3_t args = { };
+  switch (eState) {
+    case ncclProfilerProxyCtrlIdle:
+    case ncclProfilerProxyCtrlActive:
+    case ncclProfilerProxyCtrlSleep:
+    case ncclProfilerProxyCtrlWakeup:
+    case ncclProfilerProxyCtrlAppend:
+    case ncclProfilerProxyCtrlAppendEnd:
+      args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
+      break;
+    case ncclProfilerProxyStepSendGPUWait:
+    case ncclProfilerProxyStepSendWait:
+    case ncclProfilerProxyStepRecvWait:
+    case ncclProfilerProxyStepRecvFlushWait:
+    case ncclProfilerProxyStepRecvGPUWait:
+      break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v3->recordEventState(eHandle, eState, &args);
+}
+
+static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.finalize = ncclProfiler_v3->finalize;
+  return ncclSuccess;
+}
+
 ncclProfiler_t* getNcclProfiler_v3(void* lib) {
   ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
   if (ncclProfiler_v3) {
+    ncclProfiler.name = ncclProfiler_v3->name;
+    ncclProfiler.init = ncclProfiler_init;
     INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
-    return ncclProfiler_v3;
+    return &ncclProfiler;
   }
   INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
   return NULL;
diff --git a/src/plugin/profiler/profiler_v4.cc b/src/plugin/profiler/profiler_v4.cc
new file mode 100644
index 000000000..11bed891a
--- /dev/null
+++ b/src/plugin/profiler/profiler_v4.cc
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+
+static ncclProfiler_v4_t* ncclProfiler_v4;
+
+ncclProfiler_t* getNcclProfiler_v4(void* lib) {
+  ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
+  if (ncclProfiler_v4) {
+    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
+    return ncclProfiler_v4;
+  }
+  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
+  return NULL;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index c27d23455..74ec70f0e 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -416,6 +416,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   args->state = ncclProxyOpReady;
   args->progress = op->connection->tcomm->proxyProgress;
   args->proxyAppendPtr = op->connection->proxyAppendPtr;
+  if (args->pattern != ncclPatternProfiler) ncclProfilerStartProxyOpEvent(subIndex, args);
   return ncclSuccess;
 }
 
@@ -634,10 +635,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       const int rank = comm->rank, nranks = comm->nRanks;
       int *nstepsSend = NULL, *nstepsRecv = NULL;
       PatRSAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
+      struct ncclPatStep ps = {0};
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_up);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_up);
 
-      struct ncclPatStep ps;
       do {
         algo.getNextOp(&ps);
         if (ps.flags & PatSkipped) continue;
@@ -668,10 +669,10 @@ ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* op, bool
       const int rank = comm->rank, nranks = comm->nRanks;
       int *nstepsSend = NULL, *nstepsRecv = NULL;
       PatAGAlgorithm<char> algo(op->chunkSize, NCCL_STEPS, 16, 0, size, size, op->chunkSize, rank, nranks);
+      struct ncclPatStep ps = {0};
       NCCLCHECKGOTO(ncclCalloc(&nstepsSend, log2Up(nranks)), result, exit_pat_down);
       NCCLCHECKGOTO(ncclCalloc(&nstepsRecv, log2Up(nranks)), result, exit_pat_down);
 
-      struct ncclPatStep ps;
       do {
         algo.getNextOp(&ps);
         if (ps.flags & PatSkipped) continue;
@@ -933,11 +934,13 @@ void* ncclProxyProgress(void *proxyState_) {
       INFO(NCCL_ALL,"%s:%d -> %d [Progress Thread]", __FILE__, __LINE__, ret);
       break;
     }
-    void* eHandle;
-    ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
-    if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
-    if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
-    ncclProfilerStopProxyCtrlEvent(eHandle);
+    if ((lastIdle == 0 && idle == 1) || (lastIdle == 1 && idle == 0)) {
+      void* eHandle;
+      ncclProfilerStartProxyCtrlEvent(proxyState->profilerContext, &eHandle);
+      if (lastIdle == 0 && idle == 1) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlIdle);
+      if (lastIdle == 1 && idle == 0) ncclProfilerRecordProxyCtrlEventState(eHandle, 0, ncclProfilerProxyCtrlActive);
+      ncclProfilerStopProxyCtrlEvent(eHandle);
+    }
     if (idle || !state->active || (++proxyOpAppendCounter == ncclParamProgressAppendOpFreq())) {
       int added = 0;
       proxyOpAppendCounter = 0;
diff --git a/src/ras/collectives.cc b/src/ras/collectives.cc
index 72833604f..4f8b6efc4 100644
--- a/src/ras/collectives.cc
+++ b/src/ras/collectives.cc
@@ -606,6 +606,10 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
   for (int commIdx = 0; commIdx < nNcclComms; commIdx++) {
     if (ncclComms[commIdx] == nullptr) // nullptr's are always at the end after sorting.
       break;
+    if (!__atomic_load_n(&ncclComms[commIdx]->peerInfoValid, __ATOMIC_ACQUIRE)) {
+      // Critical data is not yet initialized -- ignore the communicator.
+      continue;
+    }
     // A process may manage multiple GPUs and thus have multiple communicators with the same commHash.
     // Comparing just the commHash is OK though within communicators that are part of the same process.
     if (commIdx == 0 || ncclComms[commIdx]->commHash != ncclComms[commIdx-1]->commHash) {
@@ -651,6 +655,8 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
   // collCommIdx counts rasCollComms::comm (comm); commIdx indexes ncclComms.
   for (int collCommIdx = 0, commIdx = 0; collCommIdx < nComms; collCommIdx++) {
     struct ncclComm* ncclComm = ncclComms[commIdx];
+    if (!__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE))
+      continue;
 
     comm->commId.commHash = ncclComm->commHash;
     comm->commId.hostHash = ncclComm->peerInfo->hostHash;
@@ -663,15 +669,15 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
          commIdx++) {
       ncclComm = ncclComms[commIdx];
       struct rasCollComms::comm::rank* rank = comm->ranks+comm->nRanks;
-      ncclResult_t asyncError;
       rank->commRank = ncclComm->rank;
       // rasNetSendCollReq initializes coll->peers[0] to our rasNetListeningSocket.addr, so peerIdx is initially
       // always 0.  It will increase after we send this response back to the peer we got the request from.
       rank->peerIdx = 0;
       memcpy(rank->collOpCounts, ncclComm->seqNumber, sizeof(rank->collOpCounts));
       rank->status.initState = ncclComm->initState;
-      if (ncclCommGetAsyncError(ncclComm, &asyncError) == ncclSuccess)
-        rank->status.asyncError = asyncError;
+      rank->status.asyncError = __atomic_load_n(&ncclComm->asyncResult, __ATOMIC_ACQUIRE);
+      if (rank->status.asyncError == ncclSuccess && ncclComm->proxyState)
+        rank->status.asyncError = __atomic_load_n(&ncclComm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
       rank->status.finalizeCalled = (ncclComm->finalizeCalled != 0);
       rank->status.destroyFlag = (ncclComm->destroyFlag != 0);
       rank->status.abortFlag = (__atomic_load_n(ncclComm->abortFlag, __ATOMIC_ACQUIRE) != 0);
@@ -680,7 +686,7 @@ static ncclResult_t rasCollCommsInit(struct rasCollRequest** pReq, size_t* pReqL
       comm->nRanks++;
     } // for (commIdx)
 
-    if (firstNewSkipMissingIdx != -1 &&
+    if (__atomic_load_n(&ncclComm->peerInfoValid, __ATOMIC_ACQUIRE) && firstNewSkipMissingIdx != -1 &&
         memcmp(req->comms.skipMissingRanksComms+firstNewSkipMissingIdx, &comm->commId, sizeof(comm->commId)) == 0) {
       // Fill in the missingRanks array that follows the comm->ranks.
       struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks+comm->nRanks);
diff --git a/src/ras/rasnet.cc b/src/ras/rasnet.cc
index 43aa042a7..1194e61b5 100644
--- a/src/ras/rasnet.cc
+++ b/src/ras/rasnet.cc
@@ -365,15 +365,16 @@ ncclResult_t rasNetAcceptNewSocket() {
   NCCLCHECKGOTO(ncclSocketAccept(&sock->sock, &rasNetListeningSocket), ret, fail);
   NCCLCHECKGOTO(ncclSocketReady(&sock->sock, &ready), ret, fail);
 
-  if (sock->sock.fd != -1) {
-    NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
-    rasPfds[sock->pfd].fd = sock->sock.fd;
-    rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side.  This also
-                                        // helps the code tell the sides apart.
-    sock->status = RAS_SOCK_CONNECTING;
-
-    INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
-  }
+  if (sock->sock.fd == -1)
+    goto fail; // We'll return ncclSuccess, but we need to clean up the incomplete socket first.
+
+  NCCLCHECKGOTO(rasGetNewPollEntry(&sock->pfd), ret, fail);
+  rasPfds[sock->pfd].fd = sock->sock.fd;
+  rasPfds[sock->pfd].events = POLLIN; // Initially we'll just wait for a handshake from the other side.  This also
+                                      // helps the code tell the sides apart.
+  sock->status = RAS_SOCK_CONNECTING;
+
+  INFO(NCCL_RAS, "RAS new incoming socket connection from %s", ncclSocketToString(&sock->sock.addr, rasLine));
 
 exit:
   return ret;
@@ -480,7 +481,10 @@ void rasSocksHandleTimeouts(int64_t now, int64_t* nextWakeup) {
 // Once we get an EOF when receiving data, we finalize the termination.
 // For not fully established sockets, we can terminate immediately as there's no useful data to extract.
 void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRetryOffset, bool retry) {
-  assert(sock->status != RAS_SOCK_CLOSED);
+  if (sock->status == RAS_SOCK_CLOSED) {
+    INFO(NCCL_RAS, "RAS socket in closed state passed for termination -- internal error?");
+    // The code below can actually handle such a case gracefully.
+  }
   if (sock->conn) {
     struct rasConnection* conn = sock->conn;
     // If the sock of the connection points back to us, it means that we are the current socket of this
@@ -542,8 +546,10 @@ void rasSocketTerminate(struct rasSocket* sock, bool finalize, uint64_t startRet
   } else {
     // Either the caller requested finalization or we cannot receive on it.
     (void)ncclSocketClose(&sock->sock);
-    rasPfds[sock->pfd].fd = -1;
-    rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
+    if (sock->pfd != -1) {
+      rasPfds[sock->pfd].fd = -1;
+      rasPfds[sock->pfd].events = rasPfds[sock->pfd].revents = 0;
+    }
     free(sock->recvMsg);
     freeSockEntry(sock);
   }
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
index 2ab7e9448..d9d9fb436 100644
--- a/src/register/coll_reg.cc
+++ b/src/register/coll_reg.cc
@@ -1,6 +1,7 @@
 #include "register.h"
 #include "transport.h"
 #include "enqueue.h"
+#include "register_inline.h"
 
 static ncclResult_t registerCheckP2PConnection(struct ncclComm* comm, struct ncclConnector* conn, struct ncclTopoGraph* graph, int peer, bool* needReg) {
   if (conn->connected) {
@@ -61,32 +62,34 @@ ncclResult_t ncclRegisterCollNvlsBuffers(
 
     if (nvlsReged && comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
       if (comm->planner.persistent && ncclParamGraphRegister()) {
-        ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
-        if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+        if (info->func == ncclFuncAllGather) {
+          ncclCollnetGraphRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        } else if (info->func == ncclFuncReduceScatter) {
+          ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+        } else if (info->func == ncclFuncAllReduce) {
+          ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle, cleanupQueue, &info->nCleanupQueueElts);
+          if (collnetReged) ncclCollnetGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle, cleanupQueue, &info->nCleanupQueueElts);
+        }
       }
 
       if (collnetReged == 0 && ncclParamLocalRegister()) {
-        ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
-        if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+        if (info->func == ncclFuncAllGather) {
+          ncclCollnetLocalRegisterBuffer(comm, info->sendbuff, sendbuffSize, collNetSend, &collnetReged, &sendHandle);
+        } else if (info->func == ncclFuncReduceScatter) {
+          ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+        } else if (info->func == ncclFuncAllReduce) {
+          ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetRecv, &collnetReged, &recvHandle);
+          if (collnetReged) ncclCollnetLocalRegisterBuffer(comm, info->recvbuff, recvbuffSize, collNetSend, &collnetReged, &sendHandle);
+        }
       }
     }
 
     if (nvlsReged) {
       *regNeedConnect = 0;
       /* tweak NVLS channels usage; for registered NVLS buffer to saturate bandwidth. */
-      if (comm->nNodes == 1) {
-        if (info->func == ncclFuncReduceScatter) {
-          // RS: Further tweaks for Blackwell with NVLS registered buffers
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 6 : 5));
-        }
-        else {
-          // AR/AG: Further tweaks for Blackwell with NVLS registered buffers
-          info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 8 : 4));
-        }
-      } else {
-        // Further tweaks for Blackwell with NVLS registered buffers
-        info->nMaxChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, (comm->compCap >= 100) ? 7 : 6));
-      }
+      int recChannels;
+      NCCLCHECK(ncclNvlsRegResourcesQuery(comm, info, &recChannels));
+      info->nMaxChannels = recChannels;
       info->regBufType |= NCCL_NVLS_REG_BUFFER;
     }
 
@@ -188,7 +191,7 @@ ncclResult_t ncclRegisterCollBuffers(
       struct ncclChannel* channel = comm->channels;
       int ipcRegFlag = 0, netSendRegFlag = 0, netRecvRegFlag = 0;
       void *sendHandle, *recvHandle;
-      if (info->func != ncclFuncReduceScatter && comm->intraNodeP2pSupport) {
+      if (info->func != ncclFuncReduceScatter && comm->isAllDirectP2p) {
         for (int r = 0; r < NCCL_MAX_DIRECT_ARITY; ++r) {
           for (int down = 0; down < 2; ++down) {
             int peer = down ? channel->collnetDirect.down[r] : channel->collnetDirect.up[r];
@@ -308,7 +311,7 @@ ncclResult_t ncclRegisterCollBuffers(
           }
         }
       }
-      if (nPeers > 0 && comm->intraNodeP2pSupport) {
+      if (nPeers > 0 && comm->isAllDirectP2p) {
         if (comm->planner.persistent && ncclParamGraphRegister()) {
           ncclIpcGraphRegisterBuffer(comm, info->recvbuff, recvbuffSize, peerRanks, nPeers, NCCL_IPC_COLLECTIVE, &regBufFlag, &info->recvbuffOffset, &info->recvbuffRmtAddrs, cleanupQueue, &info->nCleanupQueueElts);
         }
@@ -365,7 +368,7 @@ ncclResult_t ncclRegisterCollBuffers(
       void *sendHandle, *recvHandle;
       NCCLCHECK(ncclRegFind(comm, info->recvbuff, recvbuffSize, &recvRegRecord));
       if (recvRegRecord == NULL && !(comm->planner.persistent && ncclParamGraphRegister())) goto exit;
-      if (comm->intraNodeP2pSupport) {
+      if (comm->isAllDirectP2p) {
         for (int c = 0; c < comm->nChannels; ++c) {
           struct ncclChannel* channel = comm->channels + c;
           struct ncclTree* tree = NULL;
diff --git a/src/register/register.cc b/src/register/register.cc
index 930367a97..59928f57e 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -10,24 +10,21 @@
 #include "net.h"
 #include "register.h"
 #include "transport.h"
+#include "group.h"
 
-ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data, size_t size, struct ncclReg** reg) {
-  struct ncclRegCache* cache = &comm->regCache;
-  uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
 
-  *reg = NULL;
-  for (int slot=0; /*true*/; slot++) {
-    if (slot == cache->population || addr < cache->slots[slot]->addr) return ncclSuccess;
-    if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
-      *reg = cache->slots[slot];
-      return ncclSuccess;
+static ncclResult_t regFindHandleFromSymAddr(struct ncclComm* comm, void* baseSymPtr, struct ncclReg** handle) {
+  struct ncclRegCache* cache = &comm->regCache;
+  *handle = NULL;
+  for (int slot = 0; slot < cache->population; slot++) {
+    if (baseSymPtr == cache->slots[slot]->baseSymPtr) {
+      *handle = cache->slots[slot];
+      break;
     }
   }
+  return ncclSuccess;
 }
-NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
 
 ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
   if (reg && isValid) {
@@ -43,14 +40,14 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i
   NCCLCHECK(CommCheck(comm, "ncclCommRegister", "comm"));
   struct ncclRegCache* cache = &comm->regCache;
   uintptr_t pageSize = cache->pageSize;
-  uintptr_t addr = (uintptr_t)data & -pageSize;
-  size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
+  uintptr_t begAddr = (uintptr_t)data & -pageSize;
+  uintptr_t endAddr = ((uintptr_t)data + size + pageSize-1) & -pageSize;
 
   if (comm->checkPointers) NCCLCHECK(CudaPtrCheck(data, comm, "buff", "ncclCommRegister"));
   INFO(NCCL_REG, "register comm %p buffer %p size %zi", comm, data, size);
 
   for (int slot=0; /*true*/; slot++) {
-    if ((slot == cache->population) || (addr < cache->slots[slot]->addr)) {
+    if ((slot == cache->population) || (begAddr < cache->slots[slot]->begAddr)) {
       if (cache->population == cache->capacity) { // must grow cache
         cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
         NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
@@ -58,15 +55,15 @@ ncclResult_t ncclRegister(struct ncclComm* comm, void* data, size_t size, bool i
       memmove(cache->slots+slot+1, cache->slots+slot, (cache->population-slot)*sizeof(struct ncclReg*));
       NCCLCHECK(ncclCalloc(cache->slots+slot, 1));
       struct ncclReg* regSlot = cache->slots[slot];
-      regSlot->addr = addr;
-      regSlot->pages = pages;
+      regSlot->begAddr = begAddr;
+      regSlot->endAddr = endAddr;
       if (isGraph) regSlot->graphRefs = 1;
       else regSlot->localRefs = 1;
       cache->population += 1;
       *handle = regSlot;
       goto exit;
-    } else if ((addr >= cache->slots[slot]->addr) &&
-        ((addr-cache->slots[slot]->addr)/pageSize+pages) <= cache->slots[slot]->pages) {
+    } else if ((cache->slots[slot]->begAddr <= begAddr) &&
+               (cache->slots[slot]->endAddr >= endAddr)) {
       if (isGraph) cache->slots[slot]->graphRefs++;
       else cache->slots[slot]->localRefs++;
       *handle = cache->slots[slot];
@@ -120,7 +117,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
   struct ncclRegCache* cache = &comm->regCache;
   for (int i = 0; i < cache->population; i++) {
     struct ncclReg* reg = cache->slots[i];
-    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->addr, reg->pages);
+    INFO(NCCL_INIT, "Cleanup buffer %p pages %lx", (void*)reg->begAddr, (reg->endAddr-reg->begAddr)/cache->pageSize);
     NCCLCHECK(regCleanup(comm, reg));
     free(reg);
   }
@@ -177,3 +174,104 @@ ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *hand
   NCCLCHECK(commDeregister(comm, true, handle));
   return ncclSuccess;
 }
+
+ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle) {
+  ncclResult_t ret = ncclSuccess;
+  void* regSymAddr = NULL;
+  ALIGN_SIZE(comm->symAllocHead, alignment);
+  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, baseSize, memHandle, &regSymAddr), ret, fail);
+  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, baseSize, regSymAddr), ret, fail);
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+  comm->symAllocHead += baseSize;
+  regHandle->baseSymPtr = regSymAddr;
+  regHandle->symSize = baseSize;
+exit:
+  return ret;
+fail:
+  regHandle->baseSymPtr = NULL;
+  regHandle->symSize = 0;
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags) {
+  ncclResult_t ret = ncclSuccess;
+  CUmemGenericAllocationHandle memHandle;
+  size_t baseSize;
+  void* baseAddr = NULL;
+  struct ncclReg* regHandle = NULL;
+  int saveDev;
+
+  *win = NULL;
+
+  CUDACHECK(cudaGetDevice(&saveDev));
+  NCCLCHECK(ncclGroupStartInternal());
+  if (!ncclParamLocalRegister() || !ncclCuMemEnable()) {
+    goto exit;
+  }
+
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
+
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+  if (comm && buff && size && win) {
+    size_t alignment = 0;
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)buff), ret, fail);
+    // size and alignment check
+    if (!((uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0 && baseSize % NCCL_REC_PAGE_SIZE == 0 && (uintptr_t)buff + size <= (uintptr_t)baseAddr + baseSize)) {
+      WARN("buffer %p (baseAddr %p align %d) size %zu (baseSize %ld align %d) does not satisfy symmetric registration requirements", buff, baseAddr, (uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0, size, baseSize, baseSize % NCCL_REC_PAGE_SIZE == 0);
+      goto fail;
+    }
+    NCCLCHECKGOTO(ncclRegister(comm, baseAddr, baseSize, false, (void**)&regHandle), ret, fail);
+    NCCLCHECKGOTO(ncclCalloc(win, 1), ret, fail);
+    (*win)->handle = regHandle;
+    regHandle->winFlags = winFlags;
+    if (regHandle->baseSymPtr == NULL && comm->symmetricSupport) {
+      struct ncclSymRegTask* task;
+      CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, baseAddr), ret, fail);
+      CUCHECKGOTO(cuMemRelease(memHandle), ret, fail);
+      alignment = baseSize >= NCCL_REC_PAGE_SIZE * 72L ? NCCL_MAX_PAGE_SIZE : NCCL_REC_PAGE_SIZE;
+      NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail);
+      task->buff = buff;
+      task->baseSize = baseSize;
+      task->memHandle = memHandle;
+      task->regHandle = regHandle;
+      task->alignment = alignment;
+      ncclIntruQueueEnqueue(&comm->symRegTaskQueue, task);
+      ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
+    }
+  }
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ret = ncclGroupEndInternal());
+  cudaSetDevice(saveDev);
+  return ret;
+fail:
+  free(*win);
+  *win = NULL;
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win);
+ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win) {
+  ncclResult_t ret = ncclSuccess;
+  int saveDev;
+  struct ncclReg* regHandle;
+  CUDACHECK(cudaGetDevice(&saveDev));
+  if (win == NULL) goto exit;
+  regHandle = win->handle;
+  if (regHandle && ncclParamLocalRegister() && ncclCuMemEnable()) {
+    if (regHandle->baseSymPtr) {
+      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+      NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
+      NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
+    }
+    NCCLCHECKGOTO(commDeregister(comm, false, regHandle), ret, fail);
+  }
+  free(win);
+exit:
+  CUDACHECK(cudaSetDevice(saveDev));
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/symmetric.cc b/src/symmetric.cc
new file mode 100644
index 000000000..f5b1e6c22
--- /dev/null
+++ b/src/symmetric.cc
@@ -0,0 +1,296 @@
+#include "symmetric.h"
+#include "comm.h"
+#include "device.h"
+#include <cmath>
+
+constexpr char const* kernelName[] = {
+  // Must align with enum ncclSymKernelId definition in src/include/symmetric.h
+  "AllReduce_AGxLL_R",
+  "AllReduce_AGxLLMC_R",
+  "AllReduce_RSxLD_AGxST",
+  "AllReduce_RSxLDMC_AGxSTMC",
+  "AllGather_LL",
+  "AllGather_LLMC",
+  "AllGather_ST",
+  "AllGather_STMC",
+  "ReduceScatter_LL",
+  "ReduceScatter_LD",
+  "ReduceScatter_LDMC"
+};
+
+constexpr uint32_t kernelMask_STMC = 1<<ncclSymKernelId_AllGather_LLMC |
+                                     1<<ncclSymKernelId_AllGather_STMC |
+                                     1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
+                                     1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                     1<<ncclSymKernelId_ReduceScatter_LDMC;
+
+constexpr uint32_t kernelMask_LDMC = 1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                     1<<ncclSymKernelId_ReduceScatter_LDMC;
+
+constexpr uint32_t kernelMask_LL = 1<<ncclSymKernelId_AllReduce_AGxLL_R |
+                                   1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
+                                   1<<ncclSymKernelId_AllGather_LL |
+                                   1<<ncclSymKernelId_AllGather_LLMC |
+                                   1<<ncclSymKernelId_ReduceScatter_LL;
+
+constexpr uint32_t kernelMask_AG = 1<<ncclSymKernelId_AllGather_LL |
+                                   1<<ncclSymKernelId_AllGather_LLMC |
+                                   1<<ncclSymKernelId_AllGather_ST |
+                                   1<<ncclSymKernelId_AllGather_STMC;
+
+constexpr uint32_t kernelMask_AR = 1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
+                                   1<<ncclSymKernelId_AllReduce_AGxLL_R |
+                                   1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                   1<<ncclSymKernelId_AllReduce_RSxLD_AGxST;
+
+constexpr uint32_t kernelMask_RS = 1<<ncclSymKernelId_ReduceScatter_LD |
+                                   1<<ncclSymKernelId_ReduceScatter_LDMC |
+                                   1<<ncclSymKernelId_ReduceScatter_LL;
+
+static uint32_t kernelMask_coll(ncclFunc_t coll) {
+  switch (coll) {
+  case ncclFuncAllGather: return kernelMask_AG;
+  case ncclFuncAllReduce: return kernelMask_AR;
+  case ncclFuncReduceScatter: return kernelMask_RS;
+  default: return 0;
+  }
+}
+
+static uint32_t kernelMask_user() {
+  static uint32_t cache = -1u;
+  uint32_t got = __atomic_load_n(&cache, __ATOMIC_RELAXED);
+  if (got == -1u) {
+    // TODO: Enhance this to be a pattern match. I like regex's but we also have
+    // the parseList() used by NCCL_ALGO/PROTO.
+    char const* name = ncclGetEnv("NCCL_SYM_KERNEL");
+    if (name == nullptr || strcmp(name, "^") == 0) {
+      static_assert((int)ncclSymKernelId_Count < 32, "Use more than 32 bits");
+      got = (1<<(int)ncclSymKernelId_Count)-1;
+    } else {
+      got = 0;
+      for (int k=0; k < (int)ncclSymKernelId_Count; k++) {
+        if (strcmp(kernelName[k], name) == 0) {
+          __atomic_store_n(&cache, 1<<k, __ATOMIC_RELAXED);
+          got = 1<<k;
+          break;
+        }
+      }
+    }
+    __atomic_store_n(&cache, got, __ATOMIC_RELAXED);
+  }
+  return got;
+}
+
+NCCL_PARAM(SymCTAs, "SYM_CTAS", 0)
+
+static double softmin(double x, double ceiling, double softness) {
+  // looks like a smooth version of: min(x, ceiling)
+  return ceiling - softness*std::log1p((std::exp(ceiling/softness) - 1)*std::exp(-x/softness));
+}
+
+static double softplus(double x, double softness) {
+  // looks like a smooth version of: max(0, x)
+  double z = x/softness;
+  return 100.0 <= z ? x : softness*std::log1p(std::exp(z));
+}
+
+static double model(double busBytes, double baseLat, int nSMs, double smBw, double busMultiplier, double peakBw) {
+  double bw = softmin(nSMs*smBw*busMultiplier, peakBw, smBw);
+  return baseLat + softplus(busBytes/bw - 1, 1);
+}
+
+// Given the kernel and bytes, return the minimum number of blocks to run on such that
+// perf is 99% of running at max blocks, and return the estimate runtime for that
+// block count.
+static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, float* timeUs, int* nBlocks) {
+  constexpr double LL_BusFactor = 9; // 2X the bytes, plus some processing, plus no unrolling
+
+  int nRanks = comm->nRanks;
+  int nMaxBlocks = ncclSymMaxBlocks;
+  int nMaxBlocksNvls = divUp((comm->cudaArch < 1000 ? 16 : 32), nRanks);
+  size_t busBytes; // max(bytes sent, bytes received)
+  double busMultiplier = 1;
+
+  switch (k) {
+  default:
+    busBytes = size_t(1)<<50;
+    break;
+
+  case ncclSymKernelId_AllReduce_AGxLL_R:
+    busBytes = nRanks*nBytes*LL_BusFactor;
+    break;
+  case ncclSymKernelId_AllReduce_AGxLLMC_R:
+    busBytes = nRanks*nBytes*LL_BusFactor;
+    busMultiplier = 1.1; // To beat non-MC LL
+    break;
+  case ncclSymKernelId_AllReduce_RSxLD_AGxST:
+    busBytes = 2*nBytes*(nRanks-1)/nRanks;
+    break;
+  case ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC:
+    busBytes = nBytes/nRanks + nBytes;
+    busMultiplier = nRanks;
+    nMaxBlocks = nMaxBlocksNvls;
+    break;
+
+  case ncclSymKernelId_AllGather_LL:
+    busBytes = nRanks*nBytes*LL_BusFactor;
+    break;
+  case ncclSymKernelId_AllGather_LLMC:
+    busBytes = nRanks*nBytes*LL_BusFactor;
+    busMultiplier = 1.1; // To beat non-MC LL
+    break;
+  case ncclSymKernelId_AllGather_ST:
+    busBytes = (nRanks-1)*nBytes;
+    break;
+  case ncclSymKernelId_AllGather_STMC:
+    busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
+    busMultiplier = 0.55*nRanks;
+    nMaxBlocks = nMaxBlocksNvls;
+    break;
+
+  case ncclSymKernelId_ReduceScatter_LL:
+    busBytes = nRanks*nBytes*LL_BusFactor;
+    break;
+  case ncclSymKernelId_ReduceScatter_LD:
+    busBytes = (nRanks-1)*nBytes;
+    break;
+  case ncclSymKernelId_ReduceScatter_LDMC:
+    busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
+    busMultiplier = 0.55*nRanks;
+    nMaxBlocks = nMaxBlocksNvls;
+    break;
+  }
+
+  nMaxBlocks = std::min<int>(nMaxBlocks, comm->config.maxCTAs);
+  int nMinBlocks = comm->config.minCTAs;
+
+  int nUserCTAs = std::min<int>(ncclSymMaxBlocks, ncclParamSymCTAs());
+  if (nUserCTAs > 0) nMinBlocks = nMaxBlocks = nUserCTAs;
+
+  bool isLL = kernelMask_LL>>k & 1;
+  bool isAG = kernelMask_AG>>k & 1;
+  bool isAR = kernelMask_AR>>k & 1;
+  constexpr double GBps = (1<<30)/1.e6;
+  double baseLat, smBw, peakBw;
+  if (comm->cudaArch < 1000) {
+    baseLat = isLL ? 4.5 : 7.8;
+    smBw = isAR ? 65*GBps : 44*GBps;
+    peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps;
+  } else {
+    baseLat = isLL ? (isAG ? 8.5 : 11) : (isAR ? 19.5 : 13.0);
+    smBw = 55*GBps;
+    peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps;
+  }
+  *nBlocks = nMaxBlocks;
+  *timeUs = model(busBytes, baseLat, nMaxBlocks, smBw, busMultiplier, peakBw);
+  // Use least number of blocks that puts us within a tolerance of peak performance.
+  for (int bn = nMinBlocks; bn < nMaxBlocks; bn++) {
+    double time = model(busBytes, baseLat, bn, smBw, busMultiplier, peakBw);
+    if (time <= 1.025*(*timeUs)) {
+      *nBlocks = bn;
+      *timeUs = time;
+      break;
+    }
+  }
+}
+
+bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
+  bool isFloat;
+  switch (ty) {
+  case ncclFloat64:
+  case ncclFloat32:
+  case ncclFloat16:
+  case ncclBfloat16:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
+    isFloat = true;
+    break;
+  default:
+    isFloat = false;
+    break;
+  }
+
+  switch (coll) {
+  case ncclFuncAllGather:
+    return true;
+  case ncclFuncAllReduce:
+  case ncclFuncReduceScatter:
+    return red == ncclDevSum && isFloat && ty != ncclFloat64;
+  default:
+    return false;
+  }
+}
+
+ncclResult_t ncclSymPickKernel(
+    struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts,
+    float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps
+  ) {
+  uint32_t kmask = kernelMask_coll(coll);
+  kmask &= kernelMask_user();
+
+  bool hasSTMC = comm->nvlsSupport;
+  bool hasLDMC = false;
+  if (comm->nvlsSupport) {
+    switch (ty) {
+    case ncclInt32:
+    case ncclUint32:
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat16:
+    case ncclBfloat16:
+      hasLDMC = red == ncclDevSum || red == ncclDevMinMax;
+      break;
+    case ncclFloat8e4m3:
+    case ncclFloat8e5m2:
+      hasLDMC = red == ncclDevSum || red == ncclDevMinMax;
+      hasLDMC &= comm->compCap >= 100;
+      break;
+    case ncclFloat:
+    case ncclDouble:
+      hasLDMC = red == ncclDevSum;
+      break;
+    default: break;
+    }
+  }
+  if (!hasSTMC) kmask &= ~kernelMask_STMC;
+  if (!hasLDMC) kmask &= ~kernelMask_LDMC;
+
+  size_t nBytes = nElts*ncclTypeSize(ty);
+  size_t nBusBytes = (coll == ncclFuncAllReduce ? 1 : comm->nRanks)*nBytes;
+  // LL kernels use 32-bit ints to track element counts and indices.
+  if (nBusBytes >= (size_t(2)<<30)) kmask &= ~kernelMask_LL;
+  // Any kernel might use 32-bit int to track unrolled loop chunks (which are going
+  // to be at least 32 bytes per chunk)
+  if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
+
+  ncclSymKernelId bestKernel = ncclSymKernelId_Count;
+  float bestTime = 1.e30f;
+  int bestBlocks = 999;
+
+  constexpr float smPenalty = .025f; // 2.5% percent increase in time per SM
+  uint32_t kmaskRemain = kmask;
+  while (kmaskRemain != 0) {
+    ncclSymKernelId k = (ncclSymKernelId)popFirstOneBit(&kmaskRemain);
+    float kTime;
+    int kBlocks;
+    queryModel(comm, k, nBytes, &kTime, &kBlocks);
+    if (kTime*(1.0f + smPenalty*kBlocks) < bestTime*(1.0f + smPenalty*bestBlocks)) {
+      bestKernel = k;
+      bestTime = kTime;
+      bestBlocks = kBlocks;
+    }
+  }
+
+  *kernelId = bestKernel;
+  *estTimeUs = kmask==0 || kernelMask_user() == (1<<ncclSymKernelId_Count)-1 ? bestTime : 0.0f;
+  *nBlocks = bestBlocks;
+  *nWarps = 16;
+  return ncclSuccess;
+}
+
+const char* ncclSymKernelIdToString(int kernelId) {
+  if (kernelId < 0 || kernelId >= ncclSymKernelId_Count) {
+    return "Unknown";
+  }
+  return kernelName[kernelId];
+}
diff --git a/src/transport.cc b/src/transport.cc
index f98b77a43..d98b98b1b 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -71,7 +71,7 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
 NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
 #include <sys/time.h>
 
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2pSupport, bool* directMode) {
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode) {
   bool supportFlag = true;
   bool directFlag = false;
   if (comm->localRanks == 1) {
@@ -84,8 +84,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
         struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
         struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
         int canConnect = 0;
-        NCCLCHECK(ncclTransports[0]->canConnect(&canConnect, comm, NULL, ipeerInfo, jpeerInfo));
-        if (!canConnect && supportFlag == true) {
+        int intermediateRank = -1;
+        NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank, &canConnect, NULL, &intermediateRank));
+        if (!canConnect || intermediateRank != -1) {
           supportFlag = false;
         }
         if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
@@ -93,9 +94,9 @@ ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* intraNodeP2p
       }
     }
   }
-  *intraNodeP2pSupport = supportFlag;
+  *isAllDirectP2p = supportFlag;
   *directMode = directFlag;
-  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type intraNodeP2pSupport %d directMode %d", supportFlag, directFlag);
+  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d", supportFlag, directFlag);
   return ncclSuccess;
 }
 
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 84e1f84a0..386865e21 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -13,6 +13,7 @@
 #include "assert.h"
 #include "bootstrap.h"
 #include "channel.h"
+#include "register_inline.h"
 
 int64_t ncclParamGdrCopySyncEnable();
 int64_t ncclParamGdrCopyFlushEnable();
@@ -1188,7 +1189,7 @@ static ncclResult_t collnetRegisterBuffer(struct ncclComm* comm, const void* use
       goto exit;
     } else {
       /* start register collnet buffer */
-      struct collnetRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+      struct collnetRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr };
       void* handle = NULL;
       struct ncclConnInfo* conn = (type == collNetRecv) ? &comm->channels[0].peers[comm->nRanks]->recv[type].conn : &comm->channels[0].peers[comm->nRanks]->send[type].conn;
 
@@ -1389,7 +1390,7 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
   char line[1024];
 
-  if (comm->collNetSupport == 0) goto exit;
+  if (comm->config.collnetEnable == 0) goto exit;
   // Connect Collnet + chain
   for (int c = 0; c < comm->nChannels; c++) {
     struct ncclChannel* channel = comm->channels + c;
@@ -1421,7 +1422,7 @@ ncclResult_t ncclCollNetChainBufferSetup(ncclComm_t comm) {
 ncclResult_t ncclCollNetDirectBufferSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
 
-  if (comm->collNetSupport == 0) goto exit;
+  if (comm->config.collnetEnable == 0) goto exit;
 
   // Connect intra-node CollNet + Direct
   for (int c = 0; c < comm->nChannels; c++) {
@@ -1498,8 +1499,8 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
 
   comm->collNetHeads = headsUnique;
   comm->collNetHeadsNum = nHeadsUnique;
-  if (parent && parent->collNetSupport && parent->nNodes == comm->nNodes) {
-    if (!parent->config.splitShare) {
+  if (parent && parent->config.collnetEnable && parent->nNodes == comm->nNodes) {
+    if (!parent->shareResources) {
       collNetSetupFail = 1;
       goto fail;
     }
@@ -1547,9 +1548,6 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
 
         NCCLCHECKGOTO(collNetInitRailRankMap(comm), ret, fail);
       } else {
-        /* TODO: CX-6 and CX-7 both do not support multiple sharp resources per process, if child comm cannot
-         * share the sharp resource from parent, we cannot use sharp in this case. This restriction might be
-         * lifted by sharp plugin/IB hardware in the future. */
         collNetSetupFail = 1;
         if (comm->rank == 0) {
           WARN("Child comms (nRanks %d) fails to share parent comms (nRanks %d) sharp resources", comm->nRanks, parent->nRanks);
@@ -1629,7 +1627,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
   return ret;
 fail:
   ncclTransportCollNetFree(comm);
-  comm->collNetSupport = 0;
+  comm->config.collnetEnable = 0;
   goto exit;
 }
 
diff --git a/src/transport/net.cc b/src/transport/net.cc
index 61b15ce20..c0cd20d6e 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -16,6 +16,7 @@
 #include "transport.h"
 #include "shm.h"
 #include <assert.h>
+#include "register_inline.h"
 
 static_assert(sizeof(ncclNetHandle_t) <= CONNECT_SIZE, "NET Connect info is too large");
 
@@ -629,8 +630,6 @@ static ncclResult_t sendProxySetup(struct ncclProxyConnection* connection, struc
   resources->netDeviceVersion = props.netDeviceVersion;
   resources->netDeviceType = props.netDeviceType;
 
-  resources->netDeviceVersion = props.netDeviceVersion;
-  resources->netDeviceType = props.netDeviceType;
   /* point-to-point size limits*/
   resources->maxP2pBytes = props.maxP2pBytes;
   if((resources->maxP2pBytes <= 0) || (resources->maxP2pBytes > NCCL_MAX_NET_SIZE_BYTES)) {
@@ -732,7 +731,14 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteRank;
-      if (comms->sendComm[resources->channelId] == NULL) ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      // let only one localrank connect to a tpRemoteRank to avoid duplicate connections
+      if (comms->activeConnect[resources->channelId] == 0)
+        comms->activeConnect[resources->channelId] = (resources->tpLocalRank + 1);
+      if (comms->sendComm[resources->channelId] == NULL
+          && comms->activeConnect[resources->channelId] == (resources->tpLocalRank + 1)) {
+        ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle,
+            comms->sendComm + resources->channelId, &resources->netDeviceHandle);
+      }
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
@@ -886,7 +892,15 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclCalloc(progressState->netComms + resources->netDev, proxyState->tpnRanks));
       }
       struct ncclSharedNetComms* comms = progressState->netComms[resources->netDev] + resources->tpRemoteProxyRank;
-      if (comms->recvComm[resources->channelId] == NULL) ret = proxyState->ncclNet->accept(resources->netListenComm, comms->recvComm+resources->channelId, &resources->netDeviceHandle);
+      // reuse handle to for netdev/remote rank to avoid duplicate connections
+      if (comms->activeAccept[resources->channelId] == 0)
+        comms->activeAccept[resources->channelId] = (resources->tpLocalRank + 1);
+      //try connecting while comm is null
+      if (comms->recvComm[resources->channelId] == NULL
+         && comms->activeAccept[resources->channelId] == (resources->tpLocalRank + 1)) {
+        ret = proxyState->ncclNet->accept(resources->netListenComm,
+            comms->recvComm+resources->channelId, &resources->netDeviceHandle);
+      }
       resources->netRecvComm = comms->recvComm[resources->channelId];
       if (comms->recvComm[resources->channelId]) comms->recvRefCount[resources->channelId]++;
     } else {
@@ -1101,7 +1115,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
       // Set step base for next op
       resources->step = sub->base + sub->nsteps;
       sub->posted = sub->transmitted = sub->done = 0;
-      ncclProfilerStartSendProxyOpEvent(s, args);
+      ncclProfilerRecordProxyOpEventState(s, args, ncclProfilerProxyOpInProgress_v4);
       if (!sub->reg)
         sub->sendMhandle = resources->mhandles[args->protocol];
     }
@@ -1140,7 +1154,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
         } else {
           sub->posted += args->sliceSteps;
         }
-        ncclProfilerRecordProxyOpEventState(s, args, sub->posted, sub->transSize, ncclProfilerProxyOpSendPosted);
         ncclProfilerRecordProxyStepEventState(s, args, postedStepId, ncclProfilerProxyStepSendGPUWait);
         args->idle = 0;
         continue;
@@ -1188,18 +1201,17 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             }
           }
           if (ready) {
-            ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted+args->sliceSteps, sub->transSize, ncclProfilerProxyOpSendRemFifoWait);
+            ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendPeerWait_v4);
             // Data is ready, try to send.
             // Coverity complains about the size here as pointing to an out-of-scope temporary.  Which is nonsense,
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
-            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, sub, sub->requests+buffSlot));
+            void* phandle = &sub->pHandles[DIVUP(transmittedStepId, args->sliceSteps)%NCCL_STEPS];
+            NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, phandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
-              sub->transSize += size;
+              sub->transSize = size;
               sub->transmitted += args->sliceSteps;
-              sub->profilerSteps++;
-              ncclProfilerRecordProxyOpEventState(s, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpSendTransmitted);
               ncclProfilerRecordProxyStepEventState(s, args, transmittedStepId, ncclProfilerProxyStepSendWait);
               args->idle = 0;
               continue;
@@ -1220,7 +1232,6 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
           TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] request %p done", sub->done, buffSlot, sub->nsteps, sub->requests[buffSlot]);
           sub->done += args->sliceSteps;
           ncclProfilerStopProxyStepEvent(s, args, doneStepId);
-          ncclProfilerRecordProxyOpEventState(s, args, sub->done, sub->transSize, ncclProfilerProxyOpSendDone);
 
           if (resources->shared == 0) {
             volatile uint64_t* sendHead = resources->gdcSync ? resources->gdcSync : &resources->sendMem->head;
@@ -1282,7 +1293,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       sub->posted = sub->received = sub->transmitted = sub->done = 0;
       sub->regBufferReady = 0;
       for (int i=0; i<groupSize; i++) sub[-i].groupSize = groupSize;
-      ncclProfilerStartRecvProxyOpEvent(s, args);
+      ncclProfilerRecordProxyOpEventState(s, args, ncclProfilerProxyOpInProgress_v4);
       if (!sub->reg)
         sub->recvMhandle = resources->mhandles[args->protocol];
     }
@@ -1343,7 +1354,7 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
           if (sub->nbytes < sizes[subCount]) sizes[subCount] = sub->nbytes;
           tags[subCount] = resources->tpRemoteRank;
           mhandles[subCount] = sub->recvMhandle;
-          phandles[subCount] = sub;
+          phandles[subCount] = &sub->pHandles[DIVUP(postedStepId, args->sliceSteps)%NCCL_STEPS];
           subCount++;
         }
       }
@@ -1362,8 +1373,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             int postedStepId = sub->posted;
             TRACE(NCCL_NET, "recvProxy [%ld/%ld/%d] Irecv posted, buff %p, size %ld, myRank %d, channelId %d, mhandle %p", sub->posted, (sub->base + sub->posted) % NCCL_STEPS, sub->nsteps, ptrs[i], sizes[i], proxyState->tpRank, sub->channelId, mhandles[i]);
             sub->posted += args->sliceSteps;
-            sub->profilerSteps++;
-            ncclProfilerRecordProxyOpEventState(s+i, args, sub->posted, sub->transSize, ncclProfilerProxyOpRecvPosted);
             ncclProfilerRecordProxyStepEventState(s+i, args, postedStepId, ncclProfilerProxyStepRecvWait);
           }
           args->idle = 0;
@@ -1393,9 +1402,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             struct recvNetResources* resources = (struct recvNetResources*)(sub->connection->transportResources);
             volatile struct ncclConnFifo* connFifo = (volatile struct ncclConnFifo*)resources->recvMem->connFifo;
             connFifo[buffSlot].size = -1;
-            sub->transSize += sizes[i];
+            sub->transSize = sizes[i];
             sub->received += args->sliceSteps;
-            ncclProfilerRecordProxyOpEventState(s+i, args, sub->received, sub->transSize, ncclProfilerProxyOpRecvReceived);
             ncclProfilerRecordProxyStepEventState(s+i, args, receivedStepId, ncclProfilerProxyStepRecvFlushWait);
             if (step < sub->nsteps) {
               struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1459,7 +1467,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
             int transmittedStepId = sub->transmitted;
 
             sub->transmitted += args->sliceSteps;
-            ncclProfilerRecordProxyOpEventState(s+i, args, sub->transmitted, sub->transSize, ncclProfilerProxyOpRecvTransmitted);
             ncclProfilerRecordProxyStepEventState(s+i, args, transmittedStepId, ncclProfilerProxyStepRecvGPUWait);
             if (step < sub->nsteps) {
               __sync_synchronize();
@@ -1479,7 +1486,6 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
       struct ncclProxySubArgs* subGroup = args->subs+s;
       for (int i=0; i<subGroup->groupSize; i++) {
         struct ncclProxySubArgs* sub = subGroup + i;
-        int doneStepId = sub->done;
         if (sub->done == sub->nsteps) continue;
         if (sub->transmitted > sub->done) {
           struct recvNetResources* resources = (struct recvNetResources*) (sub->connection->transportResources);
@@ -1494,9 +1500,9 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
                 NCCLCHECK(proxyState->ncclNet->irecvConsumed(resources->netRecvComm, subGroup->recvRequestsSubCount, subGroup->recvRequestsCache[sub->done%NCCL_STEPS]));
               subGroup->recvRequestsCache[sub->done%NCCL_STEPS] = NULL;
             }
+            int doneStepId = sub->done;
             sub->done += args->sliceSteps;
             ncclProfilerStopProxyStepEvent(s+i, args, doneStepId);
-            ncclProfilerRecordProxyOpEventState(s+i, args, sub->done, sub->transSize, ncclProfilerProxyOpRecvDone);
             args->idle = 0;
             if (sub->done == sub->nsteps) {
               args->done++;
@@ -1547,9 +1553,9 @@ static ncclResult_t netRegisterBuffer(ncclComm* comm, const void* userbuff, size
       if (found) {
         *outRegBufFlag = 1;
         outHandle[p] = netHandle->handle;
-        INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, netHandle->handle);
+        INFO(NCCL_REG, "rank %d - NET reuse buffer %p size %ld (baseAddr %p size %ld) handle %p", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, regRecord->endAddr - regRecord->begAddr, netHandle->handle);
       } else {
-        struct netRegInfo info = { regRecord->addr, regRecord->pages * comm->regCache.pageSize };
+        struct netRegInfo info = { regRecord->begAddr, regRecord->endAddr - regRecord->begAddr };
         void* handle = NULL;
 
         if (peerConn->conn.flags & NCCL_DIRECT_NIC) {
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index c049531f8..19a505e1c 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -25,8 +25,10 @@
 #include "timer.h"
 
 #include "ibvwrap.h"
+#include "mlx5/mlx5dvwrap.h"
 
-#define MAXNAMESIZE 64
+#define MAXSUFFIXSIZE 16
+#define MAXNAMESIZE (64 + MAXSUFFIXSIZE)
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress ncclIbIfAddr;
 
@@ -55,6 +57,17 @@ struct ncclIbStats {
   int fatalErrorCount;
 };
 
+enum ncclIbProvider {
+  IB_PROVIDER_NONE = 0,
+  IB_PROVIDER_MLX5 = 1,
+  IB_PROVIDER_MAX = 2,
+};
+
+const char* ibProviderName[] = {
+  "None",
+  "Mlx5",
+};
+
 static int ncclNIbDevs = -1;
 struct alignas(64) ncclIbDev {
   pthread_mutex_t lock;
@@ -77,6 +90,12 @@ struct alignas(64) ncclIbDev {
   struct ibv_port_attr portAttr;
   struct ncclIbStats stats;
   int dmaBufSupported;
+  enum ncclIbProvider ibProvider;
+  union {
+    struct {
+      int dataDirect;
+    } mlx5;
+  } capsProvider;
 };
 
 #define MAX_IB_DEVS  32
@@ -106,6 +125,7 @@ NCCL_PARAM(IbAdaptiveRouting, "IB_ADAPTIVE_ROUTING", -2);
 NCCL_PARAM(IbFifoTc, "IB_FIFO_TC", -1);
 NCCL_PARAM(IbAsyncEvents,"IB_RETURN_ASYNC_EVENTS",1);
 NCCL_PARAM(IbEceEnable,"IB_ECE_ENABLE",1);
+NCCL_PARAM(IbDataDirect,"IB_DATA_DIRECT",1);
 
 static ncclResult_t ncclIbStatsInit(struct ncclIbStats* stat) {
   __atomic_store_n(&stat->fatalErrorCount, 0, __ATOMIC_RELAXED);
@@ -451,6 +471,10 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
   if (p == NULL) {
     WARN("Could not find real path of %s (%s)", devName, devicePath);
   } else {
+    // Merge multi-port NICs into the same PCI device
+    p[strlen(p)-1] = '0';
+    // Also merge virtual functions (VF) into the same device
+    if (ncclParamIbMergeVfs()) p[strlen(p)-3] = p[strlen(p)-4] = '0';
     // Keep the real port aside (the ibv port is always 1 on recent cards)
     *realPort = 0;
     for (int d=0; d<ncclNIbDevs; d++) {
@@ -495,9 +519,29 @@ static int ncclIbRelaxedOrderingCapable(void) {
   return r == ncclInternalError ? 0 : 1;
 }
 
+static bool ncclMlx5dvDmaBufCapable(ibv_context *context){
+  ncclResult_t res;
+  int dev_fail = 0;
+
+  struct ibv_pd* pd;
+  NCCLCHECKGOTO(wrap_ibv_alloc_pd(&pd, context), res, failure);
+  // Test kernel DMA-BUF support with a dummy call (fd=-1)
+  (void)wrap_direct_ibv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/);
+  // ibv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+  (void)wrap_direct_mlx5dv_reg_dmabuf_mr(pd, 0ULL /*offset*/, 0ULL /*len*/, 0ULL /*iova*/, -1 /*fd*/, 0 /*flags*/, 0 /* mlx5 flags*/);
+  // mlx5dv_reg_dmabuf_mr() will fail with EOPNOTSUPP/EPROTONOSUPPORT if not supported (EBADF otherwise)
+  dev_fail |= (errno == EOPNOTSUPP) || (errno == EPROTONOSUPPORT);
+  NCCLCHECKGOTO(wrap_ibv_dealloc_pd(pd), res, failure);
+  // stop the search and goto failure
+  if (dev_fail) goto failure;
+  return true;
+failure:
+  return false;
+}
+
 ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
   if (ncclParamIbMergeNics() == 0 && props->ndevs > 1) {
-    WARN("NET/IB : Trying to merge multiple devices together when NCCL_IB_MERGE_NICS=0. Please enable it or disable device merging in NCCL.");
+    INFO(NCCL_NET, "NET/IB : Skipping makeVDevice, NCCL_IB_MERGE_NICS=0");
     return ncclInvalidUsage;
   }
 
@@ -565,14 +609,17 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
   if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
+  if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
 
   if (ncclNIbDevs == -1) {
     pthread_mutex_lock(&ncclIbLock);
     wrap_ibv_fork_init();
     if (ncclNIbDevs == -1) {
+      int nIpIfs = 0;
       ncclNIbDevs = 0;
       ncclNMergedIbDevs = 0;
-      if (ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) {
+      NCCLCHECK(ncclFindInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1, &nIpIfs));
+      if (nIpIfs != 1) {
         WARN("NET/IB : No IP interface found.");
         ret = ncclInternalError;
         goto fail;
@@ -600,6 +647,17 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
+        enum ncclIbProvider ibProvider = IB_PROVIDER_NONE;
+        char dataDirectDevicePath[PATH_MAX];
+        int dataDirectSupported = 0;
+        if (wrap_mlx5dv_is_supported(devices[d])) {
+          ibProvider = IB_PROVIDER_MLX5;
+          snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
+          if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
+            INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
+            if(ncclParamIbDataDirect()) dataDirectSupported = 1;
+          }
+        }
         int nPorts = 0;
         struct ibv_device_attr devAttr;
         memset(&devAttr, 0, sizeof(devAttr));
@@ -609,58 +667,69 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           continue;
         }
         for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
-          struct ibv_port_attr portAttr;
-          if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
-            WARN("NET/IB : Unable to query port_num %d", port_num);
-            continue;
-          }
-          if (portAttr.state != IBV_PORT_ACTIVE) continue;
-          if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
-              && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
+          for (int dataDirect = 0; dataDirect < 1 + dataDirectSupported; ++dataDirect) {
+            struct ibv_port_attr portAttr;
+            if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
+              WARN("NET/IB : Unable to query port_num %d", port_num);
+              continue;
+            }
+            if (portAttr.state != IBV_PORT_ACTIVE) continue;
+            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
+                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
-          // check against user specified HCAs/ports
-          if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
-            continue;
+            // check against user specified HCAs/ports
+            if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
+              continue;
+            }
+            pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
+            ncclIbDevs[ncclNIbDevs].device = d;
+            ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider;
+            ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
+            ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
+            ncclIbDevs[ncclNIbDevs].portNum = port_num;
+            ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+            ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
+            ncclIbDevs[ncclNIbDevs].context = context;
+            ncclIbDevs[ncclNIbDevs].pdRefs = 0;
+            ncclIbDevs[ncclNIbDevs].pd = NULL;
+            if (!dataDirect) {
+              strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+              NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
+            }
+            else {
+              snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name);
+              NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX));
+              strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX);
+              ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1;
+            }
+            ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
+            ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
+            ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
+            ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+            NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
+
+            // Enable ADAPTIVE_ROUTING by default on IB networks
+            // But allow it to be overloaded by an env parameter
+            ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
+            if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
+
+            INFO(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
+                NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+
+            PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
+            ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
+            PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
+
+            // Add this plain physical device to the list of virtual devices
+            int vDev;
+            ncclNetVDeviceProps_t vProps = {0};
+            vProps.ndevs = 1;
+            vProps.devs[0] = ncclNIbDevs;
+            NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
+
+            ncclNIbDevs++;
+            nPorts++;
           }
-          pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
-          ncclIbDevs[ncclNIbDevs].device = d;
-          ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
-          ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
-          ncclIbDevs[ncclNIbDevs].portNum = port_num;
-          ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
-          ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
-          ncclIbDevs[ncclNIbDevs].context = context;
-          ncclIbDevs[ncclNIbDevs].pdRefs = 0;
-          ncclIbDevs[ncclNIbDevs].pd = NULL;
-          strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-          NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
-          ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
-          ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
-          ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
-          ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
-          NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
-
-          // Enable ADAPTIVE_ROUTING by default on IB networks
-          // But allow it to be overloaded by an env parameter
-          ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
-          if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
-
-          TRACE(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-              NCCL_IB_LLSTR(portAttr.link_layer), ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
-
-          PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
-          ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
-          PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
-
-          // Add this plain physical device to the list of virtual devices
-          int vDev;
-          ncclNetVDeviceProps_t vProps = {0};
-          vProps.ndevs = 1;
-          vProps.devs[0] = ncclNIbDevs;
-          NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
-
-          ncclNIbDevs++;
-          nPorts++;
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
       }
@@ -779,6 +848,9 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
     props->ptrSupport |= NCCL_PTR_DMABUF; // GDR support via DMA-BUF
   }
   props->forceFlush = 0;
+  if (ibDev->capsProvider.mlx5.dataDirect) {
+    props->forceFlush = 1;
+  }
   props->latency = 0; // Not set
   props->port = ibDev->portNum + ibDev->realPort;
   props->maxComms = ibDev->maxQp;
@@ -893,6 +965,7 @@ struct ncclProfilerInfo {
   int qpIndex[MAX_QPS_PER_REQ];
   int nEventHandles;
   ncclProfilerNetIbDescr_v1_t data;
+  void* pHandle;
 };
 
 struct ncclIbRequest {
@@ -1312,23 +1385,27 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
     devInfo->gid.global.interface_id = commDev->base.gidInfo.localGid.global.interface_id;
 
     // info logging
-    if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
-      for (int q = 0; q < comm->base.nqps; q++) {
-        // Print just the QPs for this dev
-        if (comm->base.qps[q].devIndex == i)
+    for (int q = 0; q < comm->base.nqps; q++) {
+      // Print just the QPs for this dev
+      if (comm->base.qps[q].devIndex == i) {
+        if (devInfo->link_layer == IBV_LINK_LAYER_INFINIBAND) { // IB
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu  FLID %d fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
-            dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
-	    devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
-      }
-    } else { // RoCE
-      for (int q = 0; q < comm->base.nqps; q++) {
-        // Print just the QPs for this dev
-        if (comm->base.qps[q].devIndex == i)
-          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x} GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
-            comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
-            commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask, (int64_t)commDev->base.gidInfo.localGidIndex,
-            devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
+               comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
+               dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
+               devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
+        } else { // RoCE
+          INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
+               comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
+               commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu,
+               (int64_t)commDev->base.gidInfo.localGidIndex,
+               devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
+        }
+        // Log ECE info
+        if (meta.qpInfo[q].ece_supported) {
+          INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d query_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
+               commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn,
+               meta.qpInfo[q].ece_supported, meta.qpInfo[q].ece.vendor_id, meta.qpInfo[q].ece.options, meta.qpInfo[q].ece.comp_mask);
+        }
       }
     }
     if (link_layer == IBV_LINK_LAYER_UNSPECIFIED) link_layer = devInfo->link_layer;
@@ -1406,8 +1483,14 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
     ncclIbSendCommDev* commDev = comm->devs + devIndex;
 
     struct ibv_qp* qp = comm->base.qps[q].qp;
-    if (remQpInfo->ece_supported)
+    if (remQpInfo->ece_supported) {
+      struct ncclIbQp* nqp = comm->base.qps + q;
+      int ibDevN = comm->devs[nqp->devIndex].base.ibDevN;
+      struct ncclIbDev* ibDev = ncclIbDevs + ibDevN;
+      INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
+        ibDevN, ibDev->portNum, qp->qp_num, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask);
       NCCLCHECKGOTO(wrap_ibv_set_ece(qp, &remQpInfo->ece, &remQpInfo->ece_supported), ret, fail);
+    }
 
     ncclIbDev* ibDev = ncclIbDevs + commDev->base.ibDevN;
     remDevInfo->mtu = std::min(remDevInfo->mtu, ibDev->portAttr.active_mtu);
@@ -1415,16 +1498,6 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
     NCCLCHECKGOTO(ncclIbRtsQp(qp), ret, fail);
   }
 
-  if (link_layer == IBV_LINK_LAYER_ETHERNET ) { // RoCE
-    for (int q = 0; q < comm->base.nqps; q++) {
-      struct ncclIbQp* qp = comm->base.qps + q;
-      int ibDevN = comm->devs[qp->devIndex].base.ibDevN;
-      struct ncclIbDev* ibDev = ncclIbDevs + ibDevN;
-      INFO(NCCL_NET,"NET/IB: IbDev %d Port %d qpn %d set_ece={supported=%d, vendor_id=0x%x, options=0x%x, comp_mask=0x%x}",
-        ibDevN, ibDev->portNum, remMeta.qpInfo[q].qpn, remMeta.qpInfo[q].ece_supported, remMeta.qpInfo[q].ece.vendor_id, remMeta.qpInfo[q].ece.options, remMeta.qpInfo[q].ece.comp_mask);
-    }
-  }
-
   comm->base.nDataQps = std::max(comm->base.vProps.ndevs, comm->base.nRemDevs);
 
   comm->base.ready = 1;
@@ -1750,9 +1823,8 @@ ncclResult_t ncclIbGetRequest(struct ncclIbNetCommBase* base, struct ncclIbReque
     if (r->type == NCCL_NET_IB_REQ_UNUSED) {
       r->base = base;
       r->sock = NULL;
-      r->devBases[0] = NULL;
-      r->devBases[1] = NULL;
-      r->events[0] = r->events[1] = 0;
+      memset(r->devBases, 0, sizeof(r->devBases));
+      memset(r->events, 0, sizeof(r->events));
       *req = r;
       return ncclSuccess;
     }
@@ -1789,7 +1861,11 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
       if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
       if (fd != -1) {
         /* DMA-BUF support */
-        NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+        if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) {
+          NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+        } else {
+          NCCLCHECKGOTO(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT), res, returning);
+        }
       } else {
         if (ncclIbRelaxedOrderingEnabled) {
           // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
@@ -1897,7 +1973,7 @@ ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
 
 NCCL_PARAM(IbSplitDataOnQps, "IB_SPLIT_DATA_ON_QPS", 0);
 
-ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandle) {
+ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot) {
   struct ncclIbRequest** reqs = comm->fifoReqs[slot];
   volatile struct ncclIbSendFifo* slots = comm->fifo[slot];
   int nreqs = slots[0].nreqs;
@@ -1989,19 +2065,21 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl
     struct ibv_send_wr* bad_wr;
 #ifdef NCCL_ENABLE_NET_PROFILING
     // QP profiling loop
-    for (int r=0; r<nreqs && pHandle; r++) {
+    for (int r=0; r<nreqs; r++) {
       // Store comm qpIndex for this request
       int nEventHandles = reqs[r]->pInfo[0].nEventHandles;
-      reqs[r]->pInfo[0].qpIndex[nEventHandles%MAX_QPS_PER_REQ] = qpIndex;
+      assert(nEventHandles < MAX_QPS_PER_REQ);
+      reqs[r]->pInfo[0].qpIndex[nEventHandles] = qpIndex;
       // Store info for profiler
-      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
       reqs[r]->pInfo[0].data.type = ncclProfileQp;
       reqs[r]->pInfo[0].data.qp.device = devIndex;
       reqs[r]->pInfo[0].data.qp.wr_id = comm->wrs[r].wr_id;
       reqs[r]->pInfo[0].data.qp.opcode = comm->wrs[r].opcode;
       reqs[r]->pInfo[0].data.qp.qpNum = qp->qp->qp_num;
       reqs[r]->pInfo[0].data.qp.length = comm->sges[r].length;
-      NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles%MAX_QPS_PER_REQ], 0, pHandle, pluginId, &reqs[r]->pInfo[0].data));
+      void* pHandle = reqs[r]->pInfo[0].pHandle;
+      NCCLCHECK(ncclProfilerFunction(&reqs[r]->pInfo[0].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, pHandle, pluginId, &reqs[r]->pInfo[0].data));
       reqs[r]->pInfo[0].nEventHandles++;
     }
 #endif
@@ -2023,8 +2101,11 @@ ncclResult_t ncclIbMultiSend(struct ncclIbSendComm* comm, int slot, void* pHandl
 
 ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request) {
   struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
-  if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0"); return ncclInternalError; }
-  if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
+  if (comm->base.ready == 0) {
+    WARN("NET/IB: ncclIbIsend() called when comm->base.ready == 0");
+    *request = NULL;
+    return ncclInternalError;
+  }
   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
   struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) mhandle;
@@ -2065,6 +2146,9 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     req->send.size = size;
     req->send.data = data;
     req->send.offset = 0;
+#ifdef NCCL_ENABLE_NET_PROFILING
+    req->pInfo[0].pHandle = phandle;
+#endif
 
     // Populate events
     int nEvents = ncclParamIbSplitDataOnQps() ? comm->base.nqps : comm->base.nDataQps;
@@ -2094,7 +2178,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, size_t size, int tag, void*
     }
 
     TIME_START(0);
-    NCCLCHECK(ncclIbMultiSend(comm, slot, phandle));
+    NCCLCHECK(ncclIbMultiSend(comm, slot));
 
     // Clear slots[0]->nreqs, as well as other fields to help debugging and sanity checks
     memset((void*)slots, 0, sizeof(struct ncclIbSendFifo));
@@ -2187,8 +2271,11 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, int n, void** data, siz
 
 ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request) {
   struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm;
-  if (comm->base.ready == 0) { WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0"); return ncclInternalError; }
-  if (comm->base.ready == 0) { *request = NULL; return ncclSuccess; }
+  if (comm->base.ready == 0) {
+    WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
+    *request = NULL;
+    return ncclInternalError;
+  }
   if (n > NCCL_NET_IB_MAX_RECVS) return ncclInternalError;
   NCCLCHECK(ncclIbStatsCheckFatalCount(&comm->base.stats,__func__));
 
@@ -2222,14 +2309,17 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
     ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
 #ifdef NCCL_ENABLE_NET_PROFILING
     // Start a QP event for every request in the multirecv and every qp
-    for (int r = 0; r < n && phandles; r++) {
+    for (int r = 0; r < n; r++) {
+      int nEventHandles = req->pInfo[r].nEventHandles;
+      assert(nEventHandles < MAX_QPS_PER_REQ);
+      req->pInfo[r].qpIndex[nEventHandles] = comm->base.qpIndex;
       // Store info for profiler
-      int pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
+      int64_t pluginId = NCCL_PROFILER_NET_TYPE_IB | NCCL_PROFILER_NET_IB_VER;
       req->pInfo[r].data.type = ncclProfileQp;
       req->pInfo[r].data.qp.device = qp->devIndex;
       req->pInfo[r].data.qp.wr_id = wr.wr_id;
       req->pInfo[r].data.qp.qpNum = qp->qp->qp_num;
-      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[i], 0, phandles[r], pluginId, &req->pInfo[r].data));
+      NCCLCHECK(ncclProfilerFunction(&req->pInfo[r].qpEventHandles[nEventHandles], ncclProfilerNetEventStart, phandles[r], pluginId, &req->pInfo[r].data));
       req->pInfo[r].nEventHandles++;
     }
 #endif
@@ -2311,7 +2401,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
           sizes[i] = r->recv.sizes[i];
 #ifdef NCCL_ENABLE_NET_PROFILING
           for (int j = 0; j < r->pInfo[i].nEventHandles; j++) {
-            NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], 1, NULL, 0, NULL));
+            NCCLCHECK(ncclProfilerFunction(&r->pInfo[i].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL));
           }
 #endif
         }
@@ -2320,7 +2410,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
         sizes[0] = r->send.size;
 #ifdef NCCL_ENABLE_NET_PROFILING
         for (int j = 0; j < r->pInfo[0].nEventHandles; j++) {
-          NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], 1, NULL, 0, NULL));
+          NCCLCHECK(ncclProfilerFunction(&r->pInfo[0].qpEventHandles[j], ncclProfilerNetEventStop, NULL, 0, NULL));
         }
 #endif
       }
@@ -2368,20 +2458,21 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
           #ifdef ENABLE_TRACE
           char line[SOCKET_NAME_MAXLEN+1];
-          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d}, i=%d",
-              ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], i);
+          TRACE(NCCL_NET, "Got completion from peer %s with status=%d opcode=%d len=%u wr_id=%lu r=%p type=%d events={%d,%d,%d,%d}, i=%d",
+            ncclSocketToString(&addr, line), wc->status, wc->opcode,wc->byte_len, wc->wr_id, req, req->type, req->events[0], req->events[1], req->events[2], req->events[3], i);
           #endif
           if (req && req->type == NCCL_NET_IB_REQ_SEND) {
             for (int j = 0; j < req->nreqs; j++) {
               struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
               if ((sendReq->events[i] <= 0)) {
-                WARN("NET/IB: sendReq(%p)->events={%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], i, j);
+                WARN("NET/IB: sendReq(%p)->events={%d,%d,%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], sendReq->events[2], sendReq->events[3], i, j);
                 return ncclInternalError;
               }
               sendReq->events[i]--;
 #ifdef NCCL_ENABLE_NET_PROFILING
               // Stop Qp event for sendReq
-              NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[getReqQpIndex(sendReq, j, wc->qp_num)], 1, NULL, 0, NULL));
+              int qpIndex = getReqQpIndex(sendReq, j, wc->qp_num);
+              NCCLCHECK(ncclProfilerFunction(&sendReq->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL));
 #endif
             }
           } else {
@@ -2398,7 +2489,8 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 #ifdef NCCL_ENABLE_NET_PROFILING
             // Stop Qp event for workFifo
             for (int j = 0; j < req->nreqs; j++) {
-              NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[getReqQpIndex(req, j, wc->qp_num)], 1, NULL, 0, NULL));
+              int qpIndex = getReqQpIndex(req, j, wc->qp_num);
+              NCCLCHECK(ncclProfilerFunction(&req->pInfo[j].qpEventHandles[qpIndex], ncclProfilerNetEventStop, NULL, 0, NULL));
             }
 #endif
           }
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 8034d95fe..985810c47 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -45,7 +45,7 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallba
     if (ncclNetIfs == -1) {
       char names[MAX_IF_NAME_SIZE*MAX_IFS];
       union ncclSocketAddress addrs[MAX_IFS];
-      ncclNetIfs = ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS);
+      NCCLCHECK(ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS, &ncclNetIfs));
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
         pthread_mutex_unlock(&ncclNetSocketLock);
@@ -124,8 +124,9 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
 #define MAX_SOCKETS 64
 #define MAX_THREADS 16
 #define MAX_REQUESTS NCCL_NET_MAX_REQUESTS
-#define MIN_CHUNKSIZE (64*1024)
 
+NCCL_PARAM(SocketInlineSize, "SOCKET_INLINE", /*128 B=*/1 << 7);
+NCCL_PARAM(SocketMinTaskSize, "SOCKET_MIN_TASKSIZE", /*64 kiB=*/1 << 16);
 NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2);
 NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2);
 
@@ -171,6 +172,7 @@ struct ncclNetSocketRequest {
   int op;
   void* data;
   int size;
+  void* inlineData;
   struct ncclSocket* ctrlSock;
   int offset;
   int used;
@@ -211,6 +213,7 @@ struct ncclNetSocketComm {
   int nSocks;
   int nThreads;
   int nextSock;
+  void* inlineData;
   struct ncclNetSocketRequest requests[MAX_REQUESTS];
   pthread_t helperThread[MAX_THREADS];
   struct ncclNetSocketThreadResources threadResources[MAX_THREADS];
@@ -241,13 +244,13 @@ void* persistentSocketThread(void *args_) {
               data.sock.fd = r->sock->fd;
               data.sock.op = r->op;
               data.sock.length = r->size;
-              ncclProfilerFunction(&eHandle[i+j], 0, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+              ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStart, resource->pInfo->pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
             }
 #endif
             r->result = ncclSocketProgress(r->op, r->sock, r->data, r->size, &r->offset);
             if (r->result != ncclSuccess) {
 #ifdef NCCL_ENABLE_NET_PROFILING
-              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStop, NULL, 0, NULL);
               eHandle[i+j] = NULL;
 #endif
               WARN("NET/Socket : socket progress error");
@@ -257,7 +260,7 @@ void* persistentSocketThread(void *args_) {
             if (r->offset < r->size) repeat = 1;
 #ifdef NCCL_ENABLE_NET_PROFILING
             if (repeat == 0) {
-              ncclProfilerFunction(&eHandle[i+j], 1, NULL, 0, NULL);
+              ncclProfilerFunction(&eHandle[i+j], ncclProfilerNetEventStop, NULL, 0, NULL);
               eHandle[i+j] = NULL;
             }
 #endif
@@ -360,6 +363,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm)
   goto exit;
 }
 
+#define SOCKET_CTRL_SIZE (sizeof(int))
 ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
@@ -401,6 +405,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op
     NCCLCHECK(ncclSocketProgress(NCCL_SOCKET_SEND, sock, &i, sizeof(uint8_t), &done));
     if (done == 0) return ncclSuccess;
   }
+  NCCLCHECK(ncclCalloc(&comm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize())));
   *sendComm = comm;
   return ncclSuccess;
 }
@@ -449,6 +454,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic
       memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
     free(sock);
   }
+  NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize())));
   *recvComm = rComm;
 
   /* reset lComm state */
@@ -470,6 +476,7 @@ ncclResult_t ncclNetSocketGetRequest(struct ncclNetSocketComm* comm, int op, voi
       r->used = 1;
       r->comm = comm;
       r->nSubs = 0;
+      r->inlineData = (uint8_t*)comm->inlineData + i * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize());
       *req = r;
       return ncclSuccess;
     }
@@ -520,6 +527,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro
   return ncclInternalError;
 }
 
+// if the dataSize is smaller than the inline size, return the inline size; if not, return 0 to avoid the extra copy.
+static int ncclNetSocketInlineSize(int dataSize) { return (dataSize <= ncclParamSocketInlineSize()) ? dataSize : 0; }
+
 ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
   *done = 0;
   struct ncclNetSocketRequest *r = (struct ncclNetSocketRequest*)request;
@@ -527,37 +537,55 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
     WARN("NET/Socket : test called with NULL request");
     return ncclInternalError;
   }
-  if (r->used == 1) { /* try to send/recv size */
-    int data = r->size;
+  if (r->used == 1) { /* try to send/recv size (+ inline data if any) */
+    int msgSize;
+    uint8_t* msg = (uint8_t*)r->inlineData;
+    if (r->op == NCCL_SOCKET_SEND) {
+      // sender side has the right data size, copy size info + inline data to the buffer
+      int inlineSize = ncclNetSocketInlineSize(r->size);
+      msgSize = inlineSize + SOCKET_CTRL_SIZE;
+      memcpy(msg, &r->size, SOCKET_CTRL_SIZE);
+      if (inlineSize > 0) memcpy(msg + SOCKET_CTRL_SIZE, r->data, inlineSize);
+    } else {
+      // receiver side doesn't have the right data size, wait for the sender to send it
+      int sizeOffset = 0, senderSize = 0;
+      while (sizeOffset < SOCKET_CTRL_SIZE) {
+        NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, SOCKET_CTRL_SIZE, &sizeOffset));
+        if (sizeOffset == 0) return ncclSuccess; /* not ready yet*/
+      }
+      memcpy(&senderSize, msg, SOCKET_CTRL_SIZE);
+      if (senderSize > r->size) {
+        char line[SOCKET_NAME_MAXLEN + 1];
+        union ncclSocketAddress addr;
+        NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
+        WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in a healthy state, "
+             "there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
+             ncclSocketToString(&addr, line), senderSize, r->size);
+        return ncclInvalidUsage;
+      }
+      // copy to the data buffer if we have received some inline data already
+      int receivedInline = sizeOffset - SOCKET_CTRL_SIZE;
+      if (receivedInline > 0) memcpy(r->data, msg + SOCKET_CTRL_SIZE, receivedInline);
+      // from the actual size, extract the remaining inline size to be received and redirect the msg buffer to the user data
+      r->size = senderSize;
+      msgSize = ncclNetSocketInlineSize(r->size) - receivedInline;
+      msg = (uint8_t*)r->data + receivedInline;
+    }
     int offset = 0;
-    NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, &data, sizeof(int), &offset));
-
-    if (offset == 0) return ncclSuccess; /* Not ready -- retry later */
-
-    // Not sure we could ever receive less than 4 bytes, but just in case ...
-    if (offset < sizeof(int)) NCCLCHECK(ncclSocketWait(r->op, r->ctrlSock, &data, sizeof(int), &offset));
-
-    // Check size is less or equal to the size provided by the user
-    if (r->op == NCCL_SOCKET_RECV && data > r->size) {
-      char line[SOCKET_NAME_MAXLEN+1];
-      union ncclSocketAddress addr;
-      NCCLCHECK(ncclSocketGetAddr(r->ctrlSock, &addr));
-      WARN("NET/Socket : peer %s message truncated : receiving %d bytes instead of %d. If you believe your socket network is in healthy state, \
-          there may be a mismatch in collective sizes or environment settings (e.g. NCCL_PROTO, NCCL_ALGO) between ranks",
-          ncclSocketToString(&addr, line), data, r->size);
-      return ncclInvalidUsage;
+    while (offset < msgSize) {
+      NCCLCHECK(ncclSocketProgress(r->op, r->ctrlSock, msg, msgSize, &offset));
+      if (offset == 0) return ncclSuccess; /* not ready yet*/
     }
-    r->size = data;
-    r->offset = 0;
-    r->used = 2; // done exchanging size
-    // divide into subtasks
-    int chunkOffset = 0, i = 0;
+    // done exchanging sizes, r->size now contains the actual size
+    r->used = 2;
+    r->offset = ncclNetSocketInlineSize(r->size);
+    int chunkOffset = r->offset, i = 0;
     if (r->comm->nSocks > 0) {
-      // each request can be divided up to nSocks tasks
-      int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks));
+      // each request can be divided up to nSocks tasks, we use the size left to transfer
+      int taskSize = std::max((int)ncclParamSocketMinTaskSize(), DIVUP(r->size - r->offset, r->comm->nSocks));
       while (chunkOffset < r->size) {
-        int chunkSize = std::min(taskSize, r->size-chunkOffset);
-        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++));
+        int chunkSize = std::min(taskSize, r->size - chunkOffset);
+        NCCLCHECK(ncclNetSocketGetTask(r->comm, &r->pInfo, r->op, (char*)(r->data) + chunkOffset, chunkSize, r->tasks + i++));
         chunkOffset += chunkSize;
       }
     }
@@ -588,7 +616,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         data.sock.fd = r->ctrlSock->fd;
         data.sock.op = r->op;
         data.sock.length = r->size;
-        ncclProfilerFunction(&r->pInfo.eHandle, 0, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
+        ncclProfilerFunction(&r->pInfo.eHandle, ncclProfilerNetEventStart, r->pInfo.pHandle, NCCL_PROFILER_NET_TYPE_SOCK | 1, &data);
       }
 #endif
       if (r->offset < r->size) {
@@ -599,7 +627,7 @@ ncclResult_t ncclNetSocketTest(void* request, int* done, int* size) {
         *done = 1;
         r->used = 0;
 #ifdef NCCL_ENABLE_NET_PROFILING
-        ncclProfilerFunction(&r->pInfo.eHandle, 1, NULL, 0, NULL);
+        ncclProfilerFunction(&r->pInfo.eHandle, ncclProfilerNetEventStop, NULL, 0, NULL);
         r->pInfo.eHandle = NULL;
 #endif
       }
@@ -673,6 +701,7 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
       NCCLCHECK(ncclSocketReady(&comm->socks[i], &ready));
       if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
     }
+    if(comm->inlineData) free(comm->inlineData);
     free(comm);
   }
   return ncclSuccess;
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index d99f7cb3e..da8d263f1 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -13,6 +13,7 @@
 #include "enqueue.h"
 #include "register.h"
 #include "transport.h"
+#include "register_inline.h"
 
 #if CUDART_VERSION >= 12010
 
@@ -109,7 +110,9 @@ ncclResult_t nvlsGroupUnbind(struct ncclComm *comm, size_t size, CUmemGenericAll
 }
 
 ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t ucsize, size_t mcsize) {
-  CUCHECK(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize));
+  // unbind can trigger RM error if buffer is freed already by users
+  // however, it is safe to ignore the error, and unbind will succeed anyway
+  CUCALL(cuMulticastUnbind(*mcHandler, dev, 0/*mcOffset*/, ucsize));
   CUCHECK(cuMemUnmap(ptr, mcsize));
   CUCHECK(cuMemAddressFree(ptr, mcsize));
   CUCHECK(cuMemRelease(*mcHandler));
@@ -143,9 +146,9 @@ ncclResult_t nvlsGroupUnmapMem(struct ncclComm *comm, size_t ucsize, void* ucptr
 #define NVLS_MEM_ALIGN_SIZE (1 << 21)
 #define NVLS_NCHANNELS_SM90 16
 #define NVLS_NCHANNELS_SM100 32
+#define NVLS_NCHANNELS_SM100_NVL 24
 
 NCCL_PARAM(NvlsEnable, "NVLS_ENABLE", 2);
-NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", -2);
 NCCL_PARAM(NvlsChunkSize, "NVLS_CHUNKSIZE", 128*1024);
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
@@ -171,12 +174,31 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
     comm->nvlsSupport = 1;
   }
 
-  INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d", comm->nvlsSupport ? "" : "not ", dev);
   if (comm->nvlsSupport) {
-    int channels = (comm->compCap >= 100) ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM90;
-    if (ncclParamNvlsChannels() >= 0) channels = ncclParamNvlsChannels();
+    int channels;
+    if (comm->compCap >= 100) {
+      // Use a reduced number of channels for single node/MNNVL domain on Blackwell.
+      // comm->nNodes is not yet initialized at this point so we need to use other data.
+      bool multiNode;
+      if (comm->MNNVL) {
+        multiNode = (comm->clique.size < comm->nRanks);
+      } else {
+        int i;
+        for (i = 1; i < comm->nRanks; i++) {
+          if (comm->peerInfo[i].hostHash != comm->peerInfo[0].hostHash)
+            break;
+        }
+        multiNode = (i < comm->nRanks);
+      }
+      channels = (multiNode ? NVLS_NCHANNELS_SM100 : NVLS_NCHANNELS_SM100_NVL);
+    } else {
+      channels = NVLS_NCHANNELS_SM90;
+    }
+    if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT) channels = comm->config.nvlsCTAs;
     comm->nvlsChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, channels));
   }
+  INFO(NCCL_INIT, "NVLS multicast support is %savailable on dev %d (NVLS_NCHANNELS %d)",
+       comm->nvlsSupport ? "" : "not ", dev, comm->nvlsChannels);
   return ncclSuccess;
 }
 
@@ -242,16 +264,33 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc
   CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)ucptr, ucsize, ucgran, 0U, 0), ret, fail);
 
   // Alloc local physical mem for this NVLS group
-  CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail);
-  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail);
-  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail);
-  CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail);
+  CUCHECKGOTO(cuMemCreate(ucHandle, ucsize, &ucprop, 0), ret, fail1);
+  CUCHECKGOTO(cuMemMap((CUdeviceptr)*ucptr, ucsize, 0, *ucHandle, 0), ret, fail2);
+  CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)*ucptr, ucsize, desc, 1), ret, fail3);
+  CUDACHECKGOTO(cudaMemset(*ucptr, 0, ucsize), ret, fail3);
 
   // intra-node barrier to mitigate the possible hang in cuMulticastBindMem during abort
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail3);
   // Bind physical memory to the Multicast group
   // NB: It will block until all ranks have been added to the Group
-  CUCHECKGOTO(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/), ret, fail);
+  // This is where we normally see issues if the system NVLS/Multicast support is broken
+  {
+    CUresult err = CUPFN(cuMulticastBindMem(*mcHandle, 0/*mcOffset*/, *ucHandle, 0/*memOffset*/, ucsize, 0/*flags*/));
+    if (err != CUDA_SUCCESS) {
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      if (ncclParamNvlsEnable() == 1) {
+        // Fail the job as NVLS support is not available
+        WARN("Failed to bind NVLink SHARP (NVLS) Multicast memory of size %ld : CUDA error %d '%s'.\nThis is usually caused by a system or configuration error in the Fabric Manager or NVSwitches.\nDo not force-enable NVLS (NCCL_NVLS_ENABLE=1) if you wish to avoid this error in the future.", ucsize, err, errStr );
+        ret = ncclUnhandledCudaError;
+      } else {
+        // Continue without NVLS support (returns ncclSuccess)
+        INFO(NCCL_INIT|NCCL_NVLS, "Failed to bind NVLink SHARP (NVLS) Multicast memory of size %ld : CUDA error %d '%s'. Proceeding without NVLS support.", ucsize, err, errStr);
+      }
+      comm->nvlsSupport = comm->nvlsChannels = 0;
+      goto fail3;
+   }
+  }
 
   // Map mc virtual address
   CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)mcptr, mcsize, mcgran, 0U, 0), ret, fail);
@@ -263,6 +302,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc
 
 exit:
   return ret;
+fail3:
+  CUCHECK(cuMemUnmap((CUdeviceptr)*ucptr, ucsize));
+fail2:
+  CUCHECK(cuMemRelease(*ucHandle));
+fail1:
+  CUCHECK(cuMemAddressFree((CUdeviceptr)*ucptr, ucsize));
 fail:
   if (allocMcHandle && *mcptr == NULL && *ucptr == NULL) CUCHECK(cuMemRelease(*mcHandle));
   goto exit;
@@ -291,8 +336,8 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   nvlsPerRankSize = nChannels * 2 * buffSize;
   nvlsTotalSize = nvlsPerRankSize * nHeads;
 
-  INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
-       comm, headRank, nHeads, buffSize, nvlsPerRankSize, nvlsTotalSize);
+  INFO(NCCL_INIT | NCCL_NVLS, "NVLS comm %p headRank %d nHeads %d nvlsRanks %d buffSize %zu nvlsPerRankSize %zu nvlsTotalSize %zu",
+       comm, headRank, nHeads, comm->localRanks, buffSize, nvlsPerRankSize, nvlsTotalSize);
 
   NCCLCHECKGOTO(nvlsAllocateMem(comm, &resources->accessDesc, nvlsTotalSize, &resources->ucBuffHandle, &resources->mcBuffHandle, (void**)&resources->ucBuff, (void**)&resources->mcBuff, &resources->buffUCSize, &resources->buffMCSize), res, fail);
 
@@ -338,32 +383,10 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   size_t typeSize;
   char shmPath[sizeof("/dev/shm/nccl-XXXXXX")];
   uintptr_t *nvlsShmem = NULL;
-  bool nvlsShare = parent && parent->nvlsSupport && parent->config.splitShare;
-  int nHeads = comm->channels[0].nvls.nHeads;
+  bool nvlsShare = parent && parent->nvlsSupport && parent->shareResources && parent->localRanks == comm->localRanks;
 
   if (comm->nvlsSupport == 0 || comm->nvlsChannels == 0) return ncclSuccess;
 
-  if (nvlsShare && parent->channels[0].nvls.nHeads == nHeads) {
-    for (int ch = 0; ch < nHeads; ++ch) {
-      bool find = false;
-      for (int h = 0; h < parent->channels[0].nvls.nHeads; ++h) {
-        if (comm->nvlsHeads[ch] == parent->nvlsHeads[h]) {
-          // find the head
-          find = true;
-          break;
-        }
-      }
-      if (find == false) {
-        nvlsShare = false;
-        goto setup;
-      }
-    }
-    nvlsShare = true;
-  } else {
-    nvlsShare = false;
-  }
-
-setup:
   comm->nvlsChunkSize = ncclParamNvlsChunkSize();
   if (nvlsShare) {
     /* reuse NVLS resources */
@@ -387,9 +410,10 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     comm->nvlsResources->inited = false;
     comm->nvlsResources->refCount = 1;
     comm->nvlsResources->nChannels = comm->nvlsChannels;
+    comm->nvlsResources->nHeads = nHeads;
     resources = comm->nvlsResources;
 
-    if (parent && parent->nvlsSupport && parent->config.splitShare) {
+    if (parent && parent->nvlsSupport && parent->shareResources) {
       /* ranks on other nodes might share the NVLS resources, we need to cap nvlsChannels
        * to make sure nvlsChannels match for each rank. */
       comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
@@ -529,9 +553,9 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   if (userBuff) {
     NCCLCHECKGOTO(ncclRegFind(comm, (void*)userBuff, buffSize, &regRecord), ret, fail);
     if (regRecord) {
-      CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->addr), ret, fail);
+      CUDACHECKGOTO(cudaPointerGetAttributes(&attr, (void*)regRecord->begAddr), ret, fail);
       if (attr.type == cudaMemoryTypeDevice) {
-        size_t regSize = regRecord->pages * comm->regCache.pageSize;
+        size_t regSize = regRecord->endAddr - regRecord->begAddr;
         memset(&mcprop, 0, sizeof(CUmulticastObjectProp));
         mcprop.numDevices = comm->localRanks;
         mcprop.handleTypes = ncclCuMemHandleType;
@@ -546,8 +570,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
         ucprop.requestedHandleTypes = ncclCuMemHandleType;
         CUCHECKGOTO(cuMemGetAllocationGranularity(&ucgran, &ucprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
 
-        CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&regRecord->baseAddr, &regRecord->baseSize, (CUdeviceptr)regRecord->addr), ret, fail);
-        if (regRecord->addr % ucgran == 0) {
+        if (regRecord->begAddr % ucgran == 0) {
           if (regSize % ucgran != 0) {
             regRecord->regUCSize = ALIGN_SIZE(regSize, ucgran);
           } else {
@@ -555,7 +578,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
           }
           regRecord->state |= NVLS_REG_POSSIBLE;
           memcpy(&regData[comm->localRank].reg, regRecord, sizeof(struct ncclReg));
-          regData[comm->localRank].offset = userBuff - regRecord->addr;
+          regData[comm->localRank].offset = userBuff - regRecord->begAddr;
         }
       }
 
@@ -595,7 +618,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->addr, ucsize, 0), ret, fail);
+  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0), ret, fail);
 
   // Create a VA for the NVLS
   CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
@@ -610,7 +633,7 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
   /* get all buffer addresses */
-  regRecord->caddrs[comm->localRank] = regRecord->addr;
+  regRecord->caddrs[comm->localRank] = regRecord->begAddr;
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
 
   /* Although registration is done, we still need to check whether the offsets are same among ranks. */
@@ -642,23 +665,23 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
 
   if (sendRegRecord) {
     memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
-    regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->addr;
+    regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->begAddr;
   }
 
   if (recvRegRecord) {
     memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
-    regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->addr;
+    regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->begAddr;
   }
 
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
 
   /* first check whether all local ranks find their registered buffer */
   for (int i = 0; i < comm->localRanks; ++i) {
-    if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.addr) {
+    if ((regData[i * 2].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2].reg.caddrs[i] != regData[i * 2].reg.begAddr) {
       sendNeedReg = true;
     }
 
-    if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.addr) {
+    if ((regData[i * 2 + 1].reg.state & NVLS_REG_COMPLETE) == 0 || regData[comm->localRank * 2 + 1].reg.caddrs[i] != regData[i * 2 + 1].reg.begAddr) {
       recvNeedReg = true;
     }
 
@@ -787,7 +810,7 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
     NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord));
   }
 
-  NCCLCHECK(nvlsRegisterBuffer(comm, baseSend, baseRecv, baseSendSize, baseRecvSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
+  NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
 
   if (*outRegBufUsed) {
     if (sendRegRecord) {
@@ -815,6 +838,124 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nvlsSupport) {
+    CUmulticastObjectProp mcprop = {};
+    CUmemGenericAllocationHandle mcHandle;
+    char shareableHandle[NVLS_HANDLE_SIZE];
+    CUmemAccessDesc accessDesc = {};
+
+    mcprop.numDevices = comm->localRanks;
+    mcprop.handleTypes = ncclCuMemHandleType;
+    mcprop.flags = 0;
+    mcprop.size = comm->baseStride;
+
+    if (comm->localRank == 0) {
+      NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+    } else {
+      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+      NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail);
+    }
+
+    CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail);
+    CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&comm->baseMCSymPtr, comm->baseStride, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail);
+    CUCHECKGOTO(cuMemMap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, 0, mcHandle, 0), ret, fail);
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = comm->cudaDev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, &accessDesc, 1), ret, fail);
+    comm->symMCHandle = mcHandle;
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nvlsSupport && comm->baseMCSymPtr) {
+    CUCHECKGOTO(cuMemUnmap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail);
+    CUCHECKGOTO(cuMemAddressFree((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail);
+    CUCHECKGOTO(cuMemRelease(comm->symMCHandle), ret, fail);
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) {
+  ncclResult_t ret = ncclSuccess;
+  assert((uintptr_t)ucaddr % NCCL_REC_PAGE_SIZE == 0 && ucsize % NCCL_REC_PAGE_SIZE == 0);
+  if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) {
+    CUCHECKGOTO(cuMulticastBindAddr(comm->symMCHandle, offset, (CUdeviceptr)ucaddr, ucsize, 0), ret, fail);
+    INFO(NCCL_ALLOC, "NVLS symmetric alloc mc buffer ptr %p offset %ld UC addr %p UC size %ld symAllocHead %ld", comm->baseMCSymPtr + offset, offset, ucaddr, ucsize, comm->symAllocHead);
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) {
+    size_t offset = (size_t)ucaddr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride);
+    CUCHECKGOTO(cuMulticastUnbind(comm->symMCHandle, comm->cudaDev, offset, ucsize), ret, fail);
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) {
+  int factor;
+  ncclResult_t ret = ncclSuccess;
+  if (comm->nNodes == 1) {
+    if (info->func == ncclFuncReduceScatter) {
+      factor = (comm->compCap >= 100 ? 6 : 5) * 8;
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else if (info->func == ncclFuncAllGather) {
+      factor = 4 * 8;
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else if (info->func == ncclFuncAllReduce) {
+      if (comm->compCap >= 100) {
+        factor = 8 * 8;
+      } else {
+        factor = 4 * 8;
+      }
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else {
+      goto fail;
+    }
+  } else {
+    // Further tweaks for Blackwell with NVLS registered buffers
+    if (info->func == ncclFuncReduceScatter) {
+      factor = (comm->bandwidths[ncclFuncReduceScatter][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] > 400 ? 7 : 6) * 8;
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else if (info->func == ncclFuncAllGather) {
+      factor = 6 * 8;
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else if (info->func == ncclFuncAllReduce) {
+      factor = (comm->compCap >= 100 ? 7 : 6) * 8;
+      *recChannels = std::max(comm->config.minCTAs, std::min(comm->config.maxCTAs, DIVUP(factor, comm->nvlsResources->nHeads)));
+    } else {
+      goto fail;
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  ret = ncclInvalidArgument;
+  goto exit;
+}
+
 #else
 
 /*
@@ -860,4 +1001,25 @@ ncclResult_t ncclNvlsDeregBuffer(struct ncclComm* comm, CUmemGenericAllocationHa
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) {
+  return ncclSuccess;
+}
+
+ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) {
+  *recChannels = 0;
+  return ncclSuccess;
+}
+
 #endif /* CUDA_VERSION >= 12010 */
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index aed84c588..d263dda3a 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -12,6 +12,7 @@
 #include "transport.h"
 #include <assert.h>
 #include "shm.h"
+#include "register_inline.h"
 
 enum p2pType { P2P_DIRECT, P2P_INTERMEDIATE, P2P_IPC, P2P_CUMEM };
 
@@ -826,7 +827,7 @@ ncclResult_t ret = ncclSuccess;
         // We already have IPC info for peerLocalRank, no need to register it, we can reuse it
         *regBufFlag = 1;
         if (isLegacyIpc) *isLegacyIpc = regRecord->ipcInfos[peerLocalRank]->impInfo.legacyIpcCap;
-        INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->addr, regRecord->pages * comm->regCache.pageSize, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
+        INFO(NCCL_REG, "rank %d - IPC reuse buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, regRecord->endAddr - regRecord->begAddr, peerRank, regRecord->ipcInfos[peerLocalRank]->impInfo.rmtRegAddr);
       } else {
         // Register buffer with peerLocalRank
         struct ncclProxyConnector* proxyConn = NULL;
@@ -885,11 +886,11 @@ ncclResult_t ret = ncclSuccess;
 
         void* rmtRegAddr = NULL;
         ipcInfo.size = baseSize;
-        ipcInfo.offset = regRecord->addr - (uintptr_t)baseAddr;
+        ipcInfo.offset = regRecord->begAddr - (uintptr_t)baseAddr;
         // Now ipcInfo contains all necessary registration info. Start to register buffer on proxy side
         // and get the remote register address back.
         if (proxyConn) {
-          INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank);
+          INFO(NCCL_REG, "rank %d - IPC registering buffer %p size %ld (baseAddr %p size %ld) to peer %d", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, ipcInfo.size, peerRank);
           NCCLCHECKGOTO(ncclProxyCallBlocking(comm, proxyConn, ncclProxyMsgRegister, &ipcInfo, sizeof(p2pIpcExpInfo), &rmtRegAddr, sizeof(void*)), ret, fail);
         }
         if (rmtRegAddr) {
@@ -909,7 +910,7 @@ ncclResult_t ret = ncclSuccess;
           regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank] = (uintptr_t)rmtRegAddr;
           needUpdate = true;
           *regBufFlag = 1;
-          INFO(NCCL_REG, "rank %d - IPC registered buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->addr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->addr);
+          INFO(NCCL_REG, "rank %d - IPC register buffer %p size %ld (baseAddr %p size %ld) to peer %d regAddr %p offsetOut %ld", comm->rank, userbuff, buffSize, (void*)regRecord->begAddr, ipcInfo.size, peerRank, rmtRegAddr, (uintptr_t)userbuff - regRecord->begAddr);
         }
       }
     }
@@ -935,7 +936,7 @@ ncclResult_t ret = ncclSuccess;
         // p2p always returns remote addr here since remote buffer addr is passed in ncclDevWorkP2p struct
         peerRmtAddrs = (uintptr_t*)regRecord->regIpcAddrs.hostPeerRmtAddrs[peerLocalRank];
       }
-      *offsetOut = (uintptr_t)userbuff - regRecord->addr;
+      *offsetOut = (uintptr_t)userbuff - regRecord->begAddr;
       *peerRmtAddrsOut = peerRmtAddrs;
     }
   }
@@ -1117,6 +1118,88 @@ static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, s
   goto exit;
 }
 
+ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm) {
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&comm->baseUCSymPtr, comm->baseStride * comm->localRanks, NCCL_MAX_PAGE_SIZE, 0, 0));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm) {
+  if (comm->baseUCSymPtr) {
+    CUCHECK(cuMemAddressFree((CUdeviceptr)comm->baseUCSymPtr, comm->baseStride * comm->localRanks));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr) {
+  ncclResult_t ret = ncclSuccess;
+  CUmemGenericAllocationHandle impHandle;
+  int impFd = -1;
+  ncclCuDesc* desc = NULL;
+  CUmemAccessDesc accessDesc = {};
+
+  assert(offset % NCCL_REC_PAGE_SIZE == 0 && size % NCCL_REC_PAGE_SIZE == 0);
+  NCCLCHECKGOTO(ncclCalloc(&desc, comm->localRanks), ret, fail);
+  if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+    memcpy(&desc[comm->localRank].data, &memHandle, sizeof(CUmemGenericAllocationHandle));
+  } else {
+    CUCHECKGOTO(cuMemExportToShareableHandle(&desc[comm->localRank].handle, memHandle, ncclCuMemHandleType, 0), ret, fail);
+  }
+
+  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, desc, sizeof(ncclCuDesc)), ret, fail);
+
+  // start mapping
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = comm->cudaDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  for (int r = 0; r < comm->localRanks; ++r) {
+    CUdeviceptr maddr;
+    if (r == comm->localRank) {
+      impHandle = memHandle;
+    } else {
+      if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+        impFd = -1;
+        NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, comm->localRankToRank[r], &desc[r].data, &impFd), ret, fail);
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)(uintptr_t)impFd, ncclCuMemHandleType), ret, fail);
+        SYSCHECKGOTO(close(impFd), "close", ret, fail);
+      } else {
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&desc[r].handle, ncclCuMemHandleType), ret, fail);
+      }
+    }
+    maddr = (CUdeviceptr)(comm->baseUCSymPtr + (size_t)r * comm->baseStride + offset);
+    CUCHECKGOTO(cuMemMap(maddr, size, 0, impHandle, 0), ret, fail);
+    CUCHECKGOTO(cuMemSetAccess(maddr, size, &accessDesc, 1), ret, fail);
+
+    if (r == comm->localRank) {
+      *symPtr = (void*)maddr;
+    } else {
+      CUCHECKGOTO(cuMemRelease(impHandle), ret, fail);
+    }
+  }
+
+  INFO(NCCL_ALLOC, "IPC symmetric alloc buffer %p offset %ld size %ld symAllocHead %ld", *symPtr, offset, size, comm->symAllocHead);
+
+exit:
+  free(desc);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr) {
+  ncclResult_t ret = ncclSuccess;
+  if (comm && symPtr && size > 0) {
+    size_t offset = (size_t)symPtr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride);
+    for (int r = 0; r < comm->localRanks; ++r) {
+      CUdeviceptr peerAddr = (CUdeviceptr)(comm->baseUCSymPtr + r * comm->baseStride + offset);
+      CUCHECKGOTO(cuMemUnmap(peerAddr, size), ret, fail);
+    }
+  }
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc
index 3e32843aa..6e7b33c16 100644
--- a/src/transport/profiler.cc
+++ b/src/transport/profiler.cc
@@ -6,6 +6,7 @@
 #include "transport.h"
 #include "proxy.h"
 #include "profiler.h"
+#include "device.h"
 
 static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   connection->proxyAppendPtr = &connection->proxyAppend;
@@ -29,15 +30,15 @@ static ncclResult_t profilerProxyProgress(struct ncclProxyState* proxyState, str
   if (args->state == ncclProxyOpProgress) {
     for (int s = 0; s < args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs + s;
-      uint64_t* workStarted = (uint64_t *)sub->sendbuff;
-      uint64_t* workCompleted = (uint64_t *)sub->recvbuff;
-      if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId]) {
-        ncclProfilerStartKernelChEvent(args, s);
+      struct ncclDevProfiler* workStarted = (struct ncclDevProfiler *)sub->sendbuff;
+      struct ncclDevProfiler* workCompleted = (struct ncclDevProfiler *)sub->recvbuff;
+      if (sub->posted < sub->nsteps && sub->base <= workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) {
+        ncclProfilerStartKernelChEvent(args, s, workStarted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp);
         sub->posted = sub->nsteps;
         continue; // allow events on every channel to start
       }
-      if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId]) {
-        ncclProfilerStopKernelChEvent(args, s);
+      if (sub->transmitted < sub->nsteps && sub->base <= workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].counter) {
+        ncclProfilerStopKernelChEvent(args, s, workCompleted[sub->channelId].data[sub->base%MAX_PROFILER_EVENTS_PER_CHANNEL].timestamp);
         sub->transmitted = sub->nsteps;
         args->done++;
       }
diff --git a/src/transport/shm.cc b/src/transport/shm.cc
index aa3e6c41b..993570da2 100644
--- a/src/transport/shm.cc
+++ b/src/transport/shm.cc
@@ -10,7 +10,7 @@
 #include "transport.h"
 
 #define SHM_PATH_MAX 128
-#define SHM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#define SHM_HANDLE_TYPE ncclCuMemHandleType
 
 struct shmBuffInfo {
   void *hptr;

From 3ea7eedf3b9b94f1d9f99f4e55536dfcbd23c1ca Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Wed, 18 Jun 2025 10:34:47 -0700
Subject: [PATCH 12/21] NCCL 2.27.5-1

Improvements for GB200 systems
* Optimize the network performance by alternating the direction of the
  rings and the NIC to GPU assignment across communicators to limit
  unnecessary sharing.
* Fix the detection of C2C links in case GPU Direct RDMA is disabled
  between a GPU and a NIC.
* Fix PXN support on MNNVL systems, where NCCL would try (and fail) to
  share regular host memory across multiple nodes.
* Fix P2C (PXN over C2C), which is now preferred over regular PXN.  This
  support is currently preliminary and is disabled by default; use
  NCCL_PXN_C2C=1 to enable.

Further reduce the overheads of CUDA graph capturing, which increased in
NCCL 2.26.2 for large graphs.

Optimize the network performance on DGX B200 systems by adjusting the
bandwidths provided to the graph search algorithm.

Enable fp8 reductions in symmetric kernels on Blackwell with CUDA 12.8.

Restore the plugin name handling logic to make it possible to specify a
path to the plugin (Issue #1732).

Restore the ability to change NCCL_COLLNET_ENABLE during execution
(Issue #1741).

Add an example tuner plugin with CSV-based overrides.

Remove an x86 dependency from the example profiler.
---
 ext-net/example/Makefile                      |  21 +-
 ext-profiler/example/Makefile                 |  20 +-
 ext-profiler/example/plugin.c                 |  22 +-
 ext-tuner/basic/Makefile                      |  23 +
 ext-tuner/basic/nccl/common.h                 |  15 +
 ext-tuner/basic/nccl/err.h                    |  17 +
 ext-tuner/basic/nccl/tuner.h                  |  97 ++
 ext-tuner/basic/plugin.c                      |  34 +
 ext-tuner/example/Makefile                    |  54 +-
 ext-tuner/example/README.md                   | 164 ++++
 ext-tuner/example/nccl_tuner.conf             |  45 +
 ext-tuner/example/plugin.c                    | 433 ++++++++-
 ext-tuner/example/scripts/README.md           | 106 +++
 ext-tuner/example/scripts/optimize_config.py  | 430 +++++++++
 .../scripts/sample_performance_data.csv       |  24 +
 ext-tuner/example/test/Makefile               |  30 +
 ext-tuner/example/test/README.md              | 205 +++++
 ext-tuner/example/test/test_plugin.c          | 856 ++++++++++++++++++
 makefiles/common.mk                           |  10 +-
 makefiles/version.mk                          |   2 +-
 src/device/Makefile                           |   5 +-
 src/device/reduce_kernel.h                    |   2 +-
 src/device/symmetric/generate.py              |   4 +-
 src/graph/paths.cc                            |  30 +-
 src/graph/search.cc                           | 120 ++-
 src/graph/topo.cc                             |  15 +-
 src/graph/topo.h                              |  11 +-
 src/graph/tuning.cc                           |  11 +-
 src/init.cc                                   |  14 +-
 src/misc/mlx5dvsymbols.cc                     |   3 +
 src/misc/strongstream.cc                      |  28 +-
 src/plugin/plugin_open.cc                     |  26 +-
 src/transport/net_ib.cc                       |   6 +-
 33 files changed, 2740 insertions(+), 143 deletions(-)
 create mode 100644 ext-tuner/basic/Makefile
 create mode 100644 ext-tuner/basic/nccl/common.h
 create mode 100644 ext-tuner/basic/nccl/err.h
 create mode 100644 ext-tuner/basic/nccl/tuner.h
 create mode 100644 ext-tuner/basic/plugin.c
 create mode 100644 ext-tuner/example/README.md
 create mode 100644 ext-tuner/example/nccl_tuner.conf
 create mode 100644 ext-tuner/example/scripts/README.md
 create mode 100644 ext-tuner/example/scripts/optimize_config.py
 create mode 100644 ext-tuner/example/scripts/sample_performance_data.csv
 create mode 100644 ext-tuner/example/test/Makefile
 create mode 100644 ext-tuner/example/test/README.md
 create mode 100644 ext-tuner/example/test/test_plugin.c

diff --git a/ext-net/example/Makefile b/ext-net/example/Makefile
index e0a6aa619..9cc623e31 100644
--- a/ext-net/example/Makefile
+++ b/ext-net/example/Makefile
@@ -3,15 +3,20 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME:=../../build/
-CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO:=libnccl-net.so
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-default: $(PLUGIN_SO)
+SRC_FILES := $(wildcard *.c)
 
-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+build: ${BUILDDIR}/libnccl-net-example.so
+
+${BUILDDIR}/libnccl-net-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/libnccl-net-example.so
diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile
index f5cc9f1d8..777ff5bad 100644
--- a/ext-profiler/example/Makefile
+++ b/ext-profiler/example/Makefile
@@ -3,14 +3,20 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME := ../../build
-INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO := libnccl-profiler.so
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-default: $(PLUGIN_SO)
+SRC_FILES := $(wildcard *.c)
 
-$(PLUGIN_SO): plugin.c event.c print_event.c
-	$(CXX) $(INC) -g -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+build: ${BUILDDIR}/libnccl-profiler-example.so
+
+${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/libnccl-profiler-example.so
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.c
index e3f707a0a..b89cd4627 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.c
@@ -12,7 +12,7 @@
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <unistd.h>
-#include <x86intrin.h>
+#include <time.h>
 #include "event.h"
 #include "print_event.h"
 
@@ -41,22 +41,10 @@ static struct proxyOp* detachPool;
 ncclDebugLogger_t logFn;
 #define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
 
-static double freq = -1;
-__hidden void calibrate() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  uint64_t timeCycles = __rdtsc();
-  double time = - tv.tv_sec*1e6 - tv.tv_usec;
-  uint64_t total = 0ULL;
-  for (int i = 0; i < 10000; i++) total += __rdtsc();
-  gettimeofday(&tv, NULL);
-  timeCycles = __rdtsc() - timeCycles;
-  time += tv.tv_sec*1e6 + tv.tv_usec;
-  freq = timeCycles / time;
-}
-
 __hidden double gettime(void) {
-  return __rdtsc() / freq;
+  struct timespec t;
+  clock_gettime(CLOCK_MONOTONIC, &t);
+  return (t.tv_sec*1e6 + (t.tv_nsec*1e-3));
 }
 
 static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@@ -98,8 +86,6 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
     // process address space.
     pid = getpid();
 
-    // calibrate and start timer
-    calibrate();
     startTime = gettime();
   }
   pthread_mutex_unlock(&lock);
diff --git a/ext-tuner/basic/Makefile b/ext-tuner/basic/Makefile
new file mode 100644
index 000000000..50edd23a7
--- /dev/null
+++ b/ext-tuner/basic/Makefile
@@ -0,0 +1,23 @@
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.DEFAULT_GOAL: build
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
+
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+build: ${BUILDDIR}/libnccl-tuner-basic.so
+
+${BUILDDIR}/libnccl-tuner-basic.so: ${SRC_FILES}
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl -fPIC -shared -o $@ $^
+
+clean:
+	rm -f ${BUILDDIR}/libnccl-tuner-basic.so
diff --git a/ext-tuner/basic/nccl/common.h b/ext-tuner/basic/nccl/common.h
new file mode 100644
index 000000000..912925225
--- /dev/null
+++ b/ext-tuner/basic/nccl/common.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-tuner/basic/nccl/err.h b/ext-tuner/basic/nccl/err.h
new file mode 100644
index 000000000..bb92e8354
--- /dev/null
+++ b/ext-tuner/basic/nccl/err.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_ERR_H_
+#define NCCL_ERR_H_
+
+/* Error type for plugins */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6 } ncclResult_t;
+
+#endif
diff --git a/ext-tuner/basic/nccl/tuner.h b/ext-tuner/basic/nccl/tuner.h
new file mode 100644
index 000000000..77b543d12
--- /dev/null
+++ b/ext-tuner/basic/nccl/tuner.h
@@ -0,0 +1,97 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+#include "err.h"
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  // Outputs:
+  //   - context: tuner context object
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*destroy)(void* context);
+} ncclTuner_v4_t;
+
+typedef ncclTuner_v4_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+
+#endif
diff --git a/ext-tuner/basic/plugin.c b/ext-tuner/basic/plugin.c
new file mode 100644
index 000000000..a17fd009e
--- /dev/null
+++ b/ext-tuner/basic/plugin.c
@@ -0,0 +1,34 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "tuner.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
+
+__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
+  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
+    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
+  }
+  *nChannels = 1;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+
+#define PLUGIN_NAME "Basic"
+
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit,
+  .getCollInfo = pluginGetCollInfo,
+  .destroy = pluginDestroy
+};
diff --git a/ext-tuner/example/Makefile b/ext-tuner/example/Makefile
index 9d9ace484..76c16b60f 100644
--- a/ext-tuner/example/Makefile
+++ b/ext-tuner/example/Makefile
@@ -3,15 +3,53 @@
 #
 # See LICENSE.txt for license information
 #
-NCCL_HOME:=../../build/
-CUDA_HOME:=/usr/local/cuda
-INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
-PLUGIN_SO:=libnccl-tuner.so
 
-default: $(PLUGIN_SO)
+.DEFAULT_GOAL: build
+PLUGIN_SO:=libnccl-tuner-example.so
+include ../../makefiles/common.mk
+SRCDIR   ?= $(abspath ../..)
+BUILDDIR ?= .
+NCCLDIR  := $(BUILDDIR)
 
-$(PLUGIN_SO): plugin.c
-	$(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+SRC_FILES := $(wildcard *.c)
+DST_DIR   := $(BUILDDIR)/test/unit/plugins
+
+default: ${BUILDDIR}/$(PLUGIN_SO)
+
+build: ${BUILDDIR}/$(PLUGIN_SO)
+
+${BUILDDIR}/$(PLUGIN_SO): plugin.c
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${BUILDDIR}
+	$(CC) -Inccl $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Test targets - delegate to test directory
+test:
+	$(MAKE) -C test test TEST_CASE=$(TEST_CASE)
+
+test-verbose:
+	$(MAKE) -C test test-verbose TEST_CASE=$(TEST_CASE)
+
+# Build tests
+test-build:
+	$(MAKE) -C test all
+
+# Optimize configurations from performance data
+optimize-config:
+	@if [ -z "$(CSV_FILE)" ]; then \
+		echo "Usage: make optimize-config CSV_FILE=path/to/data.csv [OUTPUT=config.conf] [METRIC=latency_us]"; \
+		echo "Example: make optimize-config CSV_FILE=scripts/sample_performance_data.csv"; \
+		exit 1; \
+	fi
+	python3 scripts/optimize_config.py $(CSV_FILE) \
+		$(if $(OUTPUT),-o $(OUTPUT)) \
+		$(if $(METRIC),-m $(METRIC)) \
+		$(if $(SIZE_RANGES),--size-ranges $(SIZE_RANGES)) \
+		$(if $(DRY_RUN),--dry-run) \
+		$(if $(NO_HEADER),--no-header)
 
 clean:
-	rm -f $(PLUGIN_SO)
+	rm -f ${BUILDDIR}/$(PLUGIN_SO)
+	$(MAKE) -C test clean
+
+.PHONY: test test-verbose test-build optimize-config clean
diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md
new file mode 100644
index 000000000..7f472ae7a
--- /dev/null
+++ b/ext-tuner/example/README.md
@@ -0,0 +1,164 @@
+# NCCL Example Tuner Plugin
+
+This example plugin shows a practical example of a CSV file-based tuning approach, allowing selective overrides for tuning parameters based on all tuning inputs without recompiling.
+
+## Features
+
+- **File-based Configuration**: Read tuning parameters from a CSV configuration file
+- **Size-based Tuning**: Specify different configurations based on message size ranges
+- **Dimension-aware Tuning**: Match configurations based on number of nodes and ranks
+- **Optional Channels Configuration**: Set specific channel counts or use -1 to keep NCCL's default
+- **Environment Variable Support**: Specify config file location via `NCCL_TUNER_CONFIG_FILE`
+- **Fallback Behavior**: Gracefully handles missing config files and invalid entries
+
+## Building
+
+```bash
+make
+```
+
+This will create `libnccl-tuner-example.so` that can be loaded by NCCL.
+
+## Configuration File Format
+
+The configuration file uses CSV (Comma-Separated Values) format with one configuration per line:
+
+```
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+```
+
+### Parameters
+
+- **collective_type**: The collective operation type
+  - `broadcast`, `reduce`, `allgather`, `reducescatter`, `allreduce`
+
+- **min_bytes/max_bytes**: The message size range (in bytes) for which this config applies
+  - Use `0` for minimum and `4294967295` for maximum (covers all sizes)
+
+- **algorithm**: The NCCL algorithm to use
+  - `tree`, `ring`, `collnet_direct`, `collnet_chain`, `nvls`, `nvls_tree`, `pat`
+
+- **protocol**: The NCCL protocol to use
+  - `ll`, `ll128`, `simple`
+
+- **channels**: Number of channels (SMs) to use
+  - Use a positive integer to specify exact channel count
+  - Use `-1` to keep NCCL's default channel selection
+
+- **nNodes**: Number of nodes to match
+  - Use a positive integer to match specific node count
+  - Use `-1` to match any number of nodes
+
+- **nRanks**: Number of ranks to match
+  - Use a positive integer to match specific rank count
+  - Use `-1` to match any number of ranks
+
+- **numPipeOps**: Number of pipeline operations to match (optional)
+  - Use a positive integer to match specific pipeline operation count
+  - Use `-1` to match any number of pipeline operations
+  - If omitted, configuration will match any numPipeOps value
+
+- **regBuff**: Whether user buffer can be registered (optional)
+  - Use `0` to match only non-registered buffers
+  - Use `1` to match only registered buffers
+  - Use `-1` to match either registered or non-registered buffers
+  - If omitted, configuration will match any regBuff value
+
+### Example Configuration
+
+```csv
+# Single-node, small allreduce: use tree algorithm, registered buffers only
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# 4-node, 32-rank setup: medium allreduce, single pipeline op, non-registered buffers
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# Any topology: large allreduce with LL128, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Single-node broadcast: prefer tree, any pipeOps, registered buffers (backward compatible)
+broadcast,0,32768,tree,simple,-1,1,-1
+
+# Multi-node broadcast: optimized for non-registered buffers, single pipeline op
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+```
+
+Comments start with `#` and empty lines are ignored. The CSV format makes it easy to edit configurations in spreadsheet applications like Excel, Google Sheets, or LibreOffice Calc.
+
+### Backward Compatibility
+
+Configurations without the numPipeOps and/or regBuff parameters are fully supported:
+- 8 fields: matches any numPipeOps and regBuff values
+- 9 fields: matches any regBuff value
+- 10 fields: full parameter specification
+
+This ensures existing configuration files continue to work without modification.
+
+## Usage
+
+### Method 1: Default Config File
+Place your configuration in `nccl_tuner.conf` in the current working directory.
+
+### Method 2: Environment Variable
+Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file path:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
+export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH
+mpirun -np 4 your_nccl_application
+```
+
+## Editing Configuration Files
+
+### Generating Configuration Files from Raw Data
+
+A python script to generate valid CSV configs has been provided. [Using optimize_config.py](scripts/README.md).
+
+### Spreadsheet Tips:
+- Use column headers: `collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff`
+- Save as CSV format (not Excel format) for the plugin to read
+- Use data validation to prevent typos in algorithm/protocol names
+
+## Logging
+
+The plugin uses NCCL's logging system. To see tuner-related messages:
+
+```bash
+export NCCL_DEBUG=INFO
+```
+
+This will show when configurations are loaded and applied, including the topology information.
+
+For detailed debugging output during tuning decisions:
+
+```bash
+export NCCL_DEBUG=TRACE
+```
+
+This will show verbose information about which configurations are being evaluated and matched.
+
+## Dimension Matching
+
+Configurations are only applied when the topology matches:
+
+- **Exact Match**: Configuration specifies `nNodes=4,nRanks=32`, only applied when communicator has exactly 4 nodes and 32 ranks
+- **Wildcard Nodes**: Configuration specifies `nNodes=-1,nRanks=8`, applied to any topology with exactly 8 ranks
+- **Wildcard Ranks**: Configuration specifies `nNodes=2,nRanks=-1`, applied to any 2-node topology regardless of ranks per node
+- **Wildcard Both**: Configuration specifies `nNodes=-1,nRanks=-1`, applied to any topology
+
+This allows you to create specialized configurations for different cluster setups while maintaining flexibility.
+
+## Default Behavior
+
+If no configuration file is found or no matching configuration exists for a collective operation, the plugin falls back to preferring the ring algorithm with simple protocol. All configured algorithm/protocol combinations are given a low cost (0.0) to make them preferred by NCCL's selection logic.
+
+When channels is set to `-1`, NCCL's default channel selection logic is preserved, allowing the system to automatically determine the optimal number of channels based on hardware and message size.
+
+## Troubleshooting
+
+1. **Config file not found**: Check the file path and permissions
+2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
+3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory
+4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
+5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
+6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
diff --git a/ext-tuner/example/nccl_tuner.conf b/ext-tuner/example/nccl_tuner.conf
new file mode 100644
index 000000000..13eb2f081
--- /dev/null
+++ b/ext-tuner/example/nccl_tuner.conf
@@ -0,0 +1,45 @@
+# NCCL Tuner Configuration File (CSV Format)
+# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#
+# Collective types: broadcast, reduce, allgather, reducescatter, allreduce
+# Algorithms: tree, ring, collnet_direct, collnet_chain, nvls, nvls_tree, pat
+# Protocols: ll, ll128, simple
+# Channels: number of channels to use, or -1 to keep default
+# nNodes: number of nodes to match, or -1 for any number of nodes
+# nRanks: number of ranks to match, or -1 for any number of ranks
+# numPipeOps: number of pipeline operations to match, or -1 for any number (optional)
+# regBuff: whether user buffer can be registered (0=no, 1=yes, -1=any) (optional)
+#
+# Note: numPipeOps and regBuff parameters are optional - configurations without them will match any value
+#
+# Examples:
+
+# For single-node configurations with registered buffers
+# Small allreduce operations on single node - use tree algorithm, registered buffers
+allreduce,0,65536,tree,simple,2,1,-1,-1,1
+
+# For multi-node configurations with 4 nodes, 32 total ranks, single pipeline op, non-registered buffers
+# Medium allreduce operations - use ring algorithm
+allreduce,65537,1048576,ring,simple,4,4,32,1,0
+
+# For any topology - large allreduce operations with LL128 protocol, multiple pipeline ops, any buffer type
+allreduce,1048577,4294967295,ring,ll128,-1,-1,-1,4,-1
+
+# Broadcast operations - different configs for different topologies, pipeline complexity, and buffer types
+# Single node broadcast - prefer tree, any pipeOps, registered buffers only
+broadcast,0,32768,tree,simple,-1,1,-1,-1,1
+
+# Multi-node broadcast with single pipeline operation, non-registered buffers - use ring
+broadcast,32769,4294967295,ring,simple,2,-1,-1,1,0
+
+# AllGather operations - optimized for 2-node configurations, any pipeOps, any buffer type
+allgather,0,4294967295,ring,simple,4,2,-1
+
+# ReduceScatter operations
+# Small messages on single node, single pipeline op, registered buffers
+reducescatter,0,131072,tree,simple,2,1,-1,1,1
+# Large messages on any topology, multiple pipeline ops, non-registered buffers
+reducescatter,131073,4294967295,ring,simple,-1,-1,-1,2,0
+
+# Reduce operations - any topology, keep default channels, any pipeOps, any buffer type
+reduce,0,4294967295,tree,simple,-1,-1,-1
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index 7925dcfa1..1b8031ed1 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -5,24 +5,443 @@
  ************************************************************************/
 
 #include "tuner.h"
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
 
 #define __hidden __attribute__ ((visibility("hidden")))
+#define MAX_LINE_LENGTH 256
 
-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) { return ncclSuccess; }
+// CSV field indices for configuration parsing
+// Format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+#define CONFIG_FIELD_COLLTYPE     0
+#define CONFIG_FIELD_MINBYTES     1
+#define CONFIG_FIELD_MAXBYTES     2
+#define CONFIG_FIELD_ALGORITHM    3
+#define CONFIG_FIELD_PROTOCOL     4
+#define CONFIG_FIELD_CHANNELS     5
+#define CONFIG_FIELD_NNODES       6
+#define CONFIG_FIELD_NRANKS       7
+#define CONFIG_FIELD_PIPEOPS      8  // Optional field
+#define CONFIG_FIELD_REGBUFF      9  // Optional field
+
+// Field count constants
+#define CONFIG_FIELDS_REQUIRED    8   // Minimum required fields (up to nRanks)
+#define CONFIG_FIELDS_WITH_PIPEOPS 9  // Fields including numPipeOps
+#define CONFIG_FIELDS_WITH_REGBUFF 10 // Fields including both numPipeOps and regBuff
+#define CONFIG_FIELDS_MAX         10  // Maximum number of fields supported
+
+typedef struct {
+  ncclFunc_t collType;
+  size_t minBytes;
+  size_t maxBytes;
+  int algorithm;
+  int protocol;
+  int nChannels;
+  int nNodes;
+  int nRanks;
+  int numPipeOps;
+  int regBuff;
+} TuningConfig;
+
+typedef struct {
+  TuningConfig* configs;  // Changed from static array to dynamic pointer
+  int numConfigs;
+  int maxConfigs;         // Added to track allocated size
+  size_t nRanks;
+  size_t nNodes;
+  ncclDebugLogger_t logFunction;
+} TunerContext;
+
+// Parse collective type from string
+static ncclFunc_t parseCollType(const char* str) {
+  if (strcmp(str, "broadcast") == 0) return ncclFuncBroadcast;
+  if (strcmp(str, "reduce") == 0) return ncclFuncReduce;
+  if (strcmp(str, "allgather") == 0) return ncclFuncAllGather;
+  if (strcmp(str, "reducescatter") == 0) return ncclFuncReduceScatter;
+  if (strcmp(str, "allreduce") == 0) return ncclFuncAllReduce;
+  return ncclFuncAllReduce; // default
+}
+
+// Convert collective type to string
+static const char* collTypeToString(ncclFunc_t collType) {
+  switch (collType) {
+    case ncclFuncBroadcast: return "broadcast";
+    case ncclFuncReduce: return "reduce";
+    case ncclFuncAllGather: return "allgather";
+    case ncclFuncReduceScatter: return "reducescatter";
+    case ncclFuncAllReduce: return "allreduce";
+    default: return "unknown";
+  }
+}
+
+// Parse algorithm from string
+static int parseAlgorithm(const char* str) {
+  if (strcmp(str, "tree") == 0) return NCCL_ALGO_TREE;
+  if (strcmp(str, "ring") == 0) return NCCL_ALGO_RING;
+  if (strcmp(str, "collnet_direct") == 0) return NCCL_ALGO_COLLNET_DIRECT;
+  if (strcmp(str, "collnet_chain") == 0) return NCCL_ALGO_COLLNET_CHAIN;
+  if (strcmp(str, "nvls") == 0) return NCCL_ALGO_NVLS;
+  if (strcmp(str, "nvls_tree") == 0) return NCCL_ALGO_NVLS_TREE;
+  if (strcmp(str, "pat") == 0) return NCCL_ALGO_PAT;
+  return NCCL_ALGO_RING; // default
+}
+
+// Convert algorithm to string
+static const char* algorithmToString(int algorithm) {
+  switch (algorithm) {
+    case NCCL_ALGO_TREE: return "tree";
+    case NCCL_ALGO_RING: return "ring";
+    case NCCL_ALGO_COLLNET_DIRECT: return "collnet_direct";
+    case NCCL_ALGO_COLLNET_CHAIN: return "collnet_chain";
+    case NCCL_ALGO_NVLS: return "nvls";
+    case NCCL_ALGO_NVLS_TREE: return "nvls_tree";
+    case NCCL_ALGO_PAT: return "pat";
+    default: return "unknown";
+  }
+}
+
+// Parse protocol from string
+static int parseProtocol(const char* str) {
+  if (strcmp(str, "ll") == 0) return NCCL_PROTO_LL;
+  if (strcmp(str, "ll128") == 0) return NCCL_PROTO_LL128;
+  if (strcmp(str, "simple") == 0) return NCCL_PROTO_SIMPLE;
+  return NCCL_PROTO_SIMPLE; // default
+}
+
+// Convert protocol to string
+static const char* protocolToString(int protocol) {
+  switch (protocol) {
+    case NCCL_PROTO_LL: return "ll";
+    case NCCL_PROTO_LL128: return "ll128";
+    case NCCL_PROTO_SIMPLE: return "simple";
+    default: return "unknown";
+  }
+}
+
+// Helper function to count valid configuration lines in file
+static int countConfigLines(const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    return 0;
+  }
+
+  char line[MAX_LINE_LENGTH];
+  int count = 0;
+
+  while (fgets(line, sizeof(line), file)) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Check if line has content
+    if (strlen(line) > 0) {
+      count++;
+    }
+  }
+
+  fclose(file);
+  return count;
+}
+
+// Load configuration from file
+static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
+  FILE* file = fopen(filename, "r");
+  if (!file) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Config file %s not found, using defaults", filename);
+    }
+    return ncclSuccess; // Not finding config file is not an error
+  }
+
+  // First pass: count valid configuration lines
+  int configCount = countConfigLines(filename);
+  if (configCount == 0) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: No valid configurations found in %s", filename);
+    }
+    fclose(file);
+    return ncclSuccess;
+  }
+
+  // Allocate memory for configurations based on actual count
+  ctx->configs = (TuningConfig*)malloc(configCount * sizeof(TuningConfig));
+  if (!ctx->configs) {
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Failed to allocate memory for %d configurations", configCount);
+    }
+    fclose(file);
+    return ncclSystemError;
+  }
+
+  ctx->maxConfigs = configCount;
+  ctx->numConfigs = 0;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Allocated memory for %d configurations", configCount);
+  }
+
+  // Reset file pointer to beginning
+  fseek(file, 0, SEEK_SET);
+
+  char line[MAX_LINE_LENGTH];
+
+  while (fgets(line, sizeof(line), file) && ctx->numConfigs < ctx->maxConfigs) {
+    // Skip comments and empty lines
+    if (line[0] == '#' || line[0] == '\n') continue;
+
+    // Remove trailing newline
+    line[strcspn(line, "\n")] = 0;
+
+    // Parse CSV format: colltype,minbytes,maxbytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+    char* token;
+    char* tokens[CONFIG_FIELDS_MAX];
+    int tokenCount = 0;
+
+    // Make a copy of the line for tokenizing
+    char lineCopy[MAX_LINE_LENGTH];
+    strncpy(lineCopy, line, sizeof(lineCopy));
+    lineCopy[sizeof(lineCopy) - 1] = '\0';
+
+    // Tokenize by comma
+    token = strtok(lineCopy, ",");
+    while (token != NULL && tokenCount < CONFIG_FIELDS_MAX) {
+      // Trim whitespace
+      while (*token == ' ' || *token == '\t') token++;
+      char* end = token + strlen(token) - 1;
+      while (end > token && (*end == ' ' || *end == '\t')) {
+        *end = '\0';
+        end--;
+      }
+      tokens[tokenCount++] = token;
+      token = strtok(NULL, ",");
+    }
+
+    // Validate field count: support required fields (8), with pipeOps (9), or with regBuff (10)
+    if (tokenCount >= CONFIG_FIELDS_REQUIRED && tokenCount <= CONFIG_FIELDS_MAX) {
+      TuningConfig* config = &ctx->configs[ctx->numConfigs];
+      config->collType = parseCollType(tokens[CONFIG_FIELD_COLLTYPE]);
+      config->minBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MINBYTES], NULL, 10);
+      config->maxBytes = (size_t)strtoull(tokens[CONFIG_FIELD_MAXBYTES], NULL, 10);
+      config->algorithm = parseAlgorithm(tokens[CONFIG_FIELD_ALGORITHM]);
+      config->protocol = parseProtocol(tokens[CONFIG_FIELD_PROTOCOL]);
+      config->nChannels = atoi(tokens[CONFIG_FIELD_CHANNELS]);
+      config->nNodes = atoi(tokens[CONFIG_FIELD_NNODES]);
+      config->nRanks = atoi(tokens[CONFIG_FIELD_NRANKS]);
+
+      // numPipeOps is optional (9th field, index 8)
+      if (tokenCount >= CONFIG_FIELDS_WITH_PIPEOPS) {
+        config->numPipeOps = atoi(tokens[CONFIG_FIELD_PIPEOPS]);
+      } else {
+        config->numPipeOps = -1; // -1 means match any numPipeOps
+      }
+
+      // regBuff is optional (10th field, index 9)
+      if (tokenCount >= CONFIG_FIELDS_WITH_REGBUFF) {
+        config->regBuff = atoi(tokens[CONFIG_FIELD_REGBUFF]);
+      } else {
+        config->regBuff = -1; // -1 means match any regBuff value
+      }
+
+      ctx->numConfigs++;
+
+      if (ctx->logFunction) {
+        if (config->numPipeOps == -1 && config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks);
+        } else if (config->regBuff == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=any",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps);
+        } else if (config->numPipeOps == -1) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=any regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->regBuff);
+        } else {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Loaded config: %s [%zu-%zu] %s/%s channels=%d nodes=%d ranks=%d pipeOps=%d regBuff=%d",
+                           tokens[CONFIG_FIELD_COLLTYPE], config->minBytes, config->maxBytes,
+                           tokens[CONFIG_FIELD_ALGORITHM], tokens[CONFIG_FIELD_PROTOCOL],
+                           config->nChannels, config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+        }
+      }
+    }
+  }
+
+  fclose(file);
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: Loaded %d tuning configurations from %s", ctx->numConfigs, filename);
+  }
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+  TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
+  if (!ctx) return ncclSystemError;
+
+  ctx->configs = NULL;     // Initialize to NULL
+  ctx->numConfigs = 0;
+  ctx->maxConfigs = 0;     // Initialize to 0
+  ctx->nRanks = nRanks;
+  ctx->nNodes = nNodes;
+  ctx->logFunction = logFunction;
+
+  if (logFunction) {
+    logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+  }
+
+  // Try to load config file from environment variable or default location
+  const char* configFile = getenv("NCCL_TUNER_CONFIG_FILE");
+  if (!configFile) {
+    configFile = "nccl_tuner.conf"; // default config file name
+  }
+
+  ncclResult_t result = loadConfig(ctx, configFile);
+  if (result != ncclSuccess) {
+    if (ctx->configs) {
+      free(ctx->configs);  // Clean up allocated memory on error
+    }
+    free(ctx);
+    return result;
+  }
+
+  *context = ctx;
+  return ncclSuccess;
+}
 
 __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
                               int numPipeOps, float** collCostTable, int numAlgo, int numProto,
                               int regBuff, int* nChannels) {
-  // Update NCCL core generated cost table. Updated table will be evaluated by NCCL to pick the best algo/proto combo
-  float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
-  if (table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] != NCCL_ALGO_PROTO_IGNORE) {
-    table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 0.0;
-  }
+  TunerContext* ctx = (TunerContext*)context;
+  if (!ctx) return ncclInternalError;
+
+  // Default channels
   *nChannels = 1;
+
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: pluginGetCollInfo called - collType=%s, nBytes=%zu, numPipeOps=%d, regBuff=%d, numConfigs=%d",
+                     collTypeToString(collType), nBytes, numPipeOps, regBuff, ctx->numConfigs);
+  }
+
+  // Look for matching configuration
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+
+    if (ctx->logFunction) {
+      ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                       "TUNER/ExamplePlugin: Checking config %d - collType=%s, minBytes=%zu, maxBytes=%zu, algo=%s, proto=%s, nNodes=%d, nRanks=%d, numPipeOps=%d, regBuff=%d",
+                       i, collTypeToString(config->collType), config->minBytes, config->maxBytes, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                       config->nNodes, config->nRanks, config->numPipeOps, config->regBuff);
+    }
+
+    // Check if this config matches the current collective, size range, topology, pipeline ops, and regBuff
+    if (config->collType == collType &&
+        nBytes >= config->minBytes &&
+        nBytes <= config->maxBytes &&
+        (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes) &&
+        (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks) &&
+        (config->numPipeOps == -1 || config->numPipeOps == numPipeOps) &&
+        (config->regBuff == -1 || config->regBuff == regBuff)) {
+
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config matches. Applying algo=%s, proto=%s, channels=%d",
+                         algorithmToString(config->algorithm), protocolToString(config->protocol), config->nChannels);
+      }
+
+      // Check bounds
+      if (config->algorithm < numAlgo && config->protocol < numProto) {
+        if (collCostTable[config->algorithm][config->protocol] != NCCL_ALGO_PROTO_IGNORE) {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_TRACE, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Setting cost table[%s][%s] (%p) = 0.0 (was %.1f)",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol),
+                             &collCostTable[config->algorithm][config->protocol], collCostTable[config->algorithm][config->protocol]);
+          }
+          collCostTable[config->algorithm][config->protocol] = 0.0; // Set low cost to prefer this configuration
+
+          // Only override channels if not set to -1 (keep default)
+          if (config->nChannels != -1) {
+            *nChannels = config->nChannels;
+          }
+
+          if (ctx->logFunction) {
+            if (config->nChannels == -1) {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=default (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nNodes, config->nRanks);
+            } else {
+              ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                               "TUNER/ExamplePlugin: Applied config for collType=%s, bytes=%zu, pipeOps=%d, regBuff=%d: algo=%s, proto=%s, channels=%d (nodes=%d, ranks=%d)",
+                               collTypeToString(config->collType), nBytes, numPipeOps, regBuff, algorithmToString(config->algorithm), protocolToString(config->protocol),
+                               config->nChannels, config->nNodes, config->nRanks);
+            }
+          }
+          return ncclSuccess;
+        } else {
+          if (ctx->logFunction) {
+            ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                             "TUNER/ExamplePlugin: Algorithm/protocol combination [%s][%s] is marked as IGNORE",
+                             algorithmToString(config->algorithm), protocolToString(config->protocol));
+          }
+        }
+      } else {
+        if (ctx->logFunction) {
+          ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                           "TUNER/ExamplePlugin: Algorithm/protocol out of bounds - algo=%s (max %d), proto=%s (max %d)",
+                           algorithmToString(config->algorithm), numAlgo, protocolToString(config->protocol), numProto);
+        }
+      }
+    } else {
+      if (ctx->logFunction) {
+        ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                         "TUNER/ExamplePlugin: Config does not match - collType match=%d, size match=%d, nodes match=%d, ranks match=%d, pipeOps match=%d, regBuff match=%d",
+                         config->collType == collType,
+                         (nBytes >= config->minBytes && nBytes <= config->maxBytes),
+                         (config->nNodes == -1 || config->nNodes == (int)ctx->nNodes),
+                         (config->nRanks == -1 || config->nRanks == (int)ctx->nRanks),
+                         (config->numPipeOps == -1 || config->numPipeOps == numPipeOps),
+                         (config->regBuff == -1 || config->regBuff == regBuff));
+      }
+    }
+  }
+
+  // If no specific config found, apply default behavior
+  if (ctx->logFunction) {
+    ctx->logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
+                     "TUNER/ExamplePlugin: No matching config found");
+  }
+
   return ncclSuccess;
 }
 
-__hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }
+__hidden ncclResult_t pluginDestroy(void* context) {
+  if (context) {
+    TunerContext* ctx = (TunerContext*)context;
+    if (ctx->configs) {
+      free(ctx->configs);  // Free dynamically allocated configs array
+    }
+    free(context);
+  }
+  return ncclSuccess;
+}
 
 #define PLUGIN_NAME "Example"
 
diff --git a/ext-tuner/example/scripts/README.md b/ext-tuner/example/scripts/README.md
new file mode 100644
index 000000000..d31de4354
--- /dev/null
+++ b/ext-tuner/example/scripts/README.md
@@ -0,0 +1,106 @@
+# NCCL Tuner Configuration Scripts
+
+This directory contains scripts for optimizing NCCL tuner configurations based on performance data.
+
+## optimize_config.py
+
+A Python script that reads performance data from CSV files and generates optimal NCCL tuner configurations.
+
+### Usage
+
+```bash
+python scripts/optimize_config.py [options] <input_csv_file>
+```
+
+### Options
+
+- `-o, --output FILE`: Output NCCL tuner config file (default: `nccl_tuner.conf`)
+- `-m, --metric METRIC`: Optimization metric (`cost_metric`, `bandwidth_gbps`, `latency_us`)
+- `--no-header`: Don't add header comments to output file
+- `--dry-run`: Print configurations without writing to file
+
+### CSV Input Format
+
+The input CSV file should have the following columns:
+
+```csv
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+```
+
+**Required columns:**
+- `collective`: NCCL collective type (`allreduce`, `broadcast`, `reduce`, etc.)
+- `size_bytes`: Message size in bytes
+- `algorithm`: NCCL algorithm (`tree`, `ring`, `nvls`, etc.)
+- `protocol`: NCCL protocol (`simple`, `ll`, `ll128`)
+- `channels`: Number of channels (or `-1` for default)
+- `nodes`: Number of nodes (or `-1` for any)
+- `ranks`: Number of ranks (or `-1` for any)
+- `pipeOps`: Number of pipeline operations (or `-1` for any)
+- `regBuff`: Registered buffer flag (`0`, `1`, or `-1` for any)
+
+**Optional metrics (must have at least one present):**
+- `bandwidth_gbps`: Bandwidth in GB/s (higher is better)
+- `latency_us`: Latency in microseconds (lower is better)
+
+### Examples
+
+**Basic usage with cost optimization:**
+```bash
+python scripts/optimize_config.py sample_performance_data.csv
+```
+
+**Optimize for bandwidth and write to custom file:**
+```bash
+python scripts/optimize_config.py -m bandwidth_gbps -o my_tuner.conf performance_data.csv
+```
+
+**Preview configurations without writing:**
+```bash
+python scripts/optimize_config.py --dry-run performance_data.csv
+```
+
+### How It Works
+
+1. **Data Loading**: Reads CSV performance data and validates format
+2. **Grouping**: Groups data by collective type, topology (nodes/ranks), and other parameters
+3. **Size Ranges**: Automatically bins data into size ranges for optimization
+4. **Optimization**: Finds the best performing configuration for each group/size combination
+5. **Output**: Generates NCCL tuner config format and appends to specified file
+
+### Default Size Ranges
+
+The script uses these default size ranges (in bytes):
+- Small: 0 - 1,024
+- Medium: 1,025 - 65,536
+- Large: 65,537 - 1,048,576
+- XLarge: 1,048,577 - 16,777,216
+- XXLarge: 16,777,217 - 4,294,967,295
+
+### Sample Data
+
+See `sample_performance_data.csv` for an example of the expected input format.
+
+### Integration with NCCL
+
+The generated configuration file can be used directly with the NCCL tuner plugin:
+
+```bash
+export NCCL_TUNER_CONFIG_FILE=/path/to/optimized_config.conf
+export NCCL_TUNER_PLUGIN=/path/to/libnccl-tuner.so
+mpirun -np 8 your_nccl_application
+```
+
+### Performance Data Collection
+
+To collect performance data for optimization, you can:
+
+1. **Use NCCL benchmarks** with different algorithm/protocol combinations
+2. **Profile your applications** with various tuner settings
+3. **Run systematic sweeps** across parameter combinations
+4. **Use NCCL debug output** to collect timing information
+
+The key is to have comprehensive data covering:
+- Different message sizes (small to large)
+- Various topologies (single node, multi-node)
+- All relevant algorithm/protocol combinations
+- Different channel counts and pipeline configurations
diff --git a/ext-tuner/example/scripts/optimize_config.py b/ext-tuner/example/scripts/optimize_config.py
new file mode 100644
index 000000000..c5c9b7085
--- /dev/null
+++ b/ext-tuner/example/scripts/optimize_config.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""
+NCCL Tuner Configuration Optimizer
+
+Reads a CSV file containing performance data across different tuning parameters
+and generates optimal NCCL tuner configurations based on the best performing
+combinations.
+
+By default, creates growing size ranges that interpolate between the actual data sizes
+for each unique dimension (node count, rank count combination). This ensures that
+different cluster configurations get their own optimized size boundaries, as
+performance characteristics often vary significantly between topologies.
+
+Each dimension gets its own set of ranges starting from 0 and extending to the maximum
+size for that dimension, with boundaries at midpoints between consecutive data sizes.
+
+CSV Input Format:
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,bandwidth_gbps,latency_us
+
+Output Format (NCCL Tuner Config):
+collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff
+
+Usage Examples:
+  # Auto-create dimension-specific interpolated ranges (default)
+  python3 optimize_config.py data.csv
+
+  # Use custom size ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --size-ranges "0-1024,1025-65536,65537-1048576"
+
+  # Use hardcoded default ranges (applied to all topologies)
+  python3 optimize_config.py data.csv --no-auto-ranges
+"""
+
+import csv
+import argparse
+import sys
+import os
+from collections import defaultdict
+from typing import Dict, List, Tuple, Any
+
+class PerformanceData:
+    def __init__(self, row: Dict[str, str]):
+        self.collective = row['collective']
+        self.size_bytes = int(row['size_bytes'])
+        self.algorithm = row['algorithm']
+        self.protocol = row['protocol']
+        self.channels = int(row['channels']) if row['channels'] != '-1' else -1
+        self.nodes = int(row['nodes']) if row['nodes'] != '-1' else -1
+        self.ranks = int(row['ranks']) if row['ranks'] != '-1' else -1
+        self.pipeOps = int(row['pipeOps']) if row['pipeOps'] != '-1' else -1
+        self.regBuff = int(row['regBuff']) if row['regBuff'] != '-1' else -1
+
+        # Performance metrics
+        self.bandwidth_gbps = float(row.get('bandwidth_gbps', 0))  # Higher is better
+        self.latency_us = float(row.get('latency_us', 0))  # Lower is better
+
+    def get_config_key(self) -> Tuple:
+        """Generate a key for grouping similar configurations"""
+        return (self.collective, self.nodes, self.ranks, self.pipeOps, self.regBuff)
+
+    def get_size_range_key(self, topology_size_ranges: Dict[Tuple[int, int], List[Tuple[int, int]]]) -> Tuple[int, int]:
+        """Find which size range this data point belongs to for its dimension"""
+        topology_key = (self.nodes, self.ranks)
+
+        # Get size ranges for this dimension, or fall back to default
+        if topology_key in topology_size_ranges:
+            size_ranges = topology_size_ranges[topology_key]
+        elif (-1, -1) in topology_size_ranges:
+            size_ranges = topology_size_ranges[(-1, -1)]
+        else:
+            # Fallback to first available dimension ranges
+            size_ranges = next(iter(topology_size_ranges.values()))
+
+        for min_size, max_size in size_ranges:
+            if min_size <= self.size_bytes <= max_size:
+                return (min_size, max_size)
+        # If no range found, create a single-point range
+        return (self.size_bytes, self.size_bytes)
+
+class ConfigOptimizer:
+    def __init__(self, optimization_metric: str = 'latency_us'):
+        self.optimization_metric = optimization_metric
+        # Default size ranges - will be overridden by auto-detection
+        self.size_ranges = [
+            (0, 1024),
+            (1025, 64*1024),
+            (64*1024+1, 1024*1024),
+            (1024*1024+1, 16*1024*1024),
+            (16*1024*1024+1, 4*1024*1024*1024-1)
+        ]
+        self.auto_size_ranges = True
+
+    def set_size_ranges(self, ranges: List[Tuple[int, int]]):
+        """Set custom size ranges for optimization"""
+        self.size_ranges = ranges
+        self.auto_size_ranges = False
+
+    def auto_determine_size_ranges(self, data: List[PerformanceData]) -> Dict[Tuple[int, int], List[Tuple[int, int]]]:
+        """Create growing size ranges for each unique (nodes, ranks) dimension"""
+        if not data:
+            return {(-1, -1): self.size_ranges}
+
+        # Group data by dimension (nodes, ranks)
+        topology_data = defaultdict(list)
+        for item in data:
+            topology_key = (item.nodes, item.ranks)
+            topology_data[topology_key].append(item)
+
+        topology_ranges = {}
+
+        for topology_key, items in topology_data.items():
+            nodes, ranks = topology_key
+
+            # Extract unique sizes for this dimension and sort them
+            unique_sizes = sorted(set(item.size_bytes for item in items))
+
+            if len(unique_sizes) <= 1:
+                # Only one size, create a single range from 0 to that size
+                size = unique_sizes[0] if unique_sizes else 0
+                ranges = [(0, size)]
+            else:
+                # Create growing ranges that interpolate between data points
+                ranges = []
+
+                for i, size in enumerate(unique_sizes):
+                    if i == 0:
+                        # First range: 0 to midpoint between first and second size
+                        if len(unique_sizes) > 1:
+                            next_size = unique_sizes[i + 1]
+                            max_size = (size + next_size) // 2
+                        else:
+                            max_size = size
+                        min_size = 0
+                    elif i == len(unique_sizes) - 1:
+                        # Last range: previous max + 1 to current size (and beyond)
+                        min_size = ranges[-1][1] + 1
+                        max_size = size
+                    else:
+                        # Intermediate ranges: previous max + 1 to midpoint with next size
+                        min_size = ranges[-1][1] + 1
+                        next_size = unique_sizes[i + 1]
+                        max_size = (size + next_size) // 2
+
+                    ranges.append((min_size, max_size))
+
+            topology_ranges[topology_key] = ranges
+
+            print(f"Dimension {nodes} nodes, {ranks} ranks: {len(ranges)} size ranges from {len(unique_sizes)} unique sizes:")
+            for i, (min_size, max_size) in enumerate(ranges):
+                # Count data points that fall in this range for this dimension
+                count = sum(1 for item in items if min_size <= item.size_bytes <= max_size)
+                actual_sizes = sorted(set(item.size_bytes for item in items if min_size <= item.size_bytes <= max_size))
+                if actual_sizes:
+                    size_list = ', '.join(f"{s:,}" for s in actual_sizes[:3])
+                    if len(actual_sizes) > 3:
+                        size_list += f", ... (+{len(actual_sizes)-3} more)"
+                    print(f"  Range {i+1}: {min_size:,} - {max_size:,} bytes ({count} data points, sizes: {size_list})")
+
+        return topology_ranges
+
+    def load_data(self, csv_file: str) -> List[PerformanceData]:
+        """Load performance data from CSV file"""
+        data = []
+        try:
+            with open(csv_file, 'r') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    try:
+                        data.append(PerformanceData(row))
+                    except (ValueError, KeyError) as e:
+                        print(f"Warning: Skipping invalid row: {row} - {e}")
+        except FileNotFoundError:
+            print(f"Error: File {csv_file} not found")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Error reading {csv_file}: {e}")
+            sys.exit(1)
+
+        print(f"Loaded {len(data)} performance data points")
+
+        # Auto-determine size ranges if enabled
+        if self.auto_size_ranges and data:
+            self.topology_size_ranges = self.auto_determine_size_ranges(data)
+        else:
+            # Use default ranges for all topologies
+            self.topology_size_ranges = {(-1, -1): self.size_ranges}
+
+        return data
+
+    def is_better(self, new_data: PerformanceData, current_best: PerformanceData) -> bool:
+        """Determine if new_data is better than current_best"""
+        if self.optimization_metric == 'bandwidth_gbps':
+            return new_data.bandwidth_gbps > current_best.bandwidth_gbps
+        elif self.optimization_metric == 'latency_us':
+            return new_data.latency_us < current_best.latency_us
+        else:
+            # Default to latency
+            return new_data.latency_us < current_best.latency_us
+
+    def optimize_configurations(self, data: List[PerformanceData]) -> List[str]:
+        """Find optimal configurations and return as NCCL config strings"""
+        # Group data by configuration key and size range
+        grouped_data = defaultdict(lambda: defaultdict(list))
+
+        for item in data:
+            config_key = item.get_config_key()
+            size_range = item.get_size_range_key(self.topology_size_ranges)
+            grouped_data[config_key][size_range].append(item)
+
+        # Store optimal configurations before combining ranges
+        optimal_configs = []
+
+        for config_key, size_ranges_dict in grouped_data.items():
+            collective, nodes, ranks, pipeOps, regBuff = config_key
+
+            for (min_size, max_size), items in size_ranges_dict.items():
+                if not items:
+                    continue
+
+                # Find the best performing configuration for this size range
+                best_item = items[0]
+                for item in items[1:]:
+                    if self.is_better(item, best_item):
+                        best_item = item
+
+                # Store the optimal configuration with its range
+                optimal_configs.append({
+                    'collective': collective,
+                    'min_size': min_size,
+                    'max_size': max_size,
+                    'algorithm': best_item.algorithm,
+                    'protocol': best_item.protocol,
+                    'channels': best_item.channels,
+                    'nodes': best_item.nodes,
+                    'ranks': best_item.ranks,
+                    'pipeOps': best_item.pipeOps,
+                    'regBuff': best_item.regBuff,
+                    'metric_value': getattr(best_item, self.optimization_metric)
+                })
+
+        # Combine sequential ranges with identical tunings
+        combined_configs = self.combine_sequential_ranges(optimal_configs)
+
+        # Generate config strings
+        configs = []
+        for config in combined_configs:
+            config_str = f"{config['collective']},{config['min_size']},{config['max_size']},{config['algorithm']},{config['protocol']},{config['channels']},{config['nodes']},{config['ranks']},{config['pipeOps']},{config['regBuff']}"
+            configs.append(config_str)
+
+            print(f"Optimal for {config['collective']} [{config['min_size']}-{config['max_size']}] nodes={config['nodes']} ranks={config['ranks']}: "
+                  f"{config['algorithm']}/{config['protocol']} channels={config['channels']} "
+                  f"({self.optimization_metric}={config['metric_value']:.3f})")
+
+        return configs
+
+    def combine_sequential_ranges(self, configs: List[Dict]) -> List[Dict]:
+        """Combine sequential ranges that have identical tuning parameters"""
+        if not configs:
+            return configs
+
+        # Group by collective and topology (nodes, ranks)
+        topology_groups = defaultdict(list)
+        for config in configs:
+            topology_key = (config['collective'], config['nodes'], config['ranks'],
+                          config['pipeOps'], config['regBuff'])
+            topology_groups[topology_key].append(config)
+
+        combined_configs = []
+
+        for topology_key, topology_configs in topology_groups.items():
+            # Sort by min_size to ensure proper ordering
+            topology_configs.sort(key=lambda x: x['min_size'])
+
+            # Group by tuning parameters (algorithm, protocol, channels)
+            tuning_groups = defaultdict(list)
+            for config in topology_configs:
+                tuning_key = (config['algorithm'], config['protocol'], config['channels'])
+                tuning_groups[tuning_key].append(config)
+
+            # For each tuning group, combine sequential ranges
+            for tuning_key, tuning_configs in tuning_groups.items():
+                if not tuning_configs:
+                    continue
+
+                # Sort by min_size
+                tuning_configs.sort(key=lambda x: x['min_size'])
+
+                # Combine sequential ranges
+                current_config = tuning_configs[0].copy()
+
+                for next_config in tuning_configs[1:]:
+                    # Check if ranges are adjacent or overlapping
+                    if current_config['max_size'] + 1 >= next_config['min_size']:
+                        # Extend the current range
+                        current_config['max_size'] = max(current_config['max_size'], next_config['max_size'])
+                        # Update metric value to the better one
+                        if self.optimization_metric == 'bandwidth_gbps':
+                            if next_config['metric_value'] > current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                        else:  # latency_us or default
+                            if next_config['metric_value'] < current_config['metric_value']:
+                                current_config['metric_value'] = next_config['metric_value']
+                    else:
+                        # Gap between ranges, save current and start new one
+                        combined_configs.append(current_config)
+                        current_config = next_config.copy()
+
+                # Add the last configuration
+                combined_configs.append(current_config)
+
+        # Sort final configs by collective, nodes, ranks, then min_size
+        combined_configs.sort(key=lambda x: (x['collective'], x['nodes'], x['ranks'], x['min_size']))
+
+        original_count = len(configs)
+        combined_count = len(combined_configs)
+        if combined_count < original_count:
+            print(f"Combined {original_count} ranges into {combined_count} ranges "
+                  f"(reduced by {original_count - combined_count})")
+
+        return combined_configs
+
+    def append_to_config_file(self, configs: List[str], config_file: str, add_header: bool = True):
+        """Append optimized configurations to NCCL tuner config file"""
+        try:
+            # Create directory if it doesn't exist
+            config_dir = os.path.dirname(config_file)
+            if config_dir and not os.path.exists(config_dir):
+                os.makedirs(config_dir)
+                print(f"Created directory: {config_dir}")
+
+            # Check if file exists and has content
+            file_exists = os.path.exists(config_file)
+            add_separator = False
+
+            if file_exists:
+                with open(config_file, 'r') as f:
+                    content = f.read().strip()
+                    add_separator = len(content) > 0
+                print(f"Appending to existing file: {config_file}")
+            else:
+                print(f"Creating new file: {config_file}")
+
+            with open(config_file, 'a') as f:
+                if add_separator:
+                    f.write("\n\n")
+
+                if add_header:
+                    f.write(f"# Optimized configurations generated by optimize_config.py\n")
+                    f.write(f"# Optimization metric: {self.optimization_metric}\n")
+                    f.write(f"# Format: collective_type,min_bytes,max_bytes,algorithm,protocol,channels,nNodes,nRanks,numPipeOps,regBuff\n")
+
+                for config in configs:
+                    f.write(f"{config}\n")
+
+            if file_exists:
+                print(f"Appended {len(configs)} optimized configurations to {config_file}")
+            else:
+                print(f"Created {config_file} with {len(configs)} optimized configurations")
+
+        except PermissionError:
+            print(f"Error: Permission denied writing to {config_file}")
+            print("Try running with appropriate permissions or choose a different output location")
+            sys.exit(1)
+        except OSError as e:
+            print(f"Error: Cannot create/write to {config_file}: {e}")
+            print("Check that the path is valid and you have write permissions")
+            sys.exit(1)
+        except Exception as e:
+            print(f"Unexpected error writing to {config_file}: {e}")
+            sys.exit(1)
+
+def main():
+    parser = argparse.ArgumentParser(description="Optimize NCCL tuner configurations from performance data")
+    parser.add_argument("csv_file", help="Input CSV file with performance data")
+    parser.add_argument("-o", "--output", default="nccl_tuner.conf",
+                       help="Output NCCL tuner config file (default: nccl_tuner.conf)")
+    parser.add_argument("-m", "--metric", choices=['bandwidth_gbps', 'latency_us'],
+                       default='latency_us', help="Optimization metric (default: latency_us)")
+    parser.add_argument("--no-header", action="https://github.com/store_true",
+                       help="Don't add header comments to output file")
+    parser.add_argument("--dry-run", action="https://github.com/store_true",
+                       help="Print configurations without writing to file")
+    parser.add_argument("--no-auto-ranges", action="https://github.com/store_true",
+                       help="Disable automatic size range determination (use default ranges)")
+    parser.add_argument("--size-ranges", type=str,
+                       help="Custom size ranges as comma-separated pairs: 'min1-max1,min2-max2,...'")
+
+    args = parser.parse_args()
+
+    optimizer = ConfigOptimizer(args.metric)
+
+    # Handle size range configuration
+    if args.size_ranges:
+        # Parse custom size ranges
+        try:
+            ranges = []
+            for range_str in args.size_ranges.split(','):
+                min_size, max_size = map(int, range_str.split('-'))
+                ranges.append((min_size, max_size))
+            optimizer.set_size_ranges(ranges)
+            print(f"Using custom size ranges: {ranges}")
+        except ValueError:
+            print("Error: Invalid size ranges format. Use 'min1-max1,min2-max2,...'")
+            sys.exit(1)
+    elif args.no_auto_ranges:
+        # Disable auto-ranging
+        optimizer.auto_size_ranges = False
+        print("Using default hardcoded size ranges")
+    else:
+        # Auto-ranging is enabled by default - creates one bucket per unique size
+        optimizer.auto_size_ranges = True
+        print("Auto-ranging enabled: will create one bucket per unique size in data")
+
+    # Load and optimize data
+    data = optimizer.load_data(args.csv_file)
+    if not data:
+        print("No valid data found in CSV file")
+        sys.exit(1)
+
+    configs = optimizer.optimize_configurations(data)
+
+    if args.dry_run:
+        print("\nGenerated configurations:")
+        for config in configs:
+            print(config)
+    else:
+        optimizer.append_to_config_file(configs, args.output, not args.no_header)
+
+if __name__ == "__main__":
+    main()
diff --git a/ext-tuner/example/scripts/sample_performance_data.csv b/ext-tuner/example/scripts/sample_performance_data.csv
new file mode 100644
index 000000000..7b96403c0
--- /dev/null
+++ b/ext-tuner/example/scripts/sample_performance_data.csv
@@ -0,0 +1,24 @@
+collective,size_bytes,algorithm,protocol,channels,nodes,ranks,pipeOps,regBuff,cost_metric,bandwidth_gbps,latency_us
+allreduce,1024,tree,simple,2,1,8,-1,-1,0.15,45.2,12.5
+allreduce,1024,ring,simple,4,1,8,-1,-1,0.12,52.1,10.8
+allreduce,1024,tree,ll,2,1,8,-1,-1,0.18,41.3,15.2
+allreduce,1024,ring,ll,4,1,8,-1,-1,0.14,48.7,12.1
+allreduce,32768,tree,simple,2,1,8,-1,-1,0.25,156.8,25.3
+allreduce,32768,ring,simple,4,1,8,-1,-1,0.18,189.2,18.4
+allreduce,32768,ring,ll128,8,1,8,-1,-1,0.16,201.5,16.2
+allreduce,1048576,ring,simple,4,1,8,-1,-1,0.45,425.6,45.1
+allreduce,1048576,ring,ll128,8,1,8,-1,-1,0.38,482.3,38.7
+allreduce,1048576,nvls,simple,16,1,8,-1,-1,0.32,551.2,32.1
+broadcast,1024,tree,simple,2,1,8,-1,-1,0.08,89.4,8.2
+broadcast,1024,ring,simple,4,1,8,-1,-1,0.12,71.3,12.1
+broadcast,32768,tree,simple,2,1,8,-1,-1,0.18,234.7,18.5
+broadcast,32768,ring,ll128,4,1,8,-1,-1,0.15,267.8,15.2
+broadcast,1048576,ring,simple,4,1,8,-1,-1,0.35,612.4,35.1
+broadcast,1048576,ring,ll128,8,1,8,-1,-1,0.28,702.1,28.3
+allreduce,1024,tree,simple,2,2,16,-1,-1,0.22,38.1,22.4
+allreduce,1024,ring,simple,4,2,16,-1,-1,0.19,42.7,19.6
+allreduce,32768,ring,simple,4,2,16,-1,-1,0.28,145.2,28.1
+allreduce,32768,ring,ll128,8,2,16,-1,-1,0.24,167.8,24.3
+allreduce,1048576,ring,simple,4,2,16,-1,-1,0.58,387.5,58.2
+allreduce,1048576,ring,ll128,8,2,16,-1,-1,0.48,456.9,48.1
+allreduce,1048576,nvls,simple,16,2,16,-1,-1,0.42,512.6,42.3
diff --git a/ext-tuner/example/test/Makefile b/ext-tuner/example/test/Makefile
new file mode 100644
index 000000000..d675cbe1e
--- /dev/null
+++ b/ext-tuner/example/test/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for NCCL Tuner Plugin Unit Tests
+#
+
+CC := gcc
+CFLAGS := -Wall -Wextra -g -std=c99 -fPIC
+INC := -I. -I../nccl
+TARGET := test_plugin
+SOURCES := test_plugin.c
+
+# Default target
+all: $(TARGET)
+
+# Build the test executable
+$(TARGET): $(SOURCES)
+	$(CC) $(CFLAGS) $(INC) -o $(TARGET) $(SOURCES)
+
+# Run the tests
+test: $(TARGET)
+	./$(TARGET) $(TEST_CASE)
+
+# Run tests with verbose output
+test-verbose: $(TARGET)
+	NCCL_DEBUG=INFO ./$(TARGET) $(TEST_CASE)
+
+# Clean build artifacts
+clean:
+	rm -f $(TARGET) *.o *.gcov *.gcda *.gcno test_*.conf
+
+.PHONY: all test test-verbose clean
diff --git a/ext-tuner/example/test/README.md b/ext-tuner/example/test/README.md
new file mode 100644
index 000000000..8203c65a1
--- /dev/null
+++ b/ext-tuner/example/test/README.md
@@ -0,0 +1,205 @@
+# NCCL Tuner Plugin Unit Tests
+
+This directory contains comprehensive unit tests for the NCCL tuner plugin. The tests verify all major functionality including configuration parsing, matching logic, and cost table updates.
+
+## Test Structure
+
+```
+test/
+├── test_plugin.c     # Main unit test file
+├── Makefile          # Build system for tests
+└── README.md         # This file
+```
+
+## Building and Running Tests
+
+### Quick Start
+
+```bash
+# Build and run all tests
+make test
+
+# Or step by step
+make           # Build test executable
+./test_plugin  # Run tests
+```
+
+### Advanced Testing
+
+```bash
+# Run with memory leak detection (requires valgrind)
+make test-memory
+
+# Run with verbose logging
+make test-verbose
+
+# Generate code coverage report (requires gcov)
+make coverage
+
+# Create sample test configuration files
+make test-configs
+```
+
+## Test Coverage
+
+The unit tests cover the following functionality:
+
+### 1. **Plugin Initialization (`test_plugin_init`)**
+- Tests successful plugin initialization
+- Verifies context allocation
+- Tests cleanup on destroy
+
+### 2. **Configuration Parsing (`test_config_parsing_valid`, `test_config_parsing_invalid`)**
+- Valid CSV format parsing
+- Comment and empty line handling
+- Invalid format graceful handling
+- Environment variable configuration
+
+### 3. **Collective Type Matching (`test_collective_matching`)**
+- Correct matching of allreduce, broadcast, etc.
+- Algorithm/protocol selection
+- Channel configuration
+
+### 4. **Size Range Matching (`test_size_matching`)**
+- Small, medium, large message size handling
+- Proper range boundary checking
+- Multiple size-based configurations
+
+### 5. **Topology Matching (`test_topology_matching`)**
+- Single-node vs multi-node configurations
+- Exact nNodes/nRanks matching
+- Wildcard matching (-1 values)
+
+### 6. **Default Channels (`test_default_channels`)**
+- Proper handling of -1 channel specification
+- Preservation of NCCL default behavior
+
+### 7. **Registered Buffer Matching (`test_regbuff_matching`)**
+- Configurations based on regBuff parameter
+- Registered vs non-registered buffer handling
+- Backward compatibility with configs missing regBuff
+
+### 8. **Pipeline Operations Matching (`test_pipeops_matching`)**
+- Configurations based on numPipeOps parameter
+- Single vs multiple pipeline operation handling
+- Backward compatibility with configs missing numPipeOps
+
+### 9. **Fallback Behavior (`test_no_match_fallback`)**
+- Default behavior when no config matches
+- Ring/Simple algorithm fallback
+
+## Test Output
+
+Successful test run:
+```
+Running NCCL Tuner Plugin Unit Tests
+=====================================
+PASS: test_plugin_init
+PASS: test_config_parsing_valid
+PASS: test_config_parsing_invalid
+PASS: test_collective_matching
+PASS: test_size_matching
+PASS: test_topology_matching
+PASS: test_default_channels
+PASS: test_regbuff_matching
+PASS: test_pipeops_matching
+PASS: test_no_match_fallback
+
+=====================================
+Test Results: 9/9 tests passed
+All tests PASSED!
+```
+
+Failed test example:
+```
+FAIL: test_collective_matching - Tree/Simple should have low cost
+Test Results: 8/9 tests passed
+Some tests FAILED!
+```
+
+## Mock NCCL Implementation
+
+The tests use the actual NCCL header files from the `../nccl/` directory:
+
+- `tuner.h` - Complete NCCL tuner interface and type definitions
+- `common.h` - Common NCCL types and logging functions
+- `err.h` - NCCL error codes
+
+This allows testing with the real NCCL interface definitions while still being able to run tests without the full NCCL library installation.
+
+## Integration with CI/CD
+
+```bash
+# Install tests for CI/CD pipeline
+make install-test
+
+# Run as part of automated testing
+make test && echo "Tests passed" || echo "Tests failed"
+```
+
+## Memory Testing
+
+The tests can be run with valgrind for memory leak detection:
+
+```bash
+make test-memory
+```
+
+This will detect:
+- Memory leaks
+- Invalid memory access
+- Use of uninitialized memory
+
+## Code Coverage
+
+Generate code coverage reports to ensure comprehensive testing:
+
+```bash
+make coverage
+# Creates test_plugin.c.gcov with line-by-line coverage
+```
+
+## Adding New Tests
+
+To add a new test:
+
+1. Create a new test function in `test_plugin.c`:
+```c
+int test_new_feature() {
+  // Test setup
+  TEST_ASSERT(condition, "description");
+  // Test cleanup
+  TEST_PASS();
+}
+```
+
+2. Add the test to the main function:
+```c
+total++; passed += test_new_feature();
+```
+
+3. Rebuild and run:
+```bash
+make test
+```
+
+## Debugging Tests
+
+For debugging failed tests:
+
+```bash
+# Compile with debug symbols
+make CFLAGS="-g -O0 -DDEBUG"
+
+# Run with gdb
+gdb ./test_plugin
+```
+
+## Cleaning Up
+
+```bash
+# Remove all build artifacts and temporary files
+make clean
+```
+
+This comprehensive test suite ensures the NCCL tuner plugin works correctly across all supported configurations and edge cases.
diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c
new file mode 100644
index 000000000..28897c449
--- /dev/null
+++ b/ext-tuner/example/test/test_plugin.c
@@ -0,0 +1,856 @@
+/*************************************************************************
+ * Unit tests for NCCL Tuner Plugin
+ ************************************************************************/
+
+#define _GNU_SOURCE  // Enable setenv/unsetenv and other GNU extensions
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdarg.h>
+
+
+// Include NCCL tuner header (which includes common.h and err.h)
+#include "tuner.h"
+
+// Include plugin source for testing
+#include "../plugin.c"
+
+// Test framework macros
+#define TEST_ASSERT(condition, message) \
+  do { \
+    if (!(condition)) { \
+      printf("FAIL: %s - %s\n", __func__, message); \
+      return 0; \
+    } \
+  } while(0)
+
+#define TEST_PASS() \
+  do { \
+    printf("PASS: %s\n", __func__); \
+    return 1; \
+  } while(0)
+
+// Global test state
+static int test_log_count = 0;
+
+// Mock logger function
+void mock_logger(ncclDebugLogLevel level, unsigned long flags,
+                 const char* file, int line, const char* fmt, ...) {
+  (void)flags; // Suppress unused parameter warning
+  test_log_count++;
+
+  // Check if we should print based on NCCL_DEBUG level
+  const char* debug_level = getenv("NCCL_DEBUG");
+  int should_print = 0;
+
+  if (debug_level) {
+    if (strcmp(debug_level, "TRACE") == 0) {
+      should_print = 1; // Print everything
+    } else if (strcmp(debug_level, "INFO") == 0 && level <= NCCL_LOG_INFO) {
+      should_print = 1; // Print INFO and below
+    } else if (strcmp(debug_level, "WARN") == 0 && level <= NCCL_LOG_WARN) {
+      should_print = 1; // Print WARN and below
+    }
+  }
+
+  if (!should_print) return;
+
+  // Convert log level to string
+  const char* level_str;
+  switch(level) {
+    case NCCL_LOG_NONE: level_str = "NONE"; break;
+    case NCCL_LOG_VERSION: level_str = "VERSION"; break;
+    case NCCL_LOG_WARN: level_str = "WARN"; break;
+    case NCCL_LOG_INFO: level_str = "INFO"; break;
+    case NCCL_LOG_ABORT: level_str = "ABORT"; break;
+    case NCCL_LOG_TRACE: level_str = "TRACE"; break;
+    default: level_str = "UNKNOWN"; break;
+  }
+
+  // Print log header
+  printf("[TUNER:%s:%s:%d] ", level_str, file, line);
+
+  // Print formatted message
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+
+  printf("\n");
+}
+
+// Helper function to create test config file
+void create_test_config(const char* filename, const char* content) {
+  FILE* f = fopen(filename, "w");
+  if (f) {
+    fprintf(f, "%s", content);
+    fclose(f);
+  }
+}
+
+// Test 1: Plugin initialization
+int test_plugin_init() {
+  void* context = NULL;
+
+  // Test successful initialization
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Clean up
+  pluginDestroy(context);
+  TEST_PASS();
+}
+
+// Test 2: Configuration file parsing - valid CSV
+int test_config_parsing_valid() {
+  const char* test_config =
+    "# Test configuration\n"
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,2,16,-1,-1\n"
+    "# Comment line\n"
+    "\n"  // Empty line
+    "reduce,1024,2048,tree,simple,-1,-1,-1,-1,-1\n";
+
+  create_test_config("test_valid.conf", test_config);
+
+  // Set environment variable to use our test config
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_valid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 3: Configuration file parsing - invalid CSV
+int test_config_parsing_invalid() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1  # Missing nRanks and other fields\n"
+    "invalid_collective,0,1024,ring,simple,1,1,1,-1,-1\n"
+    "broadcast,abc,def,ring,simple,1,1,1,-1,-1\n";  // Invalid numbers
+
+  create_test_config("test_invalid.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  // Should still succeed but with no valid configs loaded
+  TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_invalid.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 4: Collective type matching
+int test_collective_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,8,1,-1,-1,-1\n"
+    "broadcast,0,32768,ring,ll128,4,-1,-1,-1,-1\n";
+
+  create_test_config("test_match.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  // Create mock cost table
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+
+  // Test allreduce matching (should match first config)
+  ncclResult_t result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                                          cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                                          0, &nChannels);
+
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 8, "Should set 8 channels");
+
+  // Test broadcast matching (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Reset costs
+    }
+  }
+
+  result = pluginGetCollInfo(context, ncclFuncBroadcast, 16384, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should succeed");
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 4, "Should set 4 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_match.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 5: Size range matching
+int test_size_matching() {
+  const char* test_config =
+    "allreduce,0,1024,tree,simple,2,-1,-1,-1,-1\n"
+    "allreduce,1025,65536,ring,simple,4,-1,-1,-1,-1\n"
+    "allreduce,65537,4294967295,ring,ll128,8,-1,-1,-1,-1\n";
+
+  create_test_config("test_size.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+  int nChannels = 1;
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 512, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Small message - checking cost_table[TREE][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Small: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Small: Should set 2 channels");
+
+  // Test medium message (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Medium message - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Medium: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Medium: Should set 4 channels");
+
+  // Test large message (should match third config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 1048576, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Large message - checking cost_table[RING][LL128] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128], cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Large: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_size.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 6: Topology matching
+int test_topology_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,1,-1,-1,-1\n"      // Single node only
+    "allreduce,0,65536,ring,simple,4,4,32,-1,-1\n"      // 4 nodes, 32 ranks exactly
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any topology
+
+  create_test_config("test_topo.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_topo.conf", 1);
+
+  // Test with single node setup
+  void* context1 = NULL;
+  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  pluginGetCollInfo(context1, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
+  TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
+
+  pluginDestroy(context1);
+
+  // Test with 4 nodes, 32 ranks setup
+  void* context2 = NULL;
+  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context2, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "4-node: Should match ring/simple config");
+  TEST_ASSERT(nChannels == 4, "4-node: Should set 4 channels");
+
+  // Clean up
+  unlink("test_topo.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 7: Default channels behavior (-1)
+int test_default_channels() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,-1,-1,-1,-1,-1\n";  // Use default channels
+
+  create_test_config("test_default.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels = 99;  // Set to known value
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Should apply algorithm/protocol");
+  TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_default.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 8: regBuff matching
+int test_regbuff_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,-1,1\n"      // Registered buffers only
+    "allreduce,0,65536,ring,simple,4,-1,-1,-1,0\n"      // Non-registered buffers only
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any buffer type (backward compatible)
+
+  create_test_config("test_regbuff.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test registered buffer (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    1, &nChannels);  // regBuff = 1 (registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Registered buffer: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Registered buffer: Should set 2 channels");
+
+  // Test non-registered buffer (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);  // regBuff = 0 (non-registered)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Non-registered buffer: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Non-registered buffer: Should set 4 channels");
+
+  // Test backward compatibility - config without regBuff should match any regBuff value
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  // First try with regBuff=2 (unusual value, should match third config)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    2, &nChannels);  // regBuff = 2 (only third config should match)
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any regBuff: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_regbuff.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 9: numPipeOps matching
+int test_pipeops_matching() {
+  const char* test_config =
+    "allreduce,0,65536,tree,simple,2,-1,-1,1,-1\n"      // Single pipeline op
+    "allreduce,0,65536,ring,simple,4,-1,-1,4,-1\n"      // Multiple pipeline ops
+    "allreduce,0,65536,ring,ll128,8,-1,-1,-1,-1\n";     // Any pipeline ops (backward compatible)
+
+  create_test_config("test_pipeops.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+  }
+
+  int nChannels;
+
+  // Test single pipeline op (should match first config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single pipeOp: Tree/Simple should have low cost");
+  TEST_ASSERT(nChannels == 2, "Single pipeOp: Should set 2 channels");
+
+  // Test multiple pipeline ops (should match second config)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 4,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 0.0, "Multiple pipeOps: Ring/Simple should have low cost");
+  TEST_ASSERT(nChannels == 4, "Multiple pipeOps: Should set 4 channels");
+
+  // Test different number of pipeline ops (should match third config - backward compatible)
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 2,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_LL128] == 0.0, "Any pipeOps: Ring/LL128 should have low cost");
+  TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_pipeops.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 10: No matching configuration (fallback behavior)
+int test_no_match_fallback() {
+  const char* test_config =
+    "broadcast,0,1024,tree,simple,2,-1,-1,-1,-1\n";  // Only broadcast config
+
+  create_test_config("test_fallback.conf", test_config);
+  setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
+
+  void* context = NULL;
+  pluginInit(8, 1, mock_logger, &context);
+
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  // Try allreduce (should not match, use fallback)
+  pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                    cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                    0, &nChannels);
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "DEBUG: Fallback test - checking cost_table[RING][SIMPLE] (%p) = %.1f (expecting 0.0)",
+              &cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE], cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE]);
+  TEST_ASSERT(cost_table[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] == 1.0, "Should use pass through unmodified");
+  TEST_ASSERT(nChannels == 1, "Should use default channels");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink("test_fallback.conf");
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+  TEST_PASS();
+}
+
+// Test 11: Large configuration files (testing dynamic allocation)
+int test_large_config() {
+  const char* large_config_file = "test_large.conf";
+
+  // Create a large configuration file with many entries
+  // This tests the dynamic allocation functionality
+  FILE* f = fopen(large_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create large config file");
+
+  // Write header comment
+  fprintf(f, "# Large configuration file for testing dynamic allocation\n");
+  fprintf(f, "# This file contains many configurations to test memory allocation\n");
+
+  // Generate a large number of configurations (much more than the old MAX_CONFIGS=100)
+  const int num_configs = 500; // 5x the old static limit
+  const char* collectives[] = {"allreduce", "broadcast", "reduce", "allgather", "reducescatter"};
+  const char* algorithms[] = {"tree", "ring", "collnet_direct", "nvls"};
+  const char* protocols[] = {"simple", "ll", "ll128"};
+
+  for (int i = 0; i < num_configs; i++) {
+    // Vary the configurations to create realistic test data
+    const char* coll = collectives[i % 5];
+    const char* algo = algorithms[i % 4];
+    const char* proto = protocols[i % 3];
+
+    size_t min_bytes = (i * 1024) % 1048576; // Vary from 0 to 1MB
+    size_t max_bytes = min_bytes + 65536;    // 64KB range
+    int channels = (i % 8) + 1;              // 1-8 channels
+    int nodes = (i % 4) == 0 ? -1 : (i % 4); // Mix of -1 and 1-3 nodes
+    int ranks = (i % 8) == 0 ? -1 : (i % 32) + 1; // Mix of -1 and 1-32 ranks
+    int pipeOps = (i % 3) == 0 ? -1 : (i % 4) + 1; // Mix of -1 and 1-4 pipeOps
+    int regBuff = (i % 3) == 0 ? -1 : (i % 2); // Mix of -1, 0, 1
+
+    fprintf(f, "%s,%zu,%zu,%s,%s,%d,%d,%d,%d,%d\n",
+            coll, min_bytes, max_bytes, algo, proto, channels, nodes, ranks, pipeOps, regBuff);
+  }
+
+  fclose(f);
+
+  // Set environment to use our large config file
+  setenv("NCCL_TUNER_CONFIG_FILE", large_config_file, 1);
+
+  // Initialize plugin with large config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
+  TEST_ASSERT(context != NULL, "Context should be allocated");
+
+  // Verify that configurations were loaded
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == num_configs, "Should load all configurations from large file");
+  TEST_ASSERT(ctx->maxConfigs == num_configs, "maxConfigs should match allocated size");
+  TEST_ASSERT(ctx->configs != NULL, "Configs array should be dynamically allocated");
+
+  // Test that we can access configurations throughout the array
+  // (This would have failed with the old static MAX_CONFIGS=100 limit)
+  for (int i = 0; i < ctx->numConfigs; i++) {
+    TuningConfig* config = &ctx->configs[i];
+    // Basic sanity checks on the loaded configurations
+    TEST_ASSERT(config->collType >= ncclFuncBroadcast && config->collType <= ncclFuncAllReduce,
+                "Collective type should be valid");
+    TEST_ASSERT(config->maxBytes >= config->minBytes, "maxBytes should be >= minBytes");
+    TEST_ASSERT(config->nChannels > 0, "nChannels should be positive");
+  }
+
+  // Test specific configuration access at various indices
+  // Index 0 (first config)
+  TuningConfig* first_config = &ctx->configs[0];
+  TEST_ASSERT(first_config != NULL, "First config should be accessible");
+
+  // Index in middle
+  TuningConfig* mid_config = &ctx->configs[num_configs / 2];
+  TEST_ASSERT(mid_config != NULL, "Middle config should be accessible");
+
+  // Index near end (this would have crashed with static array of 100)
+  TuningConfig* late_config = &ctx->configs[num_configs - 1];
+  TEST_ASSERT(late_config != NULL, "Last config should be accessible");
+
+  // Test memory allocation size - verify we didn't over-allocate
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Successfully loaded %d configurations (dynamic allocation)", ctx->numConfigs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory allocated for %d configurations (%zu bytes total)",
+              ctx->maxConfigs, ctx->maxConfigs * sizeof(TuningConfig));
+
+  // Test that the plugin can still find matching configurations from the large set
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0; // Default high cost
+    }
+  }
+
+  int nChannels;
+  // Try to find a matching configuration - should work with large config set
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(large_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 12: Very large configuration stress test
+int test_very_large_config_stress() {
+  const char* stress_config_file = "test_stress.conf";
+
+  // Create an even larger configuration file to stress test the implementation
+  FILE* f = fopen(stress_config_file, "w");
+  TEST_ASSERT(f != NULL, "Should be able to create stress test config file");
+
+  fprintf(f, "# Stress test configuration with very large number of entries\n");
+
+  // Generate an extremely large number of configurations
+  const int stress_configs = 2000; // 20x the old static limit
+
+  for (int i = 0; i < stress_configs; i++) {
+    // Create varied but valid configurations
+    fprintf(f, "allreduce,%d,%d,ring,simple,4,-1,-1,-1,-1\n",
+            i * 512, (i * 512) + 1024);
+  }
+
+  fclose(f);
+
+  setenv("NCCL_TUNER_CONFIG_FILE", stress_config_file, 1);
+
+  // Test initialization with stress config
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == stress_configs, "Should load all stress test configurations");
+  TEST_ASSERT(ctx->configs != NULL, "Stress test configs should be allocated");
+
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Stress test - loaded %d configurations successfully", stress_configs);
+  mock_logger(NCCL_LOG_INFO, NCCL_ALL, __FILE__, __LINE__,
+              "Memory usage: %zu bytes for configuration array",
+              stress_configs * sizeof(TuningConfig));
+
+  // Verify we can access configurations throughout the entire range
+  for (int i = 0; i < stress_configs; i += 100) { // Sample every 100th config
+    TuningConfig* config = &ctx->configs[i];
+    TEST_ASSERT(config->collType == ncclFuncAllReduce, "Config should have correct collective type");
+    TEST_ASSERT(config->minBytes == (size_t)(i * 512), "Config should have correct minBytes");
+  }
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(stress_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test 13: Edge case - empty config file
+int test_empty_config() {
+  const char* empty_config_file = "test_empty.conf";
+
+  // Create empty config file (only comments)
+  create_test_config(empty_config_file,
+    "# Empty configuration file\n"
+    "# No actual configurations\n"
+    "\n"
+    "\n");
+
+  setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
+
+  TunerContext* ctx = (TunerContext*)context;
+  TEST_ASSERT(ctx->numConfigs == 0, "Should have zero configurations");
+  TEST_ASSERT(ctx->maxConfigs == 0, "Should have zero max configurations");
+  TEST_ASSERT(ctx->configs == NULL, "Should not allocate memory for empty config");
+
+  // Test that plugin still works with no configurations (fallback behavior)
+  float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
+  for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
+    cost_table_ptr[i] = cost_table[i];
+    for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
+      cost_table[i][j] = 1.0;
+    }
+  }
+
+  int nChannels;
+  result = pluginGetCollInfo(context, ncclFuncAllReduce, 32768, 1,
+                            cost_table_ptr, NCCL_NUM_ALGORITHMS, NCCL_NUM_PROTOCOLS,
+                            0, &nChannels);
+  TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
+
+  // Clean up
+  pluginDestroy(context);
+  unlink(empty_config_file);
+  unsetenv("NCCL_TUNER_CONFIG_FILE");
+
+  TEST_PASS();
+}
+
+// Test runner function pointer type
+typedef int (*TestFunction)(void);
+
+// Test registry
+typedef struct {
+  const char* name;
+  TestFunction func;
+  const char* description;
+} TestCase;
+
+// All available tests
+TestCase test_cases[] = {
+  {"init", test_plugin_init, "Plugin initialization"},
+  {"config-valid", test_config_parsing_valid, "Valid configuration parsing"},
+  {"config-invalid", test_config_parsing_invalid, "Invalid configuration parsing"},
+  {"collective", test_collective_matching, "Collective type matching"},
+  {"size", test_size_matching, "Size range matching"},
+  {"topology", test_topology_matching, "Topology matching"},
+  {"channels", test_default_channels, "Default channels behavior"},
+  {"regbuff", test_regbuff_matching, "Registered buffer matching"},
+  {"pipeops", test_pipeops_matching, "Pipeline operations matching"},
+  {"fallback", test_no_match_fallback, "Fallback behavior"},
+  {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
+  {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
+  {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {NULL, NULL, NULL} // End marker
+};
+
+// Show help/usage information
+void show_help(const char* program_name) {
+  printf("Usage: %s [test_name ...]\n\n", program_name);
+  printf("Available tests:\n");
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    printf("  %-15s - %s\n", test_cases[i].name, test_cases[i].description);
+  }
+  printf("\nExamples:\n");
+  printf("  %s                    # Run all tests\n", program_name);
+  printf("  %s init               # Run only initialization test\n", program_name);
+  printf("  %s init collective    # Run initialization and collective tests\n", program_name);
+  printf("  %s --help             # Show this help\n", program_name);
+}
+
+// Find test by name
+TestFunction find_test(const char* name) {
+  for (int i = 0; test_cases[i].name != NULL; i++) {
+    if (strcmp(test_cases[i].name, name) == 0) {
+      return test_cases[i].func;
+    }
+  }
+  return NULL;
+}
+
+// Main test runner
+int main(int argc, char* argv[]) {
+  int passed = 0, total = 0;
+
+  // Check for help
+  if (argc > 1 && (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-h") == 0)) {
+    show_help(argv[0]);
+    return 0;
+  }
+
+  printf("Running NCCL Tuner Plugin Unit Tests\n");
+  printf("=====================================\n");
+
+  if (argc == 1) {
+    // No arguments - run all tests
+    for (int i = 0; test_cases[i].name != NULL; i++) {
+      total++;
+      passed += test_cases[i].func();
+    }
+  } else {
+    // Run specific tests
+    for (int arg = 1; arg < argc; arg++) {
+      TestFunction test_func = find_test(argv[arg]);
+      if (test_func) {
+        total++;
+        passed += test_func();
+      } else {
+        printf("ERROR: Unknown test '%s'\n", argv[arg]);
+        printf("Use --help to see available tests\n");
+        return 1;
+      }
+    }
+  }
+
+  printf("\n=====================================\n");
+  printf("Test Results: %d/%d tests passed\n", passed, total);
+
+  if (passed == total) {
+    printf("All tests PASSED!\n");
+    return 0;
+  } else {
+    printf("Some tests FAILED!\n");
+    return 1;
+  }
+}
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 8a35a8fab..6ba9bbfce 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -40,10 +40,12 @@ ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
 CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
 endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
+CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
 CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
-CUDA13_GENCODE = -gencode=arch=compute_100,code=sm_100 \
-                 -gencode=arch=compute_120,code=sm_120
+CUDA12_8_GENCODE = -gencode=arch=compute_100,code=sm_100 \
+                   -gencode=arch=compute_120,code=sm_120
+CUDA13_GENCODE = -gencode=arch=compute_110,code=sm_110
 
 CUDA8_PTX     = -gencode=arch=compute_61,code=compute_61
 CUDA9_PTX     = -gencode=arch=compute_70,code=compute_70
@@ -53,10 +55,10 @@ CUDA13_PTX    = -gencode=arch=compute_120,code=compute_120
 
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
 # Prior to SM75 is deprecated from CUDA13.0 onwards
-  NVCC_GENCODE ?= $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+  NVCC_GENCODE ?= $(CUDA10_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
 # Include Blackwell support if we're using CUDA12.8 or above
-  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA13_GENCODE) $(CUDA13_PTX)
+  NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_8_GENCODE) $(CUDA13_PTX)
 else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
 # Include Hopper support if we're using CUDA11.8 or above
   NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
diff --git a/makefiles/version.mk b/makefiles/version.mk
index f41e7a783..013e972f3 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 27
-NCCL_PATCH   := 3
+NCCL_PATCH   := 5
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/device/Makefile b/src/device/Makefile
index df58489a0..67ab176ca 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -36,9 +36,8 @@ define COMPILE
  $(call COMPILE$(or $3,$(suffix $2)),$1,$2)
 endef
 
-ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12080))"),1)
-	NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a \
-                     -gencode=arch=compute_120a,code=sm_120a
+ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12090))"),1)
+	NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100f,code=sm_100f
 else ifeq ($(shell echo "$$((1000*$(CUDA_MAJOR) + 10*$(CUDA_MINOR) >= 12070))"),1)
   NVCC_GENCODE_LDMC_FP8 = -gencode=arch=compute_100a,code=sm_100a
 else
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index 0d054bb2d..d36dfe5a7 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -1009,7 +1009,7 @@ struct Apply_LoadMultimem {
     DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_bfloat16, bf16x2, 4)
   #endif
 
-  #if NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
+  #if NCCL_CUDA_ARCH_SPECIFIC == 1000 || NCCL_CUDA_ARCH_SPECIFIC == 1010 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1000 || NCCL_CUDA_ARCH_FAMILY_SPECIFIC == 1010 || NCCL_CUDA_ARCH_SPECIFIC == 1200 || NCCL_CUDA_ARCH_SPECIFIC == 1210
     DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
     DEFINE_Apply_LoadMultimem_minmax_v4_and_xparts(__nv_fp8_e4m3, e4m3x4, 4)
     DEFINE_Apply_LoadMultimem_sum_v4_and_xparts(__nv_fp8_e5m2, e5m2x4, 4)
diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py
index f630ff072..8fcb9a425 100755
--- a/src/device/symmetric/generate.py
+++ b/src/device/symmetric/generate.py
@@ -108,7 +108,7 @@ def required_cuda(k):
       if k.algo in ldmc_algos:
         cudart = 12070
         arch = None
-        specific_sms = [100, 120]
+        specific_sms = ["100a", "101a", "100f", "101f", "120a", "121a"]
   return (cudart, arch, specific_sms)
 
 ################################################################################
@@ -145,7 +145,7 @@ def kernel_conds(k):
   if not specific_sms:
     arch_cond = "__CUDA_ARCH__ >= %d"%arch
   else:
-    arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_SPECIFIC==%d"%(10*sm) for sm in specific_sms])
+    arch_cond = " || ".join(["0"] + ["NCCL_CUDA_ARCH_%sSPECIFIC==%d"%("FAMILY_" if sm[-1] == "f" else "", 10*int(sm.replace('a', '').replace('f', ''))) for sm in specific_sms])
   return cudart_cond, arch_cond
 
 def instantiate(k):
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index bc5cc755e..4b44abd01 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -175,6 +175,13 @@ ncclResult_t ncclGetLocalCpu(struct ncclTopoSystem* system, int gpu, int* retCpu
   return ncclSuccess;
 }
 
+static int mergePathType(int type0, int type1){
+  int max = std::max(type0,type1);
+  int min = std::min(type0,type1);
+  if(max == PATH_PHB && min == PATH_C2C) return PATH_P2C;
+  else return max;
+}
+
 static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix, int t1, int i1, int t2, int i2) {
   struct ncclTopoNode* cpuNode = system->nodes[tx].nodes+ix;
   struct ncclTopoNode* srcNode = system->nodes[t1].nodes+i1;
@@ -187,7 +194,7 @@ static ncclResult_t addInterStep(struct ncclTopoSystem* system, int tx, int ix,
 
   // Update path characteristics
   srcNode->paths[t2][i2].count = l;
-  srcNode->paths[t2][i2].type = std::max(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
+  srcNode->paths[t2][i2].type = mergePathType(srcNode->paths[tx][ix].type, cpuNode->paths[t2][i2].type);
   if (tx == GPU) srcNode->paths[t2][i2].type = PATH_PXN;
   srcNode->paths[t2][i2].bw = std::min(srcNode->paths[tx][ix].bw, cpuNode->paths[t2][i2].bw);
   return ncclSuccess;
@@ -674,9 +681,9 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       int c;
       NCCLCHECK(ncclGetLocalCpu(system, g, &c));
       if (c == -1) continue;
-      if (gpuNode->paths[NET][n].type == PATH_PHB && gpuNode->paths[CPU][c].type == PATH_C2C) {
-        gpuNode->paths[NET][n].type = PATH_P2C;
-        netNode->paths[GPU][g].type = PATH_P2C;
+      if (mergePathType(gpuNode->paths[CPU][c].type, netNode->paths[CPU][c].type) == PATH_P2C) {
+        gpuNode->paths[NET][n].type = std::min(PATH_P2C, gpuNode->paths[NET][n].type);
+        netNode->paths[GPU][g].type = std::min(PATH_P2C, netNode->paths[GPU][g].type);
       }
     }
   }
@@ -695,16 +702,15 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
           // PXN = PCI + NVLink.
           struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
           // Only use PXN for NIC n if remote GPU p ...
-          if (/* (1) is either connected to the NIC with PXB*/
-              (peerNode->paths[NET][n].type <= PATH_PXB ||
-               /* or with P2C and PxN over C2C is enabled */
-               (ncclParamPxnC2c() && peerNode->paths[NET][n].type == PATH_P2C)) &&
+          int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB;
+          if (/* (1) is connected to the NIC with PxN type*/
+              peerNode->paths[NET][n].type <= pxnType &&
               /* and (2) is connected to us through NVLink */
               peerNode->paths[GPU][g].type <= PATH_NVL &&
               /* and (3) is on the same node as us */
               NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
               /* and (4) has either higher bw to that NIC or avoid going through the CPU*/
-              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXB))
+              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType))
             // We can use that GPU as relay to communicate with that NIC.
             // Only enabling it in the GPU->NIC direction for now to favor
             // receiving locally and sending remotely (consistent with net.cc)
@@ -725,6 +731,12 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
       }
     }
   }
+
+  // Pre-compute NET local gpus to accelerate search
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+    NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &net->net.localGpu));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 9d8ad3ff8..67e600906 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -437,6 +437,65 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
   return ncclSuccess;
 }
 
+// Add the preferred NICs ordered by GPU first
+static ncclResult_t ncclTopoPrefNetsGpuFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
+  const int nGpus = (gpu == -1) ? system->nodes[GPU].count : 1;
+  int gpuCount = nGpus;
+  int gpuIds[NCCL_TOPO_MAX_NODES] = {gpu};
+  int firstNets[NCCL_TOPO_MAX_NODES];
+  if (gpu == -1)
+    for (int g = 0; g < nGpus; g++) gpuIds[g] = g;
+
+  for (int c = 0; c < MAXCHANNELS; c++) {
+    for (int g = 0; g < nGpus; g++) {
+      if (gpuIds[g] == -1) continue;
+      int localNet;
+      int64_t netId;
+      struct ncclTopoNode* gpu = system->nodes[GPU].nodes + gpuIds[g];
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, &localNet));
+      // store the first net found for each GPU in case of duplicates
+      if(c == 0) firstNets[g] = localNet;
+      // if the NET has already been returned for channel 0, that GPU is done
+      if (c > 0 && firstNets[g] == localNet) {
+        gpuIds[g] = -1;
+        gpuCount--;
+        continue;
+      }
+      // only add it to the list if it doesn't already exist
+      int found = 0;
+      while (found < (*netCount) && nets[found] != localNet) found++;
+      if (found == (*netCount)) nets[(*netCount)++] = localNet;
+    }
+    if (gpuCount == 0) break;
+  }
+  return ncclSuccess;
+}
+
+// Add the preferred NICs ordered by channels first
+static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCount) {
+  for (int g = 0; g < system->nodes[GPU].count; g++) {
+    if (gpu != -1 && gpu != g) continue;
+    int localNetCount = 0, localNets[MAXCHANNELS];
+    struct ncclTopoNode* gpu = system->nodes[GPU].nodes + g;
+    for (int c = 0; c < MAXCHANNELS; c++) {
+      int64_t netId;
+      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
+      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets + localNetCount));
+      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
+      localNetCount++;
+    }
+    // Append NICs to list
+    for (int i = 0; i < localNetCount; i++) {
+      int n = localNets[i];
+      int found = 0;
+      while (found < (*netCount) && nets[found] != n) found++;
+      if (found == (*netCount)) nets[(*netCount)++] = n;
+    }
+  }
+  return ncclSuccess;
+}
+
 // Build a sorted list of the NETs to try.
 //
 // "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
@@ -445,39 +504,25 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
 // The list is built the following way:
 // 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
 // 2. add other NETs satisfying typeInter but not already in the list.
-
+NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1);
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
   int netCount = 0;
-  int localNetCount;
-  int localNets[MAXCHANNELS];
 
-  // First add the preferred NICs
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    if (gpu != -1 && gpu != g) continue;
-    localNetCount = 0;
-    struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-    for (int c = 0; c<MAXCHANNELS; c++) {
-      int64_t netId;
-      NCCLCHECK(ncclTopoGetLocalNet(system, gpu->gpu.rank, c, &netId, NULL));
-      NCCLCHECK(ncclTopoIdToIndex(system, NET, netId, localNets+localNetCount));
-      if (localNetCount > 0 && localNets[localNetCount] == localNets[0]) break;
-      localNetCount++;
-    }
-    // Append NICs to list
-    for (int i=0; i<localNetCount; i++) {
-      int n = localNets[i];
-      int found = 0;
-      while (found<netCount && nets[found] != n) found++;
-      if (found == netCount) nets[netCount++] = n;
-    }
+  // First add the preferred NETs.
+  if (system->nHosts > 1 && ncclParamScatterEnable()) {
+    // For MNNVL systems, we sort the devices by GPU first, then by channel
+    NCCLCHECK(ncclTopoPrefNetsGpuFirst(system, gpu, nets, &netCount));
+  } else {
+    // For other systems, we sort the devices by channel first, then by GPU
+    NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount));
   }
 
   // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
-    for (int g=0; g<system->nodes[GPU].count; g++) {
+    for (int g = 0; g < system->nodes[GPU].count; g++) {
       if (gpu != -1 && gpu != g) continue;
-      localNetCount = 0;
+      int localNetCount = 0, localNets[MAXCHANNELS];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
       struct ncclTopoLinkList* paths = gpu->paths[NET];
       for (int n=0; n<system->nodes[NET].count && n<MAXCHANNELS; n++) {
@@ -625,8 +670,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
     if (graph->pattern == NCCL_TOPO_PATTERN_NVLS || graph->pattern == NCCL_TOPO_PATTERN_COLLNET_DIRECT) {
       // NVLS search only tries to find NIC:GPU combinations to compute the heads.
       if (graph->nChannels < netCount) {
-        int gpu;
-        NCCLCHECK(ncclTopoGetLocalGpu(system, net->id, &gpu));
+        int gpu = net->net.localGpu;
         if (gpu != -1) {
           int duplicate = 0;
           // check whether there is duplicate head when one GPU connects with multiple NICs
@@ -643,13 +687,12 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         }
       }
     } else {
-      if (graph->nChannels > 0) {
+      if (graph->nChannels > 0 && graph->sameChannels == 1) {
         // Try to replay the last channel
         int g;
         NCCLCHECK(ncclTopoReplayGetGpu(system, graph, -1, &g));
         NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_REPLAY, time, NET, n, g));
-      }
-      if (graph->nChannels == 0 || graph->sameChannels == 0) {
+      } else {
         if (graph->nChannels == 0 && system->nodes[NVS].count == 0) {
           // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long
           int t = 1 << 10;
@@ -658,11 +701,16 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
         }
 
         // Then try the most local GPUs
+        int localGpu = net->net.localGpu;
+        if (localGpu != -1) {
+          NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpu));
+        }
         int localGpus[NCCL_TOPO_MAX_NODES], localGpuCount, pathType;
         NCCLCHECK(ncclTopoGetLocal(system, NET, n, GPU, localGpus, &localGpuCount, &pathType));
         // if no GPUs are connected, skip this net
         if (pathType == PATH_DIS) continue;
         for (int g = 0; g < localGpuCount; ++g) {
+          if (localGpus[g] == localGpu) continue; // We already tried this one
           NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, NET, n, localGpus[g]));
         }
       }
@@ -749,8 +797,8 @@ struct kvDict kvDictLinkType[] = {
   { "NVB", PATH_NVB },
   { "PIX", PATH_PIX },
   { "PXB", PATH_PXB },
-  { "PXN", PATH_PXN },
   { "P2C", PATH_P2C },
+  { "PXN", PATH_PXN },
   { "PHB", PATH_PHB },
   { "SYS", PATH_SYS },
   { NULL, 0 }
@@ -798,8 +846,10 @@ ncclResult_t ncclTopoGetGraphFromXmlSub(struct ncclXmlNode *xmlGraph, struct ncc
   NCCLCHECK(xmlGetAttrInt(xmlGraph, "nchannels", &graph->nChannels));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedintra", &graph->bwIntra));
   NCCLCHECK(xmlGetAttrFloat(xmlGraph, "speedinter", &graph->bwInter));
-  if (xmlGetAttrFloat(xmlGraph, "latencyinter", &graph->latencyInter) != ncclSuccess) graph->latencyInter = 0.0;
   const char* str;
+  NCCLCHECK(xmlGetAttr(xmlGraph, "latencyinter", &str));
+  if (!str) INFO(NCCL_GRAPH, "latencyinter not found in graph, using 0.0");
+  graph->latencyInter = str ? strtof(str, NULL) : 0.0;
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeintra", &str));
   NCCLCHECK(kvConvertToInt(str, &graph->typeIntra, kvDictLinkType));
   NCCLCHECK(xmlGetAttr(xmlGraph, "typeinter", &str));
@@ -910,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
 float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
-float sm100SpeedArrayInter[] = { 47.9, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
 
@@ -1136,8 +1186,12 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
     }
     for (int i=0; i<ngpus; i++) {
-      sprintf(line+offset, " %s/%d", topoNodeTypeStr[GPU], graph->intra[ngpus*c+i]);
+      int g;
+      ncclTopoRankToIndex(system, graph->intra[ngpus * c + i], &g, true);
+      int64_t topoId = system->nodes[GPU].nodes[g].id;
+      sprintf(line + offset, " %s/%lx-%lx", topoNodeTypeStr[GPU], NCCL_TOPO_ID_SYSTEM_ID(topoId), NCCL_TOPO_ID_LOCAL_ID(topoId));
       offset = strlen(line);
+      if (graph->id == 3) break; // NVLS graphs only use the first GPU
     }
     if (system->nodes[NET].count > 0) {
       sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 9fe81bbcd..8fdf54ea4 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -21,7 +21,7 @@
 
 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
 const char* topoLinkTypeStr[] = { "LOC", "NVL", "",    "C2C", "PCI",    "",    "",    "",    "", "SYS", "NET" };
-const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "PXN", "P2C", "PHB", "SYS", "NET", "DIS" };
+const char* topoPathTypeStr[] = { "LOC", "NVL", "NVB", "C2C", "PIX", "PXB", "P2C", "PXN", "PHB", "SYS", "NET", "DIS" };
 
 /******************************************************************/
 /******************* Graph Creation Functions *********************/
@@ -677,7 +677,14 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
     struct ncclXmlNode* node = topNode->subs[s];
     if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
   }
-  for (int systemId=0; systemId<system->nHosts; systemId++) if (system->hostHashes[systemId] == localHostHash) system->systemId = systemId;
+
+  int systemId = 0;
+  while (systemId < system->nHosts && system->hostHashes[systemId] != localHostHash) systemId++;
+  system->systemId = systemId;
+  if(systemId == system->nHosts){
+    WARN("localHostHash = 0x%lx not found in the list of system hostHashes",localHostHash);
+    return ncclInvalidArgument;
+  }
 
   NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL, 0));
   NCCLCHECK(ncclTopoAddC2c(topNode, *topoSystem, NULL, 0));
@@ -1143,8 +1150,8 @@ struct kvDict nicPathKvList[] = {
   { "PORT", PATH_PORT },
   { "PIX",  PATH_PIX },
   { "PXB",  PATH_PXB },
-  { "PXN",  PATH_PXN },
   { "P2C",  PATH_P2C },
+  { "PXN",  PATH_PXN },
   { "PHB",  PATH_PHB },
   { "SYS",  PATH_SYS },
   { NULL, 0 }
@@ -1421,7 +1428,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   }
 
   // Only update our topo tracking structure if we aren't dumping (separate steps)
-  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, comm->peerInfo[comm->rank].hostHash), ret, fail);
+  if (dumpXmlFile == NULL) NCCLCHECKGOTO(ncclTopoGetSystemFromXml(xml, system, getHostHash()), ret, fail);
 
 exit:
   if (!comm->MNNVL && localRanks) free(localRanks);
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 07ef5e105..9b49c0222 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -18,7 +18,7 @@
 #define SM80_NVLINK_BW 20.0
 #define SM90_NVLINK_BW 20.6
 #define SM86_NVLINK_BW 12.0
-#define SM100_NVLINK_BW 40.0
+#define SM100_NVLINK_BW 40.1
 #define PCI_BW 12.0           // PCI Gen3 x16
 #define AMD_BW 16.0
 #define BDW_QPI_BW 6.0
@@ -76,11 +76,11 @@ extern const char* topoLinkTypeStr[];
 // Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
 #define PATH_PXB 5
 
-// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
-#define PATH_PXN 6
-
 // Connection between a GPU and a NIC using the C2C connection to the CPU and the PCIe connection to the NIC
-#define PATH_P2C 7
+#define PATH_P2C 6
+
+// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
+#define PATH_PXN 7
 
 // Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
 #define PATH_PHB 8
@@ -143,6 +143,7 @@ struct ncclTopoNode {
       int gdrSupport;
       int collSupport;
       int maxChannels;
+      int localGpu;
     }net;
     struct {
       int arch;
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 64dc5cf22..8e99f18c3 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -455,9 +455,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
     int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
     if (pEnable == 2 && p == NCCL_PROTO_LL128) {
-      // Enable LL128 by default only on Volta/Ampere/Hopper/Blackwell+NVLink. Other cases are not tested and may cause silent data corruption.
       pEnable = 1;
-      pEnable &= (graphs[a]->typeInter <= PATH_PXB || (minCompCap >= 90 && graphs[a]->typeInter <= (ncclParamLl128C2c() ? PATH_P2C : PATH_PXN)));
+      if (ncclParamLl128C2c() && minCompCap >= 90) {
+        // Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXN);
+      } else {
+        // Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXB);
+        if (!ncclParamLl128C2c() && minCompCap >= 90)
+          INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128.");
+      }
       pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
       pEnable &= (minCompCap == maxCompCap);
       pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
diff --git a/src/init.cc b/src/init.cc
index 83764a883..2a57c46c0 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -1507,7 +1507,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   int minCTAsEnv;
   int maxCTAsEnv;
   int splitShareEnv;
-  int collnetEnableEnv;
+  const char* collnetEnableEnv;
   int ctaPolicyEnv;
   int shrinkShareEnv;
   int nvlsCTAsEnv;
@@ -1561,9 +1561,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     comm->config.shrinkShare = shrinkShareEnv;
   }
 
-  collnetEnableEnv = ncclParamCollnetEnable();
-  if (collnetEnableEnv != NCCL_CONFIG_UNDEF_INT) {
-    comm->config.collnetEnable = collnetEnableEnv;
+  // NCCL_COLLNET_ENABLE needs to be reloaded each time for comm init
+  // since users might change the env on the fly to enable/disable collnet
+  collnetEnableEnv = ncclGetEnv("NCCL_COLLNET_ENABLE");
+  if (collnetEnableEnv != NULL) {
+    int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
+    if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
+      comm->config.collnetEnable = collnetEnableInt;
+      INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
+    }
   }
 
   ctaPolicyEnv = ncclParamCtaPolicy();
diff --git a/src/misc/mlx5dvsymbols.cc b/src/misc/mlx5dvsymbols.cc
index 5bb4109f3..47cc4eb0d 100644
--- a/src/misc/mlx5dvsymbols.cc
+++ b/src/misc/mlx5dvsymbols.cc
@@ -52,6 +52,9 @@ ncclResult_t buildMlx5dvSymbols(struct ncclMlx5dvSymbols* mlx5dvSymbols) {
 #define LOAD_SYM_VERSION(handle, symbol, funcptr, version) do {  \
     cast = (void**)&funcptr;                                     \
     *cast = dlvsym(handle, symbol, version);                     \
+    if (*cast == NULL) {                                         \
+      INFO(NCCL_NET, "dlvsym failed on %s - %s version %s", symbol, dlerror(), version);  \
+    }                                                            \
   } while (0)
 
   LOAD_SYM(mlx5dvhandle, "mlx5dv_is_supported", mlx5dvSymbols->mlx5dv_internal_is_supported);
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 0adb4b137..1766f4167 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -21,7 +21,6 @@ struct ncclStrongStreamCapture {
   cudaGraph_t graph;
   unsigned long long graphId;
   cudaStream_t captureStream;
-  cudaGraphNode_t lastRecord;
   void* acquiredBy;
 };
 
@@ -216,7 +215,6 @@ ncclResult_t ncclStrongStreamAcquire(
         CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
       }
       cap->graphId = graph.graphId;
-      cap->lastRecord = nullptr;
       cap->acquiredBy = localThreadId();
       // Push to capturing list.
       cap->next = ss->captureHead;
@@ -296,16 +294,6 @@ ncclResult_t ncclStrongStreamRelease(
         cudaGraphNode_t recordNode;
         CUDACHECK(cudaGraphAddEventRecordNode(&recordNode, graph.graph, nullptr, 0, ss->serialEvent));
 
-        // Make this record order after previous record on this stream.
-        if (cap->lastRecord != nullptr) {
-        #if CUDART_VERSION >= 13000
-          CUDACHECK(cudaGraphAddDependencies_v2(graph.graph, &cap->lastRecord, &recordNode, nullptr, 1));
-        #else
-          CUDACHECK(cudaGraphAddDependencies(graph.graph, &cap->lastRecord, &recordNode, 1));
-        #endif
-        }
-        cap->lastRecord = recordNode;
-
         // Get current nodes from work stream so we can add them as dependencies.
         cudaStreamCaptureStatus status;
         cudaGraphNode_t const* nodes;
@@ -338,6 +326,22 @@ ncclResult_t ncclStrongStreamRelease(
           }
         }
 
+	// Make every future operation captured on cap->captureStream depend on 'recordNode'.
+        #if CUDART_VERSION >= 13000
+        CUDACHECK(cudaStreamUpdateCaptureDependencies_v2(
+                    cap->captureStream,
+                    &recordNode,          /* dependencies                */
+                    /*edges =*/ nullptr,  /* no edge annotations         */
+                    1,                    /* count                       */
+                    cudaStreamSetCaptureDependencies));
+        #else
+        CUDACHECK(cudaStreamUpdateCaptureDependencies(
+                    cap->captureStream,
+                    &recordNode,
+                    1,
+                    cudaStreamSetCaptureDependencies));
+        #endif
+
         if (cap->acquiredBy != localThreadId() && ncclParamLaunchRaceFatal()) {
           WARN("%s", launchRaceFatalMsg);
           return ncclInvalidUsage;
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index a9c1d0dc0..64c97be39 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -61,20 +61,20 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
   char eNoEntNameList[PATH_MAX] = { 0 };
 
   if (libName && strlen(libName)) {
-    // match names that start with 'lib' and end with '.so'
-    if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
-      snprintf(libName_, MAX_STR_LEN, "%s", libName);
-      libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-      if (libHandles[type]) {
-        INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-        return libHandles[type];
-      }
-      if (openErr == ENOENT) {
-        appendNameToList(eNoEntNameList, &len, libName_);
-      } else {
-        INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
-      }
+    snprintf(libName_, MAX_STR_LEN, "%s", libName);
+    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+    if (libHandles[type]) {
+      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      return libHandles[type];
+    }
+    if (openErr == ENOENT) {
+      appendNameToList(eNoEntNameList, &len, libName_);
     } else {
+      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+    }
+
+    // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so')
+    if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) {
       snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
       libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
       if (libHandles[type]) {
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 19a505e1c..40897d93f 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -494,7 +494,9 @@ static int ibvSpeeds[] = {
   14000, /* FDR */
   25000, /* EDR */
   50000, /* HDR */
-  100000 /* NDR */ };
+  100000, /* NDR */
+  200000  /* XDR */
+};
 
 static int firstBitSet(int val, int max) {
   int i = 0;
@@ -654,7 +656,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           ibProvider = IB_PROVIDER_MLX5;
           snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
           if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
-            INFO(NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
+            INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
             if(ncclParamIbDataDirect()) dataDirectSupported = 1;
           }
         }

From 7c12c627c62ef4e5a2485777a8d9dce58f3f562f Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Fri, 11 Jul 2025 07:32:13 -0700
Subject: [PATCH 13/21] NCCL 2.27.6-1

Improve support for DirectNIC (CX8)
* Add support for XDR speed detection.
* When DirectNIC is enabled, report only the RDMA interfaces.

Extend the P2C (PXN over C2C) support to send/receive operations.

Support compilation with GCC 14 (Issues #1743, #1751).

Fix the unloading of network plugins that also provide tuner capability.

Fix the change of the current device across the calls to ncclCommDestroy()
and ncclCommAbort().

A note for users on MNNVL systems: please ensure an adequate stack size for
NCCL threads.  While the default Linux stack size limit of 8192 KB is known
to be sufficient, we've seen crashes if the limit is changed to
"unlimited", as it causes the glibc library to unexpectedly *decrease* the
stack size of NCCL's background threads to just 2048 KB.  Use "ulimit -s"
in bash to print the current limit; if needed, reset it to 8192 KB using
"ulimit -s 8192" (one also needs to ensure that the new setting is
propagated to other nodes when launching a multi-node NCCL job).
---
 makefiles/common.mk         |  2 +-
 makefiles/version.mk        |  2 +-
 src/graph/paths.cc          |  4 +-
 src/graph/search.cc         |  5 ++-
 src/graph/topo.h            |  2 +
 src/include/ibvcore.h       | 76 ++++++++++++++++++++++++++-----------
 src/include/plugin/plugin.h | 10 ++++-
 src/init.cc                 |  8 +++-
 src/misc/ibvwrap.cc         | 10 ++++-
 src/misc/socket.cc          |  3 +-
 src/plugin/net.cc           |  5 ++-
 src/plugin/plugin_open.cc   | 38 +++++++++----------
 src/plugin/profiler.cc      |  5 ++-
 src/plugin/tuner.cc         |  5 ++-
 src/transport/net_ib.cc     | 19 +++++++---
 15 files changed, 128 insertions(+), 66 deletions(-)

diff --git a/makefiles/common.mk b/makefiles/common.mk
index 6ba9bbfce..0f01671b6 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -76,7 +76,7 @@ $(info NVCC_GENCODE is ${NVCC_GENCODE})
 ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
   CXXSTD ?= -std=c++17
 else
-  CXXSTD ?= -std=c++11
+  CXXSTD ?= -std=c++14
 endif
 
 CXXFLAGS   := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 013e972f3..0f482d31a 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 27
-NCCL_PATCH   := 5
+NCCL_PATCH   := 6
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 4b44abd01..82c0d9972 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -709,8 +709,8 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
               peerNode->paths[GPU][g].type <= PATH_NVL &&
               /* and (3) is on the same node as us */
               NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) &&
-              /* and (4) has either higher bw to that NIC or avoid going through the CPU*/
-              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > pxnType))
+              /* and (4) has either higher bw to that NIC or avoid going through the CPU (path.type is > PATH_PXN)*/
+              (peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || gpu->paths[NET][n].type > PATH_PXN))
             // We can use that GPU as relay to communicate with that NIC.
             // Only enabling it in the GPU->NIC direction for now to favor
             // receiving locally and sending remotely (consistent with net.cc)
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 67e600906..86199d78b 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -960,7 +960,7 @@ float sm90SpeedArrayInter[] = { 48.0, 45.0, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0,
 #define NSPEEDSINTER_SM90 (sizeof(sm90SpeedArrayInter)/sizeof(float))
 
 float sm100SpeedArrayIntra[] = { 90.0, 80.0, 70.0, 60.0, 50.0, 40.0, 30.0, 24.0, 20.0, 19.0, 18.0 };
-float sm100SpeedArrayInter[] = { 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
+float sm100SpeedArrayInter[] = { 96.0, 48.0, 45.1, 42.0, 40.0, 30.0, 24.0, 22.0, 20.0, 17.5, 15.0, 12.0, 6.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA_SM100 (sizeof(sm100SpeedArrayIntra)/sizeof(float))
 #define NSPEEDSINTER_SM100 (sizeof(sm100SpeedArrayInter)/sizeof(float))
 
@@ -1307,7 +1307,8 @@ ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoG
         NCCLCHECK(ncclTopoGetLocalGpu(comm->topo, netId, &g2));
         if (g2 != -1) {
           struct ncclTopoNode* peerGpu = comm->topo->nodes[GPU].nodes+g2;
-          if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= PATH_PXB) {
+          int pxnType = ncclParamPxnC2c() ? PATH_P2C : PATH_PXB;
+          if (peerGpu->paths[GPU][g1].type <= PATH_NVL && peerGpu->paths[NET][n].type <= pxnType) {
             *proxyRank = peerGpu->gpu.rank;
             if (dev) *dev = netDev;
             if (id) *id = netId;
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 9b49c0222..9ef10ff2d 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -98,6 +98,8 @@ extern const char* topoLinkTypeStr[];
 #define PATH_DIS 11
 extern const char* topoPathTypeStr[];
 
+extern int64_t ncclParamPxnC2c();
+
 struct ncclTopoNode;
 struct ncclTopoLink {
   int type;
diff --git a/src/include/ibvcore.h b/src/include/ibvcore.h
index 8d8ecf1ec..ae9051f28 100644
--- a/src/include/ibvcore.h
+++ b/src/include/ibvcore.h
@@ -9,6 +9,7 @@
 #include <stdint.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <string.h>
 
 #if __GNUC__ >= 3
 #  define __attribute_const __attribute__((const))
@@ -39,7 +40,7 @@ union ibv_gid {
 #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
 
 /*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
 
 enum ibv_node_type {
 	IBV_NODE_UNKNOWN	= -1,
@@ -208,7 +209,9 @@ struct ibv_port_attr {
 	uint8_t			active_speed;
 	uint8_t			phys_state;
 	uint8_t			link_layer;
-	uint8_t			reserved;
+	uint8_t                 flags;
+	uint16_t                port_cap_flags2;
+	uint32_t                active_speed_ex;
 };
 
 enum ibv_event_type {
@@ -993,37 +996,50 @@ enum verbs_context_mask {
 
 struct verbs_context {
 	/*  "grows up" - new fields go here */
-	int (*_reserved_2) (void);
-	int (*destroy_flow) (struct ibv_flow *flow);
-	int (*_reserved_1) (void);
-	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
-					  struct ibv_flow_attr *flow_attr);
+	int (*query_port)(struct ibv_context *context, uint8_t port_num,
+			  struct ibv_port_attr *port_attr,
+			  size_t port_attr_len);
+	int (*_reserved[25]) (void);
+	struct verbs_ex_private *priv;
+	int (*query_device_ex)(struct ibv_context *context,
+			       const struct ibv_query_device_ex_input *input,
+			       struct ibv_device_attr_ex *attr,
+			       size_t attr_size);
+	int (*ibv_destroy_flow) (struct ibv_flow *flow);
+	void (*ABI_placeholder2) (void); /* DO NOT COPY THIS GARBAGE */
+	struct ibv_flow * (*ibv_create_flow) (struct ibv_qp *qp,
+					      struct ibv_flow_attr *flow_attr);
+	void (*ABI_placeholder1) (void); /* DO NOT COPY THIS GARBAGE */
 	struct ibv_qp * (*open_qp)(struct ibv_context *context,
 			struct ibv_qp_open_attr *attr);
 	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
 			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
 	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
-	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
-			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
-	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
-			struct ibv_xrcd_init_attr *xrcd_init_attr);
-	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
-	uint64_t has_comp_mask;
-	size_t   sz;	/* Must be immediately before struct ibv_context */
-	struct ibv_context context;/* Must be last field in the struct */
+	struct ibv_srq *	(*create_srq_ex)(struct ibv_context *context,
+						 struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd *	(*open_xrcd)(struct ibv_context *context,
+					     struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int			(*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t _ABI_placeholder3;
+	size_t   sz;			/* Must be immediately before struct ibv_context */
+	struct ibv_context context;	/* Must be last field in the struct */
 };
 
-/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
 {
-	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
-		NULL : container_of(ctx, struct verbs_context, context);
+	if (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)
+		return NULL;
+
+	/* open code container_of to not pollute the global namespace */
+	return (struct verbs_context *)(((uintptr_t)ctx) -
+					offsetof(struct verbs_context,
+						 context));
 }
 
 #define verbs_get_ctx_op(ctx, op) ({ \
-	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
-	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
-	!_vctx->op) ? NULL : _vctx; })*/
+	struct verbs_context *__vctx = verbs_get_ctx(ctx); \
+	(!__vctx || (__vctx->sz < sizeof(*__vctx) - offsetof(struct verbs_context, op)) || \
+	 !__vctx->op) ? NULL : __vctx; })
 
 #define verbs_set_ctx_op(_vctx, op, ptr) ({ \
 	struct verbs_context *vctx = _vctx; \
@@ -1055,4 +1071,20 @@ struct ibv_ece {
 	uint32_t comp_mask;
 };
 
+/**
+ * ibv_query_port_ex - Get (extended) port properties
+ */
+static inline int ibv_query_port_ex(struct ibv_context *context,
+				    uint8_t port_num,
+				    struct ibv_port_attr *port_attr)
+{
+	struct verbs_context *vctx = verbs_get_ctx_op(context, query_port);
+
+        if (vctx) {
+          return vctx->query_port(context, port_num, port_attr, sizeof(*port_attr));
+        }
+
+        return -1;
+}
+
 #endif  // NCCL_IBV_CORE_H_
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
index 7336c34d9..300e436a0 100644
--- a/src/include/plugin/plugin.h
+++ b/src/include/plugin/plugin.h
@@ -9,10 +9,16 @@
 
 #include "nccl.h"
 
+enum ncclPluginType {
+  ncclPluginTypeNet,
+  ncclPluginTypeTuner,
+  ncclPluginTypeProfiler,
+};
+
 void* ncclOpenNetPluginLib(const char* name);
 void* ncclOpenTunerPluginLib(const char* name);
 void* ncclOpenProfilerPluginLib(const char* name);
-void* ncclGetNetPluginLib(void);
-ncclResult_t ncclClosePluginLib(void* handle);
+void* ncclGetNetPluginLib(enum ncclPluginType type);
+ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type);
 
 #endif
diff --git a/src/init.cc b/src/init.cc
index 2a57c46c0..af784c02d 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -2170,6 +2170,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
     NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
 
   TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+  NCCLCHECK(ncclGroupStartInternal());
   // Try and prevent a double free of the comm struct (user error)
   if (comm->rank == -1 || comm->nRanks == -1 || comm->cudaDev == -1 || comm->busId == -1) {
     WARN("comm %p has already been destroyed", comm);
@@ -2184,6 +2185,8 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
   return res;
 fail:
   goto exit;
@@ -2207,6 +2210,7 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   if (comm == NULL) {
     return ncclSuccess;
   }
+  NCCLCHECK(ncclGroupStartInternal());
   // Ask anything that might still be running on the device to quit
   NCCLCHECK(setCommAbortFlags(comm,1));
   comm->destroyFlag = 1;
@@ -2229,7 +2233,9 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commReclaim, NULL, free, comm), res, fail);
 
 exit:
-  return ncclSuccess;
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  return res;
 fail:
   goto exit;
 }
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 23bf5e125..59f52e320 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -142,8 +142,14 @@ ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_devic
   IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_device, ibv_internal_query_device(context, device_attr), 0, "ibv_query_device");
 }
 
-ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+  // First try and query the extended port attributes (e.g. active_speed_ex)
+  if (ibv_query_port_ex(context, port_num, port_attr) != 0) {
+    // Fall back to the original attribute API call, but zero all members first
+    memset(port_attr, 0, sizeof(*port_attr));
+    IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+  }
+  return ncclSuccess;
 }
 
 ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) {
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 278fb5c51..d066d2829 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -441,7 +441,8 @@ static ncclResult_t socketTryAccept(struct ncclSocket* sock) {
   if (sock->fd != -1) {
     sock->state = ncclSocketStateAccepted;
   } else if (errno == ENETDOWN || errno == EPROTO || errno == ENOPROTOOPT || errno == EHOSTDOWN ||
-             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH) {
+             errno == ENONET || errno == EHOSTUNREACH || errno == EOPNOTSUPP || errno == ENETUNREACH ||
+             errno == EINTR) {
     /* per accept's man page, for linux sockets, the following errors might be already pending errors
      * and should be considered as EAGAIN. To avoid infinite loop in case of errors, we use the retry count*/
     if (++sock->errorRetries == ncclParamRetryCnt()) {
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
index 78944106a..aa80c12ab 100644
--- a/src/plugin/net.cc
+++ b/src/plugin/net.cc
@@ -67,7 +67,7 @@ static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
 static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
   if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
     INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
-    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
+    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet));
     memset(pluginLib, 0, sizeof(netPluginLib_t));
   }
   return ncclSuccess;
@@ -105,8 +105,9 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
   return ncclSuccess;
 fail:
   if (pluginLib->dlHandle) {
-    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
+    NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet));
   }
+  pluginLib->dlHandle = nullptr;
   pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed;
   pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
   goto exit;
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index 64c97be39..f80321c81 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -10,16 +10,12 @@
 #include <dlfcn.h>
 
 #include "debug.h"
+#include "plugin.h"
 
 #define MAX_STR_LEN 255
 
-enum ncclPluginType {
-  ncclPluginTypeNet,
-  ncclPluginTypeTuner,
-  ncclPluginTypeProfiler,
-};
-
 #define NUM_LIBS 3
+static char* libNames[NUM_LIBS];
 static void *libHandles[NUM_LIBS];
 static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
 static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
@@ -65,6 +61,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
     libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
     if (libHandles[type]) {
       INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+      libNames[type] = strdup(libName_);
       return libHandles[type];
     }
     if (openErr == ENOENT) {
@@ -79,6 +76,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
       libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
       if (libHandles[type]) {
         INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
+        libNames[type] = strdup(libName_);
         return libHandles[type];
       }
       if (openErr == ENOENT) {
@@ -91,6 +89,7 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
     snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
     libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
     if (libHandles[type]) {
+      libNames[type] = strdup(libName_);
       return libHandles[type];
     }
     if (openErr == ENOENT) {
@@ -120,22 +119,21 @@ void* ncclOpenProfilerPluginLib(const char* name) {
   return openPluginLib(ncclPluginTypeProfiler, name);
 }
 
-void* ncclGetNetPluginLib(void) {
-  return libHandles[ncclPluginTypeNet];
+void* ncclGetNetPluginLib(enum ncclPluginType type) {
+  if (libNames[ncclPluginTypeNet]) {
+    // increment the reference counter of the net library
+    libNames[type] = strdup(libNames[ncclPluginTypeNet]);
+    libHandles[type] = dlopen(libNames[ncclPluginTypeNet], RTLD_NOW | RTLD_LOCAL);
+  }
+  return libHandles[type];
 }
 
-ncclResult_t ncclClosePluginLib(void* handle) {
-  bool found = false;
-  for (int l=0; l<NUM_LIBS; l++) {
-    if (libHandles[l] == handle) {
-      libHandles[l] = nullptr;
-      if (!found) {
-        if (handle) {
-          dlclose(handle);
-        }
-        found = true;
-      }
-    }
+ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type) {
+  if (handle && libHandles[type] == handle) {
+    dlclose(handle);
+    libHandles[type] = nullptr;
+    free(libNames[type]);
+    libNames[type] = nullptr;
   }
   return ncclSuccess;
 }
diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc
index da347b63e..15c3f2bc2 100644
--- a/src/plugin/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -77,7 +77,8 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
   pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 fail:
-  if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
+  if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler));
+  profilerPluginLib = nullptr;
   profilerPluginStatus = profilerPluginLoadFailed;
   goto exit;
 }
@@ -86,7 +87,7 @@ static ncclResult_t ncclProfilerPluginUnload(void) {
   pthread_mutex_lock(&profilerLock);
   if (0 == (--profilerPluginRefCount)) {
     INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
-    NCCLCHECK(ncclClosePluginLib(profilerPluginLib));
+    NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler));
     profilerPluginLib = nullptr;
     ncclProfiler = nullptr;
     profilerPluginStatus = profilerPluginLoadReady;
diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc
index 443bf78c4..24a59de2e 100644
--- a/src/plugin/tuner.cc
+++ b/src/plugin/tuner.cc
@@ -52,7 +52,7 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
 
   tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
   if (nullptr == tunerPluginLib) {
-    tunerPluginLib = ncclGetNetPluginLib();
+    tunerPluginLib = ncclGetNetPluginLib(ncclPluginTypeTuner);
     if (nullptr == tunerPluginLib) {
       goto fail;
     }
@@ -78,6 +78,7 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
   pthread_mutex_unlock(&tunerPluginLock);
   return ncclSuccess;
 fail:
+  if (tunerPluginLib) NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner));
   tunerPluginLib = nullptr;
   status = tunerPluginLoadFailed;
   goto exit;
@@ -87,7 +88,7 @@ ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
   pthread_mutex_lock(&tunerPluginLock);
   if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
     INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
-    NCCLCHECK(ncclClosePluginLib(tunerPluginLib));
+    NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner));
     tunerPluginLib = nullptr;
     tunerSymbol = nullptr;
     comm->tuner = nullptr;
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 40897d93f..709e7ad40 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -652,12 +652,15 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
         enum ncclIbProvider ibProvider = IB_PROVIDER_NONE;
         char dataDirectDevicePath[PATH_MAX];
         int dataDirectSupported = 0;
+        int skipNetDevForDataDirect = 0;
         if (wrap_mlx5dv_is_supported(devices[d])) {
           ibProvider = IB_PROVIDER_MLX5;
           snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
           if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
-            INFO(NCCL_INIT|NCCL_NET, "Data Direct DMA Interface is detected for device:%s", devices[d]->name);
-            if(ncclParamIbDataDirect()) dataDirectSupported = 1;
+            INFO(NCCL_INIT|NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device:%s", devices[d]->name);
+            // Now check whether Data Direct has been disabled by the user
+            if(ncclParamIbDataDirect() == 1) { dataDirectSupported = 1; skipNetDevForDataDirect = 1; }
+            if(ncclParamIbDataDirect() == 2) { dataDirectSupported = 1; skipNetDevForDataDirect = 0; }
           }
         }
         int nPorts = 0;
@@ -669,7 +672,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           continue;
         }
         for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
-          for (int dataDirect = 0; dataDirect < 1 + dataDirectSupported; ++dataDirect) {
+          // dataDirect = 0 exposes the devices normally, dataDirect = 1 exposes the devices through direct NIC
+          for (int dataDirect = skipNetDevForDataDirect; dataDirect < 1 + dataDirectSupported; ++dataDirect) {
             struct ibv_port_attr portAttr;
             if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
               WARN("NET/IB : Unable to query port_num %d", port_num);
@@ -690,15 +694,18 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
             ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
             ncclIbDevs[ncclNIbDevs].portNum = port_num;
             ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
-            ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
+            if (portAttr.active_speed_ex)
+              // A non-zero active_speed_ex indicates XDR rate (0x100) or higher
+              ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width);
+            else
+              ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
             ncclIbDevs[ncclNIbDevs].context = context;
             ncclIbDevs[ncclNIbDevs].pdRefs = 0;
             ncclIbDevs[ncclNIbDevs].pd = NULL;
             if (!dataDirect) {
               strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
               NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
-            }
-            else {
+            } else {
               snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name);
               NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX));
               strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX);

From bfedf2629eae7abbcb7b9bd4841723b21f725636 Mon Sep 17 00:00:00 2001
From: Stephen Sachs <ssachs@nvidia.com>
Date: Wed, 16 Jul 2025 17:56:12 +0200
Subject: [PATCH 14/21] Add issues templates and Github action to remove stale
 issues

We add 3 different issue types issue/question/RFE and add some predefined
questions to speed up the debugging process.

We also add a custom action which will close all issues create mode than 6
months ago which have not been updated for more than a month.
---
 .github/ISSUE_TEMPLATE/ISSUE.yaml       | 77 +++++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/QUESTION.yaml    | 15 +++++
 .github/ISSUE_TEMPLATE/RFE.yaml         | 22 +++++++
 .github/ISSUE_TEMPLATE/config.yml       |  1 +
 .github/workflows/close-old-issues.js   | 76 ++++++++++++++++++++++++
 .github/workflows/close_old_issues.yaml | 31 ++++++++++
 6 files changed, 222 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/ISSUE.yaml
 create mode 100644 .github/ISSUE_TEMPLATE/QUESTION.yaml
 create mode 100644 .github/ISSUE_TEMPLATE/RFE.yaml
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml
 create mode 100644 .github/workflows/close-old-issues.js
 create mode 100644 .github/workflows/close_old_issues.yaml

diff --git a/.github/ISSUE_TEMPLATE/ISSUE.yaml b/.github/ISSUE_TEMPLATE/ISSUE.yaml
new file mode 100644
index 000000000..f760b305b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/ISSUE.yaml
@@ -0,0 +1,77 @@
+name: NCCL issue or bug
+description: Report an issue or failure when running NCCL code
+title: "[Issue]: "
+labels: ["triage"]
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for reaching out! Before reporting a new issue, please feel free to search for the behavior in the existing issues. If you found an issue which is already closed or you are unsure, open a new issue and reference the old one from it.
+        You can also check out the [troubleshooting section](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html) in our user guide.
+        
+        ---
+        
+        To ensure we can assist you quickly and accurately, we often need the following information:
+  - type: dropdown
+    id: type
+    attributes:
+      label: How is this issue impacting you?
+      description: What best describes your issue?
+      options:
+        - Lower performance than expected
+        - Application crash
+        - Data corruption
+        - Application hang
+    validations:
+      required: true
+
+  - type: textarea
+    id: log
+    attributes:
+      label: Share Your Debug Logs
+      description: |
+
+        The logs and topo-files are a great tool to pin down issues. You can create them by setting these environment variables before the run.
+        * `NCCL_DEBUG=INFO` and `NCCL_DEBUG_FILE=ncclDebug.%h.%p` to produce one file per rank
+        * `NCCL_TOPO_DUMP_FILE=ncclSystem.txt`
+
+  - type: textarea
+    id: repro
+    attributes:
+      label: Steps to Reproduce the Issue
+      description: |
+        * **Minimal Steps**: Please provide a simple way to recreate the issue (see [Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports) for inspiration).
+        * **Environment Details**: Include software versions and relevant settings.
+        * **Intermittency**: Is this a sporadic issue? If so, how often does it occur?
+        * **Previous Success**: Did this work with an older NCCL version?
+
+        The easier we can reproduce on our side the more likely we are to be able to solve it in a timely manner.
+
+  - type: input
+    id: nccl_version
+    attributes:
+      label: NCCL Version
+      description: |
+        NCCL reports its version string in the debug logs.
+        You can also determine the version if you know which library was used by running `strings libnccl.so | grep 'NCCL version'`.
+      placeholder: "e.g. 2.27.1+cuda12.8"
+    validations:
+      required: true
+
+  - type: textarea
+    id: platform
+    attributes:
+      label: Your platform details
+      description: |
+        * **GPU & Network**: Share your architecture and topology (e.g., from `nvidia-smi`, `nvidia-smi topo -m`, `ibstatus`).
+        * **Environment**: Bare-metal, containers, or cloud?
+        * **Scalability**: Does this issue occur with a specific number of ranks/nodes?
+
+  - type: textarea
+    id: issue-description
+    attributes:
+      label: Error Message & Behavior
+      description: |
+        * **First Error**: What was the initial `NCCL WARN` message in your logs?
+        * **Expected vs. Actual**: Briefly describe the anticipated behavior versus what you're seeing.
diff --git a/.github/ISSUE_TEMPLATE/QUESTION.yaml b/.github/ISSUE_TEMPLATE/QUESTION.yaml
new file mode 100644
index 000000000..60e43489f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/QUESTION.yaml
@@ -0,0 +1,15 @@
+name: NCCL question
+description: Ask the NCCL team a question
+title: "[Question]: "
+labels: ["question"]
+
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), in particular the troubleshooting section, and also the [release notes](https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html).
+        ---
+  - type: textarea
+    id: question
+    attributes:
+      label: Question
diff --git a/.github/ISSUE_TEMPLATE/RFE.yaml b/.github/ISSUE_TEMPLATE/RFE.yaml
new file mode 100644
index 000000000..7a305abfa
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/RFE.yaml
@@ -0,0 +1,22 @@
+name: NCCL request for enhancement
+description: Request for enhancement
+title: "[RFE]: "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+
+        Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nccl/issues?q=sort%3Aupdated-desc%20is%3Aissue%20is%3Aopen%20label%3Aenhancement).
+        
+        ---
+  - type: textarea
+    id: rfe-description
+    attributes:
+      label: Please provide the below details to ensure we understand your needs
+      description: |
+        * What is the goal of this request?
+        * Who will benefit from this feature?
+        * Is this request for a specific GPU architecture or network infrastructure?
+        * How will this feature improve current workflows or processes?
+        * What is the priority level of this request?
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..3ba13e0ce
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: false
diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js
new file mode 100644
index 000000000..9605aa023
--- /dev/null
+++ b/.github/workflows/close-old-issues.js
@@ -0,0 +1,76 @@
+const { Octokit } = require("@octokit/rest");
+
+const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN });
+
+const owner = process.env.REPO_OWNER;
+const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format
+
+const now = new Date();
+const sixMonthsAgo = new Date(now);
+sixMonthsAgo.setMonth(now.getMonth() - 6);
+const oneMonthAgo = new Date(now);
+oneMonthAgo.setMonth(now.getMonth() - 1);
+
+async function closeOldIssues() {
+  let page = 1;
+  let closedCount = 0;
+
+    // write a multiline comment into a variable:
+    let body = `### Issue Cleanup: Helping Us Focus on Current Challenges
+
+We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now.
+
+*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\").
+Thanks for your understanding and for contributing to NCCL.`;
+
+  while (true) {
+    const { data: issues } = await octokit.issues.listForRepo({
+      owner,
+      repo,
+      state: "open",
+      per_page: 100,
+      page,
+    });
+
+    if (issues.length === 0) break;
+
+    for (const issue of issues) {
+      // Ignore PRs
+      if (issue.pull_request) continue;
+
+      const createdAt = new Date(issue.created_at);
+      const updatedAt = new Date(issue.updated_at);
+
+        if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) {
+
+        // Add a comment before closing
+        await octokit.issues.createComment({
+          owner,
+          repo,
+          issue_number: issue.number,
+          body: body,
+        });
+
+        await octokit.issues.update({
+          owner,
+          repo,
+          issue_number: issue.number,
+          state: "closed",
+          state_reason: "not_planned",
+        });
+        closedCount++;
+        console.log(`Closed issue #${issue.number}`);
+
+        // Break out if we have closed 100 issues
+        if (closedCount >= 100) {
+          console.log("Closed 100 issues, stopping.");
+          return;
+        }
+      }
+    }
+    page++;
+  }
+  console.log(`Total closed: ${closedCount}`);
+}
+
+closeOldIssues().catch(console.error);
diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml
new file mode 100644
index 000000000..15d81cb54
--- /dev/null
+++ b/.github/workflows/close_old_issues.yaml
@@ -0,0 +1,31 @@
+name: Close Old Issues
+
+on:
+  schedule:
+    - cron: '30 2 * * *'  # Runs daily at 02:30 UTC
+  workflow_dispatch:
+
+permissions:
+  issues: write
+
+jobs:
+  close-old-issues:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Install dependencies
+        run: npm install @octokit/rest@22.0.0
+
+      - name: Run close-old-issues script
+        run: node .github/workflows/close-old-issues.js
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          REPO_OWNER: ${{ github.repository_owner }}
+          REPO_NAME: ${{ github.event.repository.name || github.repository }}

From 0d1ece2b43ba1d85c76746ce63505f6db6b6b2f4 Mon Sep 17 00:00:00 2001
From: Stephen Sachs <ssachs@nvidia.com>
Date: Thu, 17 Jul 2025 21:50:05 +0200
Subject: [PATCH 15/21] Exclude ongoing issues from auto-closing logic

- Added a check to skip issues labeled "ongoing" in the close-old-issues script
- Adjusted the condition to compare both creation and update dates against six months ago
---
 .github/workflows/close-old-issues.js | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js
index 9605aa023..57e110339 100644
--- a/.github/workflows/close-old-issues.js
+++ b/.github/workflows/close-old-issues.js
@@ -38,10 +38,13 @@ Thanks for your understanding and for contributing to NCCL.`;
       // Ignore PRs
       if (issue.pull_request) continue;
 
+      // Ignore issues with label "ongoing"
+      if (issue.labels.some(label => label.name === "ongoing")) continue;
+
       const createdAt = new Date(issue.created_at);
       const updatedAt = new Date(issue.updated_at);
 
-        if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) {
+        if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) {
 
         // Add a comment before closing
         await octokit.issues.createComment({

From 593de54e52679b51428571c13271e2ea9f91b1b1 Mon Sep 17 00:00:00 2001
From: Kamil Iskra <kiskra@nvidia.com>
Date: Thu, 24 Jul 2025 10:39:53 -0700
Subject: [PATCH 16/21] NCCL 2.27.7-1

Prevent initialization failures in certain configurations when attempting
to load fp8-specific symmetric multicast kernels on GPUs older than
Blackwell.
---
 ext-tuner/README.md         | 182 +++++++++++++++++++++++++++++++++
 ext-tuner/basic/README.md   | 197 ++++++++++++++++++++++++++++++++++++
 ext-tuner/example/README.md |   3 +-
 makefiles/version.mk        |   2 +-
 src/enqueue.cc              |   5 +-
 5 files changed, 382 insertions(+), 7 deletions(-)
 create mode 100644 ext-tuner/README.md
 create mode 100644 ext-tuner/basic/README.md

diff --git a/ext-tuner/README.md b/ext-tuner/README.md
new file mode 100644
index 000000000..67a743a12
--- /dev/null
+++ b/ext-tuner/README.md
@@ -0,0 +1,182 @@
+# NCCL Tuner Plugin Development
+
+This directory contains resources and examples for developing NCCL tuner plugins. Tuner plugins allow you to customize NCCL's algorithm and protocol selection behavior to optimize performance for specific workloads and hardware configurations.
+
+## Overview
+
+NCCL tuner plugins provide a way to influence NCCL's automatic algorithm and protocol selection by modifying the cost tables that NCCL uses to make decisions. This allows you to:
+
+- Override default algorithm/protocol combinations for specific collective operations
+- Customize tuning based on message size, topology, and other parameters
+- Implement sophisticated tuning strategies without recompiling NCCL
+- Optimize performance for specific hardware configurations or workloads
+
+## Tuner Plugin Interface
+
+NCCL tuner plugins must implement the `ncclTuner_t` interface defined in `nccl_tuner.h` within `nccl/src/include/plugin`. These definitions have been forked to `tuner.h` in each example plugin, and it is expected that any plugin implementor forks the internal NCCL definitions as well. The current interface includes:
+
+```c
+// Initialize the tuner plugin
+ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+
+// Get and modify collective operation cost information
+ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                           int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                           int regBuff, int* nChannels);
+
+// Clean up plugin resources
+ncclResult_t (*destroy)(void* context);
+```
+
+## Development Guidelines
+
+### 1. Plugin Structure
+
+A typical tuner plugin should:
+- Include the necessary forked NCCL headers (`tuner.h`)
+- Implement all required interface functions
+- Export the plugin structure with appropriate version
+- Handle all input parameters gracefully
+
+### 2. Cost Table Modification
+
+The `getCollInfo` function receives a cost table that maps algorithm/protocol combinations to performance costs. Lower costs indicate preferred combinations. You can:
+
+- Set costs to `0.0` to make combinations highly preferred
+- Set costs to `NCCL_ALGO_PROTO_IGNORE` to disable combinations
+- Use relative costs to create preferences between options
+
+### 3. Channel Management
+
+The `nChannels` parameter allows you to:
+- Set a specific number of channels to use
+- Return the original value to preserve NCCL's default behavior
+- Implement dynamic channel selection based on message size or topology
+
+### 4. Error Handling
+
+Always return appropriate `ncclResult_t` values:
+- `ncclSuccess` for successful or ignored operations
+- `ncclInternalError` for plugin-specific errors. Returning an error is only advisable on plugin initialization and destruction, as the penalty users can pay for the overhead of a failed plugin call can be immense.
+- Other NCCL error codes as appropriate
+
+## Getting Started
+
+### Option 1: Start with the Example Plugin
+
+If you're new to tuner plugin development, start with the `example/` directory:
+
+```bash
+cd example/
+make
+```
+
+This provides a CSV-based configuration system that you can customize or use as a template.
+
+## Building and Testing
+
+### Build Requirements
+
+- GCC or compatible C compiler
+- NCCL headers (included in `nccl/` subdirectories)
+- Make
+
+## Option 2: Use the Basic Plugin
+
+For more customized tuning needs, you might want to start with a clean baseline. In that case, base off the basic plugin in the `basic/` directory:
+
+```bash
+cd basic/
+make
+```
+
+### Build Process
+
+Each plugin directory contains a Makefile:
+
+```bash
+cd basic/    # or example/
+make
+```
+
+This generates a shared library (`.so` file) that can be loaded by NCCL.
+
+### Loading the Plugin
+
+Set the `LD_LIBRARY_PATH` to include your plugin directory:
+
+```bash
+export LD_LIBRARY_PATH=/path/to/your/plugin:$LD_LIBRARY_PATH
+```
+
+Set `NCCL_TUNER_PLUGIN` to either the plugin name, or the absolute path to the plugin file. Any of the below can work:
+
+```bash
+export NCCL_TUNER_PLUGIN=example
+export NCCL_TUNER_PLUGIN=libnccl-tuner-example.so
+export NCCL_TUNER_PLUGIN=/path/to/your/plugin/libnccl-tuner-example.so
+```
+
+NCCL will automatically discover and load the plugin based on the exported symbol names.
+
+## Advanced Topics
+
+### Plugin Versioning
+
+NCCL supports multiple plugin interface versions. Make sure your plugin exports the correct version:
+
+```c
+const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+    .name = "YourPluginName",
+    .init = yourInitFunction,
+    .getCollInfo = yourGetCollInfoFunction,
+    .destroy = yourDestroyFunction
+};
+```
+
+### Multi-GPU and Multi-Node Considerations
+
+Your plugin receives topology information (`nRanks`, `nNodes`) during initialization. Use this to:
+- Implement topology-aware tuning strategies
+- Handle single-node vs. multi-node optimizations differently
+- Scale channel counts based on available hardware
+
+### Performance Optimization
+
+- Keep plugin logic lightweight to avoid impacting NCCL performance
+- Cache expensive computations when possible
+- Use the logging system for debugging but avoid excessive output in production
+
+## Debugging and Logging
+
+Use NCCL's debug logging system:
+
+```bash
+export NCCL_DEBUG=INFO    # General information
+export NCCL_DEBUG_SUBSYS=TUNING
+```
+
+Within your plugin, use the provided `ncclDebugLogger_t` function for consistent logging.
+
+## Best Practices
+
+1. **Test thoroughly**: Verify your plugin works with various message sizes and topologies
+2. **Handle edge cases**: Ensure your plugin behaves correctly with unusual input parameters
+3. **Document your approach**: Clearly document your tuning strategy and configuration options
+4. **Version your plugin**: Use meaningful version numbers and maintain backward compatibility
+5. **Performance validation**: Measure the impact of your tuning decisions on real workloads
+
+## Contributing
+
+When developing new tuner plugins:
+- Follow the existing code style and structure
+- Include comprehensive documentation
+- Add example configurations and test cases
+- Consider contributing useful plugins back to the community
+
+## Resources
+
+- [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
+- Example plugin implementations in this directory
+
+For questions and support, refer to the NCCL community resources and documentation.
\ No newline at end of file
diff --git a/ext-tuner/basic/README.md b/ext-tuner/basic/README.md
new file mode 100644
index 000000000..acc6d5545
--- /dev/null
+++ b/ext-tuner/basic/README.md
@@ -0,0 +1,197 @@
+# Basic NCCL Tuner Plugin
+
+This directory contains a minimal placeholder implementation of an NCCL tuner plugin. It serves as a starting point for developing custom tuner plugins by providing the essential function stubs and interface structure required by NCCL.
+
+## Purpose
+
+This basic plugin is designed to:
+- Provide a minimal working example of the NCCL tuner plugin interface
+- Serve as a template for developing custom tuner plugins
+- Demonstrate the required function signatures and structure
+- Implement placeholder functionality that can be extended
+
+
+## Implementation Details
+
+The plugin implements the following functions:
+
+### `pluginInit`
+```c
+ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context)
+```
+- **Purpose**: Initialize the plugin with communicator information
+- **Current Implementation**: Simple placeholder that returns success
+- **Parameters**:
+  - `nRanks`: Total number of ranks in the communicator
+  - `nNodes`: Total number of nodes in the communicator
+  - `logFunction`: NCCL debug logging function
+  - `context`: Plugin context pointer (output)
+
+### `pluginGetCollInfo`
+```c
+ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels)
+```
+- **Purpose**: Modify cost tables for collective operations
+- **Current Implementation**:
+  - Sets RING+SIMPLE algorithm to cost 0.0 (highest preference)
+  - Sets channel count to 1
+- **Parameters**:
+  - `context`: Plugin context from init
+  - `collType`: Type of collective operation
+  - `nBytes`: Message size in bytes
+  - `numPipeOps`: Number of pipeline operations
+  - `collCostTable`: Cost table to modify
+  - `numAlgo`: Number of algorithms
+  - `numProto`: Number of protocols
+  - `regBuff`: Whether buffer can be registered
+  - `nChannels`: Number of channels to use (output)
+
+### `pluginDestroy`
+```c
+ncclResult_t pluginDestroy(void* context)
+```
+- **Purpose**: Clean up plugin resources
+- **Current Implementation**: Simple placeholder that returns success
+
+## Cost Table Structure
+
+The plugin demonstrates how to modify NCCL's cost tables:
+
+```c
+float (*table)[NCCL_NUM_PROTOCOLS] = (float (*)[NCCL_NUM_PROTOCOLS])collCostTable;
+```
+
+The cost table is a 2D array where:
+- First dimension: Algorithm index (e.g., `NCCL_ALGO_RING`)
+- Second dimension: Protocol index (e.g., `NCCL_PROTO_SIMPLE`)
+- Values: Cost for that algorithm/protocol combination
+
+### Cost Values
+- **0.0**: Highest preference (lowest cost)
+- **Positive values**: Relative costs (lower is better)
+- **`NCCL_ALGO_PROTO_IGNORE`**: Disable this combination
+
+## Building
+
+```bash
+make
+```
+
+This creates `libnccl-tuner-basic.so` which can be loaded by NCCL.
+
+## Usage
+
+### Loading the Plugin
+
+```bash
+export LD_LIBRARY_PATH=/path/to/basic:$LD_LIBRARY_PATH
+mpirun -np 4 your_nccl_application
+```
+
+```bash
+export NCCL_TUNER_PLUGIN=basic
+export NCCL_TUNER_PLUGIN=libnccl-tuner-basic.so
+export NCCL_TUNER_PLUGIN=/path/to/your/plugin/libnccl-tuner-basic.so
+```
+
+### Verifying Plugin Loading
+
+Enable NCCL debug output to see if the plugin is loaded:
+
+```bash
+export NCCL_DEBUG=INFO
+```
+
+You should see messages indicating the tuner plugin is being used.
+
+## Extending the Plugin
+
+This basic plugin provides a foundation that you can extend:
+
+### 1. Add Configuration Logic
+
+Modify `pluginGetCollInfo` to implement your tuning strategy:
+
+```c
+__hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels) {
+  // Your custom tuning logic here
+  if (nBytes < 1024) {
+    // Small message optimization
+    table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = 0.0;
+  } else {
+    // Large message optimization
+    table[NCCL_ALGO_RING][NCCL_PROTO_LL128] = 0.0;
+  }
+
+  // Dynamic channel selection
+  *nChannels = (nBytes > 1024*1024) ? 4 : 1;
+
+  return ncclSuccess;
+}
+```
+
+### 2. Add Context Management
+
+Use the context pointer to store plugin state:
+
+```c
+struct pluginContext {
+  int initialized;
+  size_t nRanks;
+  size_t nNodes;
+  // Add your plugin-specific data here
+};
+```
+
+### 3. Add File-Based Configuration
+
+Read configuration from files, environment variables, or other sources.
+
+### 4. Add Topology Awareness
+
+Use the `nRanks` and `nNodes` parameters to implement topology-specific tuning.
+
+## File Structure
+
+```
+basic/
+├── README.md          # This file
+├── plugin.c           # Plugin implementation
+├── Makefile           # Build configuration
+└── nccl/              # NCCL header files
+    └── tuner.h        # Tuner plugin interface definitions
+```
+
+## Next Steps
+
+1. **Understand the Interface**: Study the function signatures and parameters
+2. **Implement Your Logic**: Add your tuning strategy to `pluginGetCollInfo`
+3. **Test Thoroughly**: Verify your plugin works with different message sizes and topologies
+4. **Add Error Handling**: Implement proper error checking and resource management
+5. **Document Your Changes**: Update this README with your specific implementation details
+
+## Comparison with Example Plugin
+
+- **Basic Plugin**: Minimal implementation, good for learning and simple use cases
+- **Example Plugin**: Full-featured CSV-based configuration system, good for production use
+
+Choose the basic plugin if you want to:
+- Learn the tuner plugin interface
+- Implement simple, hardcoded tuning strategies
+- Build a custom plugin from scratch
+
+Choose the example plugin if you want:
+- File-based configuration
+- Complex tuning strategies
+- Production-ready features
+
+## Resources
+
+- [Parent Directory README](../README.md) - General tuner plugin development guide
+- [Example Plugin](../example/README.md) - Fully featured implementation
+
+This basic plugin provides the foundation you need to start developing custom NCCL tuner plugins. Extend it with your specific tuning logic and requirements.
diff --git a/ext-tuner/example/README.md b/ext-tuner/example/README.md
index 7f472ae7a..10a99b5f2 100644
--- a/ext-tuner/example/README.md
+++ b/ext-tuner/example/README.md
@@ -104,7 +104,6 @@ Set the `NCCL_TUNER_CONFIG_FILE` environment variable to specify the config file
 
 ```bash
 export NCCL_TUNER_CONFIG_FILE=/path/to/your/tuner.conf
-export LD_LIBRARY_PATH=/path/to/plugin:$LD_LIBRARY_PATH
 mpirun -np 4 your_nccl_application
 ```
 
@@ -158,7 +157,7 @@ When channels is set to `-1`, NCCL's default channel selection logic is preserve
 
 1. **Config file not found**: Check the file path and permissions
 2. **Configurations not applied**: Verify the collective type, size ranges, algorithm/protocol names, and topology parameters
-3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory
+3. **Plugin not loaded**: Ensure `LD_LIBRARY_PATH` includes the plugin directory and that `NCCL_TUNER_PLUGIN` either specifies the plugin name, or an absolute path to the plugin shared library.
 4. **No effect on performance**: Check that NCCL is actually using the tuner plugin with `NCCL_DEBUG=INFO`
 5. **Topology mismatch**: Verify that nNodes and nRanks match your actual setup, or use -1 for wildcards
 6. **CSV parsing errors**: Ensure no spaces after commas, or quote fields containing spaces
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 0f482d31a..3b182d61b 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 27
-NCCL_PATCH   := 6
+NCCL_PATCH   := 7
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/enqueue.cc b/src/enqueue.cc
index f5b43724c..225a4cffc 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -38,12 +38,9 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
       if (fn == nullptr) continue;
 
       cudaError_t errcode = cudaFuncGetAttributes(&attr, fn);
-      if (errcode == cudaErrorNoKernelImageForDevice) continue;
-      CUDACHECKGOTO(errcode, result, ignore0);
-
+      if (errcode != cudaSuccess) continue; // Silently ignore failures
       if (maxStackSize) {
         if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
-      ignore0:;
       }
       if (carveout) {
         CUDACHECKGOTO(cudaFuncSetAttribute(fn,

From f1308997d0420148b1be1c24d63f19d902ae589b Mon Sep 17 00:00:00 2001
From: Mark Santesson <msantesson@nvidia.com>
Date: Tue, 2 Sep 2025 13:21:14 -0700
Subject: [PATCH 17/21] NCCL 2.28.3-1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Device API (Experimental)
 * Introduces device-side APIs to integrate NCCL communication directly into application kernels.
 * Supports LSA (Load/Store Access) for CUDA P2P communication over NVLink and some PCIe platforms.
 * Supports Multimem for hardware multicast using NVLink SHARP.
 * Adds initial framework for GIN (GPU-Initiated Networking), currently under development.
 * Introduces device communicators created using ncclDevCommCreate.
 * Enables device-side communication operations with synchronization (ncclLsaBarrierSession) and memory accessors (ncclGetLsaPointer, ncclGetLsaMultimemPointer).
 * Experimental APIs - signatures and functionality may evolve in future releases.
 * No ABI compatibility is guaranteed — applications must be recompiled with each new NCCL release.

Symmetric memory improvements
 * Support for aggregating symmetric operations using ncclGroupStart/End APIs.
 * Reimplement symmetric kernels using device API.

New Host APIs
 * Introduce new host collective APIs: ncclAlltoAll, ncclScatter, ncclGather.

CE (Copy Engine) Collectives
 * Reduce SM utilization for alltoall, scatter, gather, and allgather within a single (MN)NVL domain.
 * Free up SM capacity for the application to do computation at the same time.
 * To enable the feature for ncclAllGather, ncclAlltoAll, ncclGather, ncclScatter, register buffers into symmetric windows and use the NCCL_CTA_POLICY_ZERO flag in the communicator config_t.

NCCL Inspector Plugin
 * Introduces an Inspector plugin for always-on performance monitoring.
 * Produces structured JSON output with metadata, execution time, bandwidth, and optional event traces for each NCCL operation.
 * Enables integration with analysis tools such as Performance Exporter to visualize NCCL performance bottlenecks.
 * Lightweight to enable via environment variables NCCL_PROFILER_PLUGIN and NCCL_INSPECTOR_ENABLE.

CMake support (Experiemental)
 * Adds a CMake build system as an alternative to existing Makefiles.
 * Known issues: pkg.build and Device API currently do not work with CMake.
 * The known issues will be addressed in a future release.

Decreased max CTA count from 32 to 16 on Blackwell
 * SM overhead is decreased by 50% with this improvement.
 * This may cause some perf drop on Blackwell because of the reduced SM usage.
 * If the extra SM capacity is not desired, two options are available to restore to previous behavior: 1) Setting NCCL_MIN_CTAS=32 NCCL_MAX_CTAS=32 environment variables; 2) setting communicator config to over-write max CTA count to 32.
 * Based on community feedback, future versions may consider different trade-offs between performance and SM overhead.

Plugins
 * Network
   * App-aware Network plugin. NCCL passes information about communication operations to be executed on the network end point. This allows for better tuning of network end points and their use in the plugins.
   * Improve handling of physical and virtual network devices and load/unload.
   * Network plugin version 11 - add explicit context and communication ID support for per communicator init/finalize.
   * Add Multi-Request Net API. Using this will help NCCL to anticipate multiple send/recv requests and optimize for it. See maxMultiRequestSize field in ncclNetProperties_v11_t.
 * Profiler
   * Add support for API events (group, collective, and p2p) and for tracking kernel launches in the profiler plugin.
   * Add Inspector Profiler Plugin (see section above).
   * Add a hook to Google’s CoMMA profiler on github.
 * Tuner
   * Expose NCCL tuning constants at tuner initialization via ncclTunerConstants_v5_t.
   * Add NVL Domain Information API.
 * Support multiple plugin types from a single shared object.

New Parameterization and ncclConfig changes:
 * Add new option NCCL_MNNVL_CLIQUE_ID=-2 which will use rack serial number to partition the MNNVL clique. This will limit NVLink domains to GPUs within a single rack.
 * Add NCCL_NETDEVS_POLICY to control how NET devices are assigned to GPUs. The default (AUTO) is the policy used in previous versions.
 * Add NCCL_SINGLE_PROC_MEM_REG_ENABLE control variable to enable NVLS UB registration in the “one process, multiple ranks” case as opt in.
 * Move nChannelsPerNetPeer into ncclConfig. NCCL_NCHANNELS_PER_NET_PEER can override the value in ncclConfig.
 * Enable PxN over C2C by default
   * PxN over C2C will improve performance for Grace-Blackwell platforms by allowing NCCL to leverage the NIC attached to a peer GPU over NVLINK, C2C, and PCIe.
   * This behavior can be overridden by setting NCCL_PXN_C2C=0.

Other Improvements:
 * Allow FP8 support for non-reductive operations on pre sm90 devices. (See https://github.com/pytorch/pytorch/pull/151594#discussion_r2135777776)
 * Fix NVLS+CollNet and temporarily disables COLLNET_CHAIN for >8 GPUs.
 * Only consider running interfaces for socket traffic. NCCL will not attempt to use interfaces that do not have the IFF_RUNNING bit. (https://github.com/NVIDIA/nccl/issues/1798)
 * Modernize mutex management. Convert to std::mutex and std::lock_guard.
 * Remove sm35 and sm50 GENCODE targets which have long been deprecated and were causing issues with the latest NCCL release builds.
 * Improved NVLS/NVLSTree tuning prediction to improve algorithm and protocol selection.
 * NVLSTree Tuning Fixes. Update tuning data for H100, GB200-NV72.
 * Respond better to RoCE link flaps. Instead of reporting an “unknown event” it will now report “GID table changed”.
 * Move libvirt bridge interface to the end of possible interfaces so that they are considered last. These interfaces are usually virtual bridges to relay traffic to containers running on the host and cannot be used for traffic to a remote node and are therefore unsuitable.
---
 ext-net/README.md                             |   25 +-
 ext-net/example/CMakeLists.txt                |   19 +
 ext-net/example/nccl/net.h                    |   10 +-
 ext-net/example/nccl/net_device.h             |    5 +-
 ext-net/example/nccl/net_v10.h                |    3 +-
 ext-net/example/nccl/net_v11.h                |  120 ++
 ext-net/example/nccl/net_v9.h                 |    3 +-
 ext-net/example/plugin.c                      |  101 +-
 ext-profiler/README.md                        |  119 +-
 ext-profiler/example/CMakeLists.txt           |   34 +
 ext-profiler/example/Makefile                 |   16 +-
 ext-profiler/example/README.md                |  180 +-
 ext-profiler/example/event.c                  |   30 -
 ext-profiler/example/event.h                  |  155 +-
 ext-profiler/example/nccl/profiler.h          |   33 +-
 ext-profiler/example/nccl/profiler_v5.h       |  152 ++
 ext-profiler/example/{plugin.c => plugin.cc}  |  264 ++-
 ext-profiler/example/plugin.h                 |    5 +-
 .../example/{print_event.c => print_event.cc} |   99 +-
 ext-profiler/example/queue.h                  |   50 +
 ext-profiler/google-CoMMA/Makefile            |   22 +
 ext-profiler/inspector/Makefile               |   62 +
 ext-profiler/inspector/README.md              |  216 +++
 .../inspector/exporter/example/README.md      |  151 ++
 .../exporter/example/perf_summary_exporter.py |  548 ++++++
 .../exporter/example/requirements.txt         |    6 +
 ext-profiler/inspector/inspector.cc           | 1530 +++++++++++++++++
 ext-profiler/inspector/inspector.h            |  198 +++
 ext-profiler/inspector/inspector_plugin.cc    |  493 ++++++
 ext-profiler/inspector/json.cc                |  496 ++++++
 ext-profiler/inspector/json.h                 |   83 +
 ext-profiler/inspector/nccl/common.h          |   73 +
 ext-profiler/inspector/nccl/profiler.h        |   85 +
 ext-profiler/inspector/nccl/profiler_net.h    |   19 +
 ext-profiler/inspector/nccl/profiler_v1.h     |  112 ++
 ext-profiler/inspector/nccl/profiler_v2.h     |  108 ++
 ext-profiler/inspector/nccl/profiler_v3.h     |  116 ++
 ext-profiler/inspector/nccl/profiler_v4.h     |  127 ++
 ext-profiler/inspector/nccl/profiler_v5.h     |  151 ++
 ext-profiler/inspector/nccl/types.h           |   21 +
 ext-profiler/inspector/version.h              |   12 +
 ext-tuner/README.md                           |    2 +-
 ext-tuner/example/.gitignore                  |   49 +
 ext-tuner/example/CMakeLists.txt              |   26 +
 ext-tuner/example/nccl/tuner.h                |   51 +-
 ext-tuner/example/plugin.c                    |   36 +-
 ext-tuner/example/test/test_plugin.c          |  178 +-
 makefiles/common.mk                           |    7 +-
 makefiles/version.mk                          |    4 +-
 pkg/Makefile                                  |    2 +-
 pkg/debian/libnccl-dev.install.in             |    2 +-
 pkg/redhat/nccl.spec.in                       |    4 +-
 pkg/srctxz/Makefile                           |    2 +-
 pkg/srctxz/create_srctxz.sh.in                |   28 +-
 src/CMakeLists.txt                            |  180 ++
 src/Makefile                                  |   20 +-
 src/allocator.cc                              |  396 ++++-
 src/bootstrap.cc                              |   25 +-
 src/ce_coll.cc                                |  615 +++++++
 src/collectives.cc                            |   42 +
 src/debug.cc                                  |   46 +-
 src/dev_runtime.cc                            |  995 +++++++++++
 src/device/CMakeLists.txt                     |   60 +
 src/device/Makefile                           |    8 +-
 src/device/common.h                           |   14 +-
 src/device/generate.py                        |   18 +-
 src/device/symmetric/all_gather.cuh           |  260 +--
 src/device/symmetric/all_reduce.cuh           |  353 ++--
 src/device/symmetric/generate.py              |   62 +-
 src/device/symmetric/kernel.cuh               |   24 +-
 src/device/symmetric/primitives.cuh           |  453 +----
 src/device/symmetric/reduce_scatter.cuh       |  265 +--
 src/enqueue.cc                                |  590 ++++---
 src/graph/CMakeLists.txt                      |   14 +
 src/graph/connect.cc                          |   40 +-
 src/graph/paths.cc                            |   15 +-
 src/graph/topo.cc                             |  375 ++--
 src/graph/topo.h                              |   24 +-
 src/graph/tuning.cc                           |  175 +-
 src/graph/xml.cc                              |   44 +-
 src/graph/xml.h                               |    7 +
 src/group.cc                                  |  213 ++-
 src/include/allocator.h                       |   52 +-
 src/include/bitops.h                          |   29 +-
 src/include/ce_coll.h                         |   76 +
 src/include/channel.h                         |    7 +-
 src/include/coll_net.h                        |    3 +-
 src/include/collectives.h                     |    8 +-
 src/include/comm.h                            |   55 +-
 src/include/core.h                            |    1 +
 src/include/cpuset.h                          |   95 +-
 src/include/cudawrap.h                        |    6 +
 src/include/debug.h                           |   26 +-
 src/include/dev_runtime.h                     |   92 +
 src/include/device.h                          |   24 +-
 src/include/graph.h                           |    2 +
 src/include/group.h                           |    1 +
 src/include/nccl_common.h                     |   26 +-
 src/include/nccl_device.h                     |   15 +
 src/include/nccl_device/README.md             |   32 +
 src/include/nccl_device/comm.h                |   10 +
 src/include/nccl_device/coop.h                |  152 ++
 src/include/nccl_device/core.h                |  150 ++
 src/include/nccl_device/impl/comm__funcs.h    |   10 +
 src/include/nccl_device/impl/comm__types.h    |   40 +
 src/include/nccl_device/impl/core__funcs.h    |  210 +++
 src/include/nccl_device/impl/core__types.h    |   26 +
 src/include/nccl_device/impl/ll_a2a__funcs.h  |  229 +++
 src/include/nccl_device/impl/ll_a2a__types.h  |   37 +
 .../nccl_device/impl/mem_barrier__funcs.h     |  126 ++
 .../nccl_device/impl/mem_barrier__types.h     |   46 +
 src/include/nccl_device/impl/ptr__funcs.h     |  157 ++
 src/include/nccl_device/impl/ptr__types.h     |   11 +
 src/include/nccl_device/ll_a2a.h              |   53 +
 src/include/nccl_device/mem_barrier.h         |   35 +
 src/include/nccl_device/ptr.h                 |   61 +
 src/include/nccl_device/utility.h             |  352 ++++
 src/include/net.h                             |    6 +
 src/include/net_device.h                      |    5 +-
 src/include/nvmlwrap.h                        |   21 +
 src/include/nvtx.h                            |  108 +-
 src/include/nvtx3/nvToolsExtCounters.h        |    2 +-
 .../nvtx3/nvToolsExtSemanticsCounters.h       |    2 +-
 src/include/nvtx3/nvToolsExtSemanticsScope.h  |    2 +-
 .../nvtx3/nvtxDetail/nvtxExtHelperMacros.h    |    2 +-
 src/include/nvtx3/nvtxDetail/nvtxExtImpl.h    |    2 +-
 .../nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h |    2 +-
 .../nvtxDetail/nvtxExtPayloadHelperInternal.h |    2 +-
 .../nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h |    2 +-
 src/include/nvtx3/nvtxDetail/nvtxExtTypes.h   |    2 +-
 src/include/nvtx_payload_schemas.h            |   23 +
 src/include/plugin/nccl_net.h                 |   23 +-
 src/include/plugin/nccl_profiler.h            |   32 +-
 src/include/plugin/nccl_tuner.h               |   41 +-
 src/include/plugin/net/net_v10.h              |    4 +-
 src/include/plugin/net/net_v11.h              |  188 ++
 src/include/plugin/net/net_v9.h               |    4 +-
 src/include/plugin/plugin.h                   |    2 +
 src/include/plugin/profiler/profiler_v5.h     |  151 ++
 src/include/plugin/tuner/tuner_v5.h           |   87 +
 src/include/profiler.h                        |   36 +
 src/include/proxy.h                           |   11 +
 src/include/register.h                        |   24 +-
 src/include/register_inline.h                 |   17 +-
 src/include/scheduler.h                       |   17 +
 src/include/shm.h                             |    6 +
 src/include/shmutils.h                        |    4 +-
 src/include/sym_kernels.h                     |  112 ++
 src/include/symmetric.h                       |   90 -
 src/include/transport.h                       |   12 +-
 src/include/utils.h                           |    2 +
 src/init.cc                                   |  201 ++-
 src/init_nvtx.cc                              |   13 +
 src/misc/CMakeLists.txt                       |   20 +
 src/misc/cudawrap.cc                          |   17 +-
 src/misc/gdrwrap.cc                           |    5 +-
 src/misc/ibvwrap.cc                           |    5 +-
 src/misc/mlx5dvwrap.cc                        |   30 +-
 src/misc/nvmlwrap.cc                          |   17 +
 src/misc/param.cc                             |   10 +-
 src/misc/shmutils.cc                          |   45 +-
 src/misc/socket.cc                            |    8 +-
 src/misc/strongstream.cc                      |    9 +-
 src/misc/utils.cc                             |   30 +-
 src/mnnvl.cc                                  |    6 +-
 src/nccl.h.in                                 |   59 +-
 src/nccl_device/CMakeLists.txt                |    9 +
 src/nccl_device/core.cc                       |   57 +
 src/nccl_device/ll_a2a.cc                     |   26 +
 src/nccl_device/mem_barrier.cc                |   21 +
 src/plugin/CMakeLists.txt                     |   18 +
 src/plugin/net.cc                             |  125 +-
 src/plugin/net/CMakeLists.txt                 |   12 +
 src/plugin/net/net_v10.cc                     |  187 +-
 src/plugin/net/net_v11.cc                     |   31 +
 src/plugin/net/net_v6.cc                      |   68 +-
 src/plugin/net/net_v7.cc                      |   67 +-
 src/plugin/net/net_v8.cc                      |   67 +-
 src/plugin/net/net_v9.cc                      |  117 +-
 src/plugin/plugin_open.cc                     |   66 +-
 src/plugin/profiler.cc                        |  297 +++-
 src/plugin/profiler/CMakeLists.txt            |   11 +
 src/plugin/profiler/profiler_v1.cc            |   16 +-
 src/plugin/profiler/profiler_v2.cc            |   16 +-
 src/plugin/profiler/profiler_v3.cc            |   16 +-
 src/plugin/profiler/profiler_v4.cc            |  104 +-
 src/plugin/profiler/profiler_v5.cc            |   21 +
 src/plugin/tuner.cc                           |   30 +-
 src/plugin/tuner/CMakeLists.txt               |   10 +
 src/plugin/tuner/tuner_v2.cc                  |   14 +-
 src/plugin/tuner/tuner_v3.cc                  |   12 +-
 src/plugin/tuner/tuner_v4.cc                  |   22 +-
 src/plugin/tuner/tuner_v5.cc                  |   21 +
 src/proxy.cc                                  |   68 +-
 src/ras/CMakeLists.txt                        |   11 +
 src/ras/ras.cc                                |    8 +-
 src/register/CMakeLists.txt                   |    9 +
 src/register/coll_reg.cc                      |    9 +-
 src/register/register.cc                      |  113 --
 src/register/sendrecv_reg.cc                  |    6 +
 src/scheduler/CMakeLists.txt                  |    7 +
 src/scheduler/symmetric_sched.cc              |  235 +++
 src/{symmetric.cc => sym_kernels.cc}          |  179 +-
 src/transport/CMakeLists.txt                  |   15 +
 src/transport/coll_net.cc                     |   62 +-
 src/transport/generic.cc                      |    6 +
 src/transport/net.cc                          |  149 +-
 src/transport/net_ib.cc                       |  281 +--
 src/transport/net_socket.cc                   |   70 +-
 src/transport/nvls.cc                         |  136 +-
 src/transport/p2p.cc                          |   90 +-
 src/transport/profiler.cc                     |    2 +-
 212 files changed, 15532 insertions(+), 2935 deletions(-)
 create mode 100644 ext-net/example/CMakeLists.txt
 create mode 100644 ext-net/example/nccl/net_v11.h
 create mode 100644 ext-profiler/example/CMakeLists.txt
 delete mode 100644 ext-profiler/example/event.c
 create mode 100644 ext-profiler/example/nccl/profiler_v5.h
 rename ext-profiler/example/{plugin.c => plugin.cc} (68%)
 rename ext-profiler/example/{print_event.c => print_event.cc} (76%)
 create mode 100644 ext-profiler/example/queue.h
 create mode 100644 ext-profiler/google-CoMMA/Makefile
 create mode 100644 ext-profiler/inspector/Makefile
 create mode 100644 ext-profiler/inspector/README.md
 create mode 100644 ext-profiler/inspector/exporter/example/README.md
 create mode 100644 ext-profiler/inspector/exporter/example/perf_summary_exporter.py
 create mode 100644 ext-profiler/inspector/exporter/example/requirements.txt
 create mode 100644 ext-profiler/inspector/inspector.cc
 create mode 100644 ext-profiler/inspector/inspector.h
 create mode 100644 ext-profiler/inspector/inspector_plugin.cc
 create mode 100644 ext-profiler/inspector/json.cc
 create mode 100644 ext-profiler/inspector/json.h
 create mode 100644 ext-profiler/inspector/nccl/common.h
 create mode 100644 ext-profiler/inspector/nccl/profiler.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_net.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_v1.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_v2.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_v3.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_v4.h
 create mode 100644 ext-profiler/inspector/nccl/profiler_v5.h
 create mode 100644 ext-profiler/inspector/nccl/types.h
 create mode 100644 ext-profiler/inspector/version.h
 create mode 100644 ext-tuner/example/.gitignore
 create mode 100644 ext-tuner/example/CMakeLists.txt
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/ce_coll.cc
 create mode 100644 src/dev_runtime.cc
 create mode 100644 src/device/CMakeLists.txt
 create mode 100644 src/graph/CMakeLists.txt
 create mode 100644 src/include/ce_coll.h
 create mode 100644 src/include/dev_runtime.h
 create mode 100644 src/include/nccl_device.h
 create mode 100644 src/include/nccl_device/README.md
 create mode 100644 src/include/nccl_device/comm.h
 create mode 100644 src/include/nccl_device/coop.h
 create mode 100644 src/include/nccl_device/core.h
 create mode 100644 src/include/nccl_device/impl/comm__funcs.h
 create mode 100644 src/include/nccl_device/impl/comm__types.h
 create mode 100644 src/include/nccl_device/impl/core__funcs.h
 create mode 100644 src/include/nccl_device/impl/core__types.h
 create mode 100644 src/include/nccl_device/impl/ll_a2a__funcs.h
 create mode 100644 src/include/nccl_device/impl/ll_a2a__types.h
 create mode 100644 src/include/nccl_device/impl/mem_barrier__funcs.h
 create mode 100644 src/include/nccl_device/impl/mem_barrier__types.h
 create mode 100644 src/include/nccl_device/impl/ptr__funcs.h
 create mode 100644 src/include/nccl_device/impl/ptr__types.h
 create mode 100644 src/include/nccl_device/ll_a2a.h
 create mode 100644 src/include/nccl_device/mem_barrier.h
 create mode 100644 src/include/nccl_device/ptr.h
 create mode 100644 src/include/nccl_device/utility.h
 create mode 100644 src/include/plugin/net/net_v11.h
 create mode 100644 src/include/plugin/profiler/profiler_v5.h
 create mode 100644 src/include/plugin/tuner/tuner_v5.h
 create mode 100644 src/include/scheduler.h
 create mode 100644 src/include/sym_kernels.h
 delete mode 100644 src/include/symmetric.h
 create mode 100644 src/misc/CMakeLists.txt
 create mode 100644 src/nccl_device/CMakeLists.txt
 create mode 100644 src/nccl_device/core.cc
 create mode 100644 src/nccl_device/ll_a2a.cc
 create mode 100644 src/nccl_device/mem_barrier.cc
 create mode 100644 src/plugin/CMakeLists.txt
 create mode 100644 src/plugin/net/CMakeLists.txt
 create mode 100644 src/plugin/net/net_v11.cc
 create mode 100644 src/plugin/profiler/CMakeLists.txt
 create mode 100644 src/plugin/profiler/profiler_v5.cc
 create mode 100644 src/plugin/tuner/CMakeLists.txt
 create mode 100644 src/plugin/tuner/tuner_v5.cc
 create mode 100644 src/ras/CMakeLists.txt
 create mode 100644 src/register/CMakeLists.txt
 create mode 100644 src/scheduler/CMakeLists.txt
 create mode 100644 src/scheduler/symmetric_sched.cc
 rename src/{symmetric.cc => sym_kernels.cc} (52%)
 create mode 100644 src/transport/CMakeLists.txt

diff --git a/ext-net/README.md b/ext-net/README.md
index 90fe89bf5..8bcaf3096 100644
--- a/ext-net/README.md
+++ b/ext-net/README.md
@@ -60,36 +60,36 @@ of newer ones.
 The `nccl/` directory is populated with `net_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v10)
+# API (v11)
 
-Below is the main `ncclNet_v10` struct. Each function is explained in later sections.
+Below is the main `ncclNet_v11` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
   // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
   // Return the number of adapters.
   ncclResult_t (*devices)(int* ndev);
   // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v10_t* props);
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
   // Create a receiving object and provide a handle to connect to it. The
   // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
   // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
   // Connect to a handle and return a sending comm object for that peer.
   // This call must not block for the connection to be established, and instead
   // should return successfully with sendComm == NULL with the expectation that
   // it will be called again until sendComm != NULL.
   // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm);
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
   // Finalize connection establishment after remote peer has called connect.
   // This call must not block for the connection to be established, and instead
   // should return successfully with recvComm == NULL with the expectation that
   // it will be called again until recvComm != NULL.
   // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v10_t** recvDevComm);
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
   // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
   // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
   ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
@@ -191,6 +191,12 @@ This will allow the plugin to discover network devices and make sure they are us
 `init` function does not return `ncclSuccess`, then NCCL will not use the plugin and fall back on
 internal ones.
 
+Every call to `init` returns an opaque context that the plugin uses internally to allocate resources
+and manage state. Such context is passed to other net plugin calls that create further resources,
+such as `listen` and `connect`. Every context is uniquely associated to a communicator
+using the commId. The network can also be initialized with a per communicator configuration using
+the `config` argument.
+
 To allow the plugin logs to integrate into the NCCL logs seemlessly, NCCL provides a logging
 function to `init`. This function is typically used to allow for `INFO` and `WARN` macros within
 the plugin code adding the following definitions:
@@ -282,7 +288,7 @@ side.
 `listen`
 
 To create a connection, NCCL will start by calling `listen` on the receiver side. This function
-takes a device number as input argument, and should return a local `listenComm` object, and a
+takes the opaque plugin context returned by `init` and a device number as input argument, and should return a local `listenComm` object, and a
 `handle` to pass to the other side, so that the sender side can connect to the receiver.
 
 The `handle` is a buffer of size `NCCL_NET_HANDLE_MAXSIZE` and is provided by NCCL.
@@ -304,7 +310,8 @@ the `listen` call previously. If the sender did not connect yet, `accept` should
 should return `ncclSuccess`, setting `recvComm` to `NULL`. NCCL will call `accept` again until it
 succeeds.
 
-The `connect` API takes a `ncclNetCommConfig_t`, which contains a trafficClass field.
+The `connect` API takes the opaque plugin context returned by `init`. The plugin context can reference
+the `ncclNetCommConfig_t` passed to the `init` function and containing a trafficClass field.
 This field can be used by the network plugin to specify the QoS level of the connection. By default,
 `trafficClass` is set to -1 but can be configured by the application during communicator initialization
 to select a plugin-supported QoS level.
diff --git a/ext-net/example/CMakeLists.txt b/ext-net/example/CMakeLists.txt
new file mode 100644
index 000000000..d8af7fe36
--- /dev/null
+++ b/ext-net/example/CMakeLists.txt
@@ -0,0 +1,19 @@
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-net-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-net-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-net-example PROPERTIES
+    OUTPUT_NAME "nccl-net-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
diff --git a/ext-net/example/nccl/net.h b/ext-net/example/nccl/net.h
index 4cc66915b..9b3e6e03c 100644
--- a/ext-net/example/nccl/net.h
+++ b/ext-net/example/nccl/net.h
@@ -22,7 +22,9 @@
 
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
+#define NCCL_NET_MAX_DEVS_PER_NIC 4
 
+#include "net_v11.h"
 #include "net_v10.h"
 #include "net_v9.h"
 #include "net_v8.h"
@@ -33,9 +35,9 @@
 #include "net_v3.h"
 #include "net_v2.h"
 
-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
 
 #endif // end include guard
diff --git a/ext-net/example/nccl/net_device.h b/ext-net/example/nccl/net_device.h
index d693101a3..56bcea83f 100644
--- a/ext-net/example/nccl/net_device.h
+++ b/ext-net/example/nccl/net_device.h
@@ -12,7 +12,7 @@
 
 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
-#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
 
 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
 
@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/ext-net/example/nccl/net_v10.h b/ext-net/example/nccl/net_v10.h
index 809e7c001..bb0c661bb 100644
--- a/ext-net/example/nccl/net_v10.h
+++ b/ext-net/example/nccl/net_v10.h
@@ -5,10 +5,9 @@
 #ifndef NET_V10_H_
 #define NET_V10_H_
 
-#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
 typedef struct {
   int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v10_t;
 
 
diff --git a/ext-net/example/nccl/net_v11.h b/ext-net/example/nccl/net_v11.h
new file mode 100644
index 000000000..1c8adc6c5
--- /dev/null
+++ b/ext-net/example/nccl/net_v11.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+  int maxMultiRequestSize;         // Maximum number of requests supported in a single multi-request.
+} ncclNetProperties_v11_t;
+
+typedef struct {
+  int32_t maxConcurrentPeers;
+  int32_t minConcurrentPeers;
+  int32_t maxFlowsPerPeer;
+  int32_t minFlowsPerPeer;
+} ncclNetCommAttr_v11_t;
+
+typedef struct {
+  ncclNetCommAttr_v11_t sendCommAttr;
+  ncclNetCommAttr_v11_t recvCommAttr;
+  uint32_t op;
+  uint32_t algo;
+  uint32_t proto;
+} ncclNetAttr_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+  // Finalize the network.
+  ncclResult_t (*finalize)(void* ctx);
+
+  ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
+} ncclNet_v11_t;
+
+#endif // end include guard
diff --git a/ext-net/example/nccl/net_v9.h b/ext-net/example/nccl/net_v9.h
index ca60ad651..9dea09cbd 100644
--- a/ext-net/example/nccl/net_v9.h
+++ b/ext-net/example/nccl/net_v9.h
@@ -5,10 +5,9 @@
 #ifndef NET_V9_H_
 #define NET_V9_H_
 
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
 typedef struct {
   int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v9_t;
 
 typedef struct {
diff --git a/ext-net/example/plugin.c b/ext-net/example/plugin.c
index 97a29875d..b0a9a4c59 100644
--- a/ext-net/example/plugin.c
+++ b/ext-net/example/plugin.c
@@ -11,7 +11,7 @@
 
 int max_requests = NCCL_NET_MAX_REQUESTS;
 
-__hidden ncclResult_t pluginInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
 __hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; }
 __hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; }
 __hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; }
@@ -51,8 +51,8 @@ __hidden ncclResult_t pluginGetProperties(int dev, ncclNetProperties_t* props) {
   return ncclSuccess;
 }
 
-__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; }
-__hidden ncclResult_t pluginConnect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginListen(void* ctx, int dev, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle_t** recvDevComm) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMr(void* collComm, void* data, size_t size, int type, void** mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginRegMrDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) { return ncclInternalError; }
@@ -67,10 +67,11 @@ __hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalE
 __hidden ncclResult_t pluginIrecvConsumed(void* recvComm, int n, void* request) { return ncclInternalError; }
 __hidden ncclResult_t pluginGetDeviceMr(void* comm, void* mhandle, void** dptr_mhandle) { return ncclInternalError; }
 __hidden ncclResult_t pluginMakeVDevice(int* d, ncclNetVDeviceProps_t* props) { return ncclInternalError; }
+__hidden ncclResult_t pluginFinalize(void* ctx) { return ncclSuccess; }
 
 #define PLUGIN_NAME "Plugin"
 
-const ncclNet_v10_t ncclNetPlugin_v10 = {
+const ncclNet_v11_t ncclNetPlugin_v11 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .devices = pluginDevices,
@@ -91,18 +92,84 @@ const ncclNet_v10_t ncclNetPlugin_v10 = {
   .getDeviceMr = pluginGetDeviceMr,
   .irecvConsumed = pluginIrecvConsumed,
   .makeVDevice   = pluginMakeVDevice,
+  .finalize = pluginFinalize,
+};
+
+__hidden ncclResult_t pluginInit_v10(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) { return ncclSuccess; }
+__hidden ncclResult_t pluginGetProperties_v10(int dev, ncclNetProperties_v10_t* props) {
+  // Below are default values, if unsure don't change.
+
+  props->name = "Example";
+  // Fill for proper topology detection, e.g. /sys/devices/pci0000:00/0000:00:10.0/0000:0b:00.0
+  props->pciPath = NULL;
+  // Only used to detect NICs with multiple PCI attachments.
+  props->guid = 0;
+  // Add NCCL_PTR_CUDA if GPU Direct RDMA is supported and regMr can take CUDA pointers.
+  props->ptrSupport = NCCL_PTR_HOST;
+  // If you regMr has a fast registration cache, set to 1. If set to 0, user buffer registration may be disabled.
+  props->regIsGlobal = 0;
+  // Force flush after receive. Needed if the control path and data path use a different path to the GPU
+  props->forceFlush = 0;
+  // Speed in *Mbps*. 100000 means 100G
+  props->speed = 100000;
+  // Port number, used in conjunction with guid
+  props->port = 0;
+  // Custom latency (used to help tuning if latency is high. If set to 0, use default NCCL values.
+  props->latency = 0;
+  // Maximum number of comm objects we can create.
+  props->maxComms = 1024*1024;
+  // Maximum number of receive operations taken by irecv().
+  props->maxRecvs = NCCL_PLUGIN_MAX_RECVS;
+  // Coupling with NCCL network device-side code.
+  props->netDeviceType = NCCL_NET_DEVICE_HOST;
+  props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
+  // Used to tell NCCL core whether this is a virtual device fusing multiple physical devices.
+  props->vProps.ndevs = 1;
+  props->vProps.devs[0] = dev;
+  // maximum transfer sizes the plugin can handle
+  props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = NCCL_MAX_NET_SIZE_BYTES;
+  return ncclSuccess;
+}
+
+__hidden ncclResult_t pluginListen_v10(int d, void* handle, void** listenComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginConnect_v10(int dev, ncclNetCommConfig_v10_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_v10_t** sendDevComm) { return ncclInternalError; }
+__hidden ncclResult_t pluginMakeVDevice_v10(int* d, ncclNetVDeviceProps_v10_t* props) { return ncclInternalError; }
+
+const ncclNet_v10_t ncclNetPlugin_v10 = {
+  .name = PLUGIN_NAME,
+  .init = pluginInit_v10,
+  .devices = pluginDevices,
+  .getProperties = pluginGetProperties_v10,
+  .listen = pluginListen_v10,
+  .connect = pluginConnect_v10,
+  .accept = pluginAccept,
+  .regMr = pluginRegMr,
+  .regMrDmaBuf = pluginRegMrDmaBuf,
+  .deregMr = pluginDeregMr,
+  .isend = pluginIsend,
+  .irecv = pluginIrecv,
+  .iflush = pluginIflush,
+  .test = pluginTest,
+  .closeSend = pluginCloseSend,
+  .closeRecv = pluginCloseRecv,
+  .closeListen = pluginCloseListen,
+  .getDeviceMr = pluginGetDeviceMr,
+  .irecvConsumed = pluginIrecvConsumed,
+  .makeVDevice   = pluginMakeVDevice_v10,
 };
 
+
 __hidden ncclResult_t pluginInit_v9(ncclDebugLogger_t logFunction) {
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }
 
 __hidden ncclResult_t pluginGetProperties_v9(int dev, ncclNetProperties_v9_t* props) {
-  return pluginGetProperties(dev, (ncclNetProperties_t*)props);
+  return pluginGetProperties_v10(dev, (ncclNetProperties_v10_t*)props);
 }
 
 __hidden ncclResult_t pluginConnect_v9(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm){
-  return pluginConnect(dev, NULL, handle, sendComm, sendDevComm);
+  return pluginConnect_v10(dev, NULL, handle, sendComm, sendDevComm);
 }
 
 __hidden ncclResult_t pluginIsend_v9(void* sendComm, void* data, size_t size, int tag, void* mhandle, void** request) {
@@ -120,7 +187,7 @@ const ncclNet_v9_t ncclNetPlugin_v9 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v9,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr,
@@ -172,7 +239,7 @@ const ncclNet_v8_t ncclNetPlugin_v8 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v8,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr,
@@ -216,7 +283,7 @@ const ncclNet_v7_t ncclNetPlugin_v7 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v7,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v9,
   .accept = pluginAccept,
   .regMr = pluginRegMr_v7,
@@ -257,7 +324,7 @@ const ncclNet_v6_t ncclNetPlugin_v6 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v6,
   .accept = pluginAccept_v6,
   .regMr = pluginRegMr_v7,
@@ -278,7 +345,7 @@ const ncclNet_v5_t ncclNetPlugin_v5 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v6,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v6,
   .accept = pluginAccept_v6,
   .regMr = pluginRegMr_v7,
@@ -320,7 +387,7 @@ static ncclResult_t pluginConnect_v4(int dev, void* handle, void** sendComm) {
   ncclResult_t ret;
   do {
     ncclNetDeviceHandle_v7_t* handle = NULL;
-    ret = pluginConnect(dev, NULL, handle, sendComm, &handle);
+    ret = pluginConnect_v10(dev, NULL, handle, sendComm, &handle);
   } while (ret == ncclSuccess && *sendComm == NULL);
   return ret;
 }
@@ -337,7 +404,7 @@ const ncclNet_v4_t ncclNetPlugin_v4 = {
   .init = pluginInit_v9,
   .devices = pluginDevices,
   .getProperties = pluginGetProperties_v4,
-  .listen = pluginListen,
+  .listen = pluginListen_v10,
   .connect = pluginConnect_v4,
   .accept = pluginAccept_v4,
   .regMr = pluginRegMr_v7,
@@ -363,12 +430,12 @@ static ncclResult_t pluginFlush(void* recvComm, void* data, int size, void* mhan
 }
 static ncclResult_t pluginInit_v3(ncclDebugLogger_t logFunction) {
   max_requests = NCCL_NET_MAX_REQUESTS_V3;
-  return pluginInit(logFunction, NULL);
+  return pluginInit_v10(logFunction, NULL);
 }
 #include <string.h>
 static ncclResult_t pluginListen_v3(int dev, void* handle, void** listenComm) {
   char pluginHandle[NCCL_NET_HANDLE_MAXSIZE];
-  ncclResult_t ret = pluginListen(dev, &pluginHandle, listenComm);
+  ncclResult_t ret = pluginListen_v10(dev, &pluginHandle, listenComm);
   memcpy(handle, &pluginHandle, NCCL_NET_HANDLE_MAXSIZE_V4);
   return ret;
 }
@@ -403,7 +470,7 @@ const ncclNet_v2_t ncclNetPlugin_v2 = {
   .devices = pluginDevices,
   .pciPath = pluginPciPath,
   .ptrSupport = pluginPtrSupport,
-  .listen = pluginListen,
+  .listen = pluginListen_v3,
   .connect = pluginConnect_v4,
   .accept = pluginAccept_v4,
   .regMr = pluginRegMr_v7,
diff --git a/ext-profiler/README.md b/ext-profiler/README.md
index 27bd4e25c..1d85213a6 100644
--- a/ext-profiler/README.md
+++ b/ext-profiler/README.md
@@ -49,9 +49,9 @@ of newer ones.
 The `nccl/` directory is populated with `profiler_vX.h` files extracting all relevant definitions
 from old API versions. It also provides error codes in `err.h`.
 
-# API (v4)
+# API (v5)
 
-Below is the main `ncclProfiler_v4` struct. Each function is explained in later sections.
+Below is the main `ncclProfiler_v5` struct. Each function is explained in later sections.
 
 ```
 typedef struct {
@@ -60,15 +60,15 @@ typedef struct {
   // init - initialize the profiler plugin
   // Input
   //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
   //  - commName       : user assigned communicator name
-  //  - commHash       : communicator id
   //  - nNodes         : number of nodes in communicator
   //  - nranks         : number of ranks in communicator
   //  - rank           : rank identifier in communicator
   //  - logfn          : logger function
   // Output
   //  - eActivationMask: bitmask of active events set by the plugin
-  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
 
   // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
   // Input
@@ -76,7 +76,7 @@ typedef struct {
   //  - eDescr : pointer to ncclProfilerEventDescr_t object
   // Output
   //  - eHandle: return event handle for supplied event descriptor object
-  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
 
   // stopEvent - stop/finalize an event inside and event set
   // Input
@@ -88,13 +88,13 @@ typedef struct {
   //  - eHandle   : handle to event object created through startEvent
   //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
   //  - eState    : event state transition
-  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
 
   // finalize - finalize the profiler plugin
   // Input
   //  - context: opaque profiler context object
   ncclResult_t (*finalize)(void* context);
-} ncclProfiler_v4_t;
+} ncclProfiler_v5_t;
 ```
 
 ## Error codes
@@ -148,10 +148,37 @@ is the `ncclProfilerEventDescr_t` struct.
 
 ```
 typedef struct {
-  uint8_t type;             // event type (e.g., ncclProfileGroup, ncclProfileColl, ...)
-  void* parentObj;          // pointer to parent event used to expose the event hierarchy to the profiler
-  int rank;                 // rank that generated the event
+  uint64_t type;             // event type descriptor: ncclProfileGroupApi, ncclProfileCollApi, ...
+  void* parentObj;           // pointer to parent event used to expose the event hierarchy to the profiler
+  int rank;                  // rank that generated the event
   union {
+    struct {                 // GroupAPI event metadata
+      bool graphCaptured;    // Set to true if the Group API event is emitted inside a CUDA graph capture
+      int groupDepth;        // Determines the depth of a ncclGroup. A depth of 1 implies that the Group API call is implicit (internal to NCCL)
+                             // and not called by the user. Any depth greater than 1 means that the user made the Group API call.
+    } groupApi;
+
+    struct {                 // Collective API call metadata
+      const char* func;      // string containing name of the collective operation during
+      size_t count;          // data count
+      const char* datatype;  // string containing the name of the datatype
+      int root;              // root rank
+      void* stream;          // Opaque handle that points to the CUDA stream that the operation is enqueued in
+      bool graphCaptured;    // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } collApi;
+
+    struct {                // Point-to-point API call metadata
+      const char* func;     // string containing name of the p2p operation
+      size_t count;         // data count
+      const char* datatype; // string containing the name of the datatype
+      void* stream;         // Opaque handle that points to a CUDA stream object
+      bool graphCaptured;   // Set to true if the Collective API event is emitted inside a CUDA graph capture
+    } p2pApi;
+
+    struct {                // Kernel Launch event metadata
+      void* stream;         // Opaque handle that points to the CUDA stream that the operation is enqueued in
+    } kernelLaunch;
+
     struct {                // collective events metadata
       uint64_t seqNumber;   // sequence number of this collective operation in the communicator
       const char* func;     // string containing name of the collective
@@ -164,6 +191,7 @@ typedef struct {
       uint8_t nWarps;       // number of GPU warps for this collective
       const char* algo;     // string containing name of the algorithm for this collective
       const char* proto;    // string containing name of the protocol for this collective
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
     } coll;
 
     struct {                // point-to-point events metadata
@@ -173,6 +201,7 @@ typedef struct {
       size_t count;
       int peer;             // peer rank for this point-to-point
       uint8_t nChannels;    // number of channels for this p2p
+      void* parentGroup;    // for backward compatibility with v4 - this points to the legacy v4 group parent
     } p2p;
 
     struct {                // proxyOp events metadata
@@ -198,12 +227,12 @@ typedef struct {
       void* data;           // pointer to network plugin defined event
     } netPlugin;
   };
-} ncclProfilerEventDescr_v4_t;
+} ncclProfilerEventDescr_v5_t;
 ```
 
-NCCL defines the following events: `ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`, `ncclProfileKernelCh` and
-`ncclProfileNetPlugin`.
+NCCL defines the following events: `ncclProfileGroupApi`, `ncclProfileCollApi`, `ncclProfileP2pApi`, `ncclProfileKernelLaunch`,
+`ncclProfileGroup`, `ncclProfileColl`, `ncclProfileP2p`,`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileProxyCtrl`,
+`ncclProfileKernelCh` and `ncclProfileNetPlugin`.
 
 #### stopEvent
 
@@ -213,10 +242,10 @@ handle after `eventStop` is undefined behavior.
 
 #### recordEventState
 
-Some events can only be started and stopped. For example, `ncclProfileGroup`, `ncclProfileColl`,
-`ncclProfileP2p`, cannot be updated through calls to `recordEventState`.
+Some events can only be started and stopped. For example, `ncclProfileP2pApi`, `ncclProfileCollApi`, `ncclProfileGroup`,
+`ncclProfileColl`, `ncclProfileP2p` cannot be updated through calls to `recordEventState`.
 
-`ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
+`ncclProfileGroupApi`, `ncclProfileProxyOp`, `ncclProfileProxyStep`, `ncclProfileNetPlugin`, `ncclProfileKernelCh`, and
 `ncclProfileProxyCtrl` can be updated through calls to `recordEventState`.
 
 The state of these events can be updated, along with event attributes, using `recordEventState`.
@@ -258,9 +287,21 @@ typedef enum {
 
   // ncclProfileKernelCh event states
   ncclProfilerKernelChStop             = 22,// state marks stop of kernelCh event and timestamp update
-} ncclProfilerEventState_v4_t;
+
+  // Group API States
+  ncclProfilerGroupStartApiStop        = 23,// state marks the end of a ncclGroupStart() API call
+  ncclProfilerEndGroupApiStart         = 24 // state marks the start of a ncclGroupEnd() API call
+} ncclProfilerEventState_v5_t;
 ```
 
+NCCL profile API events are generated when the API calls are made, right after NCCL checks
+for graph capture information. They parent collective, point-to-point and kernel launch events
+and persist across multiple operations in a group.
+
+`ncclProfileKernelLaunch` events are generated when the CUDA call to a kernel launch is made. In the
+case of graph capture, the event start indicates that the kernel launch operation has been recorded,
+not launched.
+
 `ncclProfileProxyOp` events are generated by the proxy progress thread while it is processing
 network requests for the GPU kernel. ProxyOp events are generated for every active channel and
 provide a summary of the activity of the proxy progress thread for that channel. Most of the
@@ -379,7 +420,7 @@ typedef union {
   struct {                // attribute to update for ncclProfileKernelCh events
     uint64_t pTimer;      // timestamp provided by the NCCL kernel
   } kernelCh;
-} ncclProfilerEventStateArgs_v4_t;
+} ncclProfilerEventStateArgs_v5_t;
 ```
 
 The example profiler in `ext-profiler/example` contains details on how to capture and use the events above.
@@ -389,27 +430,33 @@ The example profiler in `ext-profiler/example` contains details on how to captur
 NCCL core events (reported above) are organized into a hierarchy as reported below:
 
 ```
-Group event
+Group API event
    |
-   +- Collective event
+   +- Collective API event
    |  |
-   |  +- ProxyOp event
-   |  |  |
-   |  |  +- ProxyStep event
-   |  |     |
-   |  |     +- NetPlugin event
+   |  +- Collective event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
+   |
+   +- Point-to-point API event
    |  |
-   |  +- KernelCh event
+   |  +- Point-to-point event
+   |     |
+   |     +- ProxyOp event
+   |     |  |
+   |     |  +- ProxyStep event
+   |     |     |
+   |     |     +- NetPlugin event
+   |     |
+   |     +- KernelCh event
    |
-   +- Point-to-point event
-      |
-      +- ProxyOp event
-      |  |
-      |  +- ProxyStep event
-      |     |
-      |     +- NetPlugin event
-      |
-      +- KernelCh event
+   +- Kernel Launch event
 
 ProxyCtrl event
 ```
diff --git a/ext-profiler/example/CMakeLists.txt b/ext-profiler/example/CMakeLists.txt
new file mode 100644
index 000000000..fd2f04df6
--- /dev/null
+++ b/ext-profiler/example/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/print_event.cc
+)
+
+# Create shared library
+add_library(nccl-profiler-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-profiler-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-profiler-example PROPERTIES
+    OUTPUT_NAME "nccl-profiler-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
+)
+
+add_custom_command(TARGET nccl-profiler-example POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/test/unit/plugins
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-profiler-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/lib/libnccl-profiler-example.so
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_BINARY_DIR}/test/unit/plugins/libnccl-profiler-example.so
+    COMMENT "Cleaning libnccl-profiler-example.so"
+)
diff --git a/ext-profiler/example/Makefile b/ext-profiler/example/Makefile
index 777ff5bad..f6383e1b6 100644
--- a/ext-profiler/example/Makefile
+++ b/ext-profiler/example/Makefile
@@ -5,18 +5,20 @@
 #
 .DEFAULT_GOAL: build
 include ../../makefiles/common.mk
-SRCDIR   ?= $(abspath ../..)
 BUILDDIR ?= .
 NCCLDIR  := $(BUILDDIR)
 
-SRC_FILES := $(wildcard *.c)
+SRC_FILES := $(wildcard *.cc)
+DST_DIR   := $(BUILDDIR)
+OBJ_FILES := $(SRC_FILES:%.cc=${DST_DIR}/%.o)
+DEP_FILES := $(OBJ_FILES:%.o=%.dep)
 
-build: ${BUILDDIR}/libnccl-profiler-example.so
+build: ${DST_DIR}/libnccl-profiler-example.so
 
-${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
+${DST_DIR}/libnccl-profiler-example.so: ${SRC_FILES}
 	@printf "Compiling  %-35s > %s\n" $< $@
-	@mkdir -p ${BUILDDIR}
-	$(CC) -Inccl -fPIC -shared -o $@ $^
+	@mkdir -p ${DST_DIR}
+	$(CXX) -Inccl -I${CUDA_INC} -fPIC -shared -o $@ $^
 
 clean:
-	rm -f ${BUILDDIR}/libnccl-profiler-example.so
+	rm -f ${DST_DIR}/libnccl-profiler-example.so
diff --git a/ext-profiler/example/README.md b/ext-profiler/example/README.md
index d98e58f15..abc11a57e 100644
--- a/ext-profiler/example/README.md
+++ b/ext-profiler/example/README.md
@@ -13,8 +13,7 @@ change the size of the event window the profiler keeps track of.
 
 ## Building the profiler plugin
 
-To use the example plugin, just type `make`. You will need a NCCL build's include directory present.
-You can override `NCCL_HOME` to where the NCCL installation is on your system.
+To build the example plugin shipped as part of NCCL, just type `make`.
 
 ## Using the profiler plugin
 
@@ -27,13 +26,13 @@ You can override `NCCL_HOME` to where the NCCL installation is on your system.
 
    As an example, setting:
 
-   `NCCL_PROFILE_EVENT_MASK` to 1 (`ncclProfileGroup`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
+   `NCCL_PROFILE_EVENT_MASK` to 256 (`ncclProfileGroupApi`) | 2 (`ncclProfileColl`) | 8 (`ncclProfileProxyOp`)
 
-   enables the profiling of the group, the collective and the proxy op events. The same events can be
+   enables the profiling of the group API, the collective and the proxy op events. The same events can be
    expressed more concisely by setting `NCCL_PROFILE_EVENT_MASK` to 8 (`ncclProfileProxyOp`). Indeed,
    in NCCL all the events above (in the event hierarchy) the one requested are also captured. The advantage
    is that the profiler can easily correlate events that belong to the same NCCL operation and present
-   them accordingly.
+   them accordingly. Setting `NCCL_PROFILE_EVENT_MASK` to 4095 enables all events supported by the v5 profiler.
 
 3. Set `NCCL_PROFILE_DUMP_FILE` to the name of the dump file for the collected traces. A file named
    ${NCCL_PROFILE_DUMP_FILE}-hostname-tid.txt is created. Profiler traces are saved using the chrome
@@ -57,11 +56,14 @@ The group, collective and p2p pools contain objects for the corresponding events
 contains objects for `ProxyCtrl` events and the `ProxyDetach` pool contains objects for `ProxyOp` events
 generated by remote proxies. A list of pools and their size is reported below:
 
-- `NCCL_PROFILE_GROUP_POOL_SIZE` (16)
-- `NCCL_PROFILE_COLL_POOL_SIZE` (16)
-- `NCCL_PROFILE_P2P_POOL_SIZE` (1024)
+- `NCCL_PROFILE_GROUP_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_API_POOL_SIZE` (256)
+- `NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE` (256)
+- `NCCL_PROFILE_COLL_POOL_SIZE` (256)
+- `NCCL_PROFILE_P2P_POOL_SIZE` (256)
 - `NCCL_PROFILE_PROXY_CTRL_POOL_SIZE` (16)
-- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (128)
+- `NCCL_PROFILE_PROXY_DETACH_POOL_SIZE` (256)
 
 Remote proxy operations are generated when PXN is in use. Refer to this article for more information
 about PXN and how it works:
@@ -73,76 +75,58 @@ The example profiler generates traces using the json format. An example of trace
 
 ```
 [
-{"name": "Group", "cat": "GROUP", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764234.611328, "args": {"groupId": 0}},
-{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 764237.294922, "args": {"SeqNum": 0, "CommHash": 673864846479792718, "Rank": 1, "Count": 32768, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "LL", "nMaxChannels": 2}},
-{"name": "Recv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768464.936523, "args": {"Channel": 0, "Peer": 0, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 772020.300781}, "RECEIVED": {"step": 14, "ts": 772196.049805}, "TRANSMITTED": {"step": 14, "ts": 772197.326172}, "DONE": {"step": 14, "ts": 772201.538086}}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768465.158203, "args": {"Step": 0}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768477.924805, "args": {"Step": 0}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768547.197266, "args": {"Step": 0}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 4157654, "tid": 1, "ts": 768564.174805, "args": {"Step": 0}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 768568.276367},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768503.604492, "args": {"Step": 1}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768504.549805, "args": {"Step": 1}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769994.490234, "args": {"Step": 1}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 769995.012695, "args": {"Step": 1}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 4157654, "tid": 1, "ts": 770006.914062},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768506.941406, "args": {"Step": 2}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 768507.435547, "args": {"Step": 2}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771452.536133, "args": {"Step": 2}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 4157654, "tid": 1, "ts": 771453.060547, "args": {"Step": 2}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 4157654, "tid": 1, "ts": 771468.458008},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768509.484375, "args": {"Step": 3}},
-{"name": "RecvBufferWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000},
-{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 768510.250000, "args": {"Step": 3}},
-{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.499023, "args": {"Step": 3}},
-{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 4157654, "tid": 1, "ts": 771904.991211, "args": {"Step": 3}},
-{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 4157654, "tid": 1, "ts": 771910.500000},
-{"name": "Send", "cat": "PROXY", "ph": "b", "id": 1, "pid": 4157654, "tid": 1, "ts": 768482.878906, "args": {"Channel": 0, "Peer": 2, "Steps": 14, "ChunkSize": 32768, "transSize": 229376, "POSTED": {"step": 14, "ts": 771995.675781}, "REM_FIFO_WAIT": {"step": 14, "ts": 772190.692383}, "TRANSMITTED": {"step": 14, "ts": 772191.516602}, "DONE": {"step": 14, "ts": 772208.473633}}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.019531, "args": {"Step": 0}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 768483.300781, "args": {"Step": 0}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 14, "pid": 4157654, "tid": 1, "ts": 769594.615234, "args": {"Step": 0}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 14, "pid": 4157654, "tid": 1, "ts": 769618.889648},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.083008, "args": {"Step": 1}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 768505.163086, "args": {"Step": 1}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 15, "pid": 4157654, "tid": 1, "ts": 769610.555664, "args": {"Step": 1}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 15, "pid": 4157654, "tid": 1, "ts": 769622.517578},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768507.937500, "args": {"Step": 2}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 768508.017578, "args": {"Step": 2}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 16, "pid": 4157654, "tid": 1, "ts": 770002.129883, "args": {"Step": 2}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 16, "pid": 4157654, "tid": 1, "ts": 770013.848633},
-{"name": "SendBufferWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.742188, "args": {"Step": 3}},
-{"name": "SendBufferWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266},
-{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 768510.822266, "args": {"Step": 3}},
-{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477},
-{"name": "SendWait", "cat": "NET", "ph": "b", "id": 17, "pid": 4157654, "tid": 1, "ts": 771461.563477, "args": {"Step": 3}},
-{"name": "SendWait", "cat": "NET", "ph": "e", "id": 17, "pid": 4157654, "tid": 1, "ts": 771469.171875},
+{"name": "Group API", "cat": "GROUP_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 3433.595001, "args": {"groupApiId": 0, "groupDepth":1}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"groupId": 0, "Stream": 0x5020000567d0}},
+{"name": "KernelLaunch", "cat": "KERNEL_LAUNCH", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 111991.558990},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 0.000000, "args": {"count": 262144, "datatype": ncclFloat32, "root": 0, "GraphCaptured":0, "Stream": 0x5020000567d0}},
+{"name": "AllReduce", "cat": "COLL", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 111994.477997, "args": {"SeqNum": 0, "CommHash": 1493613951195738943, "Rank": 0, "Count": 262144, "Datatype": "ncclFloat32", "Algorithm": "RING", "Protocol": "SIMPLE", "nChannels": 2}},
+{"name": "KernelCh", "cat": "GPU", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119711.888000, "args": {"Channel": 0, "StartGpuClk": 1756135989724672000, "StopGpuClk": 1756135989732831232}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119652.709991, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119686.300995, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{“name": "RecvWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119707.677979, "args": {"Step": 0}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119807.691986, "args": {"Step": 0}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 0, "pid": 225798, "tid": 1, "ts": 119867.338989, "args": {"Step": 0}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120120.983002},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119733.647980, "args": {"Step": 1}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119844.401001, "args": {"Step": 1}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119890.567993, "args": {"Step": 1}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 120121.129974},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 119753.023987, "args": {"Step": 2}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120038.847992, "args": {"Step": 2}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 2, "pid": 225798, "tid": 1, "ts": 120085.685974, "args": {"Step": 2}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 2, "pid": 225798, "tid": 1, "ts": 120121.244995},
+{"name": "RecvWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 119772.510986, "args": {"Step": 3}},
+{"name": "RecvWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120062.944977, "args": {"Step": 3}},
+{"name": "RecvFlushWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "b", "id": 3, "pid": 225798, "tid": 1, "ts": 120101.089996, "args": {"Step": 3}},
+{"name": "RecvGpuWait", "cat": "NET", "ph": "e", "id": 3, "pid": 225798, "tid": 1, "ts": 120165.115997},
+{"name": "ProgressRecv", "cat": "PROXY", "ph": "e", "id": 0, "pid": 225798, "tid": 1, "ts": 120165.356995},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119656.950989, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "ScheduleSend", "cat": "PROXY", "ph": "e", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979},
+{"name": "ProgressSend", "cat": "PROXY", "ph": "b", "id": 1, "pid": 225798, "tid": 1, "ts": 119709.078979, "args": {"Channel": 0, "Peer": 1, "Steps": 4, "ChunkSize": 4194304, "transSize": 524288}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119710.632996, "args": {"Step": 0}},
+{"name": "SendGpuWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993},
+{"name": "SendPeerWait", "cat": "NET", "ph": "b", "id": 4, "pid": 225798, "tid": 1, "ts": 119808.636993, "args": {"Step": 0}},
+{"name": "SendPeerWait", "cat": "NET", "ph": "e", "id": 4, "pid": 225798, "tid": 1, "ts": 119818.972992},
  ... [ trace truncated for brevity ]
-{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.317383},
-{"name": "Group", "cat": "GROUP", "ph": "e", "id": 0, "pid": 4157654, "tid": 1, "ts": 772209.418945},
+{"name": "AllReduce", "cat": "COLL", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170633.535980},
+{"name": "AllReduce", "cat": "COLL_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170582.923981},
+{"name": "Group API", "cat": "GROUP_API", "ph": "e", "id": 17, "pid": 225798, "tid": 1, "ts": 170637.582001},
 {}]
 ```
 
 Details about the fields used in the trace can be found at this link:
 https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview?tab=t.0#heading=h.yr4qxyxotyw
 
-The trace above is obtained by running a `ncclAllReduce` operation on 8 GPUs, communicating with each other through
+The trace above is obtained by running a `ncclAllReduce` operation on 2 GPUs, communicating with each other through
 the network interface. The `Group` event encloses all traces that are related to the single `ncclAllReduce` call.
 (Note that for single collective invocations, where there are no explicit group calls, NCCL creates a group with only
 one collective and this is what is presented in the traces above).
@@ -161,38 +145,17 @@ The `AllReduce` entry presents information about the `ncclAllReduce` operation.
 - datatype    : NCCL datatype
 - algorithm   : algorithm used to process the ncclAllReduce
 - protocol    : protocol used to process the ncclAllReduce
-- nMaxChannels: max number of channels used to process the ncclAllReduce
+- nChannels   : Number of channels used to process the ncclAllReduce
 
 If the proxy events are not active (e.g., the `ncclAllReduce` is intranode) the end timestamp will match the time
 consumed by the CPU to launch the collective. For more details refer to `ext-profiler/README.md`, section `Profiling
 of collective and p2p operations`.
 
-### Proxy Send
-The `Send` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
-- Channel      : id of the channel used by this proxy operation to send data to the peer
-- Peer         : peer rank
-- Steps        : number of network steps required to transfer transSize bytes to the peer
-- ChunkSize    : chunk size used by NCCL to pipeline data through the proxy thread
-- transSize    : bytes transferred across the channel by this proxy operation
-- POSTED       : struct containing the number of buffer posts to the GPU and the time stamp for the last post
-- REM_FIFO_WAIT: struct containing the number of remote buffer waits and the time stamp for the last wait
-- TRANSMITTED  : struct containing the number of network sends and the time stamp of the last send
-- DONE         : struct containing the number of network sends completed and the time stamp of the last send completed
-
-In case of a network problem the POSTED, REM_FIFO_WAIT, TRANSMITTED and DONE might all have partially updated steps,
-which could help identify at which point the network problem occurred.
-
 The Proxy send trace gives a summary of the proxy progress thread activity for the channel. If more details are
 needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
 entries below are also reported by the profiler.
 
-#### Proxy SendBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the channel staging buffer to become available.
-
-#### Proxy SendGPUWait
+#### Proxy SendGpuWait
 
 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to provide the data in the staging
 buffer.
@@ -201,31 +164,6 @@ buffer.
 
 Presents, for every network step, the time the CPU proxy spends waiting for the `isend` to complete
 
-### Proxy Recv
-
-The `Recv` entry presents information about the `ProxyOp` processing in the progress thread. It contains the following
-info in the args field:
-
-- Channel    : id of the channel used by this proxy operation to recv data from the peer
-- Peer       : peer rank
-- Steps      : number of network steps required to transfer transSize bytes from the peer
-- ChunkSize  : chunk size used by NCCL to pipeline data through the proxy thread
-- transSize  : bytes transferred across the channel by this proxy operation
-- POSTED     : struct containing the number of recvs posted and the time stamp for the last recv posted
-- RECEIVED   : struct containing the number of recvs completed and the time stamp for the last recv completed
-- TRANSMITTED: struct containing the number of recvs flushed to the GPU memory and the time stamp for the last recv flushed
-- DONE       : struct containing the number of flush completed and the time stamp for the last flush completed
-
-The Proxy Recv trace gives a summary of the proxy progress thread activity for the channel. If more details are
-needed, these can be obtained by enabling the proxy step event (`ncclProfileProxyStep`). In which case the trace
-entries below are also reported by the profiler.
-
-
-#### Proxy RecvBufferWait
-
-Presents, for every network step, the time the CPU proxy spends waiting for the staging buffer for the channel to
-become available.
-
 #### Proxy RecvWait
 
 Presents, for every network step, the time the CPU proxy spends waiting for a posted `irecv` to complete
@@ -234,6 +172,6 @@ Presents, for every network step, the time the CPU proxy spends waiting for a po
 
 Presents, for every network step, the time the CPU proxy spends waitng for the recv data to be flushed to the GPU
 
-#### Proxy RecvGPUWait
+#### Proxy RecvGpuWait
 
 Presents, for every network step, the time the CPU proxy spends waiting for the GPU to consume the recv data
diff --git a/ext-profiler/example/event.c b/ext-profiler/example/event.c
deleted file mode 100644
index 717fe8688..000000000
--- a/ext-profiler/example/event.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdio.h>
-#include "event.h"
-
-int taskEventQueueEmpty(struct group* g) {
-  return g->eventHead == NULL;
-}
-
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event) {
-  event->next = NULL;
-  if (g->eventHead) g->eventTail->next = event;
-  else g->eventHead = event;
-  g->eventTail = event;
-}
-
-struct taskEventBase* taskEventQueueHead(struct group* g) {
-  return g->eventHead;
-}
-
-struct taskEventBase* taskEventQueueDequeue(struct group* g) {
-  struct taskEventBase* tmp = g->eventHead;
-  g->eventHead = g->eventHead->next;
-  if (g->eventHead == NULL) g->eventTail = NULL;
-  return tmp;
-}
diff --git a/ext-profiler/example/event.h b/ext-profiler/example/event.h
index 4c1b8f53a..ae830cd25 100644
--- a/ext-profiler/example/event.h
+++ b/ext-profiler/example/event.h
@@ -10,10 +10,14 @@
 #include <sys/types.h>
 #include <stdint.h>
 #include <unistd.h>
+#include <cstring>
+#include "err.h"
 #include "profiler.h"
+#include "queue.h"
+#include <cuda_runtime.h>
 
 #define MAX_CHANNELS                     32
-#define MAX_STEPS                        16
+#define MAX_STEPS                        1024
 #define MAX_OPS                          16 // Up to 64K ranks for PAT
 #define MAX_EVENTS_PER_REQ               (8)
 
@@ -21,7 +25,7 @@ struct proxyOp;
 struct proxyStep;
 
 struct netPlugin {
-  uint8_t type;
+  uint64_t type;
   int pluginType;
   int pluginVer;
   uint8_t pluginEvent;
@@ -63,7 +67,7 @@ struct kernelCh {
 #define PROXY_STEP_MAX_STATES 3
 
 struct proxyStep {
-  uint8_t type;                     // type of event: network transfer
+  uint64_t type;                     // type of event: network transfer
   int state;
   int step;                         // network transfer id in given channel
   int isSend;                       // send/recv channel operation
@@ -76,7 +80,7 @@ struct proxyStep {
 };
 
 struct proxyOp {
-  uint8_t type;                     // type of event: proxy operation
+  uint64_t type;                     // type of event: proxy operation
   uint8_t channelId;                // channel id for this proxy operation
   pid_t pid;
   int rank;
@@ -97,7 +101,7 @@ struct group;
 struct context;
 
 struct proxyCtrl {
-  uint8_t type;
+  uint64_t type;
   struct context* ctx;              // profiler context
   double startTs;
   double stopTs;
@@ -107,12 +111,12 @@ struct proxyCtrl {
 
 // task level event base structure
 struct taskEventBase {
-  uint8_t type;                     // event type: collective/p2p
+  uint64_t type;                     // event type: collective/p2p
   int rank;                         // rank of the operation in NCCL communicator
   const char* func;                 // ncclFunc*
   int refCount;                     // number of references for this operation
-  struct group* parent;             // parent event group
-  struct taskEventBase* next;       // next top level event in group
+  void* parent;                     // parent API event
+  struct taskEventBase* next;       // next top level event
   double startTs;
   double stopTs;
 };
@@ -147,7 +151,7 @@ struct p2p {
 };
 
 struct group {
-  uint8_t type;
+  uint64_t type;
   struct context* ctx;              // profiler context
   int groupId;
   int refCount;
@@ -158,6 +162,70 @@ struct group {
   struct group* next;               // next group event in queue
 };
 
+struct collApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int collApiId;
+  int refCount;
+  cudaStream_t stream;
+  const char* func;
+  size_t count;
+  const char* datatype;
+  int root;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct collApi* next;
+};
+
+struct p2pApi {
+  uint64_t type;
+  struct groupApi* parent;
+  struct context* ctx;              // profiler context
+  int p2pApiId;
+  int refCount;
+  const char* func;
+  cudaStream_t stream;
+  size_t count;
+  const char* datatype;
+  bool graphCaptured;
+  struct taskEventBase* eventHead;  // queue head for task events
+  struct taskEventBase* eventTail;  // queue tail for task events
+  double startTs;
+  double stopTs;
+  struct p2pApi* next;
+};
+
+struct kernelLaunch {
+  uint64_t type;
+  struct groupApi* parent;
+  cudaStream_t stream;
+  int kernelLaunchId;
+  double startTs;
+  double stopTs;
+  struct kernelLaunch* next;
+};
+
+struct groupApi {
+  uint64_t type;
+  struct context* ctx;
+  int groupApiId;
+  int refCount;
+  bool graphCaptured;
+  int groupDepth;
+  struct profilerQueue<struct p2pApi, &p2pApi::next> p2pApiEvents;
+  struct profilerQueue<struct collApi, &collApi::next> collApiEvents;
+  struct profilerQueue<struct kernelLaunch, &kernelLaunch::next> kernelLaunchEvents;
+  double endOfncclGroupStartTs;
+  double startOfncclGroupEndTs;
+  double startTs;
+  double stopTs;
+  struct groupApi* next;
+};
+
 // arrays for different event objects
 struct context {
   const char* commName;
@@ -165,6 +233,26 @@ struct context {
   int nranks;
   int rank;
 
+  int groupApiPoolSize;
+  int groupApiPoolBase;
+  int groupApiPoolIndex;
+  struct groupApi* groupApiPool;
+
+  int collApiPoolSize;
+  int collApiPoolBase;
+  int collApiPoolIndex;
+  struct collApi* collApiPool;
+
+  int p2pApiPoolSize;
+  int p2pApiPoolBase;
+  int p2pApiPoolIndex;
+  struct p2pApi* p2pApiPool;
+
+  int kernelLaunchPoolSize;
+  int kernelLaunchPoolBase;
+  int kernelLaunchPoolIndex;
+  struct kernelLaunch* kernelLaunchPool;
+
   int groupPoolSize;
   int groupPoolBase;
   int groupPoolIndex;
@@ -186,9 +274,50 @@ struct context {
   struct proxyCtrl* proxyCtrlPool;
 };
 
-int taskEventQueueEmpty(struct group* g);
-void taskEventQueueEnqueue(struct group* g, struct taskEventBase* event);
-struct taskEventBase* taskEventQueueHead(struct group* g);
-struct taskEventBase* taskEventQueueDequeue(struct group* g);
+template <typename T>
+inline int taskEventQueueEmpty(T *obj) {
+  return obj->eventHead == NULL;
+}
+
+template <typename T>
+inline void taskEventQueueEnqueue(T* obj, struct taskEventBase* event) {
+  event->next = NULL;
+  if (obj->eventHead) obj->eventTail->next = event;
+  else obj->eventHead = event;
+  obj->eventTail = event;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueHead(T *obj) {
+    return obj->eventHead;
+}
+
+template <typename T>
+inline struct taskEventBase* taskEventQueueDequeue(T* obj) {
+  struct taskEventBase* tmp = obj->eventHead;
+  obj->eventHead = obj->eventHead->next;
+  if (obj->eventHead == NULL) obj->eventTail = NULL;
+  return tmp;
+}
+
+template <typename T>
+inline void resetTaskEvents(T *obj, struct context* ctx) {
+  while (!taskEventQueueEmpty(obj)) {
+    struct taskEventBase* base = taskEventQueueDequeue(obj);
+    if (base->type == ncclProfileColl) {
+      struct collective* c = (struct collective *)base;
+      // reset event proxyOps & proxySteps
+      memset(c->nProxyOps, 0, sizeof(int)*MAX_CHANNELS);
+      // release collective events in the group and return them to the collective pool
+      __atomic_fetch_add(&ctx->collPoolBase, 1, __ATOMIC_RELAXED);
+    } else if (base->type == ncclProfileP2p) {
+      struct p2p* p = (struct p2p *)base;
+      // reset event proxyOp and proxySteps
+      memset(&p->op, 0, sizeof(struct proxyOp)*MAX_CHANNELS);
+      // release p2p events in the group and return them to the p2p pool
+      __atomic_fetch_add(&ctx->p2pPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  }
+}
 
 #endif
diff --git a/ext-profiler/example/nccl/profiler.h b/ext-profiler/example/nccl/profiler.h
index c911426d9..715885f72 100644
--- a/ext-profiler/example/nccl/profiler.h
+++ b/ext-profiler/example/nccl/profiler.h
@@ -11,17 +11,20 @@
 #include <stdlib.h>
 
 #include "common.h"
-#include "err.h"
 
 enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
-  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
 };
 
 typedef enum {
@@ -56,21 +59,27 @@ typedef enum {
 
   /* Kernel event states */
   ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
 } ncclProfilerEventState_t;
 
 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
 
+#include "profiler_v5.h"
 #include "profiler_v4.h"
 #include "profiler_v3.h"
 #include "profiler_v2.h"
 #include "profiler_v1.h"
 #include "profiler_net.h"
 
-typedef ncclProfiler_v4_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
 
 #endif // end include guard
diff --git a/ext-profiler/example/nccl/profiler_v5.h b/ext-profiler/example/nccl/profiler_v5.h
new file mode 100644
index 000000000..8bbc85eeb
--- /dev/null
+++ b/ext-profiler/example/nccl/profiler_v5.h
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+#include <stdbool.h>
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileGroupApi, ...
+  void* parentObj;              // pointer to the profiler parent object
+  int rank;                     // originating rank
+  union {
+    struct {
+      int graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
diff --git a/ext-profiler/example/plugin.c b/ext-profiler/example/plugin.cc
similarity index 68%
rename from ext-profiler/example/plugin.c
rename to ext-profiler/example/plugin.cc
index b89cd4627..f6d4956b3 100644
--- a/ext-profiler/example/plugin.c
+++ b/ext-profiler/example/plugin.cc
@@ -6,7 +6,7 @@
 
 #include <stdio.h>
 #include <pthread.h>
-#include <string.h>
+#include <cstring>
 #include <linux/limits.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -22,12 +22,20 @@ static int initialized;             // initialization counter for profiler
 static double startTime;            // profiler start time
 
 static const int defaultEActivationMask = ncclProfileColl | ncclProfileP2p;
-static const int defaultGroupPoolSize = 16;
-static const int defaultCollPoolSize = 16;
-static const int defaultP2pPoolSize = 1024;
+static const int defaultGroupApiPoolSize = 256;
+static const int defaultCollApiPoolSize = 256;
+static const int defaultP2pApiPoolSize = 256;
+static const int defaultKernelLaunchPoolSize = 256;
+static const int defaultGroupPoolSize = 256;
+static const int defaultCollPoolSize = 256;
+static const int defaultP2pPoolSize = 256;
 static const int defaultProxyCtrlPoolSize = 16;
-static const int defaultDetachPoolSize = 128;
+static const int defaultDetachPoolSize = 256;
 
+static int groupApiPoolSize;
+static int collApiPoolSize;
+static int p2pApiPoolSize;
+static int kernelLaunchPoolSize;
 static int groupPoolSize;
 static int collPoolSize;
 static int p2pPoolSize;
@@ -51,7 +59,7 @@ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
 static pid_t pid;
 static int* eActivationMaskPtr;
 
-__hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+__hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
   pthread_mutex_lock(&lock);
   if (__atomic_fetch_add(&initialized, 1, __ATOMIC_RELAXED) == 0) {
     // first thread initializes event mask, environment and detach pool
@@ -59,6 +67,18 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
     str = getenv("NCCL_PROFILE_EVENT_MASK");
     __atomic_store_n(eActivationMask, str ? atoi(str) : 0, __ATOMIC_RELAXED);
 
+    str = getenv("NCCL_PROFILE_GROUP_API_POOL_SIZE");
+    groupApiPoolSize = str ? atoi(str) : defaultGroupApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_COLL_API_POOL_SIZE");
+    collApiPoolSize = str ? atoi(str) : defaultCollApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_P2P_API_POOL_SIZE");
+    p2pApiPoolSize = str ? atoi(str) : defaultP2pApiPoolSize;
+
+    str = getenv("NCCL_PROFILE_KERNEL_LAUNCH_POOL_SIZE");
+    kernelLaunchPoolSize = str ? atoi(str) : defaultKernelLaunchPoolSize;
+
     str = getenv("NCCL_PROFILE_GROUP_POOL_SIZE");
     groupPoolSize = str ? atoi(str) : defaultGroupPoolSize;
 
@@ -96,11 +116,23 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
   // pre-allocate memory for event object pools in dedicated profiler context
   struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
   ctx->commName = commName;
-  ctx->commHash = commHash;
+  ctx->commHash = commId;
   ctx->nranks = nranks;
   ctx->rank = rank;
   logFn = logfn;
-  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commHash, nranks, rank);
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d", commName ? commName : "", commId, nranks, rank);
+
+  ctx->groupApiPool = (struct groupApi *)calloc(groupApiPoolSize, sizeof(*ctx->groupApiPool));
+  if (ctx->groupApiPool == NULL) goto fail;
+
+  ctx->collApiPool = (struct collApi *)calloc(collApiPoolSize, sizeof(*ctx->collApiPool));
+  if (ctx->collApiPool == NULL) goto fail;
+
+  ctx->p2pApiPool = (struct p2pApi *)calloc(p2pApiPoolSize, sizeof(*ctx->p2pApiPool));
+  if (ctx->p2pApiPool == NULL) goto fail;
+
+  ctx->kernelLaunchPool = (struct kernelLaunch *)calloc(kernelLaunchPoolSize, sizeof(*ctx->kernelLaunchPool));
+  if (ctx->kernelLaunchPool == NULL) goto fail;
 
   ctx->groupPool = (struct group *)calloc(groupPoolSize, sizeof(*ctx->groupPool));
   if (ctx->groupPool == NULL) goto fail;
@@ -130,16 +162,22 @@ __hidden ncclResult_t exampleProfilerInit(void** context, int* eActivationMask,
   if (ctx->p2pPool) free(ctx->p2pPool);
   if (ctx->collPool) free(ctx->collPool);
   if (ctx->groupPool) free(ctx->groupPool);
+  if (ctx->collApiPool) free(ctx->collApiPool);
+  if (ctx->p2pApiPool) free(ctx->p2pApiPool);
+  if (ctx->kernelLaunchPool) free(ctx->kernelLaunchPool);
+  if (ctx->groupApiPool) free(ctx->groupApiPool);
   free(ctx);
   if (detachPool) free(detachPool);
   return ncclSystemError;
 }
 
+static const char* profilerDumpFile;
+
 __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   FILE* fh = NULL;
   char filename[PATH_MAX] = { 0 };
   struct context* ctx = (struct context *)context;
-  const char* dump = getenv("NCCL_PROFILE_DUMP_FILE");
+  const char* dump = profilerDumpFile ? profilerDumpFile : getenv("NCCL_PROFILE_DUMP_FILE");
   if (dump) {
     sprintf(filename, "%s_%lu_%d.json", dump, ctx->commHash, ctx->rank);
     fh = fopen(filename, "w");
@@ -148,10 +186,12 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   INFO(NCCL_INIT, "PROFILER/Plugin: finalize commName: %s commHash: %lu nranks: %d rank: %d", ctx->commName ? ctx->commName : "", ctx->commHash, ctx->nranks, ctx->rank);
 
   // print last N groups/collectives/p2ps
-  int start = (ctx->groupPoolIndex - groupPoolSize >= 0) ? ctx->groupPoolIndex - groupPoolSize : 0;
-  int end = ctx->groupPoolIndex;
+  // Note that since the v5 version of the profiler, group API events are now at the top of the hierarchy.
+  // Legacy Group events from v4 are still emitted for compatibility purposes when using the v4 profiler but excluded from this example.
+  int start = (ctx->groupApiPoolIndex - groupApiPoolSize >= 0) ? ctx->groupApiPoolIndex - groupApiPoolSize : 0;
+  int end = ctx->groupApiPoolIndex;
   for (int i = start; i < end; i++) {
-    printEvent(fh, &ctx->groupPool[i%groupPoolSize]);
+    printEvent(fh, &ctx->groupApiPool[i%groupApiPoolSize]);
   }
 
   start = (ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize >= 0) ? ctx->proxyCtrlPoolIndex - proxyCtrlPoolSize : 0;
@@ -161,6 +201,10 @@ __hidden ncclResult_t exampleProfilerFinalize(void* context) {
   }
 
   free(ctx->groupPool);
+  free(ctx->collApiPool);
+  free(ctx->p2pApiPool);
+  free(ctx->kernelLaunchPool);
+  free(ctx->groupApiPool);
   free(ctx->collPool);
   free(ctx->p2pPool);
   free(ctx->proxyCtrlPool);
@@ -187,7 +231,113 @@ __hidden void updateEvent(void* handle);
 __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
   *eHandle = NULL;
   struct context* ctx = (struct context *)context;
-  if (eDescr->type == ncclProfileGroup) {
+  if (eDescr->type == ncclProfileGroupApi) {
+    struct groupApi* event;
+    int groupApiId = __atomic_fetch_add(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((groupApiId - __atomic_load_n(&ctx->groupApiPoolBase, __ATOMIC_RELAXED)) < groupApiPoolSize) {
+      // if there are available group API events grab one
+      event = &ctx->groupApiPool[groupApiId%groupApiPoolSize];
+      // Make sure all child events of the picked group API event are cleared
+      while (!profilerQueueEmpty(&event->collApiEvents)) {
+        struct collApi *collApiEvent = profilerQueueDequeue(&event->collApiEvents);
+        resetTaskEvents(collApiEvent, ctx);
+        __atomic_fetch_add(&ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->p2pApiEvents)) {
+        struct p2pApi *p2pApiEvent = profilerQueueDequeue(&event->p2pApiEvents);
+        resetTaskEvents(p2pApiEvent, ctx);
+        __atomic_fetch_add(&ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+      }
+      while (!profilerQueueEmpty(&event->kernelLaunchEvents)) {
+        profilerQueueDequeue(&event->kernelLaunchEvents);
+        __atomic_fetch_add(&ctx->kernelLaunchPoolBase, 1, __ATOMIC_RELAXED);
+      }
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->groupApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileGroupApi;
+    event->ctx = ctx;
+    event->groupApiId = groupApiId;
+    event->graphCaptured = eDescr->groupApi.graphCaptured;
+    event->groupDepth = eDescr->groupApi.groupDepth;
+    event->startTs = gettime() - startTime;
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileCollApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct collApi* event;
+    int collApiId = __atomic_fetch_add(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((collApiId - __atomic_load_n(&ctx->collApiPoolBase, __ATOMIC_RELAXED)) < collApiPoolSize) {
+      // if there are available Coll API events grab one
+      event = &ctx->collApiPool[collApiId%collApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->collApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileCollApi;
+    event->collApiId = collApiId;
+    event->ctx = ctx;
+    event->func = eDescr->collApi.func;
+    event->stream = (cudaStream_t) eDescr->collApi.stream;
+    event->count = eDescr->collApi.count;
+    event->datatype = eDescr->collApi.datatype;
+    event->root = eDescr->collApi.root;
+    event->graphCaptured = eDescr->collApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->collApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileP2pApi) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct p2pApi* event;
+    int p2pApiId = __atomic_fetch_add(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((p2pApiId - __atomic_load_n(&ctx->p2pApiPoolBase, __ATOMIC_RELAXED)) < p2pApiPoolSize) {
+      // if there are available p2p API events grab one
+      event = &ctx->p2pApiPool[p2pApiId%p2pApiPoolSize];
+      resetTaskEvents(event, ctx);
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->p2pApiPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileP2pApi;
+    event->p2pApiId = p2pApiId;
+    event->ctx = ctx;
+    event->func = eDescr->p2pApi.func;
+    event->stream = (cudaStream_t) eDescr->p2pApi.stream;
+    event->count = eDescr->p2pApi.count;
+    event->datatype = eDescr->p2pApi.datatype;
+    event->graphCaptured = eDescr->p2pApi.graphCaptured;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->p2pApiEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileKernelLaunch) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
+    struct kernelLaunch* event;
+    int kernelLaunchId = __atomic_fetch_add(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+    if ((kernelLaunchId - __atomic_load_n(&ctx->kernelLaunchPoolBase, __ATOMIC_RELAXED)) < kernelLaunchPoolSize) {
+      // if there are available kernel API events grab one
+      event = &ctx->kernelLaunchPool[kernelLaunchId%kernelLaunchPoolSize];
+    } else {
+      // else drop this event
+      __atomic_fetch_sub(&ctx->kernelLaunchPoolIndex, 1, __ATOMIC_RELAXED);
+      return ncclSuccess;
+    }
+    event->type = ncclProfileKernelLaunch;
+    event->stream = (cudaStream_t) eDescr->kernelLaunch.stream;
+    struct groupApi* parent = (struct groupApi *) eDescr->parentObj;
+    event->parent = parent;
+    profilerQueueEnqueue(&parent->kernelLaunchEvents, event);
+    __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
+    *eHandle = event;
+  } else if (eDescr->type == ncclProfileGroup) {
+    if (eDescr->parentObj == NULL) return ncclSuccess;
     struct group* event;
     int groupId = __atomic_fetch_add(&ctx->groupPoolIndex, 1, __ATOMIC_RELAXED);
     if ((groupId - __atomic_load_n(&ctx->groupPoolBase, __ATOMIC_RELAXED)) < groupPoolSize) {
@@ -222,7 +372,7 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     debugEvent(event, "GroupStart");
   } else if (eDescr->type == ncclProfileColl) {
     // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct collApi* parent = (struct collApi *)eDescr->parentObj;
     if (parent == NULL) return ncclSuccess;
 
     struct collective* event;
@@ -253,12 +403,12 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
     event->proto = eDescr->coll.proto;
     *eHandle = event;
     taskEventQueueEnqueue(parent, (struct taskEventBase *)event);
-    // increment the group ref counter so the event will staty open
+    // increment the group ref counter so the event will stay open
     __atomic_fetch_add(&parent->refCount, 1, __ATOMIC_RELAXED);
     debugEvent(event, "CollStart");
   } else if (eDescr->type == ncclProfileP2p) {
     // the parent might be null if we run out of events
-    struct group* parent = (struct group *)eDescr->parentObj;
+    struct p2pApi* parent = (struct p2pApi*) eDescr->parentObj;
     if (parent == NULL) return ncclSuccess;
 
     struct p2p* event;
@@ -458,8 +608,34 @@ __hidden ncclResult_t exampleProfilerStartEvent(void* context, void** eHandle, n
 }
 
 void updateEvent(void* handle) {
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->groupApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->collApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    return;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) handle;
+    if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
+      event->stopTs = gettime() - startTime;
+      __atomic_fetch_add(&event->ctx->p2pApiPoolBase, 1, __ATOMIC_RELAXED);
+    }
+    updateEvent(event->parent);
+    event->stopTs = gettime() - startTime;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) handle;
+    event->stopTs = gettime() - startTime;
+    updateEvent(event->parent);
+  } else if (type == ncclProfileGroup) {
     struct group* event = (struct group *)handle;
     if (__atomic_sub_fetch(&event->refCount, 1, __ATOMIC_RELAXED) == 0) {
       event->stopTs = gettime() - startTime;
@@ -527,25 +703,35 @@ __hidden ncclResult_t exampleProfilerStopEvent(void* eHandle) {
   // the event handle might be null if we run out of events
   if (eHandle == NULL) return ncclSuccess;
 
-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileGroup) {
-    // stopping the group event in NCCL core does not
-    // mean the group has completed. It means the group
-    // was submitted/enqueued so we need to keep the event open
+  uint64_t type = *(uint64_t *)eHandle;
+  // Stopping API events, Kernel Launch events, collective/p2p task events
+  // in NCCL core do not mean that they are complete. It means that the
+  // operation was enqueued so we need to keep the events open
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* event = (struct collApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* event = (struct p2pApi*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* event = (struct kernelLaunch*) eHandle;
+    event->stopTs = gettime() - startTime;
+    return ncclSuccess;
+  } else if (type == ncclProfileGroup) {
     struct group* event = (struct group *)eHandle;
     event->stopTs = gettime() - startTime;
     return ncclSuccess;
   } else if (type == ncclProfileColl) {
-    // stopping the collective event in NCCL core does not
-    // mean the collective has completed. It means the collective
-    // was submitted/enqueued so we need to keep the event open
     struct collective* event = (struct collective *)eHandle;
     event->base.stopTs = gettime() - startTime;
     return ncclSuccess;
   } else if (type == ncclProfileP2p) {
-    // stopping the p2p event in NCCL core does not
-    // mean the p2p has completed. It means the p2p
-    // was submitted/enqueued so we need to keep the event open
     struct p2p* event = (struct p2p *)eHandle;
     event->base.stopTs = gettime() - startTime;
     return ncclSuccess;
@@ -559,8 +745,15 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   // the event handle might be null if we run out of events
   if (eHandle == NULL) return ncclSuccess;
 
-  uint8_t type = *(uint8_t *)eHandle;
-  if (type == ncclProfileProxyOp) {
+  uint64_t type = *(uint64_t *)eHandle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* event = (struct groupApi*) eHandle;
+    if (eState == ncclProfilerEndGroupApiStart) {
+      event->endOfncclGroupStartTs = gettime() - startTime;
+    } else if (eState == ncclProfilerBeginGroupApiEnd) {
+      event->startOfncclGroupEndTs = gettime() - startTime;
+    }
+  } else if (type == ncclProfileProxyOp) {
     struct proxyOp* event = (struct proxyOp *)eHandle;
     if (eState == ncclProfilerProxyOpInProgress_v4) {
       event->progrTs = gettime() - startTime;
@@ -592,6 +785,8 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
       case ncclProfilerProxyStepRecvGPUWait:
         event->timestamp[PROXY_STEP_RECV_GPU_WAIT] = gettime() - startTime;
         break;
+      default:
+        break;
     }
   } else if (type == ncclProfileProxyCtrl) {
     struct proxyCtrl* event = (struct proxyCtrl *)eHandle;
@@ -609,7 +804,7 @@ __hidden ncclResult_t exampleProfilerRecordEventState(void* eHandle, ncclProfile
   return ncclSuccess;
 }
 
-ncclProfiler_t ncclProfiler_v4 = {
+ncclProfiler_t ncclProfiler_v5 = {
   "Example-profiler",
   exampleProfilerInit,
   exampleProfilerStartEvent,
@@ -618,14 +813,15 @@ ncclProfiler_t ncclProfiler_v4 = {
   exampleProfilerFinalize,
 };
 
-int exampleProfilerStart(int eActivationMask) {
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name) {
+  profilerDumpFile = name;
   if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
     __atomic_store_n(eActivationMaskPtr, eActivationMask, __ATOMIC_RELAXED);
   }
   return ncclSuccess;
 }
 
-int exampleProfilerStop(void) {
+__attribute__((visibility("default"))) int exampleProfilerStop(void) {
   if (__atomic_load_n(&initialized, __ATOMIC_RELAXED)) {
     __atomic_store_n(eActivationMaskPtr, 0, __ATOMIC_RELAXED);
   }
diff --git a/ext-profiler/example/plugin.h b/ext-profiler/example/plugin.h
index b4d07060a..9248ebf08 100644
--- a/ext-profiler/example/plugin.h
+++ b/ext-profiler/example/plugin.h
@@ -7,7 +7,8 @@
 #ifndef PLUGIN_H_
 #define PLUGIN_H_
 
-int exampleProfilerStart(int eActivationMask);
-int exampleProfilerStop(void);
+__attribute__((visibility("default"))) int exampleProfilerStart(int eActivationMask, const char* name);
+__attribute__((visibility("default"))) int exampleProfilerStop(void);
+
 
 #endif
diff --git a/ext-profiler/example/print_event.c b/ext-profiler/example/print_event.cc
similarity index 76%
rename from ext-profiler/example/print_event.c
rename to ext-profiler/example/print_event.cc
index a56106e10..ca3c7cfae 100644
--- a/ext-profiler/example/print_event.c
+++ b/ext-profiler/example/print_event.cc
@@ -5,15 +5,59 @@
  ************************************************************************/
 
 #include <stdio.h>
+#include "err.h"
 #include "profiler.h"
 #include "event.h"
 #include "print_event.h"
+#include <cuda_runtime.h>
 
 #define __hidden __attribute__ ((visibility("hidden")))
 
 // FIXME: chrome tracing asynchronous events (following used) allow event nesting for events that have same id and category
 // It appears that nesting more than three events causes issues. Therefore, every event is given an increasing id and a
-// category that matches the type of event (GROUP, COLL, P2P, PROXY, NET)
+// category that matches the type of event (GROUP API, COLL API, P2P API, GROUP, COLL, P2P, PROXY, NET)
+static __thread int groupApiId;
+__hidden void printGroupApiEventHeader(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupApiId\": %d, \"groupDepth\":%d}},\n",
+          "Group API", groupApiId, getpid(), 1, event->startTs, event->groupApiId, event->groupDepth);
+}
+
+__hidden void printGroupApiEventTrailer(FILE* fh, struct groupApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          "Group API", groupApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int p2pApiId;
+__hidden void printP2pApiEventHeader(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, p2pApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->graphCaptured, event->stream);
+}
+
+__hidden void printP2pApiEventTrailer(FILE* fh, struct p2pApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+          event->func, p2pApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int collApiId;
+__hidden void printCollApiEventHeader(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"count\": %lu, \"datatype\": %s, \"root\": %d, \"GraphCaptured\":%d, \"Stream\": %p}},\n",
+      event->func, collApiId, getpid(), 1, event->startTs, event->count, event->datatype, event->root, event->graphCaptured, event->stream);
+}
+
+__hidden void printCollApiEventTrailer(FILE* fh, struct collApi* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL_API\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n",
+      event->func, collApiId++, getpid(), 1, event->stopTs);
+}
+
+static __thread int kernelLaunchId;
+__hidden void printKernelLaunchEventHeader(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d, \"Stream\": %p}},\n", "KernelLaunch", kernelLaunchId, getpid(), 1, event->startTs, event->kernelLaunchId, event->stream);
+}
+
+__hidden void printKernelLaunchEventTrailer(FILE* fh, struct kernelLaunch* event) {
+  fprintf(fh, "{\"name\": \"%s\", \"cat\": \"KERNEL_LAUNCH\", \"ph\": \"e\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f},\n", "KernelLaunch", kernelLaunchId++, getpid(), 1, event->stopTs);
+}
+
 static __thread int groupId;
 __hidden void printGroupEventHeader(FILE* fh, struct group* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"GROUP\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"groupId\": %d}},\n",
@@ -28,7 +72,7 @@ __hidden void printGroupEventTrailer(FILE* fh, struct group* event) {
 static __thread int collId;
 __hidden void printCollEventHeader(FILE* fh, struct collective* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"COLL\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"SeqNum\": %lu, \"CommHash\": %lu, \"Rank\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"Algorithm\": \"%s\", \"Protocol\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, event->base.parent->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
+          event->base.func, collId, getpid(), 1, event->base.startTs, event->seqNumber, ((struct collApi*)event->base.parent)->ctx->commHash, event->base.rank, event->count, event->datatype, event->algo, event->proto, event->nChannels);
 }
 
 __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
@@ -39,7 +83,7 @@ __hidden void printCollEventTrailer(FILE* fh, struct collective* event) {
 static __thread int p2pId;
 __hidden void printP2pEventHeader(FILE* fh, struct p2p* event) {
   fprintf(fh, "{\"name\": \"%s\", \"cat\": \"P2P\", \"ph\": \"b\", \"id\": %d, \"pid\": %d, \"tid\": %d, \"ts\": %f, \"args\": {\"CommHash\": %lu, \"Rank\": %d, \"Peer\": %d, \"Count\": %lu, \"Datatype\": \"%s\", \"nChannels\": %d}},\n",
-          event->base.func, p2pId, getpid(), 1, event->base.startTs, event->base.parent->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
+          event->base.func, p2pId, getpid(), 1, event->base.startTs, ((struct p2pApi*)event->base.parent)->ctx->commHash, event->base.rank, event->peer, event->count, event->datatype, event->nChannels);
 }
 
 __hidden void printP2pEventTrailer(FILE* fh, struct p2p* event) {
@@ -173,7 +217,7 @@ void debugEvent(void* eHandle, const char* tag) {
   char filename[64] = { 0 };
   sprintf(filename, "EventDebug-%d", getpid());
   FILE* fh = fopen(filename, "a+");
-  uint8_t type = *(uint8_t *)eHandle;
+  uint64_t type = *(uint64_t *)eHandle;
   if (type == ncclProfileGroup) {
     struct group* event = (struct group *)eHandle;
     fprintf(fh, "Group event %p tag = %s {\n", event, tag);
@@ -241,8 +285,51 @@ void debugEvent(void* eHandle, const char* tag) {
 
 void printEvent(FILE* fh, void* handle) {
   if (handle == NULL || fh == NULL) return;
-  uint8_t type = *(uint8_t *)handle;
-  if (type == ncclProfileGroup) {
+  uint64_t type = *(uint64_t *)handle;
+  if (type == ncclProfileGroupApi) {
+    struct groupApi* g = (struct groupApi*) handle;
+    printGroupApiEventHeader(fh, g);
+    struct kernelLaunch* kernelLaunchHead = profilerQueueHead(&g->kernelLaunchEvents);
+    while (kernelLaunchHead != NULL) {
+      printEvent(fh, kernelLaunchHead);
+      kernelLaunchHead = kernelLaunchHead->next;
+    }
+    struct collApi* collApiHead = profilerQueueHead(&g->collApiEvents);
+    while (collApiHead != NULL) {
+      printEvent(fh, collApiHead);
+      collApiHead = collApiHead->next;
+    }
+    struct p2pApi* p2pApiHead = profilerQueueHead(&g->p2pApiEvents);
+    while (p2pApiHead != NULL) {
+      printEvent(fh, p2pApiHead);
+      p2pApiHead = p2pApiHead->next;
+    }
+    printGroupApiEventTrailer(fh, g);
+  } else if (type == ncclProfileCollApi) {
+    struct collApi* collApiEvent = (struct collApi *) handle;
+    printCollApiEventHeader(fh, collApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(collApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printCollApiEventTrailer(fh, collApiEvent);
+  } else if (type == ncclProfileP2pApi) {
+    struct p2pApi* p2pApiEvent = (struct p2pApi *) handle;
+    printP2pApiEventHeader(fh, p2pApiEvent);
+    struct taskEventBase* base = taskEventQueueHead(p2pApiEvent);
+    while (base) {
+      struct taskEventBase* next = base->next;
+      printEvent(fh, base);
+      base = next;
+    }
+    printP2pApiEventTrailer(fh, p2pApiEvent);
+  } else if (type == ncclProfileKernelLaunch) {
+    struct kernelLaunch* kernelLaunchEvent = (struct kernelLaunch *) handle;
+    printKernelLaunchEventHeader(fh, kernelLaunchEvent);
+    printKernelLaunchEventTrailer(fh, kernelLaunchEvent);
+  } else if (type == ncclProfileGroup) {
     struct group* g = (struct group *)handle;
     printGroupEventHeader(fh, g);
     struct taskEventBase* base = taskEventQueueHead(g);
diff --git a/ext-profiler/example/queue.h b/ext-profiler/example/queue.h
new file mode 100644
index 000000000..dfb14f575
--- /dev/null
+++ b/ext-profiler/example/queue.h
@@ -0,0 +1,50 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef QUEUE_H
+#define QUEUE_H
+
+template<typename T, T *T::*next>
+struct profilerQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+ inline void profilerQueueConstruct(profilerQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+ inline bool profilerQueueEmpty(profilerQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* profilerQueueHead(profilerQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueTail(profilerQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+ inline void profilerQueueEnqueue(profilerQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+ inline T* profilerQueueDequeue(profilerQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+#endif
diff --git a/ext-profiler/google-CoMMA/Makefile b/ext-profiler/google-CoMMA/Makefile
new file mode 100644
index 000000000..2da516990
--- /dev/null
+++ b/ext-profiler/google-CoMMA/Makefile
@@ -0,0 +1,22 @@
+.PHONY: build-CoMMA
+
+all: build-CoMMA
+
+build-CoMMA: clone-CoMMA
+	cd CoMMA && cargo build
+
+clone-CoMMA:
+	@if [ ! -d CoMMA ] ; then \
+		git clone https://github.com/google/CoMMA.git; \
+		ln -s $(PWD)/.. CoMMA/third_party/nccl/ext-profiler; \
+	fi
+
+clean:
+	@if [ -d CoMMA ] ; then \
+		cd CoMMA && cargo clean; \
+	fi
+
+delete:
+	@if [ -d CoMMA ] ; then \
+		rm -rf CoMMA; \
+	fi
diff --git a/ext-profiler/inspector/Makefile b/ext-profiler/inspector/Makefile
new file mode 100644
index 000000000..301c46b20
--- /dev/null
+++ b/ext-profiler/inspector/Makefile
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Variables
+NCCL_HOME := ../../build
+INC := -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl
+PLUGIN_SO := libnccl-profiler-inspector.so
+VERSION_FILE := version.cc
+
+# Compiler and flags
+CXX := g++
+CXXFLAGS := -g -O3 -fPIC -shared -march=native -DNDEBUG -Wall -Wextra
+
+ifeq ($(DEBUG), 1)
+CXXFLAGS += -g2 -ggdb3 -rdynamic -funwind-tables -fno-omit-frame-pointer
+endif
+
+ifeq ($(ASAN), 1)
+CXXFLAGS += -fsanitize=address
+LDFLAGS += -fsanitize=address -static-libasan
+NVLDFLAGS += -Xcompiler -fsanitize=address,-static-libasan
+endif
+
+ifeq ($(UBSAN), 1)
+CXXFLAGS += -fsanitize=undefined
+LDFLAGS += -fsanitize=undefined -static-libubsan
+NVLDFLAGS += -Xcompiler -fsanitize=undefined,-static-libubsan
+endif
+
+# Source files
+SOURCES := inspector_plugin.cc inspector.cc json.cc
+
+# Default target
+all: $(PLUGIN_SO)
+
+# Rule to build the plugin
+$(PLUGIN_SO): $(VERSION_FILE) $(SOURCES)
+	@echo "Compiling to create $@ from $^"
+	$(CXX) $(INC) $(CXXFLAGS) -o $@ -Wl,-soname,$(PLUGIN_SO) $^
+
+# Rule to generate version.cc
+$(VERSION_FILE):
+	@GIT_INFO=$$(./utils/extract_git_version.sh); \
+	echo '#include "version.h"' > $(VERSION_FILE).tmp; \
+	echo 'const char* get_git_version_info() { return "'$$GIT_INFO'"; }' >> $(VERSION_FILE).tmp; \
+	if ! cmp $(VERSION_FILE).tmp $(VERSION_FILE); then \
+		echo "updating ${VERSION_FILE} file -> $$GIT_INFO"; \
+		mv $(VERSION_FILE).tmp $(VERSION_FILE); \
+	else \
+		echo "${VERSION_FILE} up to date -> $$GIT_INFO"; \
+		rm $(VERSION_FILE).tmp; \
+	fi
+
+# Clean target
+clean:
+	rm -f $(VERSION_FILE) $(PLUGIN_SO)
+
+# Phony targets
+.PHONY: all clean
diff --git a/ext-profiler/inspector/README.md b/ext-profiler/inspector/README.md
new file mode 100644
index 000000000..daf26f7dd
--- /dev/null
+++ b/ext-profiler/inspector/README.md
@@ -0,0 +1,216 @@
+# NCCL Inspector Plugin
+
+The NCCL Inspector is a plugin for the NVIDIA Collective Communications Library (NCCL) that provides detailed, per-communicator, per-collective performance and metadata logging. It is designed to help users analyze and debug NCCL collective operations by generating structured JSON output for each operation.
+
+## Related Documentation
+
+- **[Performance Exporter](exporter/example/README.md)** - Tool for analyzing and visualizing NCCL performance data from inspector logs
+
+## Folder Location
+
+The Inspector plugin source is located in:
+
+```
+ext-profiler/inspector/
+```
+
+## Building the Inspector Plugin
+
+To build the Inspector plugin, run:
+
+```bash
+make
+```
+
+The build system will automatically detect CUDA and NCCL installations from your environment. If you need to specify custom paths, you can set `CUDA_HOME` and `NCCL_HOME` environment variables or pass them as make arguments.
+
+### Build Options
+
+The Makefile supports several build options:
+
+- **DEBUG=1**: Enable debug build with additional debugging information
+- **ASAN=1**: Enable Address Sanitizer for memory error detection
+- **UBSAN=1**: Enable Undefined Behavior Sanitizer
+
+Example debug build:
+```bash
+make DEBUG=1
+```
+
+### Build Output
+
+The build process creates:
+- `libnccl-profiler-inspector.so`: The main inspector plugin library
+- `version.cc`: Auto-generated version information from git
+
+## Using NCCL Inspector
+
+### Key Differences from Normal NCCL Usage
+
+The main difference between running NCCL with the Inspector plugin versus running NCCL normally is the addition of environment variables that enable detailed performance logging:
+
+**Normal NCCL Run:**
+```bash
+# Standard NCCL execution
+./your_nccl_application
+```
+
+**NCCL Inspector Run:**
+```bash
+# NCCL Inspector enabled execution
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./your_nccl_application
+```
+
+### Required Environment Variables
+
+- `NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so`
+  Loads the Inspector plugin into NCCL.
+- `NCCL_INSPECTOR_ENABLE=1`
+  Enables the Inspector plugin.
+- `NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=<interval>`
+  Sets the interval (in microseconds) for the internal dump thread to write output. Example: `500`.
+- `NCCL_INSPECTOR_DUMP_DIR=<output_dir>` (optional)
+  Sets the output directory for logs. If not set, defaults to `nccl-inspector-unknown-jobid` or `nccl-inspector-<slurm_job_id>` if running under SLURM.
+- `NCCL_INSPECTOR_DUMP_VERBOSE=<0|1>` (optional)
+  Enables verbose output including event trace information. Set to `1` to enable, `0` to disable (default).
+
+### Example Usage
+
+**Single Node:**
+```bash
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+./build/test/perf/all_reduce_perf -b 8 -e 16G -f 2 -g 8
+```
+
+**Multi-Node (SLURM):**
+```bash
+# Add these environment variables to your SLURM script
+export NCCL_PROFILER_PLUGIN=/path/to/nccl/ext-profiler/inspector/libnccl-profiler-inspector.so
+export NCCL_INSPECTOR_ENABLE=1
+export NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS=500
+export NCCL_INSPECTOR_DUMP_DIR=/path/to/logs/${SLURM_JOB_ID}/
+
+# Then run your normal NCCL application
+srun your_nccl_application
+```
+
+## Example Scripts
+
+For detailed example scripts showing how to integrate NCCL Inspector with different workloads, see the **[test/examples/](test/examples/)** directory:
+
+- **Single Node Example**: Basic NCCL performance testing with inspector
+- **Multi-Node SLURM Example**: Comprehensive multi-node testing with various collective operations
+- **Training Workload Example**: Integration with distributed training workloads
+
+## Output Example
+
+Each output file contains JSON objects with the following structure:
+
+```json
+{
+  "header": {
+    "id": "0x7f8c496ae9f661",
+    "rank": 2,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "",
+    "rec_mechanism": "profiler_plugin",
+    "dump_timestamp_us": 1748030377748202,
+    "hostname": "example-hostname",
+    "pid": 1639453
+  },
+  "coll_perf": {
+    "coll": "AllReduce",
+    "coll_sn": 1407,
+    "coll_msg_size_bytes": 17179869184,
+    "coll_exec_time_us": 61974,
+    "coll_algobw_gbs": 277.210914,
+    "coll_busbw_gbs": 485.119099
+  }
+}
+```
+
+## Output Example Verbose
+
+To enable verbose output with event trace information, set the `NCCL_INSPECTOR_DUMP_VERBOSE=1` environment variable:
+
+```bash
+export NCCL_INSPECTOR_DUMP_VERBOSE=1
+```
+
+This will include additional event trace information in the JSON output, showing the sequence of callbacks and timestamps for each individual event.
+
+```json
+{
+  "header": {
+    "id": "0xe62dedaa97644a",
+    "rank": 4,
+    "n_ranks": 8,
+    "nnodes": 1
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "9019a1912-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1752867229276385,
+    "hostname": "example-hostname",
+    "pid": 438776
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 1231,
+    "coll_msg_size_bytes": 2147483648,
+    "coll_exec_time_us": 41057,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 418.439467,
+    "coll_busbw_gbs": 366.134533,
+    "event_trace_sn": {
+      "coll_start_sn": 1,
+      "coll_stop_sn": 2,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_sn": 3,
+          "kernel_stop_sn": 48,
+          "kernel_record_sn": 47
+        }
+      ]
+    },
+    "event_trace_ts": {
+      "coll_start_ts": 1752867229235059,
+      "coll_stop_ts": 1752867229235064,
+      "kernel_events": [
+        {
+          "channel_id": 0,
+          "kernel_start_ts": 1752867229235181,
+          "kernel_stop_ts": 1752867229275811,
+          "kernel_record_ts": 1752867229275811
+        }
+      ]
+    }
+  }
+}
+```
+
+Multiple such JSON objects are written, one per collective operation per communicator.
+
+## Output Directory
+
+- By default, output files are written to:
+  - `nccl-inspector-unknown-jobid` (if no SLURM job ID is present)
+  - `nccl-inspector-<slurm_job_id>` (if running under SLURM)
+- You can override this with the `NCCL_INSPECTOR_DUMP_DIR` environment variable.
+
+## Additional Notes
+
+- The plugin is compatible with standard NCCL workflows and can be used in both single-node and multi-node (SLURM) environments.
+- For more details, see the source code and comments in `ext-profiler/inspector/`.
+
diff --git a/ext-profiler/inspector/exporter/example/README.md b/ext-profiler/inspector/exporter/example/README.md
new file mode 100644
index 000000000..26e4b2e57
--- /dev/null
+++ b/ext-profiler/inspector/exporter/example/README.md
@@ -0,0 +1,151 @@
+# NCCL Inspector Performance Summary Exporter
+
+This tool processes NCCL Inspector log files and generates comprehensive performance analysis reports including visualizations and statistical summaries.
+One can build similar exporters to integrate with various observability systems like Elastic, Prometheus or other Custom Metric systems.
+
+## Features
+
+- **Performance Analysis**: Generates statistical summaries for collective operations
+- **Communication Type Classification**: Automatically categorizes communication patterns
+- **Visualizations**: Creates scatter plots, histograms, and box plots for performance metrics
+- **Data Export**: Converts logs to Parquet format for efficient processing
+- **Multi-format Log Support**: Processes `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files
+- **Parallel Processing**: Utilizes multi-core processing for faster analysis
+
+## Requirements
+
+- Python 3.7+
+- Access to NCCL Inspector log files
+
+## Installation
+
+### Clone the Repository
+
+```bash
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl/ext-profiler/inspector/exporter/example
+```
+
+Install the required dependencies using the provided `requirements.txt` file:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Usage
+
+The script processes NCCL Inspector log files from a specified directory.
+
+**Note:** To generate NCCL Inspector log files, you need to run your NCCL application with the inspector plugin enabled. The log files will be output to a directory specified by the `NCCL_INSPECTOR_DUMP_DIR` environment variable. For detailed setup instructions and environment variable configuration, see the [Inspector README](../../../README.md).
+
+### Basic Usage
+
+```bash
+python perf_summary_exporter.py --input_dir /path/to/nccl/inspector/logs
+```
+
+This mode processes all log files in the specified directory and its subdirectories recursively.
+
+### Command Line Arguments
+
+- `--input_dir <path>`: **Required**. Directory containing NCCL Inspector log files (searches recursively in subdirectories)
+- `--output_dir <name>`: **Optional**. Custom output directory name (default: `<input_directory_name>-analysis`)
+
+## Output
+
+The tool generates:
+
+1. **Parquet Files**: One per log file containing processed log data (stored in `parquet_files/` subdirectory)
+2. **Summary Directory**: Contains comprehensive analysis results
+3. **Visualizations**: Scatter plots, histograms, and box plots for each message size
+4. **CSV Files**: Detailed summaries for each message size and collective type
+5. **Log File**: Processing log with detailed information
+
+## Example Output Structure
+
+```
+<output_dir_name>/
+├── output.log
+├── parquet_files/
+│   ├── <filename1>.parquet
+│   ├── <filename2>.parquet
+│   └── ...
+└── summary/
+    ├── scatter_plot_<comm_type>_<coll_type>.png
+    ├── combined_scatter_plot_<comm_type>_<coll_type>.png
+    └── msg_size_<human_readable_size>/
+        ├── histograms/
+        │   └── histogram_<comm_type>_<coll_type>_<size>.png
+        ├── boxplots/
+        │   └── boxplot_<comm_type>_<coll_type>_<size>.png
+        └── summary_<comm_type>_<coll_type>_<size>.csv
+```
+
+## Supported Communicator Types
+
+- `single-rank`
+- `nvlink-only`
+- `hca-only`
+- `mixed`
+
+## Supported Collective Types
+
+- `AllReduce`
+- `AllGather`
+- `ReduceScatter`
+- `Broadcast`
+
+## Log File Formats
+
+### Supported Formats
+
+- `.log` - Plain text JSON lines
+- `.log.gz` - Compressed JSON lines
+- `.jsonl` - JSON lines format
+- `.jsonl.gz` - Compressed JSON lines
+
+### Expected JSON Structure
+
+```json
+{
+  "header": {
+    "id": "0x9e7a479f95a66c",
+    "rank": 31,
+    "n_ranks": 32,
+    "nnodes": 4
+  },
+  "metadata": {
+    "inspector_output_format_version": "v4.0",
+    "git_rev": "75e61acda-dirty",
+    "rec_mechanism": "nccl_profiler_interface",
+    "dump_timestamp_us": 1749490229087081,
+    "hostname": "example-hostname",
+    "pid": 468528
+  },
+  "coll_perf": {
+    "coll": "ReduceScatter",
+    "coll_sn": 129,
+    "coll_msg_size_bytes": 65536,
+    "coll_exec_time_us": 110,
+    "coll_timing_source": "kernel_gpu",
+    "coll_algobw_gbs": 19.065018,
+    "coll_busbw_gbs": 18.469236
+  }
+}
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **No log files found**: Ensure the log directory path is correct and contains valid log files
+2. **Missing dependencies**: Ensure all requirements are installed in your virtual environment
+3. **Mixed file formats**: The tool will exit if it detects mixed `.log`, `.log.gz`, `.jsonl`, and `.jsonl.gz` files in the same directory. This is typically indicative of corrupt input directories caused by multiple overlapping NCCL Inspector runs with different output format options. Clean the directory and re-run with consistent settings.
+
+### Log Files
+
+The tool creates detailed logs in the output directory. Check `output.log` for processing information and any error messages.
+
+## Support
+
+Please refer to the github issues page at https://github.com/NVIDIA/nccl/issues. Your question may already have been asked by another user. If not, feel free to create a new issue and refer to the "inspector plugin" in the title.
diff --git a/ext-profiler/inspector/exporter/example/perf_summary_exporter.py b/ext-profiler/inspector/exporter/example/perf_summary_exporter.py
new file mode 100644
index 000000000..5913152ce
--- /dev/null
+++ b/ext-profiler/inspector/exporter/example/perf_summary_exporter.py
@@ -0,0 +1,548 @@
+from pathlib import Path
+import argparse
+import glob
+import gzip
+import sys
+import pandas as pd
+from concurrent.futures import ProcessPoolExecutor
+import json
+from tqdm.auto import tqdm
+import duckdb
+import math
+import matplotlib.pyplot as plt
+import matplotlib.dates
+from matplotlib.gridspec import GridSpec
+import os
+import logging
+import contextlib
+from datetime import datetime
+import numpy as np
+
+def setup_logging(output_dir):
+    log_file = output_dir / "output.log"
+    logging.basicConfig(
+        filename=log_file,
+        level=logging.INFO,
+        format="%(asctime)s - %(levelname)s - %(message)s",
+    )
+
+
+@contextlib.contextmanager
+def smart_open(filename, mode="r"):
+    if filename.endswith(".gz"):
+        opener = gzip.open
+    else:
+        opener = open
+
+    with opener(filename, mode) as f:
+        yield f
+
+
+def get_log_files_and_output_dir():
+    parser = argparse.ArgumentParser(description="Process log files in a directory.")
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        help="The directory containing NCCL Inspector log files to process.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        help="Custom output directory name (default: auto-generated from input directory)."
+    )
+    args = parser.parse_args()
+
+    if args.input_dir:
+        # Use the provided input directory
+        root_dir = Path(args.input_dir)
+        if not root_dir.exists():
+            raise FileNotFoundError(f"Input directory not found: {root_dir}")
+
+    logfiles = list(glob.iglob(str(Path(root_dir) / "**" / "*.log"), recursive=True))
+    gzlogfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.log.gz"), recursive=True)
+    )
+    jsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl"), recursive=True)
+    )
+    gzjsonlfiles = list(
+        glob.iglob(str(Path(root_dir) / "**" / "*.jsonl.gz"), recursive=True)
+    )
+    if (
+            sum((1 for x in [logfiles, gzlogfiles, jsonlfiles, gzjsonlfiles] if len(x) > 0))
+            > 1
+    ):
+        ### TODO: we could probably generate some logic to pick the "right" file to load, but for now, bail
+        logging.critical("Appear to have mixed .log/.log.gz/.jsonl/.jsonl.gz; bailing!")
+        sys.exit(1)
+
+    files = logfiles + gzlogfiles + jsonlfiles + gzjsonlfiles
+
+    if not files:
+        print("No inspector logs found")
+        sys.exit(1)
+
+    # Generate output directory name from input directory
+    if args.output_dir:
+        output_dir_name = args.output_dir
+    else:
+        output_dir_name = f"{root_dir.name}-analysis"
+
+    return files, output_dir_name
+
+def bytes_to_human_readable(size_bytes):
+    """
+    Convert bytes to human-readable format using decimal (SI) units.
+
+    Uses powers of 1000 (decimal/SI standard):
+    - 1 KB = 1,000 bytes
+    - 1 MB = 1,000,000 bytes
+    - 1 GB = 1,000,000,000 bytes
+
+    Not binary units (powers of 1024):
+    - Does NOT use KiB, MiB, GiB (1024-based)
+
+    Args:
+        size_bytes: Number of bytes to convert
+
+    Returns:
+        Human-readable string (e.g., "1.50MB", "2.34GB")
+    """
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.log10(int(size_bytes)) / 3)
+    s = round(size_bytes * math.pow(10, -3 * i), 2)
+    return f"{s:.2f}{size_name[i]}"
+
+def timestamp_to_datetime(timestamp_us):
+    """Convert microsecond timestamp to datetime string"""
+    return datetime.fromtimestamp(timestamp_us / 1000000).strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
+
+def microseconds_to_human_readable(microseconds):
+    """Convert microseconds to human readable format"""
+    if microseconds < 1000:
+        return f"{microseconds:.1f}μs"
+    elif microseconds < 1000000:
+        return f"{microseconds/1000:.1f}ms"
+    else:
+        return f"{microseconds/1000000:.1f}s"
+
+def get_comm_type(row) -> str:
+    if row["n_ranks"] == 1:
+        return "single-rank"
+    elif row["nnodes"] == 1:
+        return "nvlink-only"
+    elif row["n_ranks"] == row["nnodes"]:
+        return "hca-only"
+    else:
+        return "mixed"
+
+def parse_file(filepath: Path, output_dir):
+    filename = Path(filepath).stem
+    parquet_file = output_dir / f"{filename}.parquet"
+
+    # Check if parquet file exists and is newer than source file
+    if parquet_file.exists():
+        source_mtime = Path(filepath).stat().st_mtime
+        parquet_mtime = parquet_file.stat().st_mtime
+        if parquet_mtime >= source_mtime:
+            logging.info(f"Parquet file {parquet_file} is up to date. Skipping...")
+            return
+        else:
+            logging.info(f"Source file {filepath} is newer than parquet. Regenerating...")
+
+    # Check if file is empty or too small
+    file_size = Path(filepath).stat().st_size
+    if file_size == 0:
+        logging.warning(f"Skipping empty file: {filepath}")
+        return
+
+    recs = []
+    try:
+        with smart_open(filepath, "r") as infile:
+            for lineno, line in enumerate(infile):
+                try:
+                    json_recs = json.loads(line)
+                except json.JSONDecodeError:
+                    logging.error(f"Failed to parse line {filepath}:{lineno}")
+                    continue
+
+                # Validate that required fields exist
+                if not all(key in json_recs for key in ["header", "metadata", "coll_perf"]):
+                    logging.error(f"Missing required fields in {filepath}:{lineno}")
+                    continue
+
+                header = json_recs["header"]
+                metadata = json_recs["metadata"]
+                comm_type = get_comm_type(header)
+                coll_perf = json_recs["coll_perf"]
+                recs.append(
+                    dict(
+                        **header,
+                        comm_type=comm_type,
+                        **coll_perf,
+                        **metadata,
+                    )
+                )
+    except Exception as e:
+        logging.error(f"Error reading file {filepath}: {e}")
+        return
+
+    # Skip files with no valid records
+    if not recs:
+        logging.warning(f"No valid records found in file: {filepath}. Skipping...")
+        return
+
+    df = pd.DataFrame(recs)
+    df.to_parquet(parquet_file)
+    logging.info(f"Created parquet file {parquet_file} with {len(recs)} records")
+
+def create_per_node_parquet_files(files, output_dir):
+    output_dir = Path(output_dir) / "parquet_files"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    max_workers = min(64, len(files), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(parse_file, files, [output_dir] * len(files)),
+                total=len(files),
+                desc="Processing files",
+                unit="file",
+            )
+        )
+    return output_dir
+
+def generate_scatter_plot(df, comm_type, coll_type, output_file):
+    plt.figure(figsize=(10, 6), dpi=100)
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+
+    for msg_size in distinct_msg_sizes:
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        plt.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+
+    plt.xlabel("Operation Sequence Number")
+    plt.ylabel("Mean Collective Bus BW (GB/s)")
+    plt.title(f"Comm Type: {comm_type}, Coll Type: {coll_type}")
+    plt.legend(title="Message Size", loc="upper right")
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Scatter plot saved to {output_file}")
+
+def generate_combined_scatter_plot(df, comm_type, coll_type, output_file, max_cols=3):
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    num_plots = len(distinct_msg_sizes)
+
+    # Compute number of rows and columns
+    num_cols = min(max_cols, num_plots)  # Limit max columns
+    num_rows = (num_plots + num_cols - 1) // num_cols  # Calculate rows dynamically
+
+    # Create figure with GridSpec
+    fig = plt.figure(figsize=(5 * num_cols, 5 * num_rows), dpi=100)
+    gs = GridSpec(num_rows, num_cols, figure=fig)
+
+    for i, msg_size in enumerate(distinct_msg_sizes):
+        row, col = divmod(i, num_cols)  # Determine row & column index
+        ax = fig.add_subplot(gs[row, col])  # Create subplot at position
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+        mean_busbw = df_msg_size["mean_coll_busbw_gbs"].mean()
+        ax.scatter(
+            df_msg_size["coll_sn"],
+            df_msg_size["mean_coll_busbw_gbs"],
+            label=f"MsgSize: {bytes_to_human_readable(msg_size)} (Mean: {mean_busbw:.2f} GB/s)",
+            alpha=0.5,
+        )
+        ax.set_xlabel("Op Seq No")
+        ax.set_ylabel("Mean Collective Bus BW (GB/s)")
+        ax.set_title(f"Message Size: {bytes_to_human_readable(msg_size)}({msg_size})")
+        ax.legend(loc="upper right")
+
+    fig.suptitle(f"Comm Type: {comm_type}, Coll Type: {coll_type}", ha="center", y=0.98)
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Combined scatter plot saved to {output_file}")
+
+def generate_histogram(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6), dpi=100)
+    data_range = df["mean_coll_busbw_gbs"].max() - df["mean_coll_busbw_gbs"].min()
+    num_bins = min(50, int(data_range) + 1)
+    plt.hist(
+        df["mean_coll_busbw_gbs"],
+        bins=num_bins,
+        alpha=0.7,
+        color="b",
+        edgecolor="black",
+        linewidth=1.2,
+    )
+    plt.xlabel("Mean Collective Bus BW (GB/s)")
+    plt.ylabel("Frequency")
+    plt.title(
+        f"Comm Type: {comm_type}, Coll Type: {coll_type} Mean Collective Bus BW Histogram\nMsg Size: {message_size}"
+    )
+    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f"{y:.0f}"))
+    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f"{x:.2f} GB/s"))
+    plt.gca().xaxis.get_offset_text().set_visible(False)
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Histogram saved to {output_file}")
+
+def generate_boxplot(df, comm_type, coll_type, output_file, message_size):
+    plt.figure(figsize=(10, 6))
+    boxprops = dict(linestyle="-", linewidth=2, color="blue")
+    flierprops = dict(marker="o", color="red", alpha=0.5)
+    medianprops = dict(linestyle="-", linewidth=2.5, color="orange")
+    whiskerprops = dict(linestyle="--", linewidth=2, color="green")
+    capprops = dict(linestyle="-", linewidth=2, color="black")
+
+    plt.boxplot(
+        df["mean_coll_busbw_gbs"],
+        vert=False,
+        patch_artist=True,
+        boxprops=boxprops,
+        flierprops=flierprops,
+        medianprops=medianprops,
+        whiskerprops=whiskerprops,
+        capprops=capprops,
+    )
+
+    plt.xlabel("Mean Coll Bus BW (GB/s)")
+    plt.title(
+        f"Box Plot of Coll Bus BW (CommType: {comm_type} - Coll Type: {coll_type} - Msg Size: {message_size})"
+    )
+
+    # Adding labels for min, max, and median
+    stats = df["mean_coll_busbw_gbs"].describe(percentiles=[0.5])
+    plt.annotate(
+        f"Min: {stats['min']:.2f}",
+        xy=(stats["min"], 1),
+        xytext=(stats["min"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Median: {stats['50%']:.2f}",
+        xy=(stats["50%"], 1),
+        xytext=(stats["50%"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+    plt.annotate(
+        f"Max: {stats['max']:.2f}",
+        xy=(stats["max"], 1),
+        xytext=(stats["max"], 1.1),
+        arrowprops=dict(facecolor="black", shrink=0.05),
+    )
+
+    plt.tight_layout()
+    plt.savefig(output_file)
+    plt.close()
+    logging.info(f"Box plot saved to {output_file}")
+
+
+def summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name):
+    """Summarize parquet data per communication and collective type using DuckDB"""
+    logging.info(f"Summarizing data per comm/coll type for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Check if there are any parquet files
+    parquet_dir = output_root / "parquet_files"
+    parquet_files = list(parquet_dir.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning(f"No parquet files found for {comm_type} and {coll_type}")
+        return None
+
+    # Clean up invalid/empty parquet files by moving them to a separate directory
+    invalid_dir = parquet_dir / "invalid"
+    invalid_dir.mkdir(exist_ok=True)
+
+    invalid_count = 0
+    for pf in parquet_files:
+        try:
+            # Check file size first
+            if pf.stat().st_size == 0:
+                logging.warning(f"Moving zero-byte parquet file {pf} to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+                continue
+
+            # Use pyarrow to check parquet metadata without reading data
+            import pyarrow.parquet as pq
+            parquet_file = pq.ParquetFile(pf)
+            if parquet_file.metadata.num_rows == 0:
+                logging.warning(f"Moving empty parquet file {pf} (0 rows) to invalid directory")
+                pf.rename(invalid_dir / pf.name)
+                invalid_count += 1
+        except Exception as e:
+            logging.warning(f"Moving invalid parquet file {pf} to invalid directory: {e}")
+            pf.rename(invalid_dir / pf.name)
+            invalid_count += 1
+
+    # Check if any valid files remain
+    remaining_files = list(parquet_dir.glob("*.parquet"))
+    if not remaining_files:
+        logging.warning(f"No valid parquet files found for {comm_type} and {coll_type} (moved {invalid_count} invalid files)")
+        return None
+
+    logging.info(f"Found {len(remaining_files)} valid parquet files (moved {invalid_count} invalid files)")
+
+    try:
+        duckdb.execute(
+            f"CREATE OR REPLACE VIEW logs AS SELECT * FROM read_parquet('{parquet_dir}/*.parquet')"
+        )
+        df = duckdb.execute(f"""
+            SELECT
+                id,
+                coll_sn,
+                coll_msg_size_bytes,
+                AVG(coll_busbw_gbs) as mean_coll_busbw_gbs,
+                COUNT(*) as log_count,
+                ARRAY_DISTINCT(LIST(n_ranks)) as n_ranks,
+                ARRAY_DISTINCT(LIST(nnodes)) as nnodes,
+                MIN(dump_timestamp_us) as coll_start_timestamp_us,
+                MAX(dump_timestamp_us) as coll_end_timestamp_us,
+                (MAX(dump_timestamp_us) - MIN(dump_timestamp_us)) as coll_duration_us
+            FROM logs
+            WHERE coll = '{coll_type}' and comm_type = '{comm_type}'
+            GROUP BY id, coll_sn, coll_msg_size_bytes
+            ORDER BY coll_sn
+        """).df()
+    except Exception as e:
+        logging.error(f"Error executing DuckDB query for {comm_type} and {coll_type}: {e}")
+        return None
+
+    if df.empty:
+        logging.info(f"No data for {comm_type} and {coll_type}")
+        return None
+
+    # Add human-readable formatting
+    df["human_readable_coll_msg_size_bytes"] = df["coll_msg_size_bytes"].apply(
+        bytes_to_human_readable
+    )
+
+    # Log example of time range data for first few rows
+    if len(df) > 0:
+        sample_row = df.iloc[0]
+        start_time = timestamp_to_datetime(sample_row['coll_start_timestamp_us'])
+        end_time = timestamp_to_datetime(sample_row['coll_end_timestamp_us'])
+        duration = microseconds_to_human_readable(sample_row['coll_duration_us'])
+        logging.info(f"Example time range - ID: {sample_row['id']}, Coll_SN: {sample_row['coll_sn']}, "
+                     f"Start: {start_time}, End: {end_time}, Duration: {duration}")
+
+    return df
+
+
+def generate_visualizations(df, output_root, comm_type, coll_type):
+    """Generate all visualizations and save CSV files for the processed data"""
+    logging.info(f"Generating visualizations for {comm_type} and {coll_type}")
+
+    summary_dir = output_root / "summary"
+    summary_dir.mkdir(parents=True, exist_ok=True)
+
+    # Scatter Plot for all message sizes
+    output_file = summary_dir / f"scatter_plot_{comm_type}_{coll_type}.png"
+    generate_scatter_plot(df, comm_type, coll_type, output_file)
+
+    # Combined Scatter Plot for all message sizes
+    output_file = summary_dir / f"combined_scatter_plot_{comm_type}_{coll_type}.png"
+    generate_combined_scatter_plot(df, comm_type, coll_type, output_file)
+
+    distinct_msg_sizes = df["coll_msg_size_bytes"].unique()
+    for msg_size in distinct_msg_sizes:
+        hr_msg_size = bytes_to_human_readable(msg_size)
+        msg_size_dir = summary_dir / f"msg_size_{msg_size}_{hr_msg_size}"
+        msg_size_hist_dir = msg_size_dir / "histograms"
+        msg_size_boxplot_dir = msg_size_dir / "boxplots"
+        msg_size_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_hist_dir.mkdir(parents=True, exist_ok=True)
+        msg_size_boxplot_dir.mkdir(parents=True, exist_ok=True)
+
+        df_msg_size = df[df["coll_msg_size_bytes"] == msg_size]
+
+        # Add human-readable time formatting
+        df_msg_size = df_msg_size.copy()
+        df_msg_size["coll_start_datetime"] = df_msg_size["coll_start_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_end_datetime"] = df_msg_size["coll_end_timestamp_us"].apply(timestamp_to_datetime)
+        df_msg_size["coll_duration_human"] = df_msg_size["coll_duration_us"].apply(microseconds_to_human_readable)
+
+        # Histogram
+        output_file = (
+            msg_size_hist_dir / f"histogram_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_histogram(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        # Box Plot
+        output_file = (
+            msg_size_boxplot_dir / f"boxplot_{comm_type}_{coll_type}_{msg_size}.png"
+        )
+        generate_boxplot(
+            df_msg_size,
+            comm_type,
+            coll_type,
+            output_file,
+            bytes_to_human_readable(msg_size),
+        )
+
+        output_file = msg_size_dir / f"summary_{comm_type}_{coll_type}_{msg_size}.csv"
+        df_msg_size.to_csv(output_file, index=False)
+        logging.info(
+            f"Summary for {comm_type}, {coll_type}, and msg_size {msg_size} written to {output_file}"
+        )
+
+
+def generate_summary(output_root, comm_type, coll_type, output_dir_name):
+    """Generate summary by summarizing data per comm/coll type and creating visualizations"""
+    logging.info(f"Generating summary for {output_dir_name}, {comm_type} and {coll_type}")
+
+    # Step 1: Summarize data per communication and collective type
+    df = summarize_data_per_comm_coll_type(output_root, comm_type, coll_type, output_dir_name)
+
+    # Step 2: Generate visualizations if data exists
+    if df is not None:
+        generate_visualizations(df, output_root, comm_type, coll_type)
+    else:
+        logging.warning(f"No data found for {comm_type} and {coll_type} - skipping visualization generation")
+
+
+def generate_summary_wrapper(args):
+    return generate_summary(*args)
+
+
+if __name__ == "__main__":
+    files, output_dir_name = get_log_files_and_output_dir()
+    print(f"Number of log files found: {len(files)}")
+    print(f"Output directory: {output_dir_name}")
+    output_dir = Path(output_dir_name)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    setup_logging(output_dir)
+    create_per_node_parquet_files(files, output_dir)
+    comm_types = ["single-rank", "nvlink-only", "hca-only", "mixed"]
+    coll_types = ["AllReduce", "AllGather", "ReduceScatter", "Broadcast"]
+    summary_args = [
+        (output_dir, comm_type, coll_type, output_dir_name)
+        for comm_type in comm_types
+        for coll_type in coll_types
+    ]
+    max_workers = min(64, len(summary_args), os.cpu_count() or 1)
+    with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        list(
+            tqdm(
+                executor.map(generate_summary_wrapper, summary_args),
+                total=len(summary_args),
+                desc="Generating summaries",
+            )
+        )
+        print("Done!")
diff --git a/ext-profiler/inspector/exporter/example/requirements.txt b/ext-profiler/inspector/exporter/example/requirements.txt
new file mode 100644
index 000000000..8a47aae51
--- /dev/null
+++ b/ext-profiler/inspector/exporter/example/requirements.txt
@@ -0,0 +1,6 @@
+pandas>=1.3.0
+tqdm>=4.60.0
+duckdb>=0.8.0
+matplotlib>=3.3.0
+pyarrow>=5.0.0
+numpy>=1.21.0
diff --git a/ext-profiler/inspector/inspector.cc b/ext-profiler/inspector/inspector.cc
new file mode 100644
index 000000000..0cb9371d5
--- /dev/null
+++ b/ext-profiler/inspector/inspector.cc
@@ -0,0 +1,1530 @@
+#include "inspector.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <cstring>
+
+#include "common.h"
+
+#define JSON_CHK(expr)                                                  \
+  do {                                                                  \
+    const jsonResult_t res = (expr);                                    \
+    if (res != jsonSuccess) {                                           \
+      INFO(NCCL_INSPECTOR, "jsonError: %s\n", jsonErrorString(res));    \
+      return inspectorJsonError;                                        \
+    }                                                                   \
+  } while (0)
+
+#define INS_CHK(call)                                                   \
+  do {                                                                  \
+    inspectorResult_t res = call;                                       \
+    if (inspectorSuccess != res) {                                      \
+      INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
+           inspectorErrorString(res));                                  \
+      return res;                                                       \
+    }                                                                   \
+  } while (0);
+
+#define JSON_CHK_GOTO(expr, res, label)                                 \
+  do {                                                                  \
+    const jsonResult_t macro_res = (expr);                              \
+    if (macro_res != jsonSuccess) {                                     \
+      INFO(NCCL_INSPECTOR, "jsonError: %s\n", jsonErrorString(macro_res)); \
+      res = inspectorJsonError;                                         \
+      goto label;                                                       \
+    }                                                                   \
+  } while (0)
+
+#define INS_CUDA_CHK(cmd)                                               \
+  do {                                                                  \
+    cudaError_t err = cmd;                                              \
+    if (err != cudaSuccess) {                                           \
+      INFO(NCCL_INSPECTOR, "Cuda failure '%s'", cudaGetErrorString(err)); \
+      return inspectorCudaError;                                        \
+    }                                                                   \
+  } while (false)
+
+
+// Global flag to control inspector use
+static bool enableNcclInspector = false;
+// Global flag to control starting internal dump thread
+static bool enableNcclInspectorDumpThread = false;
+// Global flag to control verbose dumping (event_trace)
+static bool enableNcclInspectorDumpVerbose = false;
+// Extra guard to prevent spurious messages for eager pollers that try to dump
+// out results before we have initialized
+static bool ncclInspectorInit = false;
+
+// Define the global logFn variable
+ncclDebugLogger_t logFn = nullptr;
+
+/*
+ * Description:
+ *
+ *   Returns the current time in microseconds since the epoch.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (uses gettimeofday).
+ *
+ * Input:
+ *
+ *   None.
+ *
+ * Output:
+ *
+ *   None.
+ *
+ * Return:
+ *   uint64_t - current time in microseconds.
+ *
+ * Error Handling:
+ *   This function uses gettimeofday() which rarely fails. In case of
+ *   failure, the function returns 0. Callers should check for 0 return
+ *   value if precise error handling is required.
+ *
+ */
+uint64_t inspectorGetTime() {
+  uint64_t ts = 0;
+  timeval tv;
+
+  gettimeofday(&tv, 0);
+  ts = tv.tv_sec * 1000000 + tv.tv_usec;
+  return ts;
+}
+
+/*
+ * Description:
+ *
+ *   Converts a string to the corresponding ncclDataType_t enum value.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only string input).
+ *
+ * Input:
+ *
+ *   const char* str - string representation of the datatype.
+ *
+ * Output:
+ *
+ *   None.
+ *
+ * Return:
+ *
+ *   ncclDataType_t - corresponding enum value, or -1 if unknown.
+ *
+ */
+ncclDataType_t inspectorStringToDatatype(const char* str) {
+  if (strcmp(str, "ncclInt8") == 0) return ncclInt8;
+  if (strcmp(str, "ncclInt32") == 0) return ncclInt32;
+  if (strcmp(str, "ncclUint32") == 0) return ncclUint32;
+  if (strcmp(str, "ncclInt64") == 0) return ncclInt64;
+  if (strcmp(str, "ncclUint64") == 0) return ncclUint64;
+  if (strcmp(str, "ncclFloat16") == 0) return ncclFloat16;
+  if (strcmp(str, "ncclFloat32") == 0) return ncclFloat32;
+  if (strcmp(str, "ncclFloat64") == 0) return ncclFloat64;
+  if (strcmp(str, "ncclBfloat16") == 0) return ncclBfloat16;
+  if (strcmp(str, "ncclFloat8e4m3") == 0) return ncclFloat8e4m3;
+  if (strcmp(str, "ncclFloat8e5m2") == 0) return ncclFloat8e5m2;
+  return (ncclDataType_t)-1;  // Or handle error as appropriate
+}
+
+/*
+ * Description:
+ *
+ *   Converts a string to the corresponding ncclFunc_t enum value.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only string input).
+ *
+ * Input:
+ *   const char* str - string representation of the function (must not be NULL).
+ *
+ * Output:
+ *   None.
+ *
+ * Return:
+ *   ncclFunc_t - corresponding enum value, or ncclNumFuncs if unknown.
+ *
+ * Preconditions:
+ *   - str must not be NULL
+ */
+ncclFunc_t ncclStringToFunc(const char* str) {
+  if (strcmp(str, "AllGather") == 0) return ncclFuncAllGather;
+  if (strcmp(str, "AllReduce") == 0) return ncclFuncAllReduce;
+  if (strcmp(str, "Broadcast") == 0) return ncclFuncBroadcast;
+  if (strcmp(str, "Recv") == 0) return ncclFuncRecv;
+  if (strcmp(str, "Reduce") == 0) return ncclFuncReduce;
+  if (strcmp(str, "ReduceScatter") == 0) return ncclFuncReduceScatter;
+  if (strcmp(str, "SendRecv") == 0) return ncclFuncSendRecv;
+  if (strcmp(str, "Send") == 0) return ncclFuncSend;
+  return ncclNumFuncs; // Invalid / unknown
+}
+
+const char* ncclFuncToString(ncclFunc_t fn) {
+  switch (fn) {
+  case ncclFuncAllGather: return "AllGather";
+  case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncRecv: return "Recv";
+  case ncclFuncReduce: return "Reduce";
+  case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncSendRecv: return "SendRecv";
+  case ncclFuncSend: return "Send";
+  default: return "Invalid";
+  }
+}
+
+struct inspectorDumpThread;
+static inspectorDumpThread* dumper = nullptr;
+
+#define UNUSED(x) (void)(x)
+
+inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef) {
+  if (0 != pthread_rwlock_init(lockRef, nullptr)) {
+    return inspectorLockError;
+  } else {
+    return inspectorSuccess;
+  }
+}
+
+inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef) {
+  if (0 != pthread_rwlock_destroy(lockRef)) {
+    return inspectorLockError;
+  } else {
+    return inspectorSuccess;
+  }
+}
+
+inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef) {
+  if (0 != pthread_rwlock_rdlock(lockRef)) {
+    return inspectorLockError;
+  } else {
+    return inspectorSuccess;
+  }
+}
+
+inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef) {
+  if (0 != pthread_rwlock_wrlock(lockRef)) {
+    return inspectorLockError;
+  } else {
+    return inspectorSuccess;
+  }
+}
+
+inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef) {
+  if (0 != pthread_rwlock_unlock(lockRef)) {
+    return inspectorLockError;
+  } else {
+    return inspectorSuccess;
+  }
+}
+
+// TODO inspect these retvals
+#define INSPECTOR_LOCK_RD_FLAG(lockRef, lockFlag, debug)        \
+  do {                                                          \
+    if (!lockFlag) {                                            \
+      INS_CHK(inspectorLockRd(lockRef));                 \
+    }                                                           \
+    lockFlag = true;                                            \
+  } while (0);
+
+#define INSPECTOR_LOCK_WR_FLAG(lockRef, lockFlag, debug)        \
+  do {                                                          \
+    if (!lockFlag) {                                            \
+      INS_CHK(inspectorLockWr(lockRef));                 \
+    }                                                           \
+    lockFlag = true;                                            \
+  } while (0);
+
+#define INSPECTOR_UNLOCK_RW_LOCK_FLAG(lockRef, lockFlag, debug) \
+  do {                                                          \
+    if (lockFlag) {                                             \
+      INS_CHK(inspectorUnlockRWLock(lockRef));           \
+    }                                                           \
+    lockFlag = false;                                           \
+  } while (0);
+
+struct inspectorCommInfoList {
+  struct inspectorCommInfo* comms;
+  uint32_t ncomms;
+  pthread_rwlock_t guard;
+};
+
+struct inspectorState {
+  struct inspectorCommInfoList liveComms;
+  struct inspectorCommInfoList deletedComms;
+};
+
+
+static inspectorState g_state;
+
+static inspectorResult_t inspectorCommInfoListInit(struct inspectorCommInfoList* commList) {
+  if (commList->comms) {
+    return inspectorGlobalInitError;
+  }
+  commList->comms = nullptr;
+  commList->ncomms = 0;
+  INS_CHK(inspectorLockInit(&commList->guard));
+  return inspectorSuccess;
+}
+
+static inspectorResult_t inspectorGlobalStateInit() {
+  memset(&g_state, 0, sizeof(struct inspectorState));
+  INS_CHK(inspectorCommInfoListInit(&g_state.liveComms));
+  INS_CHK(inspectorCommInfoListInit(&g_state.deletedComms));
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Converts inspectorTimingSource_t enum to a string representation.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only operation).
+ *
+ * Input:
+ *   inspectorTimingSource_t timingSource - timing source enum value.
+ *
+ * Output:
+ *   None.
+ *
+ * Return:
+ *   const char* - string representation of the timing source.
+ */
+static const char* inspectorTimingSourceToString(inspectorTimingSource_t timingSource) {
+  switch (timingSource) {
+  case inspectorTimingSourceKernelGpu:
+    return "kernel_gpu";
+  case inspectorTimingSourceKernelCpu:
+    return "kernel_cpu";
+  case inspectorTimingSourceCollectiveCpu:
+    return "collective_cpu";
+  default:
+    return "unknown";
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Writes the header information for a communicator to the JSON output.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle.
+ *   struct inspectorCommInfo* commInfo - communicator info.
+ *
+ * Output:
+ *   Header is written to JSON output.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inspectorResult_t inspectorCommInfoHeader(jsonFileOutput* jfo,
+                                                 struct inspectorCommInfo* commInfo) {
+  JSON_CHK(jsonStartObject(jfo));
+  JSON_CHK(jsonKey(jfo, "id")); JSON_CHK(jsonStr(jfo, commInfo->commHashStr));
+  JSON_CHK(jsonKey(jfo, "rank")); JSON_CHK(jsonInt(jfo, commInfo->rank));
+  JSON_CHK(jsonKey(jfo, "n_ranks")); JSON_CHK(jsonInt(jfo, commInfo->nranks));
+  JSON_CHK(jsonKey(jfo, "nnodes")); JSON_CHK(jsonUint64(jfo, commInfo->nnodes));
+  JSON_CHK(jsonFinishObject(jfo));
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Writes metadata header information to the JSON output.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle.
+ *
+ * Output:
+ *   Metadata header is written to JSON output.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inspectorResult_t inspectorCommInfoMetaHeader(jsonFileOutput* jfo) {
+  JSON_CHK(jsonStartObject(jfo));
+  {
+    JSON_CHK(jsonKey(jfo, "inspector_output_format_version")); JSON_CHK(jsonStr(jfo, "v4.0"));
+    JSON_CHK(jsonKey(jfo, "git_rev")); JSON_CHK(jsonStr(jfo, get_git_version_info()));
+    JSON_CHK(jsonKey(jfo, "rec_mechanism")); JSON_CHK(jsonStr(jfo, "nccl_profiler_interface"));
+    JSON_CHK(jsonKey(jfo, "dump_timestamp_us")); JSON_CHK(jsonUint64(jfo, inspectorGetTime()));
+    char hostname[256];
+    gethostname(hostname, 255);
+    JSON_CHK(jsonKey(jfo, "hostname")); JSON_CHK(jsonStr(jfo, hostname));
+    JSON_CHK(jsonKey(jfo, "pid")); JSON_CHK(jsonUint64(jfo, getpid()));
+  }
+  JSON_CHK(jsonFinishObject(jfo));
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Writes verbose information (event_trace) for a completed
+ *   collective operation to the JSON output.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle.
+ *   const struct inspectorCompletedCollInfo* collInfo - completed
+ *   collective info.
+ *
+ * Output:
+ *   Verbose collective info is written to JSON output.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inline inspectorResult_t inspectorCompletedCollVerbose(jsonFileOutput* jfo,
+                                                              struct inspectorCompletedCollInfo* collInfo) {
+  // Add event trace information
+  JSON_CHK(jsonKey(jfo, "event_trace_sn"));
+  JSON_CHK(jsonStartObject(jfo));
+  {
+    // Collective events
+    JSON_CHK(jsonKey(jfo, "coll_start_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_START].sn));
+    JSON_CHK(jsonKey(jfo, "coll_stop_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_STOP].sn));
+
+    // Kernel events
+    JSON_CHK(jsonKey(jfo, "kernel_events"));
+    JSON_CHK(jsonStartList(jfo));
+    for (uint32_t ch = 0; ch < collInfo->collEvtTrk.nChannels; ch++) {
+      JSON_CHK(jsonStartObject(jfo));
+      JSON_CHK(jsonKey(jfo, "channel_id")); JSON_CHK(jsonInt(jfo, ch));
+      JSON_CHK(jsonKey(jfo, "kernel_start_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_START].sn));
+      JSON_CHK(jsonKey(jfo, "kernel_stop_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_STOP].sn));
+      JSON_CHK(jsonKey(jfo, "kernel_record_sn")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_RECORD].sn));
+      JSON_CHK(jsonFinishObject(jfo));
+    }
+    JSON_CHK(jsonFinishList(jfo));
+  }
+  JSON_CHK(jsonFinishObject(jfo));
+
+  JSON_CHK(jsonKey(jfo, "event_trace_ts"));
+  JSON_CHK(jsonStartObject(jfo));
+  {
+    // Collective events
+    JSON_CHK(jsonKey(jfo, "coll_start_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_START].ts));
+    JSON_CHK(jsonKey(jfo, "coll_stop_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.evntTrace[NCCL_INSP_EVT_TRK_COLL_STOP].ts));
+
+    // Kernel events
+    JSON_CHK(jsonKey(jfo, "kernel_events"));
+    JSON_CHK(jsonStartList(jfo));
+    for (uint32_t ch = 0; ch < collInfo->collEvtTrk.nChannels; ch++) {
+      JSON_CHK(jsonStartObject(jfo));
+      JSON_CHK(jsonKey(jfo, "channel_id")); JSON_CHK(jsonInt(jfo, ch));
+      JSON_CHK(jsonKey(jfo, "kernel_start_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_START].ts));
+      JSON_CHK(jsonKey(jfo, "kernel_stop_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_STOP].ts));
+      JSON_CHK(jsonKey(jfo, "kernel_record_ts")); JSON_CHK(jsonUint64(jfo, collInfo->collEvtTrk.kernelCh[ch].evntTrace[NCCL_INSP_EVT_TRK_KERNEL_RECORD].ts));
+      JSON_CHK(jsonFinishObject(jfo));
+    }
+    JSON_CHK(jsonFinishList(jfo));
+  }
+  JSON_CHK(jsonFinishObject(jfo));
+
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Writes completed collective operation information to the JSON
+ *   output.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle.
+ *   const struct inspectorCompletedCollInfo* collInfo - completed
+ *   collective info.
+ *
+ * Output:
+ *   Collective info is written to JSON output.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inline inspectorResult_t inspectorCompletedColl(jsonFileOutput* jfo,
+                                                        struct inspectorCompletedCollInfo* collInfo) {
+  JSON_CHK(jsonStartObject(jfo));
+  {
+
+    JSON_CHK(jsonKey(jfo, "coll")); JSON_CHK(jsonStr(jfo, ncclFuncToString(collInfo->func)));
+
+    JSON_CHK(jsonKey(jfo, "coll_sn")); JSON_CHK(jsonUint64(jfo, collInfo->sn));
+
+    JSON_CHK(jsonKey(jfo, "coll_msg_size_bytes")); JSON_CHK(jsonUint64(jfo, collInfo->msgSizeBytes));
+
+    JSON_CHK(jsonKey(jfo, "coll_exec_time_us")); JSON_CHK(jsonUint64(jfo, collInfo->execTimeUsecs));
+
+    JSON_CHK(jsonKey(jfo, "coll_timing_source")); JSON_CHK(jsonStr(jfo, inspectorTimingSourceToString(collInfo->timingSource)));
+
+    JSON_CHK(jsonKey(jfo, "coll_algobw_gbs")); JSON_CHK(jsonDouble(jfo, collInfo->algoBwGbs));
+
+    JSON_CHK(jsonKey(jfo, "coll_busbw_gbs")); JSON_CHK(jsonDouble(jfo, collInfo->busBwGbs));
+
+    if (enableNcclInspectorDumpVerbose) {
+      INS_CHK(inspectorCompletedCollVerbose(jfo, collInfo));
+    }
+  }
+  JSON_CHK(jsonFinishObject(jfo));
+
+  return inspectorSuccess;
+}
+
+
+/*
+ * Description:
+ *
+ *   Dumps the state of a communicator to the JSON output if needed.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle.
+ *   inspectorCommInfo* commInfo - communicator info.
+ *   bool* needs_writing - set to true if output was written.
+ *
+ * Output:
+ *   State is dumped to JSON output if needed.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inspectorResult_t inspectorCommInfoDump(jsonFileOutput* jfo,
+                                               inspectorCommInfo* commInfo,
+                                               bool* needs_writing) {
+  *needs_writing = false;
+
+  if (commInfo == nullptr)
+    return inspectorSuccess;
+
+  struct inspectorCompletedCollInfo collInfo;
+  memset(&collInfo, 0, sizeof(struct inspectorCompletedCollInfo));
+
+  inspectorLockWr(&commInfo->guard);
+  if (commInfo->dump) {
+    *needs_writing = true;
+    memcpy(&collInfo,
+           &commInfo->completedCollInfo,
+           sizeof(struct inspectorCompletedCollInfo));
+    commInfo->dump = false;
+  }
+  inspectorUnlockRWLock(&commInfo->guard);
+
+  if (*needs_writing) {
+    JSON_CHK(jsonLockOutput(jfo));
+    JSON_CHK(jsonStartObject(jfo));
+    {
+      JSON_CHK(jsonKey(jfo, "header"));
+      inspectorCommInfoHeader(jfo, commInfo);
+
+      JSON_CHK(jsonKey(jfo, "metadata"));
+      inspectorCommInfoMetaHeader(jfo);
+
+      JSON_CHK(jsonKey(jfo, "coll_perf"));
+      INS_CHK(inspectorCompletedColl(jfo, &collInfo));
+    }
+    JSON_CHK(jsonFinishObject(jfo));
+    JSON_CHK(jsonNewline(jfo));
+    JSON_CHK(jsonUnlockOutput(jfo));
+  }
+  return inspectorSuccess;
+}
+
+
+/*
+ * Description:
+ *
+ *   Dumps the state of all communicators in a commList to the JSON
+ *   output.
+ *
+ * Thread Safety:
+ *   Thread-safe - assumes no locks are taken and acquires all necessary
+ *   locks to iterate through all communicator objects and dump their state.
+ *
+ * Input:
+ *   jsonFileOutput* jfo - JSON output handle (must not be NULL).
+ *   struct inspectorCommInfoList* commList - list of communicators (must not be NULL).
+ *
+ * Output:
+ *   State of all communicators is dumped to JSON output.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inspectorResult_t inspectorCommInfoListDump(jsonFileOutput* jfo,
+                                                   struct inspectorCommInfoList* commList) {
+  bool flush = false;
+  INS_CHK(inspectorLockRd(&commList->guard));
+  inspectorResult_t res = inspectorSuccess;
+  if (commList->ncomms > 0) {
+    for (struct inspectorCommInfo* itr = commList->comms;
+         itr != nullptr;
+         itr = itr->next) {
+      bool needs_writing;
+      INS_CHK_GOTO(inspectorCommInfoDump(jfo, itr, &needs_writing), res, finalize);
+      if (needs_writing) {
+        flush = true;
+      }
+    }
+    if (flush) {
+      JSON_CHK_GOTO(jsonLockOutput(jfo), res, finalize);
+      JSON_CHK_GOTO(jsonFlushOutput(jfo), res, finalize);
+      JSON_CHK_GOTO(jsonUnlockOutput(jfo), res, finalize);
+    }
+  }
+finalize:
+  INS_CHK(inspectorUnlockRWLock(&commList->guard));
+  return res;
+}
+
+/*
+ * Description:
+ *   Finalizes and cleans up a commList, freeing all communicators.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called with proper locking).
+ *
+ * Input:
+ *   struct commList* commList - list of communicators.
+ *
+ * Output:
+ *   All communicators are freed.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+static inspectorResult_t inspectorCommInfoListFinalize(struct inspectorCommInfoList* commList) {
+  struct inspectorCommInfo* nextComm = nullptr;
+  INS_CHK(inspectorLockWr(&commList->guard));
+  while (commList->comms != nullptr && commList->ncomms != 0) {
+    INFO(NCCL_INSPECTOR, "NCCL Inspector: comm %lu still in tracker",
+         commList->comms->commHash);
+    nextComm = commList->comms->next;
+    INS_CHK(inspectorLockDestroy(&commList->comms->guard));
+    free(commList->comms);
+    commList->comms = nextComm;
+    commList->ncomms--;
+  }
+  INS_CHK(inspectorUnlockRWLock(&commList->guard));
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Ensures the given directory exists and is writable, creating it
+ *   if necessary.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called during initialization).
+ *
+ * Input:
+ *   char* workdir - directory path.
+ *
+ * Output:
+ *   Directory is created if needed.
+ *
+ * Return:
+ *
+ *   bool - true if directory exists and is writable, false otherwise.
+ *
+ */
+static bool ensureDir(char* workdir) {
+  struct stat st;
+
+  // Check if directory exists
+  if (stat(workdir, &st) == 0) {
+    if (S_ISDIR(st.st_mode)) {
+      // Directory exists, check if it's writable
+      if (access(workdir, W_OK) == 0) {
+        return true; // Directory exists and is writable
+      } else {
+        INFO(NCCL_INSPECTOR,
+             "NCCL Inspectoer: dump directory %s exists, but is not "
+             "writable",
+             workdir);
+        return false;
+      }
+    } else {
+      INFO(NCCL_INSPECTOR,
+           "NCCL Inspector: dump location %s exists, but is not a "
+           "directory",
+           workdir);
+      return false;
+    }
+  } else {
+    // Directory doesn't exist, try to create it
+    const mode_t mode = 0777;
+    if (mkdir(workdir, mode) == 0) {
+      return true; // Directory created successfully
+    } else {
+      INFO(NCCL_INSPECTOR,
+           "NCCL Inspector: failed to create dump directory %s: %s", workdir,
+           strerror(errno));
+      return false;
+    }
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Generates the output dump directory path based on environment
+ *   variables.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called during initialization).
+ *
+ * Input:
+ *   char** workdir - pointer to output directory string.
+ *
+ * Output:
+ *   workdir is set to the generated directory path.
+ *
+ * Return:
+ *   None.
+ */
+static void genDumpDir(char** workdir) {
+  char* dumpdir = getenv("NCCL_INSPECTOR_DUMP_DIR");
+  if (dumpdir != NULL) {
+    *workdir = strdup(dumpdir);
+    // TODO check errors here
+    return;
+  }
+
+  char* jobid = getenv("SLURM_JOBID");
+  bool badJobId = true;
+  if (jobid != NULL) {
+    errno = 0;
+    const int intid = strtol(jobid, NULL, 10);
+    if (errno == 0) {
+      char tmp[2048];
+      snprintf(tmp, 2048, "nccl-inspector-%d", intid);
+      *workdir = strdup(tmp);
+      badJobId = false;
+    }
+  }
+
+  if (badJobId) {
+    *workdir = strdup("nccl-inspector-unknown-jobid");
+  }
+}
+
+struct inspectorDumpThread {
+  bool run{false};
+  jsonFileOutput* jfo;
+  char* outputRoot;
+  uint64_t sampleIntervalUsecs;
+  pthread_t pthread;
+  pthread_rwlock_t guard;
+
+  inspectorDumpThread(const char* outputRoot, uint64_t sampleIntervalUsecs)
+    : jfo(nullptr), outputRoot(strdup(outputRoot)), sampleIntervalUsecs(sampleIntervalUsecs) {
+    if (inspectorLockInit(&guard) != inspectorSuccess) {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: couldn't init lock");
+    }
+  }
+
+  ~inspectorDumpThread() {
+    if (jfo != nullptr) {
+      jsonFinalizeFileOutput(jfo);
+      jfo = nullptr;
+    }
+    if (outputRoot != nullptr) {
+      free(outputRoot);
+      outputRoot = nullptr;
+    }
+    if (inspectorLockDestroy(&guard) != inspectorSuccess) {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: couldn't destroy lock");
+    }
+  }
+
+  void startThread() {
+    inspectorLockWr(&guard);
+    run = true;
+    inspectorUnlockRWLock(&guard);
+    if (pthread_create(&pthread, NULL, dumpMain, this) != 0) {
+      INFO(NCCL_INSPECTOR,
+           "NCCL Inspector inspectorDumpThread: couldn't create dump thread!");
+      return;
+    }
+    INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: created");
+  }
+
+  void stopThread() {
+    INFO(NCCL_ENV, "NCCL Inspector Stopping Dump thread");
+    inspectorLockWr(&guard);
+    run = false;
+    inspectorUnlockRWLock(&guard);
+    struct timespec ts;
+    ts.tv_sec = 0;
+    ts.tv_nsec = 1000000; // 1ms
+    nanosleep(&ts, NULL);
+    INFO(NCCL_INSPECTOR, "NCCL Inspector inspectorDumpThread: stopped");
+  }
+
+  inspectorResult_t inspectorStateDump(const char* output_root) {
+    if (!ncclInspectorInit) {
+      return inspectorUninitializedError;
+    }
+    if (!enableNcclInspector) {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector is not enabled, will not do ncclAllCommTallyDump");
+      return inspectorDisabledError;
+    }
+
+    if (jfo == 0) {
+      char hostname[256];
+      gethostname(hostname, 255);
+      char tmp[2048];
+      snprintf(tmp, sizeof(tmp), "%s/%s-pid%d.log", output_root, hostname, getpid());
+      jsonResult_t result = jsonInitFileOutput(&jfo, tmp);
+      if (jsonSuccess != result) {
+        INFO(NCCL_INSPECTOR, "Cannot open %s for writing: %s", tmp, jsonErrorString(result));
+        return inspectorFileOpenError;
+      }
+      chmod(tmp, 0666);
+    }
+
+    if (jfo != nullptr) {
+      inspectorCommInfoListDump(jfo, &g_state.liveComms);
+      inspectorCommInfoListDump(jfo, &g_state.deletedComms);
+    }
+
+    if (g_state.deletedComms.ncomms > 0) {
+      inspectorCommInfoListFinalize(&g_state.deletedComms);
+    }
+    return inspectorSuccess;
+  }
+
+  static void* dumpMain(void* arg) {
+    inspectorDumpThread* dumper = (inspectorDumpThread*)arg;
+    inspectorResult_t res = inspectorSuccess;
+    struct timespec ts;
+    ts.tv_sec = dumper->sampleIntervalUsecs / 1000000;
+    ts.tv_nsec = dumper->sampleIntervalUsecs % 1000000;
+
+    while (dumper->run) {
+      inspectorLockWr(&dumper->guard);
+      if (!dumper->run) {
+        inspectorUnlockRWLock(&dumper->guard);
+        break;
+      }
+      res = dumper->inspectorStateDump(dumper->outputRoot);
+      if (res == inspectorFileOpenError || res == inspectorDisabledError) {
+        inspectorUnlockRWLock(&dumper->guard);
+        break;
+      }
+      inspectorUnlockRWLock(&dumper->guard);
+
+      nanosleep(&ts, NULL);
+    }
+
+    return 0;
+  }
+};
+
+/*
+ * Description:
+ *
+ *   Shows the NCCL Inspector plugin version and configuration
+ *   environment variables in a structured format similar to NCCL's
+ *   showVersion function.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only environment variable access).
+ *
+ * Input:
+ *   None.
+ *
+ * Output:
+ *   Logs version and environment variables to debug output.
+ *
+ * Return:
+ *   None.
+ */
+static void showInspectorVersion() {
+  VERSION("NCCL Inspector Plugin - Version: %s", get_git_version_info());
+}
+
+/*
+ * Description:
+ *
+ *   Shows all NCCL Inspector environment variables and their values
+ *   in a structured format.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only environment variable access).
+ *
+ * Input:
+ *   None.
+ *
+ * Output:
+ *   Logs environment variables to debug output.
+ *
+ * Return:
+ *   None.
+ */
+static void showInspectorEnvVars() {
+  struct {
+    const char* name;
+    const char* value;
+    const char* defaultVal;
+    const char* description;
+  } envVars[] = {
+    {"NCCL_INSPECTOR_ENABLE", getenv("NCCL_INSPECTOR_ENABLE"), "0", "Enable/disable inspector plugin"},
+    {"NCCL_INSPECTOR_DUMP_THREAD_ENABLE", getenv("NCCL_INSPECTOR_DUMP_THREAD_ENABLE"), "1", "Enable/disable dump thread"},
+    {"NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS", getenv("NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS"), "0", "Dump thread interval in microseconds"},
+    {"NCCL_INSPECTOR_DUMP_DIR", getenv("NCCL_INSPECTOR_DUMP_DIR"), "(auto-generated)", "Output directory for inspector logs"},
+    {"NCCL_INSPECTOR_DUMP_VERBOSE", getenv("NCCL_INSPECTOR_DUMP_VERBOSE"), "0", "Enable/disable verbose dumping (event_trace)"}
+  };
+
+  const int numEnvVars = sizeof(envVars) / sizeof(envVars[0]);
+
+  VERSION("NCCL Inspector Environment Variables:");
+  for (int i = 0; i < numEnvVars; i++) {
+    VERSION("  %s = %s%s%s",
+            envVars[i].name,
+            envVars[i].value ? envVars[i].value : "(not set)",
+            envVars[i].value ? "" : ", default=",
+            envVars[i].value ? "" : envVars[i].defaultVal);
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Initializes the global inspector state and starts the dump thread
+ *   if enabled.
+ *
+ * Thread Safety:
+ *
+ *   Not thread-safe (should be called during initialization).
+ *
+ * Input:
+ *   None.
+ *
+ * Output:
+ *   Global state is initialized and dump thread may be started.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ */
+inspectorResult_t inspectorGlobalInit(int rank) {
+  char* str = getenv("NCCL_INSPECTOR_ENABLE");
+  int enable = str ? atoi(str) : 0; // default disable
+  enableNcclInspector = enable == 0 ? false : true;
+  ncclInspectorInit = true;
+
+  // Show version and environment configuration (similar to NCCL's showVersion)
+  if (rank == 0) {
+    showInspectorVersion();
+    showInspectorEnvVars();
+  }
+
+  if (enableNcclInspector == false) {
+    VERSION("NCCL Inspector Plugin DISABLED (NCCL_INSPECTOR_ENABLE=%s)",
+            str ? str : "0");
+    return inspectorDisabledError;
+  }
+
+  INS_CHK(inspectorGlobalStateInit());
+
+  str = getenv("NCCL_INSPECTOR_DUMP_THREAD_ENABLE");
+  enable = str ? atoi(str) : 1; // default enable
+  enableNcclInspectorDumpThread = enable == 0 ? false : true;
+
+  str = getenv("NCCL_INSPECTOR_DUMP_VERBOSE");
+  enable = str ? atoi(str) : 0; // default disable
+  enableNcclInspectorDumpVerbose = enable == 0 ? false : true;
+
+  if (enableNcclInspectorDumpThread) {
+    str = getenv("NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS");
+    const uint64_t interval = str ? strtoull(str, 0, 0) : 0;
+
+    if (interval == 0) {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector: dump thread enabled but "
+           "NCCL_INSPECTOR_DUMP_THREAD_INTERVAL_MICROSECONDS is 0; not "
+           "starting internal dump "
+           "thread.");
+      return inspectorSuccess;
+    }
+
+    char* dumpdir;
+    genDumpDir(&dumpdir);
+
+    if (dumpdir != nullptr) {
+      if (!ensureDir(dumpdir)) {
+        free(dumpdir);
+        INFO(NCCL_INSPECTOR, "NCCL Inspector: failed to generate a dump dir; not "
+             "starting internal dump thread.");
+        return inspectorSuccess;
+      }
+
+      dumper = new inspectorDumpThread(dumpdir, interval);
+      dumper->startThread();
+
+      INFO(NCCL_INSPECTOR,
+           "NCCL Inspector enabled with polling interval %lu us and "
+           "output directory %s",
+           interval, dumpdir);
+      free(dumpdir);
+    } else {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector: failed to generate a dump "
+           "dir; not starting internal dump thread.");
+    }
+  } else {
+    INFO(NCCL_INSPECTOR,
+         "NCCL Inspector: NCCL_INSPECTOR_DUMP_THREAD_ENABLE set to 0; not "
+         "starting internal dump "
+         "thread.");
+  }
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Returns a string describing the given inspectorResult_t error
+ *   code.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only operation).
+ *
+ * Input:
+ *   inspectorResult_t result - error code.
+ *
+ * Output:
+ *   None.
+ *
+ * Return:
+ *   const char* - error string.
+ */
+const char* inspectorErrorString(inspectorResult_t result) {
+  switch (result) {
+  case inspectorSuccess:
+    return "Success";
+  case inspectorUninitializedError:
+    return "Inspector is not initialized";
+  case inspectorMemoryError:
+    return "Inspector encountered issue allocating memory";
+  case inspectorFileOpenError:
+    return "Inspector could not open file";
+  case inspectorDisabledError:
+    return "Inspector is disabled";
+  case inspectorLockError:
+    return "Inspector encountered error with lock";
+  case inspectorPthreadError:
+    return "Inspector encountered error with pthreads";
+  case inspectorJsonError:
+    return "Inspector encountered error while emitting JSON";
+  case inspectorCudaError:
+    return "Inspector encountered CUDA error";
+  case inspectorBadHash:
+    return "Inspector encountered bad communicator hash";
+  case inspectorDeleteUnknownCommError:
+    return "Inspector was asked to delete a communicator that it is not "
+      "tracking";
+  case inspectorAddDuplicateCommError:
+    return "Inspector was asked to add a communicator it was already "
+      "tracking";
+  case inspectorNop:
+    return "Inspector NOP";
+  case inspectorNullTally:
+    return "Inspector encountered a null OpTally";
+  case inspectorGlobalInitError:
+    return "Inspector encountered a repeated global init";
+  case inspectorReturn:
+    return "Inspector Unconditional Return";
+  default:
+    return "Unknown error";
+  }
+}
+
+/*
+ * Description:
+ *   Converts a communicator hash to a string.
+ *
+ * Thread Safety:
+ *   Thread-safe (writes to provided buffer).
+ *
+ * Input:
+ *   uint64_t commHash - communicator hash.
+ *   char hashStr[NCCL_COMM_HASH_LENGTH] - output buffer.
+ *
+ * Output:
+ *   hashStr is set to the string representation of commHash.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ */
+inspectorResult_t inspectorCommGetHashStr(uint64_t commHash,
+                                          char hashStr[NCCL_COMM_HASH_LENGTH]) {
+  snprintf(hashStr, NCCL_COMM_HASH_LENGTH, "0x%lx",
+           commHash);
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *   Compares two communicator configurations for equality.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only comparison).
+ *
+ * Input:
+ *   uint64_t lCommHash - left communicator hash.
+ *   uint64_t rCommHash - right communicator hash.
+ *   int lRank - left rank.
+ *   int rRank - right rank.
+ *
+ * Output:
+ *   None.
+ *
+ * Return:
+ *   bool - true if communicators are equal (same hash and rank), false otherwise.
+ */
+static bool comm_eq(uint64_t lCommHash, uint64_t rCommHash,
+                    int lRank, int rRank) {
+  return lCommHash == rCommHash && lRank == rRank;
+}
+
+/*
+ * Description:
+ *   Initializes a communicator info structure with the provided parameters.
+ *
+ * Thread Safety:
+ *   Not thread-safe - should be called during communicator initialization.
+ *
+ * Input:
+ *   struct inspectorCommInfo* commInfo - communicator info structure to initialize (must not be NULL).
+ *   const char* commName - communicator name (can be NULL).
+ *   uint64_t commHash - communicator hash.
+ *   int nnodes - number of nodes (must be > 0).
+ *   int nranks - number of ranks (must be > 0).
+ *   int rank - rank (must be >= 0 and < nranks).
+ *
+ * Output:
+ *   commInfo is initialized with the provided parameters.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ * Preconditions:
+ *   - commInfo must not be NULL
+ *   - nnodes must be positive
+ *   - nranks must be positive
+ *   - rank must be non-negative and less than nranks
+ */
+static inspectorResult_t inspectorFillCommInfo(struct inspectorCommInfo* commInfo,
+                                               const char* commName, uint64_t commHash,
+                                               int nnodes, int nranks, int rank) {
+  commInfo->commName = commName;
+  commInfo->commHash = commHash;
+  inspectorCommGetHashStr(commHash, commInfo->commHashStr);
+  commInfo->rank = rank;
+  commInfo->nranks = nranks;
+  commInfo->nnodes = nnodes;
+  commInfo->dump = false;
+  INS_CHK(inspectorLockInit(&commInfo->guard));
+  commInfo->next = nullptr;
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *   Adds a communicator to the global state.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses locks internally).
+ *
+ * Input:
+ *   struct inspectorCommInfo **commInfo - pointer to output struct (must not be NULL).
+ *   const char* commName - communicator name (can be NULL).
+ *   uint64_t commHash - communicator hash.
+ *   int nNodes - number of nodes (must be > 0).
+ *   int nranks - number of ranks (must be > 0).
+ *   int rank - rank (must be >= 0 and < nranks).
+ *
+ * Output:
+ *   commInfo is set to the new communicator struct.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ * Preconditions:
+ *   - commInfo must not be NULL
+ *   - nNodes must be positive
+ *   - nranks must be positive
+ *   - rank must be non-negative and less than nranks
+ */
+inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
+                                   const char* commName, uint64_t commHash,
+                                   int nNodes, int nranks, int rank) {
+  struct inspectorCommInfoList* liveCommInfoList = &g_state.liveComms;
+  struct inspectorCommInfo* commInfoPtr = nullptr;
+
+  inspectorResult_t res = inspectorSuccess;
+  bool locked = false;
+  INSPECTOR_LOCK_RD_FLAG(&liveCommInfoList->guard, locked,
+                         "inspectorAddComm: commList::guard -rd");
+  for (struct inspectorCommInfo* itr = liveCommInfoList->comms;
+       itr != nullptr;
+       itr = itr->next) {
+    if (comm_eq(commHash, itr->commHash, rank, itr->rank)) {
+      INFO(NCCL_INSPECTOR, "NCCL Inspector: comm 0x%lx already in tracker",
+           commHash);
+      res = inspectorAddDuplicateCommError;
+      goto finalize;
+    }
+  }
+  INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked,
+                                "inspectorAddComm: commList::guard");
+  commInfoPtr
+    = (struct inspectorCommInfo*)calloc(1, sizeof(struct inspectorCommInfo));
+  if (0 == commInfoPtr) {
+    res = inspectorMemoryError;
+    goto finalize;
+  }
+  INS_CHK_GOTO(inspectorFillCommInfo(commInfoPtr,
+                                     commName,
+                                     commHash,
+                                     nNodes,
+                                     nranks,
+                                     rank),
+               res, fail);
+
+  INSPECTOR_LOCK_WR_FLAG(&liveCommInfoList->guard, locked,
+                         "inspectorAddComm: commList::guard -wr");
+  ++liveCommInfoList->ncomms;
+  commInfoPtr->next = liveCommInfoList->comms;
+  liveCommInfoList->comms = commInfoPtr;
+
+finalize:
+  INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked,
+                                "inspectorAddComm: commList::guard");
+  *commInfo = commInfoPtr;
+  return res;
+fail:
+  if (commInfoPtr) {
+    free(commInfoPtr);
+    commInfoPtr = nullptr;
+  }
+  goto finalize;
+}
+
+/*
+ * Description:
+ *
+ *   Removes a communicator from the global state and moves it to the
+ *   deleted list.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses locks internally).
+ *
+ * Input:
+ *   struct inspectorCommInfo *commInfo - communicator to remove.
+ *
+ * Output:
+ *   Communicator is removed from live list and added to deleted list.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ */
+inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo) {
+  struct inspectorCommInfoList* liveCommInfoList = &g_state.liveComms;
+  struct inspectorCommInfoList* deletedCommInfoList = &g_state.deletedComms;
+  struct inspectorCommInfo* commInfoPtr = nullptr;
+  bool locked = false;
+
+  INFO(NCCL_INSPECTOR, "NCCL Inspector: DelComm removing 0x%lx",
+       commInfo->commHash);
+
+  INSPECTOR_LOCK_WR_FLAG(&liveCommInfoList->guard, locked,
+                         "inspectorDelComm: liveCommInfoList::guard -wr");
+  struct inspectorCommInfo** prev_ptr = &liveCommInfoList->comms;
+  for (struct inspectorCommInfo* itr = liveCommInfoList->comms;
+       itr != nullptr;
+       itr = itr->next) {
+    if (comm_eq(commInfo->commHash, itr->commHash, commInfo->rank, itr->rank)) {
+      *prev_ptr = itr->next;
+      liveCommInfoList->ncomms--;
+
+      commInfoPtr = itr;
+      break;
+    }
+    prev_ptr = &itr->next;
+  }
+  INSPECTOR_UNLOCK_RW_LOCK_FLAG(&liveCommInfoList->guard, locked,
+                                "inspectorDelComm: liveCommInfoList::guard -unlock");
+
+  if (!commInfoPtr) {
+    INFO(NCCL_INSPECTOR, "NCCL Inspector: DelComm can't remove 0x%lx, not present",
+         commInfo->commHash);
+    return inspectorDeleteUnknownCommError;
+  }
+
+  inspectorLockWr(&commInfoPtr->guard);
+  commInfoPtr->dump = false;
+  inspectorUnlockRWLock(&commInfoPtr->guard);
+
+  INSPECTOR_LOCK_WR_FLAG(&deletedCommInfoList->guard, locked,
+                         "inspectorDelComm: deletedCommInfoList::guard -wr");
+  commInfoPtr->next = deletedCommInfoList->comms;
+  deletedCommInfoList->comms = commInfoPtr;
+  deletedCommInfoList->ncomms++;
+  INSPECTOR_UNLOCK_RW_LOCK_FLAG(&deletedCommInfoList->guard, locked,
+                                "inspectorDelComm: deletedCommInfoList::guard -unlock");
+
+  return inspectorSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Computes the algorithmic and bus bandwidth (in GB/s) for a given
+ *   NCCL collective operation, based on the communication info and
+ *   completed collective details. The calculation uses the message
+ *   size, execution time, and the type of collective operation to
+ *   determine the effective bandwidths. The 'factor' variable adjusts
+ *   the bus bandwidth calculation according to the communication
+ *   pattern of each collective, as described in the NCCL performance
+ *   documentation:
+ *   https://github.com/NVIDIA/nccl-tests/blob/master/doc/PERFORMANCE.md
+ *
+ * Thread Safety:
+ *
+ *   This function does not perform any locking and assumes the caller
+ *   ensures thread safety if required.
+ *
+ * Input:
+ *
+ *   commInfo - Pointer to inspectorCommInfo structure containing
+ *   communicator details.
+ *
+ *   completedColl- Pointer to inspectorCompletedCollInfo structure
+ *   containing completed collective info.
+ *
+ *   collType - The type of NCCL collective operation (ncclFunc_t).
+ *
+ * Output:
+ *   Updates the algoBwGbs and busBwGbs fields of the completedColl
+ *   structure.
+ *
+ * Return:
+ *   N.A. (void function)
+ */
+void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
+                            struct inspectorCompletedCollInfo *completedColl,
+                            ncclFunc_t collType) {
+  double timeInSec = completedColl->execTimeUsecs / 1000000.0;
+  double factor = 0.0;
+  double trafficSize = 0.0;
+  switch (collType) {
+  case ncclFuncReduce:
+  case ncclFuncBroadcast:
+    trafficSize = (double)completedColl->msgSizeBytes;
+    factor = 1;
+    break;
+  case ncclFuncAllReduce:
+    trafficSize = (double)completedColl->msgSizeBytes;
+    factor = ((double)(2 * (commInfo->nranks - 1))) / ((double)commInfo->nranks);
+    break;
+  case ncclFuncReduceScatter:
+    trafficSize = (double)(completedColl->msgSizeBytes * commInfo->nranks);
+    factor = ((double)(commInfo->nranks - 1)) / ((double)commInfo->nranks);
+    break;
+  case ncclFuncAllGather:
+    trafficSize = (double)(completedColl->msgSizeBytes * commInfo->nranks);
+    factor = ((double)(commInfo->nranks - 1)) / ((double)commInfo->nranks);
+    break;
+  case ncclFuncSendRecv:
+  case ncclFuncSend:
+  case ncclFuncRecv:
+    trafficSize = (double)completedColl->msgSizeBytes;
+    factor = 1;
+    break;
+  default:
+    trafficSize = 0;
+    factor = 0.0;
+  }
+  completedColl->algoBwGbs = timeInSec != 0 ? (trafficSize / 1.0E9 / timeInSec) : 0;
+  completedColl->busBwGbs = completedColl->algoBwGbs * factor;
+}
+
+/*
+ * Description:
+ *
+ *   Helper function to calculate kernel execution time using GPU
+ *   clock values.  The GPU clock values are measured in nanoseconds
+ *   from the globaltimer register.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only operations on kernel info).
+ *
+ * Input:
+ *   struct inspectorKernelChInfo *kernelCh - kernel channel info
+ *   containing GPU clock values.
+ *
+ * Output:
+ *   None.
+ *
+ * Return:
+ *   uint64_t - execution time in microseconds, or 0 if invalid timing
+ *   data.
+ */
+static uint64_t calculateKernelGpuExecTimeUsecs(struct inspectorKernelChInfo *kernelCh) {
+  if (kernelCh->startGpuClk != 0 && kernelCh->stopGpuClk != 0) {
+    if (kernelCh->stopGpuClk > kernelCh->startGpuClk) {
+      uint64_t execTimeNanosecs = kernelCh->stopGpuClk - kernelCh->startGpuClk;
+      return execTimeNanosecs / 1000;
+    }
+  }
+  return 0;
+}
+
+/*
+ * Description:
+ *
+ *   Calculates the maximum kernel execution time across all kernel
+ *   channels in a collective operation, using GPU clock values when
+ *   available and falling back to CPU timestamps when necessary.
+ *
+ * Thread Safety:
+ *   Thread-safe (read-only operations on collective info).
+ *
+ * Input:
+ *   struct inspectorCollInfo *collInfo - collective operation info
+ *   containing kernel channels.
+ *   inspectorTimingSource_t *timingSource - pointer to store the timing source used.
+ *
+ * Output:
+ *   timingSource is set to indicate whether GPU, CPU, or collective timing was used.
+ *
+ * Return:
+ *
+ *   uint64_t - maximum execution time in microseconds across all
+ *              kernels, or collective execution time if no kernel
+ *              timing is available.
+ *
+ */
+static uint64_t calculateMaxKernelExecTimeUsecs(struct inspectorCollInfo *collInfo,
+                                                inspectorTimingSource_t *timingSource) {
+  uint64_t maxKernelExecTimeUsecs = 0;
+  inspectorTimingSource_t bestTimingSource = inspectorTimingSourceCollectiveCpu;
+
+  for (uint32_t i = 0; i < collInfo->nChannels; i++) {
+    struct inspectorKernelChInfo *kernelCh = &collInfo->kernelCh[i];
+    uint64_t gpuExecTimeUsecs = calculateKernelGpuExecTimeUsecs(kernelCh);
+    if (gpuExecTimeUsecs > 0) {
+      if (gpuExecTimeUsecs > maxKernelExecTimeUsecs) {
+        maxKernelExecTimeUsecs = gpuExecTimeUsecs;
+        bestTimingSource = inspectorTimingSourceKernelGpu;
+      }
+    } else {
+      if (kernelCh->tsCompletedUsec > kernelCh->tsStartUsec) {
+        uint64_t cpuExecTimeUsecs = kernelCh->tsCompletedUsec - kernelCh->tsStartUsec;
+        if (cpuExecTimeUsecs > maxKernelExecTimeUsecs) {
+          maxKernelExecTimeUsecs = cpuExecTimeUsecs;
+          bestTimingSource = inspectorTimingSourceKernelCpu;
+        }
+      }
+    }
+  }
+
+  if (maxKernelExecTimeUsecs > 0) {
+    *timingSource = bestTimingSource;
+    return maxKernelExecTimeUsecs;
+  } else {
+    *timingSource = inspectorTimingSourceCollectiveCpu;
+    return collInfo->tsCompletedUsec - collInfo->tsStartUsec;
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Updates the performance information for a completed collective
+ *   operation.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses locks internally).
+ *
+ * Input:
+ *   struct inspectorCommInfo *commInfo - communicator info.
+ *   struct inspectorCollInfo *collInfo - completed collective info.
+ *
+ * Output:
+ *   commInfo is updated with completed collective info.
+ *
+ * Return:
+ *   None.
+ *
+ */
+void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
+                             struct inspectorCollInfo *collInfo) {
+  completedColl->func = ncclStringToFunc(collInfo->func);
+  completedColl->sn = collInfo->sn;
+  completedColl->msgSizeBytes = collInfo->msgSizeBytes;
+  completedColl->execTimeUsecs =
+    calculateMaxKernelExecTimeUsecs(collInfo, &completedColl->timingSource);
+  completedColl->collEvtTrk = collInfo->collEvtTrk;
+}
+
+/*
+ * Description:
+ *
+ *   Finalizes the global inspector state and stops the dump thread if
+ *   running.
+ *
+ * Thread Safety:
+ *   Not thread-safe (should be called during teardown).
+ *
+ * Input:
+ *   None.
+ *
+ * Output:
+ *   Global state is finalized and dump thread is stopped.
+ *
+ * Return:
+ *   inspectorResult_t - success or error code.
+ *
+ */
+inspectorResult_t inspectorGlobalFinalize() {
+  if (dumper) {
+    dumper->stopThread();
+    delete dumper;
+    dumper = nullptr;
+  }
+  return inspectorSuccess;
+}
diff --git a/ext-profiler/inspector/inspector.h b/ext-profiler/inspector/inspector.h
new file mode 100644
index 000000000..98e050f97
--- /dev/null
+++ b/ext-profiler/inspector/inspector.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <pthread.h>
+
+#include "json.h"
+#include "common.h"
+#include "version.h"
+
+#define MAX_CHANNELS                     64
+
+#define INS_CHK_GOTO(call, res, label)                                  \
+  do {                                                                  \
+    res = call;                                                         \
+    if (inspectorSuccess != res) {                                      \
+      INFO(NCCL_INSPECTOR, "%s:%d -> error %d: %s", __FILE__, __LINE__, res, \
+           inspectorErrorString(res));                                  \
+      goto label;                                                       \
+    }                                                                   \
+  } while (0);
+
+
+typedef enum {
+  ncclFuncBroadcast = 0,
+  ncclFuncReduce = 1,
+  ncclFuncAllGather = 2,
+  ncclFuncReduceScatter = 3,
+  ncclFuncAllReduce = 4,
+  ncclFuncSendRecv = 5,
+  ncclFuncSend = 6,
+  ncclFuncRecv = 7,
+  ncclNumFuncs = 8
+} ncclFunc_t;
+
+typedef enum {
+  inspectorSuccess = 0,
+  inspectorUninitializedError,
+  inspectorMemoryError,
+  inspectorFileOpenError,
+  inspectorDisabledError,
+  inspectorLockError,
+  inspectorPthreadError,
+  inspectorJsonError,
+  inspectorCudaError,
+  inspectorBadHash,
+  inspectorDeleteUnknownCommError,
+  inspectorAddDuplicateCommError,
+  inspectorNop,
+  inspectorNullTally,
+  inspectorGlobalInitError,
+  inspectorReturn,
+} inspectorResult_t;
+
+typedef enum {
+  inspectorTimingSourceKernelGpu = 0,
+  inspectorTimingSourceKernelCpu = 1,
+  inspectorTimingSourceCollectiveCpu = 2,
+} inspectorTimingSource_t;
+
+struct inspectorEventTraceInfo {
+  uint64_t ts;
+  uint64_t sn;
+};
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_COLL_START = 0,
+  NCCL_INSP_EVT_TRK_COLL_STOP = 1,
+  NCCL_INSP_EVT_TRK_COLL_NEVT = 2,
+} inspectorEventTrkColl_t;
+
+typedef enum {
+  NCCL_INSP_EVT_TRK_KERNEL_START = 0,
+  NCCL_INSP_EVT_TRK_KERNEL_STOP = 1,
+  NCCL_INSP_EVT_TRK_KERNEL_RECORD = 2,
+  NCCL_INSP_EVT_TRK_KERNEL_NEVT = 3,
+} inspectorEventTrkKernel_t;
+
+struct inspectorEventTrkKernelInfo {
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_KERNEL_NEVT];
+};
+
+struct inspectorEventTrkCollInfo {
+  int sn;
+  uint32_t nChannels;
+  struct inspectorEventTraceInfo evntTrace[NCCL_INSP_EVT_TRK_COLL_NEVT];
+  struct inspectorEventTrkKernelInfo kernelCh[MAX_CHANNELS];
+};
+
+struct inspectorCompletedCollInfo {
+  ncclFunc_t func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t execTimeUsecs;
+  inspectorTimingSource_t timingSource;
+  double algoBwGbs;
+  double busBwGbs;
+  // Event trace information
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+enum {
+  NCCL_COMM_HASH_LENGTH = 17
+};
+
+struct inspectorCommInfo {
+  struct inspectorCommInfo* next;
+
+  const char* commName;
+  uint64_t commHash;
+  char commHashStr[NCCL_COMM_HASH_LENGTH];
+  int rank;
+  int nranks;
+  int nnodes;
+
+  bool dump;
+  struct inspectorCompletedCollInfo completedCollInfo;
+  pthread_rwlock_t guard;
+};
+
+struct inspectorKernelChInfo {
+  uint64_t type;
+  int refCount; /*unused*/
+  struct inspectorCollInfo *collInfo;
+  uint8_t channelId;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint64_t startGpuClk;
+  uint64_t stopGpuClk;
+};
+
+struct inspectorCollInfo {
+  uint64_t type;
+  int refCount;
+  struct inspectorCommInfo *commInfo;
+  const char* func;
+  uint64_t sn;
+  size_t msgSizeBytes;
+  uint64_t tsStartUsec;
+  uint64_t tsCompletedUsec;
+  uint32_t nChannels;
+  uint32_t nKernelChStarted;
+  uint32_t nKernelChCompleted;
+  pthread_rwlock_t guard;
+  struct inspectorKernelChInfo kernelCh[MAX_CHANNELS];
+  struct inspectorEventTrkCollInfo collEvtTrk;
+};
+
+
+
+extern ncclDebugLogger_t logFn;
+#define VERSION(...) logFn(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) logFn(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define WARN(...) logFn(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+  case ncclFloat8e4m3:
+  case ncclFloat8e5m2:
+    return 1;
+  case ncclFloat16:
+  case ncclBfloat16:
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}
+
+const char* inspectorErrorString(inspectorResult_t result);
+
+inspectorResult_t inspectorLockInit(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockDestroy(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockRd(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorLockWr(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorUnlockRWLock(pthread_rwlock_t* lockRef);
+inspectorResult_t inspectorGlobalInit(int rank);
+inspectorResult_t inspectorGlobalFinalize();
+uint64_t inspectorGetTime();
+inspectorResult_t inspectorAddComm(struct inspectorCommInfo **commInfo,
+                                   const char* commName, uint64_t commHash,
+                                   int nNodes, int nranks, int rank);
+inspectorResult_t inspectorDelComm(struct inspectorCommInfo *commInfo);
+
+void inspectorUpdateCollPerf(struct inspectorCompletedCollInfo *completedColl,
+                             struct inspectorCollInfo *collInfo);
+ncclDataType_t inspectorStringToDatatype(const char* str);
+
+void inspectorComputeCollBw(struct inspectorCommInfo *commInfo,
+                            struct inspectorCompletedCollInfo *completedColl,
+                            ncclFunc_t collType);
diff --git a/ext-profiler/inspector/inspector_plugin.cc b/ext-profiler/inspector/inspector_plugin.cc
new file mode 100644
index 000000000..b1872157d
--- /dev/null
+++ b/ext-profiler/inspector/inspector_plugin.cc
@@ -0,0 +1,493 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdio.h>
+#include <pthread.h>
+#include <string.h>
+#include <linux/limits.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include "profiler.h"
+#include "inspector.h"
+
+#define __hidden __attribute__ ((visibility("hidden")))
+
+static int gInitialized;
+
+static pthread_mutex_t gLock = PTHREAD_MUTEX_INITIALIZER;
+
+
+/*
+ * Description:
+ *   Records an event trace with timestamp and sequence number
+ *
+ * Thread Safety:
+ *   Not thread-safe - must be called with proper locking. This function
+ *   is designed to be called from within locked sections where the
+ *   collective info structure is already protected.
+ *
+ * Input:
+ *   struct inspectorEventTraceInfo* evtTrace - event trace array
+ *   int eventIndex - index in the event trace array (must be valid)
+ *   struct inspectorCollInfo* collInfo - collective info structure (must not be NULL)
+ *
+ * Output:
+ *   Event trace is updated with current timestamp and next sequence
+ *   number from collective
+ *
+ * Return:
+ *   uint64_t - the sequence number assigned to this event
+ *
+ * Preconditions:
+ *   - collInfo must not be NULL
+ *   - eventIndex must be within valid bounds for evtTrace array
+ *   - Function must be called from within a locked section
+ */
+static uint64_t inspectorRecordEventTrace(struct inspectorEventTraceInfo* evtTrace,
+                                          int eventIndex,
+                                          struct inspectorCollInfo* collInfo) {
+  evtTrace[eventIndex].ts = inspectorGetTime();
+  evtTrace[eventIndex].sn = ++collInfo->collEvtTrk.sn; // Increment coll sequence counter
+
+  return evtTrace[eventIndex].sn;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for initialization).
+ *
+ * Input:
+ *   void** context - pointer to plugin context.
+ *   int* eActivationMask - pointer to activation mask output.
+ *   const char* commName - communicator name.
+ *   uint64_t commHash - communicator hash.
+ *   int nNodes - number of nodes.
+ *   int nranks - number of ranks.
+ *   int rank - rank.
+ *   ncclDebugLogger_t logfn - logger function pointer.
+ *
+ * Output:
+ *   context is set to plugin context; eActivationMask is set.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginInit(void** context, uint64_t commHash,
+                                          int* eActivationMask,
+                                          const char* commName,
+                                          int nNodes, int nranks, int rank,
+                                          ncclDebugLogger_t logfn) {
+  inspectorResult_t res = inspectorSuccess;
+  *context = nullptr;
+  logFn = logfn;
+
+  pthread_mutex_lock(&gLock);
+  if (++gInitialized == 1) {
+    res = inspectorGlobalInit(rank);
+    if (res != inspectorSuccess) {
+      WARN("Inspector Init Failed %s:%d -> error %d: %s",__FILE__, __LINE__, res,
+           inspectorErrorString(res));
+      gInitialized = 0;
+      pthread_mutex_unlock(&gLock);
+      return ncclInternalError;
+    }
+  }
+  pthread_mutex_unlock(&gLock);
+
+  INS_CHK_GOTO(inspectorAddComm((struct inspectorCommInfo **)context,
+                                commName, commHash,
+                                nNodes, nranks, rank), res, success);
+  *eActivationMask = ncclProfileColl | ncclProfileKernelCh;
+  INFO(NCCL_INIT, "PROFILER/Plugin: init commName: %s commHash: %lu nranks: %d rank: %d",
+       commName ? commName : "", commHash, nranks, rank);
+success:
+  if (res != inspectorSuccess) {
+    return ncclInternalError;
+  } else {
+    return ncclSuccess;
+  }
+}
+
+/*
+ * Description:
+ *
+ *   Finalizes the NCCL Inspector plugin and global state for a
+ *   communicator.
+ *
+ * Thread Safety:
+ *   Thread-safe (uses mutex for finalization).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *
+ * Output:
+ *   Plugin context is finalized and cleaned up.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginFinalize(void* context) {
+  inspectorDelComm((struct inspectorCommInfo *)context);
+  pthread_mutex_lock(&gLock);
+  if (--gInitialized == 0) {
+    inspectorGlobalFinalize();
+  }
+  pthread_mutex_unlock(&gLock);
+  return ncclSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount += 1;
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorPluginCollInfoRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRef(struct inspectorCollInfo *collInfo) {
+  collInfo->refCount -= 1;
+  if (collInfo->refCount == 0) {
+    inspectorLockDestroy(&collInfo->guard);
+    memset(collInfo, 0, sizeof(struct inspectorCollInfo));
+    free(collInfo);
+    return inspectorReturn;
+  }
+  return inspectorSuccess;
+}
+
+inspectorResult_t inspectorPluginCollInfoDeRefSafe(struct inspectorCollInfo *collInfo) {
+  inspectorLockWr(&collInfo->guard);
+  inspectorResult_t res = inspectorPluginCollInfoDeRef(collInfo);
+  inspectorUnlockRWLock(&collInfo->guard);
+  return res;
+}
+
+/*
+ * Description:
+ *   Initializes a new inspectorCollInfo structure for a collective
+ *   event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (allocates and initializes a new collective info
+ *   structure).
+ *
+ * Input:
+ *
+ *   struct inspectorCollInfo **collInfo - pointer to output
+ *   collective info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *   collInfo is set to the new collective info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginCollInfoInit(struct inspectorCollInfo **collInfo,
+                                        ncclProfilerEventDescr_t *eDescr,
+                                        struct inspectorCommInfo *commInfo) {
+  struct inspectorCollInfo *collInfoPtr
+    = (struct inspectorCollInfo*)calloc(1, sizeof(struct inspectorCollInfo));
+  if (collInfoPtr == nullptr) {
+    WARN("Inspector: Failed to allocate memory for collective info structure");
+    *collInfo = nullptr;
+    return;
+  }
+  collInfoPtr->type = ncclProfileColl;
+  collInfoPtr->refCount = 0;
+  inspectorPluginCollInfoRef(collInfoPtr); //self ref; no locks needed
+  collInfoPtr->func = eDescr->coll.func;
+  collInfoPtr->sn = eDescr->coll.seqNumber;
+  collInfoPtr->nChannels = eDescr->coll.nChannels;
+  if (collInfoPtr->nChannels > 0) {
+    inspectorPluginCollInfoRef(collInfoPtr); //extra ref for kernel completion
+  }
+  collInfoPtr->tsStartUsec = inspectorGetTime();
+  collInfoPtr->msgSizeBytes =
+    ncclTypeSize(inspectorStringToDatatype(eDescr->coll.datatype)) * eDescr->coll.count;
+
+
+  collInfoPtr->commInfo = commInfo;
+  collInfoPtr->collEvtTrk.sn = 0;
+  collInfoPtr->collEvtTrk.nChannels = collInfoPtr->nChannels;
+  inspectorRecordEventTrace(collInfoPtr->collEvtTrk.evntTrace,
+                            NCCL_INSP_EVT_TRK_COLL_START, collInfoPtr);
+
+  inspectorLockInit(&collInfoPtr->guard);
+  *collInfo = collInfoPtr;
+}
+
+/*
+ * Description:
+ *
+ *   Initializes a new inspectorKernelChInfo structure for a kernel
+ *   channel event.
+ *
+ * Thread Safety:
+ *   Not thread-safe (initializes kernel channel info within a
+ *   collective info structure).
+ *
+ * Input:
+ *   struct inspectorKernelChInfo **kernelChInfo - pointer to output
+ *   kernel channel info struct.
+ *   ncclProfilerEventDescr_t *eDescr - event descriptor.
+ *
+ * Output:
+ *
+ *   kernelChInfo is set to the new kernel channel info struct.
+ *
+ * Return:
+ *   None.
+ */
+static void inspectorPluginKernelChInfoInit(struct inspectorKernelChInfo **kernelChInfo,
+                                            ncclProfilerEventDescr_t *eDescr) {
+  if (eDescr->parentObj) {
+    uint64_t parentType=*(uint64_t*)eDescr->parentObj;
+    if (parentType == ncclProfileColl) {
+      struct inspectorCollInfo *collInfo = (struct inspectorCollInfo*)eDescr->parentObj;
+      if (collInfo && collInfo->type == ncclProfileColl) {
+        inspectorLockWr(&collInfo->guard);
+        struct inspectorEventTraceInfo *krnlEvtTrk =
+          collInfo->collEvtTrk.kernelCh[eDescr->kernelCh.channelId].evntTrace;
+        inspectorRecordEventTrace(krnlEvtTrk,
+                                  NCCL_INSP_EVT_TRK_KERNEL_START,
+                                  collInfo);
+        struct inspectorKernelChInfo *kernelChInfoPtr
+          = &collInfo->kernelCh[eDescr->kernelCh.channelId];
+        kernelChInfoPtr->type = ncclProfileKernelCh;
+        kernelChInfoPtr->channelId = eDescr->kernelCh.channelId;
+        kernelChInfoPtr->startGpuClk = eDescr->kernelCh.pTimer;
+        if (kernelChInfoPtr->stopGpuClk == 0) {
+          inspectorPluginCollInfoRef(collInfo); //Pairs with Record Kernel Stop event
+        }
+        kernelChInfoPtr->tsStartUsec = inspectorGetTime();
+        if (collInfo->nKernelChStarted == 0) {
+          collInfo->tsStartUsec = kernelChInfoPtr->tsStartUsec;
+        }
+        collInfo->nKernelChStarted += 1;
+        inspectorPluginCollInfoRef(collInfo); //Pairs with Stop Kernel Event
+        kernelChInfoPtr->collInfo = collInfo;
+
+        *kernelChInfo = kernelChInfoPtr;
+        inspectorUnlockRWLock(&collInfo->guard);
+      }
+    }
+  }
+}
+/*
+ * Description:
+ *
+ *   Starts a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *   Thread-safe (allocates and initializes event structures).
+ *
+ * Input:
+ *   void* context - plugin context.
+ *   void** eHandle - pointer to event handle output.
+ *   ncclProfilerEventDescr_t* eDescr - event descriptor.
+ *
+ * Output:
+ *   eHandle is set to the new event structure.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStartEvent(void* context,
+                                                void** eHandle,
+                                                ncclProfilerEventDescr_t* eDescr) {
+  if (context == nullptr || eDescr == nullptr) {
+    INFO(NCCL_INIT, "Profiler/Plugin: context/eDescr NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  *eHandle = nullptr;
+  if (eDescr->type == ncclProfileColl) {
+    struct inspectorCollInfo *collEvent = nullptr;
+    struct inspectorCommInfo *commInfoCtx = (struct inspectorCommInfo*)context;
+    inspectorPluginCollInfoInit(&collEvent, eDescr, commInfoCtx);
+    *eHandle = collEvent;
+  } else if (eDescr->type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChEvent = nullptr;
+    inspectorPluginKernelChInfoInit(&kernelChEvent, eDescr);
+    *eHandle = kernelChEvent;
+  } else {
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Stops a profiling event for the NCCL Inspector plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state and performance info).
+ *
+ * Input:
+ *
+ *   void *eHandle - event handle.
+ *
+ * Output:
+ *
+ *   Event is stopped and performance info may be updated.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginStopEvent(void *eHandle) {
+
+  if (eHandle == nullptr) {
+    INFO(NCCL_INIT,
+         "Profiler/Plugin: Event Handle NULL for start event %s", __func__);
+    return ncclSuccess;
+  }
+  uint64_t type = *(uint64_t *)eHandle;
+  inspectorResult_t res = inspectorSuccess;
+
+  if (type == ncclProfileColl) {
+    struct inspectorCollInfo *collInfo = (struct inspectorCollInfo *)eHandle;
+    // Record collective stop event
+    inspectorLockWr(&collInfo->guard);
+    inspectorRecordEventTrace(collInfo->collEvtTrk.evntTrace,
+                              NCCL_INSP_EVT_TRK_COLL_STOP,
+                              collInfo);
+    res = inspectorPluginCollInfoDeRef(collInfo);
+    if (res == inspectorReturn) {
+      // WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileColl");
+      return ncclSuccess;
+    }
+    inspectorUnlockRWLock(&collInfo->guard);
+    return ncclSuccess;
+  } else if (type == ncclProfileKernelCh) {
+    struct inspectorKernelChInfo *kernelChInfo
+      = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk =
+        collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_STOP,
+                                collInfo);
+      kernelChInfo->tsCompletedUsec = inspectorGetTime();
+      collInfo->nKernelChCompleted += 1;
+
+      res = inspectorPluginCollInfoDeRef(collInfo);
+      if (res == inspectorReturn) {
+        WARN("NCCL Inspector unnatural return: inspectorPluginStopEvent:ncclProfileKernelCh");
+        return ncclSuccess;
+      }
+      if ((collInfo->nKernelChCompleted == collInfo->nKernelChStarted)
+          && (collInfo->nKernelChCompleted == collInfo->nChannels)) {
+        struct inspectorCompletedCollInfo completedColl;
+        struct inspectorCommInfo *commInfo = collInfo->commInfo;
+        collInfo->tsCompletedUsec = kernelChInfo->tsCompletedUsec;
+        inspectorUpdateCollPerf(&completedColl, collInfo);
+
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res != inspectorReturn) {
+          inspectorUnlockRWLock(&collInfo->guard);
+        }
+        if (commInfo != nullptr) {
+          inspectorLockWr(&commInfo->guard);
+          inspectorComputeCollBw(commInfo,
+                                 &completedColl,
+                                 completedColl.func);
+          memcpy(&commInfo->completedCollInfo,
+                 &completedColl,
+                 sizeof(struct inspectorCompletedCollInfo));
+          commInfo->dump = true;
+          inspectorUnlockRWLock(&commInfo->guard);
+        }
+        return ncclSuccess;
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+    return ncclSuccess;
+  }
+  return ncclSuccess;
+}
+
+/*
+ * Description:
+ *
+ *   Records the state of a profiling event for the NCCL Inspector
+ *   plugin.
+ *
+ * Thread Safety:
+ *
+ *   Thread-safe (updates event state as needed).
+ *
+ * Input:
+ *   void* eHandle - event handle.
+ *   ncclProfilerEventState_t eState - event state.
+ *   ncclProfilerEventStateArgs_t* eStateArgs - event state arguments.
+ *
+ * Output:
+ *   Event state is updated as needed.
+ *
+ * Return:
+ *   ncclResult_t - success or error code.
+ *
+ */
+__hidden ncclResult_t inspectorPluginRecordEventState(void* eHandle,
+                                                      ncclProfilerEventState_t eState,
+                                                      ncclProfilerEventStateArgs_t* eStateArgs) {
+  if (eHandle == nullptr || eStateArgs == nullptr)
+    return ncclSuccess;
+
+  uint64_t type = *(uint64_t *)eHandle;
+
+  if (type == ncclProfileKernelCh && eState == ncclProfilerKernelChStop) {
+    struct inspectorKernelChInfo *kernelChInfo = (struct inspectorKernelChInfo *)eHandle;
+    struct inspectorCollInfo *collInfo = kernelChInfo->collInfo;
+    inspectorResult_t res = inspectorSuccess;
+    if (collInfo && collInfo->type == ncclProfileColl) {
+      inspectorLockWr(&collInfo->guard);
+      struct inspectorEventTraceInfo *krnlEvtTrk
+        = collInfo->collEvtTrk.kernelCh[kernelChInfo->channelId].evntTrace;
+      inspectorRecordEventTrace(krnlEvtTrk,
+                                NCCL_INSP_EVT_TRK_KERNEL_RECORD,
+                                collInfo);
+      kernelChInfo->stopGpuClk = eStateArgs->kernelCh.pTimer;
+      if (kernelChInfo->startGpuClk != 0) {
+        res = inspectorPluginCollInfoDeRef(collInfo);
+        if (res == inspectorReturn) {
+          WARN("NCCL Inspector unnatural return: inspectorPluginRecordEventState");
+          return ncclSuccess;
+        }
+      }
+      inspectorUnlockRWLock(&collInfo->guard);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclProfiler_t ncclProfiler_v5 = {
+  "Inspector",
+  inspectorPluginInit,
+  inspectorPluginStartEvent,
+  inspectorPluginStopEvent,
+  inspectorPluginRecordEventState,
+  inspectorPluginFinalize,
+};
diff --git a/ext-profiler/inspector/json.cc b/ext-profiler/inspector/json.cc
new file mode 100644
index 000000000..e95d98d18
--- /dev/null
+++ b/ext-profiler/inspector/json.cc
@@ -0,0 +1,496 @@
+#include "json.h"
+#include <assert.h>
+#include <math.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+const char* jsonErrorString(jsonResult_t res) {
+  switch (res) {
+  case jsonSuccess:
+    return "jsonSuccess";
+  case jsonFileError:
+    return "jsonFileError";
+  case jsonUnknownStateError:
+    return "jsonUnknownStateError";
+  case jsonEmptyStateError:
+    return "jsonEmptyStateError";
+  case jsonExpectedNonNoneStateError:
+    return "jsonExpectedNonNoneStateError";
+  case jsonMemoryError:
+    return "jsonMemoryError";
+  case jsonStringOverflowError:
+    return "jsonStringOverflowError";
+  case jsonStringBadChar:
+    return "jsonStringBadChar";
+  case jsonLockError:
+    return "jsonLockError";
+  default:
+    return "unknown json error";
+  }
+}
+
+// We use these statics to mantain a stack of states where we are writing.
+typedef struct jsonFileOutput {
+  jsonState_t* states;
+  size_t state_cap; // Allocated stack capacity
+  size_t state_n;   // # of items in the stack.
+  FILE* fp;
+  pthread_mutex_t mutex;
+} jsonFileOutput;
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput** jfo, const char* outfile) {
+  jsonFileOutput* new_jfo = (jsonFileOutput*)malloc(sizeof(jsonFileOutput));
+  if (new_jfo == NULL) {
+    return jsonMemoryError;
+  }
+  if (pthread_mutex_init(&new_jfo->mutex, NULL) != 0) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonLockError;
+  }
+  new_jfo->states = NULL;
+  new_jfo->state_cap = 0;
+  new_jfo->state_n = 0;
+  new_jfo->fp = fopen(outfile, "w");
+  if (new_jfo->fp == NULL) {
+    free(new_jfo);
+    *jfo = 0;
+    return jsonFileError;
+  }
+  *jfo = new_jfo;
+  return jsonSuccess;
+}
+
+jsonResult_t jsonNewline(jsonFileOutput* jfo) {
+  fprintf(jfo->fp, "\n");
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFlushOutput(jsonFileOutput* jfo) {
+  fflush(jfo->fp);
+  return jsonSuccess;
+}
+
+jsonResult_t jsonLockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_lock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput* jfo) {
+  if (pthread_mutex_unlock(&jfo->mutex) != 0) {
+    return jsonLockError;
+  }
+  return jsonSuccess;
+}
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput* jfo) {
+  // Really should probably complain if we aren't in a valid state
+
+  if (pthread_mutex_destroy(&jfo->mutex) != 0) {
+    free(jfo);
+    return jsonLockError;
+  }
+  if (jfo->states != NULL) {
+    free(jfo->states);
+  }
+  jfo->states = NULL;
+  jfo->state_cap = 0;
+  jfo->state_n = 0;
+  if (jfo->fp) {
+    fclose(jfo->fp);
+    jfo->fp = 0;
+  }
+
+  free(jfo);
+  return jsonSuccess;
+}
+
+static int utf8copy(unsigned char* out, int out_lim, const unsigned char* in) {
+  int copy_len;
+  if ((in[0] & 0xE0) == 0xC0) {
+    // 2-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || out_lim < 2) {
+      return 0;
+    }
+    copy_len = 2;
+  } else if ((in[0] & 0xF0) == 0xE0) {
+    // 3-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || out_lim < 3) {
+      return 0;
+    }
+    copy_len = 3;
+  } else if ((in[0] & 0xF8) == 0xF0) {
+    // 4-byte sequence
+    if ((in[1] & 0xC0) != 0x80 || (in[2] & 0xC0) != 0x80 || (in[3] & 0xC0) != 0x80 || out_lim < 4) {
+      return 0;
+    }
+    copy_len = 4;
+  } else {
+    // Invalid start byte
+    return 0;
+  }
+
+  for (int i = 0; i < copy_len; ++i) {
+    out[i] = in[i];
+  }
+
+  return copy_len;
+}
+
+// This tries to sanitize/quote a string from 'in' into 'out',
+// assuming 'out' has length 'lim'.  We mainly quote ",/,\,\t,\n, and
+// bail if we encounter non-printable stuff or non-ASCII stuff.
+// 'in' should be null-terminated, of course.
+//
+// We return false if we were not able to copy all of 'in', either for
+// length reasons or for unhandled characters.
+static jsonResult_t sanitizeJson(unsigned char out[], int lim, const unsigned char* in) {
+  int c = 0;
+  while (*in) {
+    if (c + 1 >= lim) {
+      out[c] = 0;
+      return jsonStringOverflowError;
+    }
+    switch (*in) {
+    case '"':
+    case '\\':
+    case '/':
+    case '\t':
+    case '\n':
+      if (c + 2 > lim) {
+        out[c] = 0;
+        return jsonStringOverflowError;
+      }
+
+      out[c++] = '\\';
+      if (*in == '\n') {
+        out[c++] = 'n';
+      } else if (*in == '\t') {
+        out[c++] = 't';
+      } else {
+        out[c++] = *in;
+      }
+      ++in;
+      break;
+    default:
+      if (*in <= 0x1F) {
+        out[c] = 0;
+        return jsonStringBadChar;
+      } else if (*in <= 0x7F) {
+        out[c++] = *in;
+        ++in;
+      } else {
+        const int utf8len = utf8copy(out + c, lim - c - 1, in);
+        if (utf8len == 0) {
+          out[c] = 0;
+          return jsonStringBadChar;
+        }
+        c += utf8len;
+        in += utf8len;
+      }
+      break;
+    }
+  }
+  out[c] = 0;
+  return jsonSuccess;
+}
+
+static size_t max(size_t a, size_t b) {
+  if (a < b) {
+    return b;
+  }
+  return a;
+}
+
+// Push state onto the state stack. Reallocate for extra storage if needed.
+// Because JSON_NONE is a pseudo-state, don't allow it to be pushed.
+static jsonResult_t jsonPushState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_cap <= (jfo->state_n + 1)) {
+    jfo->state_cap = max((size_t)16, jfo->state_cap * 2);
+    jfo->states = (jsonState_t*)realloc(jfo->states, sizeof(jsonState_t) * jfo->state_cap);
+    if (jfo->states == 0) {
+      return jsonMemoryError;
+    }
+  }
+  jfo->states[jfo->state_n++] = state;
+  return jsonSuccess;
+}
+
+// Return the current state at the top of the stack
+static jsonState_t jsonCurrState(const jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[jfo->state_n - 1];
+}
+
+// Replace the stack with state (equivalent to a pop & push if stack is not empty)
+static jsonResult_t jsonReplaceState(jsonFileOutput* jfo, jsonState_t state) {
+  if (state == JSON_NONE) {
+    return jsonExpectedNonNoneStateError;
+  }
+  if (jfo->state_n == 0) {
+    return jsonEmptyStateError;
+  }
+  jfo->states[jfo->state_n - 1] = state;
+  return jsonSuccess;
+}
+
+// Pop the top state off the stack, or return that the state is empty
+static jsonState_t jsonPopState(jsonFileOutput* jfo) {
+  if (jfo->state_n == 0) {
+    return JSON_NONE;
+  }
+  return jfo->states[--jfo->state_n];
+}
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput* jfo, const char* name) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+    jsonReplaceState(jfo, JSON_OBJECT_SOME);
+    break;
+  case JSON_OBJECT_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)name);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "\"%s\":", tmp);
+  jsonPushState(jfo, JSON_KEY);
+  return jsonSuccess;
+}
+
+// Helper function for inserting values.
+// Only acceptable after keys, top-level, or in lists.
+// Emit preceeding ',' if in a list and not first item.
+static jsonResult_t jsonValHelper(jsonFileOutput* jfo) {
+  switch (jsonCurrState(jfo)) {
+  case JSON_LIST_EMPTY:
+    jsonReplaceState(jfo, JSON_LIST_SOME);
+    break;
+  case JSON_LIST_SOME:
+    fprintf(jfo->fp, ",");
+    break;
+  case JSON_KEY:
+    jsonPopState(jfo);
+    break;
+  case JSON_NONE:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  return jsonSuccess;
+}
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "{");
+  return jsonPushState(jfo, JSON_OBJECT_EMPTY);
+}
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_OBJECT_EMPTY:
+  case JSON_OBJECT_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "}");
+  return jsonSuccess;
+}
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "[");
+  return jsonPushState(jfo, JSON_LIST_EMPTY);
+}
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput* jfo) {
+  switch (jsonPopState(jfo)) {
+  case JSON_LIST_EMPTY:
+  case JSON_LIST_SOME:
+    break;
+  default:
+    return jsonUnknownStateError;
+  }
+  fprintf(jfo->fp, "]");
+  return jsonSuccess;
+}
+
+// Write a null value
+jsonResult_t jsonNull(jsonFileOutput* jfo) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "null");
+  return jsonSuccess;
+}
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput* jfo, const char* str) {
+  if (str == NULL) {
+    jsonNull(jfo);
+    return jsonSuccess;
+  }
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  unsigned char tmp[2048];
+  const jsonResult_t san_res = sanitizeJson(tmp, sizeof(tmp), (const unsigned char*)str);
+  if (san_res != jsonSuccess) {
+    return san_res;
+  }
+  fprintf(jfo->fp, "\"%s\"", tmp);
+  return jsonSuccess;
+}
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput* jfo, bool val) {
+  return jsonStr(jfo, val ? "true" : "false");
+}
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput* jfo, const int val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%d", val);
+  return jsonSuccess;
+}
+
+// Write an integer value
+jsonResult_t jsonUint32(jsonFileOutput* jfo, const uint32_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%u", val);
+  return jsonSuccess;
+}
+
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput* jfo, const uint64_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%lu", val);
+  return jsonSuccess;
+}
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput* jfo, const size_t val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  fprintf(jfo->fp, "%zu", val);
+  return jsonSuccess;
+}
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput* jfo, const double val) {
+  const jsonResult_t res = jsonValHelper(jfo);
+  if (res != jsonSuccess) {
+    return res;
+  }
+  if (val != val) {
+    fprintf(jfo->fp, "\"nan\"");
+  } else {
+    fprintf(jfo->fp, "%lf", val);
+  }
+  return jsonSuccess;
+}
+
+#ifdef DO_JSON_TEST
+// compile with
+// gcc json.cc -Iinclude/ -DDO_JSON_TEST -o json_test
+// run with:
+// ./json_test
+// if something fails, it will print out the error
+// if it all works, print out "output matches reference"
+#define JSONCHECK(expr)                                                                            \
+  do {                                                                                             \
+    const jsonResult_t res = (expr);                                                               \
+    if (res != jsonSuccess) {                                                                      \
+      fprintf(stderr, "jsonError: %s\n", jsonErrorString(res));                                    \
+      exit(1);                                                                                     \
+    }                                                                                              \
+  } while (0)
+
+int main() {
+
+  const char refstr[] =
+      "{\"number\":123,\"utfstring\":\"∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ "
+      "¬β = ¬(¬α ∨ β),\",\"list\":[\"true\",null,9423812381231,3123111,0.694234]}";
+
+  jsonFileOutput* jfo;
+  JSONCHECK(jsonInitFileOutput(&jfo, "test.json"));
+  JSONCHECK(jsonStartObject(jfo));
+  JSONCHECK(jsonKey(jfo, "number"));
+  JSONCHECK(jsonInt(jfo, 123));
+  JSONCHECK(jsonKey(jfo, "utfstring"));
+  JSONCHECK(
+      jsonStr(jfo, "∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),"));
+  JSONCHECK(jsonKey(jfo, "list"));
+  JSONCHECK(jsonStartList(jfo));
+  JSONCHECK(jsonBool(jfo, true));
+  JSONCHECK(jsonNull(jfo));
+  JSONCHECK(jsonUint64(jfo, 9423812381231ULL));
+  JSONCHECK(jsonSize_t(jfo, 3123111));
+  JSONCHECK(jsonDouble(jfo, 0.69423413));
+  JSONCHECK(jsonFinishList(jfo));
+  JSONCHECK(jsonFinishObject(jfo));
+  JSONCHECK(jsonFinalizeFileOutput(jfo));
+
+  FILE* fp = fopen("test.json", "r");
+
+  const size_t reflen = sizeof(refstr) / sizeof(char);
+
+  char buffer[reflen];
+
+  fread(buffer, sizeof(char), reflen, fp);
+
+  fclose(fp);
+
+  if (memcmp(buffer, refstr, reflen) == 0) {
+    printf("output matches reference\n");
+  } else {
+    printf("output    %s\nreference %s\n", buffer, refstr);
+    return 1;
+  }
+
+  return 0;
+}
+
+#endif
diff --git a/ext-profiler/inspector/json.h b/ext-profiler/inspector/json.h
new file mode 100644
index 000000000..a0b684843
--- /dev/null
+++ b/ext-profiler/inspector/json.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+typedef enum {
+  JSON_NONE, // A pseudo-state meaning that the document is empty
+  JSON_KEY,
+  JSON_OBJECT_EMPTY,
+  JSON_OBJECT_SOME,
+  JSON_LIST_EMPTY,
+  JSON_LIST_SOME,
+} jsonState_t;
+
+typedef enum {
+  jsonSuccess,
+  jsonFileError,
+  jsonUnknownStateError,
+  jsonEmptyStateError,
+  jsonExpectedNonNoneStateError,
+  jsonStringOverflowError,
+  jsonStringBadChar,
+  jsonMemoryError,
+  jsonLockError,
+} jsonResult_t;
+
+const char *jsonErrorString(jsonResult_t res);
+
+typedef struct jsonFileOutput jsonFileOutput;
+
+jsonResult_t jsonLockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonUnlockOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonInitFileOutput(jsonFileOutput **jfo,
+                                const char *outfile);
+
+jsonResult_t jsonFinalizeFileOutput(jsonFileOutput *jfo);
+
+jsonResult_t jsonNewline(jsonFileOutput *jfo);
+jsonResult_t jsonFlushOutput(jsonFileOutput *jfo);
+
+// Emit a key and separator. Santize the key.
+// This is only acceptable if the top state is an object
+// Emit a ',' separator of we aren't the first item.
+jsonResult_t jsonKey(jsonFileOutput *jfo, const char *name);
+
+// Start an object
+jsonResult_t jsonStartObject(jsonFileOutput *jfo);
+
+// Close an object
+jsonResult_t jsonFinishObject(jsonFileOutput *jfo);
+
+// Start a list
+jsonResult_t jsonStartList(jsonFileOutput *jfo);
+
+// Close a list
+jsonResult_t jsonFinishList(jsonFileOutput *jfo);
+
+// Emit a null value
+jsonResult_t jsonNull(jsonFileOutput *jfo);
+
+// Write a (sanititzed) string
+jsonResult_t jsonStr(jsonFileOutput *jfo, const char *str);
+
+// Write a bool as "true" or "false" strings.
+jsonResult_t jsonBool(jsonFileOutput *jfo, bool val);
+
+// Write an integer value
+jsonResult_t jsonInt(jsonFileOutput *jfo, const int val);
+
+//Write an unsigned int value
+jsonResult_t jsonUint32(jsonFileOutput *jfo, const uint32_t val);
+
+// Write an integer value
+jsonResult_t jsonUint64(jsonFileOutput *jfo, const uint64_t val);
+
+// Write a size_t value
+jsonResult_t jsonSize_t(jsonFileOutput *jfo, const size_t val);
+
+// Write a double value
+jsonResult_t jsonDouble(jsonFileOutput *jfo, const double val);
diff --git a/ext-profiler/inspector/nccl/common.h b/ext-profiler/inspector/nccl/common.h
new file mode 100644
index 000000000..f8ab7e9e6
--- /dev/null
+++ b/ext-profiler/inspector/nccl/common.h
@@ -0,0 +1,73 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COMMON_H_
+#define COMMON_H_
+
+/* typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; */
+/* typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys; */
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
+} ncclDataType_t;
+
+typedef enum {
+  NCCL_LOG_NONE = 0,
+  NCCL_LOG_VERSION = 1,
+  NCCL_LOG_WARN = 2,
+  NCCL_LOG_INFO = 3,
+  NCCL_LOG_ABORT = 4,
+  NCCL_LOG_TRACE = 5
+} ncclDebugLogLevel;
+
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+
+typedef enum {
+  NCCL_INIT = 0x1,
+  NCCL_COLL = 0x2,
+  NCCL_P2P = 0x4,
+  NCCL_SHM = 0x8,
+  NCCL_NET = 0x10,
+  NCCL_GRAPH = 0x20,
+  NCCL_TUNING = 0x40,
+  NCCL_ENV = 0x80,
+  NCCL_ALLOC = 0x100,
+  NCCL_CALL = 0x200,
+  NCCL_PROXY = 0x400,
+  NCCL_NVLS = 0x800,
+  NCCL_BOOTSTRAP = 0x1000,
+  NCCL_REG = 0x2000,
+  NCCL_PROFILE = 0x4000,
+  NCCL_RAS = 0x8000,
+  NCCL_INSPECTOR = 0x100000, // big number to avoid short-term conflicts
+  NCCL_ALL = ~0
+} ncclDebugLogSubSys;
+
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler.h b/ext-profiler/inspector/nccl/profiler.h
new file mode 100644
index 000000000..715885f72
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler.h
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_H_
+#define PROFILER_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "common.h"
+
+enum {
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
+};
+
+typedef enum {
+  ncclProfilerProxyOpSendPosted        = 0,  // deprecated in v4
+  ncclProfilerProxyOpSendRemFifoWait   = 1,  // deprecated in v4
+  ncclProfilerProxyOpSendTransmitted   = 2,  // deprecated in v4
+  ncclProfilerProxyOpSendDone          = 3,  // deprecated in v4
+  ncclProfilerProxyOpRecvPosted        = 4,  // deprecated in v4
+  ncclProfilerProxyOpRecvReceived      = 5,  // deprecated in v4
+  ncclProfilerProxyOpRecvTransmitted   = 6,  // deprecated in v4
+  ncclProfilerProxyOpRecvDone          = 7,  // deprecated in v4
+  ncclProfilerProxyOpInProgress_v4     = 19,
+
+  /* Legacy proxy profiler states */
+  ncclProfilerProxyStepSendGPUWait     = 8,
+  ncclProfilerProxyStepSendPeerWait_v4 = 20,
+  ncclProfilerProxyStepSendWait        = 9,
+  ncclProfilerProxyStepRecvWait        = 10,
+  ncclProfilerProxyStepRecvFlushWait   = 11,
+  ncclProfilerProxyStepRecvGPUWait     = 12,
+
+  /* Legacy proxy control states */
+  ncclProfilerProxyCtrlIdle            = 13,
+  ncclProfilerProxyCtrlActive          = 14,
+  ncclProfilerProxyCtrlSleep           = 15,
+  ncclProfilerProxyCtrlWakeup          = 16,
+  ncclProfilerProxyCtrlAppend          = 17,
+  ncclProfilerProxyCtrlAppendEnd       = 18,
+
+  /* Network defined events states */
+  ncclProfilerNetPluginUpdate          = 21,
+
+  /* Kernel event states */
+  ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerEndGroupApiStart         = 23,
+  ncclProfilerBeginGroupApiEnd         = 24
+} ncclProfilerEventState_t;
+
+typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
+
+#include "profiler_v5.h"
+#include "profiler_v4.h"
+#include "profiler_v3.h"
+#include "profiler_v2.h"
+#include "profiler_v1.h"
+#include "profiler_net.h"
+
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
+
+#endif // end include guard
diff --git a/ext-profiler/inspector/nccl/profiler_net.h b/ext-profiler/inspector/nccl/profiler_net.h
new file mode 100644
index 000000000..4f0a4182c
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_net.h
@@ -0,0 +1,19 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_NET_H_
+#define PROFILER_NET_H_
+
+#define NCCL_PROFILER_NET_VER_BITS  (16)
+#define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
+#define NCCL_PROFILER_NET_TYPE_MASK (~0U << NCCL_PROFILER_NET_VER_BITS)
+
+typedef enum {
+  NCCL_PROFILER_NET_TYPE_IB   = (1U << NCCL_PROFILER_NET_VER_BITS),
+  NCCL_PROFILER_NET_TYPE_SOCK = (2U << NCCL_PROFILER_NET_VER_BITS),
+} ncclProfilerNetType;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler_v1.h b/ext-profiler/inspector/nccl/profiler_v1.h
new file mode 100644
index 000000000..9abcea76d
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_v1.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V1_H_
+#define PROFILER_V1_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      uint8_t func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      uint8_t datatype;
+      uint32_t op;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      uint8_t algo;
+      uint8_t proto;
+      int isCollnet;
+      int isNvls;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint8_t func;
+      void* buff;
+      uint8_t datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v1_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v1_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v1_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v1_t eState, ncclProfilerEventStateArgs_v1_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v1_t;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler_v2.h b/ext-profiler/inspector/nccl/profiler_v2.h
new file mode 100644
index 000000000..6a2699b58
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_v2.h
@@ -0,0 +1,108 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V2_H_
+#define PROFILER_V2_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      size_t trafficBytes;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+  };
+} ncclProfilerEventDescr_v2_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v2_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v2_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v2_t eState, ncclProfilerEventStateArgs_v2_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v2_t;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler_v3.h b/ext-profiler/inspector/nccl/profiler_v3.h
new file mode 100644
index 000000000..d4def08e1
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_v3.h
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V3_H_
+#define PROFILER_V3_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      const char* name;
+      uint64_t commHash;
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nMaxChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* name;
+      uint64_t commHash;
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v3_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+    int steps;
+  } proxyOp;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+} ncclProfilerEventStateArgs_v3_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v3_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v3_t eState, ncclProfilerEventStateArgs_v3_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v3_t;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler_v4.h b/ext-profiler/inspector/nccl/profiler_v4.h
new file mode 100644
index 000000000..75f00548d
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_v4.h
@@ -0,0 +1,127 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V4_H_
+#define PROFILER_V4_H_
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/types.h>
+
+typedef struct {
+  uint8_t type;                 // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v4_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v4_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commName       : user assigned communicator name
+  //  - commHash       : communicator id
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v4_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v4_t eState, ncclProfilerEventStateArgs_v4_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v4_t;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/profiler_v5.h b/ext-profiler/inspector/nccl/profiler_v5.h
new file mode 100644
index 000000000..dab1db9e1
--- /dev/null
+++ b/ext-profiler/inspector/nccl/profiler_v5.h
@@ -0,0 +1,151 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      bool graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
diff --git a/ext-profiler/inspector/nccl/types.h b/ext-profiler/inspector/nccl/types.h
new file mode 100644
index 000000000..f43fdc163
--- /dev/null
+++ b/ext-profiler/inspector/nccl/types.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_TYPES_H_
+#define NCCL_TYPES_H_
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+} ncclDataType_t;
+
+#endif
diff --git a/ext-profiler/inspector/version.h b/ext-profiler/inspector/version.h
new file mode 100644
index 000000000..347757dfc
--- /dev/null
+++ b/ext-profiler/inspector/version.h
@@ -0,0 +1,12 @@
+#ifndef VERSION_H
+#define VERSION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const char* get_git_version_info();
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif // VERSION_H
diff --git a/ext-tuner/README.md b/ext-tuner/README.md
index 67a743a12..7595e03ba 100644
--- a/ext-tuner/README.md
+++ b/ext-tuner/README.md
@@ -179,4 +179,4 @@ When developing new tuner plugins:
 - [NCCL Documentation](https://docs.nvidia.com/deeplearning/nccl/)
 - Example plugin implementations in this directory
 
-For questions and support, refer to the NCCL community resources and documentation.
\ No newline at end of file
+For questions and support, refer to the NCCL community resources and documentation.
diff --git a/ext-tuner/example/.gitignore b/ext-tuner/example/.gitignore
new file mode 100644
index 000000000..a3d6f635f
--- /dev/null
+++ b/ext-tuner/example/.gitignore
@@ -0,0 +1,49 @@
+# Compiled shared objects and binaries
+*.so
+*.o
+*.a
+*.out
+*.exe
+*.dll
+*.dylib
+*.bin
+*.elf
+
+# Python cache
+__pycache__/
+*.pyc
+*.pyo
+
+# Build and test artifacts
+/build/
+*.log
+*.tmp
+*.swp
+
+# Ignore all CSV files except scripts/sample_performance_data.csv
+*.csv
+!scripts/sample_performance_data.csv
+
+# Ignore all .conf files except nccl_tuner.conf
+*.conf
+!nccl_tuner.conf
+
+my_configs
+
+# Ignore test binary
+test/test_plugin
+
+# Editor/OS files
+.DS_Store
+Thumbs.db
+
+# Backup files
+*~
+*.bak
+
+# Ignore by convention
+*.old
+*.orig
+
+# Git
+.git/
diff --git a/ext-tuner/example/CMakeLists.txt b/ext-tuner/example/CMakeLists.txt
new file mode 100644
index 000000000..1c116b446
--- /dev/null
+++ b/ext-tuner/example/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Find all C source files in current directory
+set(SRC_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin.c
+)
+
+# Create shared library
+add_library(nccl-tuner-example SHARED ${SRC_FILES})
+
+# Set include directories
+target_include_directories(nccl-tuner-example PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl
+)
+
+# Set output name to match Makefile
+set_target_properties(nccl-tuner-example PROPERTIES
+    OUTPUT_NAME "nccl-tuner-example"
+    PREFIX "lib"
+    POSITION_INDEPENDENT_CODE ON
+    LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/test/unit/plugins
+)
+
+# Add custom target for clean (equivalent to Makefile clean target)
+add_custom_target(clean-tuner-lib
+    COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/libnccl-tuner-example.so
+    COMMENT "Cleaning libnccl-tuner-example.so"
+)
diff --git a/ext-tuner/example/nccl/tuner.h b/ext-tuner/example/nccl/tuner.h
index 77b543d12..dc956b1c0 100644
--- a/ext-tuner/example/nccl/tuner.h
+++ b/ext-tuner/example/nccl/tuner.h
@@ -45,6 +45,40 @@ typedef enum {
 
 #define NCCL_ALGO_PROTO_IGNORE -1.0
 
+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+#define NCCL_NUM_HW_LINKS 3
+
+#define NCCL_VOLTA_COMPCAP_IDX 0
+#define NCCL_AMPERE_COMPCAP_IDX 1
+#define NCCL_HOPPER_COMPCAP_IDX 2
+#define NCCL_BLACKWELL_COMPCAP_IDX 3
+#define NCCL_NUM_COMPCAPS 4
+
+#define NCCL_TUNING_SCALE_1NODE 0
+#define NCCL_TUNING_SCALE_2NODES 1
+#define NCCL_TUNING_SCALE_4NODES 2
+#define NCCL_NUM_TUNING_SCALES 3
+
+typedef struct {
+  int nNvlDomains;                    // number of NVLink domains
+  int minRanksPerNvlDomain;           // minimum ranks across all NVLink domains
+  int maxRanksPerNvlDomain;           // maximum ranks across all NVLink domains
+} ncclNvlDomainInfo_v5_t;
+
+typedef struct {
+  double baseLatencies [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  double hwLatencies [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+
+  double llMaxBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+  double perChMaxTreeBws [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES];
+
+
+} ncclTunerConstants_v5_t;
+
 // API to be implemented by external tuner
 typedef struct {
   // Name of the tuner
@@ -52,12 +86,17 @@ typedef struct {
 
   // Initializes tuner states.
   // Inputs:
+  //   - commId: communicator identifier
   //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
   //   - nNodes: number of nodes in current communicator.
   //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  //   - nvlDomainInfo: NVL domain information struct
   // Outputs:
   //   - context: tuner context object
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context);
+  // Input/Output:
+  //   - constants: tuner constants
+  ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                      ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);
 
   // Gets info (algo, protocol, number of ctas and threads) for a given collective.
   // Inputs:
@@ -87,11 +126,13 @@ typedef struct {
 
   // Terminates the plugin and cleans up any resources that the plugin allocated.
   // context: tuner context object
-  ncclResult_t (*destroy)(void* context);
-} ncclTuner_v4_t;
+  ncclResult_t (*finalize)(void* context);
+} ncclTuner_v5_t;
 
-typedef ncclTuner_v4_t ncclTuner_t;
+typedef ncclTuner_v5_t ncclTuner_t;
+typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
+typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"
 
 #endif
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index 1b8031ed1..af813495a 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -51,6 +51,7 @@ typedef struct {
   size_t nRanks;
   size_t nNodes;
   ncclDebugLogger_t logFunction;
+  ncclNvlDomainInfo_v5_t nvlDomainInfo;
 } TunerContext;
 
 // Parse collective type from string
@@ -289,7 +290,25 @@ static ncclResult_t loadConfig(TunerContext* ctx, const char* filename) {
   return ncclSuccess;
 }
 
-__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
+__hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                                 ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants) {
+
+  if (NULL != constants) {
+    // NCCL constants tuning
+    // Tune NCCL's internal tuning model to improve base algo/proto selection.
+    // Note: Example numbers are for reference only.
+    //       Actual numbers may vary depending on the hardware and network topology.
+    //       These numbers are not guaranteed to be optimal for all cases.
+    // Limit the tree bandwidth to 15GB/s
+    constants->perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 15.0;
+
+    // Limit the ring bandwidth to 20GB/s
+    constants->perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] = 20.0;
+
+    // Set NVLSTree base network latency to 24us
+    constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
+  }
+  
   TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
   if (!ctx) return ncclSystemError;
 
@@ -299,10 +318,16 @@ __hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t
   ctx->nRanks = nRanks;
   ctx->nNodes = nNodes;
   ctx->logFunction = logFunction;
+  if (nvlDomainInfo) {
+    ctx->nvlDomainInfo = *nvlDomainInfo;
+  } else {
+    memset(&ctx->nvlDomainInfo, 0, sizeof(ncclNvlDomainInfo_v5_t));
+  }
 
   if (logFunction) {
     logFunction(NCCL_LOG_INFO, NCCL_TUNING, __FILE__, __LINE__,
-                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks", nNodes, nRanks);
+                "TUNER/ExamplePlugin: Initializing tuner for %zu nodes, %zu ranks, %d NVL domains",
+                nNodes, nRanks, ctx->nvlDomainInfo.nNvlDomains);
   }
 
   // Try to load config file from environment variable or default location
@@ -432,7 +457,7 @@ __hidden ncclResult_t pluginGetCollInfo(void* context, ncclFunc_t collType, size
   return ncclSuccess;
 }
 
-__hidden ncclResult_t pluginDestroy(void* context) {
+__hidden ncclResult_t pluginFinalize(void* context) {
   if (context) {
     TunerContext* ctx = (TunerContext*)context;
     if (ctx->configs) {
@@ -443,11 +468,12 @@ __hidden ncclResult_t pluginDestroy(void* context) {
   return ncclSuccess;
 }
 
+
 #define PLUGIN_NAME "Example"
 
-const ncclTuner_v4_t ncclTunerPlugin_v4 = {
+const ncclTuner_v5_t ncclTunerPlugin_v5 = {
   .name = PLUGIN_NAME,
   .init = pluginInit,
   .getCollInfo = pluginGetCollInfo,
-  .destroy = pluginDestroy
+  .finalize = pluginFinalize
 };
diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c
index 28897c449..c0300d51c 100644
--- a/ext-tuner/example/test/test_plugin.c
+++ b/ext-tuner/example/test/test_plugin.c
@@ -97,12 +97,12 @@ int test_plugin_init() {
   void* context = NULL;
 
   // Test successful initialization
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed");
   TEST_ASSERT(context != NULL, "Context should be allocated");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   TEST_PASS();
 }
 
@@ -122,11 +122,11 @@ int test_config_parsing_valid() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_valid.conf", 1);
 
   void* context = NULL;
-  ncclResult_t result = pluginInit(16, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 2, mock_logger, NULL, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin init with valid config should succeed");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_valid.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -143,12 +143,12 @@ int test_config_parsing_invalid() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_invalid.conf", 1);
 
   void* context = NULL;
-  ncclResult_t result = pluginInit(8, 1, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
   // Should still succeed but with no valid configs loaded
   TEST_ASSERT(result == ncclSuccess, "Plugin init should succeed even with invalid config");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_invalid.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -164,7 +164,7 @@ int test_collective_matching() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_match.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   // Create mock cost table
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -208,7 +208,7 @@ int test_collective_matching() {
   TEST_ASSERT(nChannels == 4, "Should set 4 channels");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_match.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -225,7 +225,7 @@ int test_size_matching() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_size.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -279,7 +279,7 @@ int test_size_matching() {
   TEST_ASSERT(nChannels == 8, "Large: Should set 8 channels");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_size.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -297,7 +297,7 @@ int test_topology_matching() {
 
   // Test with single node setup
   void* context1 = NULL;
-  pluginInit(8, 1, mock_logger, &context1);  // 8 ranks, 1 node
+  pluginInit(&context1, 0, 8, 1, mock_logger, NULL, NULL);  // 8 ranks, 1 node
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -315,11 +315,11 @@ int test_topology_matching() {
   TEST_ASSERT(cost_table[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] == 0.0, "Single node: Should match tree config");
   TEST_ASSERT(nChannels == 2, "Single node: Should set 2 channels");
 
-  pluginDestroy(context1);
+  pluginFinalize(context1);
 
   // Test with 4 nodes, 32 ranks setup
   void* context2 = NULL;
-  pluginInit(32, 4, mock_logger, &context2);  // 32 ranks, 4 nodes
+  pluginInit(&context2, 0, 32, 4, mock_logger, NULL, NULL);  // 32 ranks, 4 nodes
 
   for (int i = 0; i < NCCL_NUM_ALGORITHMS; i++) {
     for (int j = 0; j < NCCL_NUM_PROTOCOLS; j++) {
@@ -348,7 +348,7 @@ int test_default_channels() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_default.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -368,7 +368,7 @@ int test_default_channels() {
   TEST_ASSERT(nChannels == 1, "Should keep default channels (1) when config has -1");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_default.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -385,7 +385,7 @@ int test_regbuff_matching() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_regbuff.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -436,7 +436,7 @@ int test_regbuff_matching() {
   TEST_ASSERT(nChannels == 8, "Any regBuff: Should set 8 channels");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_regbuff.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -453,7 +453,7 @@ int test_pipeops_matching() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_pipeops.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -503,7 +503,7 @@ int test_pipeops_matching() {
   TEST_ASSERT(nChannels == 8, "Any pipeOps: Should set 8 channels");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_pipeops.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -518,7 +518,7 @@ int test_no_match_fallback() {
   setenv("NCCL_TUNER_CONFIG_FILE", "test_fallback.conf", 1);
 
   void* context = NULL;
-  pluginInit(8, 1, mock_logger, &context);
+  pluginInit(&context, 0, 8, 1, mock_logger, NULL, NULL);
 
   float cost_table[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float* cost_table_ptr[NCCL_NUM_ALGORITHMS];
@@ -542,7 +542,7 @@ int test_no_match_fallback() {
   TEST_ASSERT(nChannels == 1, "Should use default channels");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink("test_fallback.conf");
   unsetenv("NCCL_TUNER_CONFIG_FILE");
   TEST_PASS();
@@ -592,7 +592,7 @@ int test_large_config() {
 
   // Initialize plugin with large config
   void* context = NULL;
-  ncclResult_t result = pluginInit(16, 4, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 16, 4, mock_logger, NULL, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin init with large config should succeed");
   TEST_ASSERT(context != NULL, "Context should be allocated");
 
@@ -651,7 +651,7 @@ int test_large_config() {
   TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with large config set");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink(large_config_file);
   unsetenv("NCCL_TUNER_CONFIG_FILE");
 
@@ -683,7 +683,7 @@ int test_very_large_config_stress() {
 
   // Test initialization with stress config
   void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin should handle very large config files");
 
   TunerContext* ctx = (TunerContext*)context;
@@ -704,7 +704,7 @@ int test_very_large_config_stress() {
   }
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink(stress_config_file);
   unsetenv("NCCL_TUNER_CONFIG_FILE");
 
@@ -725,7 +725,7 @@ int test_empty_config() {
   setenv("NCCL_TUNER_CONFIG_FILE", empty_config_file, 1);
 
   void* context = NULL;
-  ncclResult_t result = pluginInit(8, 2, mock_logger, &context);
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin should handle empty config files");
 
   TunerContext* ctx = (TunerContext*)context;
@@ -750,13 +750,134 @@ int test_empty_config() {
   TEST_ASSERT(result == ncclSuccess, "GetCollInfo should work with empty config");
 
   // Clean up
-  pluginDestroy(context);
+  pluginFinalize(context);
   unlink(empty_config_file);
   unsetenv("NCCL_TUNER_CONFIG_FILE");
 
   TEST_PASS();
 }
 
+// Test NVLink domain info handling
+int test_nvl_domain_info() {
+  printf("Testing NVLink domain info handling...\n");
+
+  // Test NVLink domain structure with min/max ranks per domain
+  ncclNvlDomainInfo_v5_t nvl_domain = {
+    .nNvlDomains = 2, // 2 nodes = 2 domains
+    .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
+    .maxRanksPerNvlDomain = 5  // maximum ranks across all domains (capacity)
+  };
+  
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
+  
+  // Validate NVLD info structure
+  TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
+  TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
+  TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
+  
+  // Clean up
+  pluginFinalize(context);
+  printf("NVLink domain info test passed!\n");
+  TEST_PASS();
+}
+
+int test_tuner_constants() {
+  // Initialize constants to -1.0 for testing purposes
+  ncclTunerConstants_v5_t constants = {
+    // Base latencies: [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .baseLatencies = {
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_TREE: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_RING: LL, LL128, Simple
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_DIRECT
+      {-1.0, -1.0, -1.0},   // NCCL_ALGO_COLLNET_CHAIN
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS
+      {-1.0, -1.0, -1.0},    // NCCL_ALGO_NVLS_TREE
+      {-1.0, -1.0, -1.0}     // NCCL_ALGO_PAT
+    },
+
+    // Hardware latencies: [NCCL_NUM_HW_LINKS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]
+    .hwLatencies = {
+      // NCCL_HW_NVLINK
+      {
+        {-1.0, -1.0, -1.0},    // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},    // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},    // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},    // NVLS
+        {-1.0, -1.0, -1.0},    // NVLS_TREE
+        {-1.0, -1.0, -1.0}     // PAT
+      },
+      // NCCL_HW_PCI
+      {
+        {-1.0, -1.0, -1.0},   // TREE
+        {-1.0, -1.0, -1.0},    // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},     // NVLS
+        {-1.0, -1.0, -1.0},   // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      },
+      // NCCL_HW_NET
+      {
+        {-1.0, -1.0, -1.0},  // TREE
+        {-1.0, -1.0, -1.0},  // RING
+        {-1.0, -1.0, -1.0},  // COLLNET_DIRECT
+        {-1.0, -1.0, -1.0},  // COLLNET_CHAIN
+        {-1.0, -1.0, -1.0},  // NVLS
+        {-1.0, -1.0, -1.0},  // NVLS_TREE
+        {-1.0, -1.0, -1.0}   // PAT
+      }
+    },
+
+    // LL maximum bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .llMaxBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Ring LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxRingLL128Bws = {
+      {-1.0, -1.0, -1.0},   // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree LL128 bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeLL128Bws = {
+      {-1.0, -1.0, -1.0},    // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},   // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    },
+
+    // Per-channel maximum Tree bandwidths: [NCCL_NUM_COMPCAPS][NCCL_NUM_TUNING_SCALES]
+    .perChMaxTreeBws = {
+      {-1.0, -1.0, -1.0},  // Volta: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Ampere: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0},  // Hopper: 1node, 2nodes, 4nodes
+      {-1.0, -1.0, -1.0}   // Blackwell: 1node, 2nodes, 4nodes
+    }
+  };
+
+  void* context = NULL;
+  ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, NULL, &constants);
+  TEST_ASSERT(result == ncclSuccess, "Plugin init with constants should succeed");
+
+  // Test that the constants were set correctly
+  TEST_ASSERT(constants.perChMaxTreeBws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 15.0, "Tree bandwidth should be 15GB/s");
+  TEST_ASSERT(constants.perChMaxRingLL128Bws[NCCL_BLACKWELL_COMPCAP_IDX][NCCL_TUNING_SCALE_4NODES] == 20.0, "Ring bandwidth should be 20GB/s");
+  TEST_ASSERT(constants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] == 24.0, "NVLSTree base network latency should be 24us");
+
+  // Clean up
+  pluginFinalize(context);
+  TEST_PASS();
+}
+
 // Test runner function pointer type
 typedef int (*TestFunction)(void);
 
@@ -782,6 +903,8 @@ TestCase test_cases[] = {
   {"large-config", test_large_config, "Large configuration files (dynamic allocation)"},
   {"stress-config", test_very_large_config_stress, "Very large configuration stress test"},
   {"empty-config", test_empty_config, "Empty configuration file handling"},
+  {"nvl-domain", test_nvl_domain_info, "NVL domain info handling"},
+  {"constants", test_tuner_constants, "Tuner constants initialization"},
   {NULL, NULL, NULL} // End marker
 };
 
@@ -825,6 +948,7 @@ int main(int argc, char* argv[]) {
   if (argc == 1) {
     // No arguments - run all tests
     for (int i = 0; test_cases[i].name != NULL; i++) {
+      printf("Running test: %s\n", test_cases[i].name);
       total++;
       passed += test_cases[i].func();
     }
diff --git a/makefiles/common.mk b/makefiles/common.mk
index 0f01671b6..f8f455dec 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -32,13 +32,8 @@ CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
 
 # You should define NVCC_GENCODE in your environment to the minimal set
 # of archs to reduce compile time.
-CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-                -gencode=arch=compute_60,code=sm_60 \
+CUDA8_GENCODE = -gencode=arch=compute_60,code=sm_60 \
                 -gencode=arch=compute_61,code=sm_61
-ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
-# SM35 is deprecated from CUDA12.0 onwards
-CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
-endif
 CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
 CUDA10_GENCODE = -gencode=arch=compute_75,code=sm_75
 CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
diff --git a/makefiles/version.mk b/makefiles/version.mk
index 3b182d61b..d0e97c065 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
-NCCL_MINOR   := 27
-NCCL_PATCH   := 7
+NCCL_MINOR   := 28
+NCCL_PATCH   := 3
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/pkg/Makefile b/pkg/Makefile
index ab6487be9..cffd5d76f 100644
--- a/pkg/Makefile
+++ b/pkg/Makefile
@@ -10,7 +10,7 @@ build : debian.build txz.build
 
 BUILDDIR ?= $(abspath ../build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
-TARGETS := debian txz
+TARGETS := debian txz doc
 all:   ${TARGETS:%=%.build}
 prep:  ${TARGETS:%=%.prep}
 build: ${TARGETS:%=%.build}
diff --git a/pkg/debian/libnccl-dev.install.in b/pkg/debian/libnccl-dev.install.in
index 45120e6de..b656e63ab 100644
--- a/pkg/debian/libnccl-dev.install.in
+++ b/pkg/debian/libnccl-dev.install.in
@@ -1,4 +1,4 @@
 bin/ncclras /usr/bin
-include/nccl.h /usr/include
+include/* /usr/include
 lib/libnccl.so /usr/lib/${pkg:MultiArch}
 lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in
index d62955592..30ddbf19f 100644
--- a/pkg/redhat/nccl.spec.in
+++ b/pkg/redhat/nccl.spec.in
@@ -47,8 +47,8 @@ ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_li
 # devel
 install -m 755 -d $RPM_BUILD_ROOT/%{_bindir}
 install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
+cp -a include/* $RPM_BUILD_ROOT/%{_includedir}/
 install -m 755 bin/ncclras $RPM_BUILD_ROOT/%{_bindir}
-install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
 ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
 
 # static
@@ -67,7 +67,7 @@ rm -rf $RPM_BUILD_ROOT
 %doc LICENSE.txt
 %defattr(-,root,root,-)
 %{_bindir}/ncclras
-%{_includedir}/nccl.h
+%{_includedir}/*
 %{_libdir}/libnccl.so
 
 %files static
diff --git a/pkg/srctxz/Makefile b/pkg/srctxz/Makefile
index 01cab95a4..a8d9e0da9 100644
--- a/pkg/srctxz/Makefile
+++ b/pkg/srctxz/Makefile
@@ -22,7 +22,7 @@ prep: $(TXZTARGETS)
 build: prep
 	$(MAKE) -C ../../src clean
 	@printf "Building source tar.xz package\n"
-	(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
+	(cd $(BUILDDIR); SRCTXZ_APITESTS=$(SRCTXZ_APITESTS) bash srctxz/create_srctxz.sh)
 	mkdir -p $(PKGDIR)
 	mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
 
diff --git a/pkg/srctxz/create_srctxz.sh.in b/pkg/srctxz/create_srctxz.sh.in
index 11bdd52db..0e627dd25 100644
--- a/pkg/srctxz/create_srctxz.sh.in
+++ b/pkg/srctxz/create_srctxz.sh.in
@@ -28,8 +28,34 @@ NCCL_SUFFIX=${nccl:Suffix}
 NCCL_BUILD=${pkg:Revision}
 
 NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  NCCLNAME+="-apitest"
+fi
+
+
+INCLUDE_TEST_ENTRIES=("apitest" "googletest" "gtest.mk")
+
+if [ "${SRCTXZ_APITESTS}" = "1" ]; then
+  # Exclude all entries inside test folder except those in INCLUDE_TEST_ENTRIES
+  for entry in $(ls $NCCLDIR/test); do
+    if [[ ! " ${INCLUDE_TEST_ENTRIES[@]} " =~ " $entry " ]]; then
+      EXCLUDE_TEST+=" --exclude $NCCLDIR/test/$entry"
+    fi
+  done
+else
+  # Exclude the entire test directory
+  EXCLUDE_TEST+=" --exclude test"
+fi
 
-tar --exclude build \
+tar --exclude fortran \
+    --exclude doc \
+    --exclude plc \
+    --exclude build \
     --exclude ".git*" \
+    --exclude share \
+    --exclude ompi \
+    --exclude ext-net \
     --exclude pkg/srctxz \
+    --exclude docker \
+    $EXCLUDE_TEST \
     --transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..5ab69dc92
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,180 @@
+# Source files
+set(LIBSRCFILES
+    bootstrap.cc
+    channel.cc
+    ce_coll.cc
+    collectives.cc
+    debug.cc
+    enqueue.cc
+    group.cc
+    init.cc
+    init_nvtx.cc
+    proxy.cc
+    transport.cc
+    mnnvl.cc
+    allocator.cc
+    sym_kernels.cc
+    dev_runtime.cc
+)
+
+# Add compatibility shim if using static cudart
+if(CUDARTLIB STREQUAL "cudart_static")
+    list(APPEND LIBSRCFILES enhcompat.cc)
+endif()
+
+# Configure pkg-config file
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
+    ${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
+    @ONLY
+)
+
+# Add files from subdirectories
+add_subdirectory(transport)
+add_subdirectory(misc)
+add_subdirectory(register)
+add_subdirectory(graph)
+add_subdirectory(plugin)
+add_subdirectory(device)
+add_subdirectory(nccl_device)
+add_subdirectory(ras)
+add_subdirectory(scheduler)
+
+add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
+
+# Add all source files
+list(APPEND LIBSRCFILES
+    ${TRANSPORT_SOURCES}
+    ${MISC_SOURCES}
+    ${REGISTER_SOURCES}
+    ${GRAPH_SOURCES}
+    ${PLUGIN_SOURCES}
+    ${RAS_SOURCES}
+    ${SYM_SOURCES}
+    ${SCHEDULER_SOURCES}
+)
+
+###################### Create a shared NCCL library ############################
+add_library(nccl SHARED)
+
+target_sources(nccl PRIVATE ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
+    COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
+                -e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
+                -e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
+                -e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
+                -e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
+                ${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
+    BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
+)
+
+add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
+
+add_dependencies(nccl nccl_header)
+
+# Set version and output name
+set_target_properties(nccl PROPERTIES
+    VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+    SOVERSION ${NCCL_MAJOR}
+    OUTPUT_NAME "nccl"
+    PREFIX "lib"
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Link libraries
+target_link_libraries(nccl
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set output directories for nccl shared library
+set_target_properties(nccl PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+###################### Create a ras binary executable ############################
+set(RAS_BINSRCFILES ras/client.cc)
+
+add_executable(ncclras ${RAS_BINSRCFILES})
+
+target_include_directories(ncclras PUBLIC
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+add_dependencies(ncclras nccl_header)
+
+target_link_libraries(ncclras
+    PRIVATE
+    pthread
+    rt
+    dl
+)
+
+# Set output directory for ncclras executable
+set_target_properties(ncclras PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+)
+
+###################### Create a static NCCL library ############################
+add_library(nccl_static STATIC ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl_static PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+# Add dependency on nccl_header
+add_dependencies(nccl_static nccl_header)
+
+# Link libraries
+target_link_libraries(nccl_static
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl_static PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Set output directory for nccl_static library
+set_target_properties(nccl_static PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
diff --git a/src/Makefile b/src/Makefile
index eab662ef9..be026cc26 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -7,10 +7,12 @@ include ../makefiles/common.mk
 include ../makefiles/version.mk
 
 ##### src files
-INCEXPORTS  := nccl.h
+INCEXPORTS  := nccl.h nccl_device.h \
+	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
+
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
-	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc symmetric.cc \
+	init.cc init_nvtx.cc proxy.cc transport.cc mnnvl.cc allocator.cc dev_runtime.cc sym_kernels.cc ce_coll.cc \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
@@ -19,6 +21,8 @@ LIBSRCFILES := \
 	$(wildcard plugin/net/*.cc) \
 	$(wildcard plugin/tuner/*.cc) \
 	$(wildcard plugin/profiler/*.cc) \
+	$(wildcard nccl_device/*.cc) \
+	$(wildcard scheduler/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -123,6 +127,16 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h
 	mkdir -p $(INCDIR)
 	install -m 644 $< $@
 
+$(INCDIR)/nccl_device/%.h: include/nccl_device/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/impl
+	install -m 644 $< $@
+
 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
@@ -149,7 +163,7 @@ install : build
 	mkdir -p $(PREFIX)/bin
 	cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/
 	cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/
-	cp -v $(BUILDDIR)/include/* $(PREFIX)/include/
+	cp -v -r $(BUILDDIR)/include/* $(PREFIX)/include/
 	cp -v $(BUILDDIR)/bin/ncclras $(PREFIX)/bin/
 
 FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h')
diff --git a/src/allocator.cc b/src/allocator.cc
index c58181948..f5638b92d 100644
--- a/src/allocator.cc
+++ b/src/allocator.cc
@@ -7,10 +7,11 @@
 #include "comm.h"
 #include "transport.h"
 #include "group.h"
+#include "nvtx.h"
 
 NCCL_API(ncclResult_t, ncclMemAlloc, void **ptr, size_t size);
 ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
   ncclResult_t ret = ncclSuccess;
 
 #if CUDART_VERSION >= 12010
@@ -98,7 +99,7 @@ ncclResult_t  ncclMemAlloc(void **ptr, size_t size) {
 
 NCCL_API(ncclResult_t, ncclMemFree, void *ptr);
 ncclResult_t  ncclMemFree(void *ptr) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
   ncclResult_t ret = ncclSuccess;
   int saveDevice;
 
@@ -127,70 +128,339 @@ ncclResult_t  ncclMemFree(void *ptr) {
   goto exit;
 }
 
-// This is a collective function and should be called by all ranks in the communicator
-ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr) {
-  ncclResult_t ret = ncclSuccess;
-  void* regSymAddr = NULL;
-  size_t allocSize = size;
-  size_t granularity;
-  CUdevice cuDev;
-  CUmemAllocationProp memprop = {};
-  CUmemGenericAllocationHandle memHandle;
-  int bit = 0, cnt = 0;
-
-  // aligment must be power of 2 as an input
-  while (bit < sizeof(size_t) * 8) {
-    if (alignment & (1L << bit)) cnt++;
-    if (cnt == 2) {
-      WARN("rank %d alignment %ld is not power of 2", comm->rank, alignment);
-      goto fail;
+////////////////////////////////////////////////////////////////////////////////
+// ncclSpace:
+//
+// This datastructure "cuts" the line of non-negative integers into segments
+// which alternate between "full" (allocated) and "empty" (not allocated). The
+// cuts are sorted ascending. The segment after the last cut must be empty
+// (the unallocated frontier). Knwoing this we can deduce whether the segment
+// ending at cut[i] is full or empty with this formula:
+//   isFull(i) = (i%2 != ncuts%2)
+
+void ncclSpaceConstruct(struct ncclSpace* a) {
+  memset(a, 0, sizeof(*a));
+}
+
+void ncclSpaceDestruct(struct ncclSpace* a) {
+  free(a->cuts);
+}
+
+static void insertSegment(struct ncclSpace* a, int index, int64_t lo, int64_t hi) {
+  // Insert space for two cuts in `a->cuts[]` before `index`.
+  if (a->count + 2 > a->capacity) {
+    a->capacity *= 2;
+    if (a->capacity == 0) a->capacity = 16;
+    int64_t* cuts1 = (int64_t*)malloc(a->capacity*sizeof(int64_t));
+    for (int i=0; i < index; i++) cuts1[i] = a->cuts[i];
+    for (int i=index; i < a->count; i++) cuts1[i+2] = a->cuts[i];
+    free(a->cuts);
+    a->cuts = cuts1;
+  } else {
+    for (int i=a->count-1; index <= i; i--) a->cuts[i+2] = a->cuts[i];
+  }
+  a->cuts[index+0] = lo;
+  a->cuts[index+1] = hi;
+  a->count += 2;
+
+  // Filter pairs of adjacent repeated values from cuts[]. Since these mark
+  // boundaries where segments transition between full<->empty, dropping such a
+  // pair fuses two adjacent segments together. Examples:
+  //   [1,2,3,3,4] -> [1,2,4]
+  //   [1,2,3,3,3,4] -> [1,2,3,4] // have to leave one 3 because its a full<->empty transition
+  //   [1,2,3,3,3,3,4] -> [1,2,4]
+  // Leading zeros don't have to be in pairs, they are always dropped:
+  //   [0,1,2] -> [1,2]
+  //   [0,0,1,2] -> [1,2]
+  int r = index, w = index; // Read and write cursors.
+  int64_t prev = r==0 ? 0 : a->cuts[r-1];
+  while (r < a->count) {
+    int64_t cur = a->cuts[r++];
+    a->cuts[w++] = cur;
+    if (prev == cur) { // Repeated value is an empty segment which can be deleted.
+      // Erase last two cuts or just one if we're at the start.
+      w -= w==1 ? 1 : 2;
+      // Zeros can only occur at the beginning (due to being sorted). We want to
+      // drop any number of zeros, but only even numbers of other repeated values.
+      // So set to zero here, which will make prev=0, thus if next value is zero
+      // it will be dropped but if its not zero then it will need to begin a new
+      // pair to be dropped.
+      cur = 0;
     }
-    bit++;
-  }
-  // temporarily align the alignment to NCCL_REC_PAGE_SIZE
-  ALIGN_SIZE(alignment, NCCL_REC_PAGE_SIZE);
-
-  CUCHECKGOTO(cuDeviceGet(&cuDev, comm->cudaDev), ret, fail);
-  memprop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  memprop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  memprop.requestedHandleTypes = ncclCuMemHandleType;
-  memprop.location.id = cuDev;
-  CUCHECKGOTO(cuMemGetAllocationGranularity(&granularity, &memprop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail);
-  ALIGN_SIZE(allocSize, granularity);
-
-  CUCHECKGOTO(cuMemCreate(&memHandle, allocSize, &memprop, 0), ret, fail);
-  ALIGN_SIZE(comm->symAllocHead, alignment);
-  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, allocSize, memHandle, &regSymAddr), ret, fail);
-  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, allocSize, regSymAddr), ret, fail);
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-  comm->symAllocHead += allocSize;
-  *symPtr = regSymAddr;
+    prev = cur;
+  }
+  a->count = w;
+}
 
-exit:
-  return ret;
-fail:
-  *symPtr = NULL;
-  goto exit;
+ncclResult_t ncclSpaceAlloc(
+    struct ncclSpace* a, int64_t limit, int64_t size, int align,
+    int64_t* outOffset
+  ) {
+  // When allocating we try to locate the first empty segment which can hold
+  // the allocation and move its lower cut upward.
+  int i = a->count%2; // First empty segment ends at cuts[i]
+  size_t off;
+  while (i <= a->count) {
+    size_t lo = i == 0 ? 0 : a->cuts[i-1];
+    size_t hi = i == a->count ? limit : a->cuts[i];
+    off = alignUp(lo, align);
+    if (off + size <= hi) {
+      *outOffset = off;
+      if (i == 0 || off + size == hi) { // Slow path required.
+        insertSegment(a, i, off, off+size);
+      } else { // We can just append to the end of a full segment.
+        a->cuts[i-1] = off + size;
+      }
+      return ncclSuccess;
+    }
+    i += 2; // Next empty segment
+  }
+  WARN("Allocation failed. No suitable space found to accommodate size=0x%lx within limit=0x%lx", (long)size, (long)limit);
+  return ncclInternalError;
 }
 
-ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr) {
-  CUmemGenericAllocationHandle handle;
-  size_t size = 0;
-  ncclResult_t ret = ncclSuccess;
-  int saveDev = comm->cudaDev;
-  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
-  if (ncclCuMemEnable()) {
-    CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-    CUCHECKGOTO(cuMemRetainAllocationHandle(&handle, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
-    CUCHECKGOTO(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, size, symPtr), ret, fail);
-    NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, size, symPtr), ret, fail);
-    CUCHECKGOTO(cuMemRelease(handle), ret, fail);
+ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t offset, int64_t size) {
+  if (a->count == 0 || a->cuts[a->count-1] <= offset) {
+    WARN("No allocation found at offset=0x%lx", (long)offset);
+    return ncclInternalError;
   }
-exit:
-  CUDACHECK(cudaSetDevice(saveDev));
-  return ret;
-fail:
-  goto exit;
+
+  // This could be binary search, but since allocate is linear there's no point.
+  int i = 1 - a->count%2; // First full segment ends at cuts[i]
+  while (a->cuts[i] <= offset) i += 2;
+
+  int64_t lo = i==0 ? 0 : a->cuts[i-1];
+  int64_t hi = a->cuts[i];
+
+  if (offset < lo || hi < offset + size) {
+    WARN("Given size=0x%lx extends beyond allocation.", (long)size);
+    return ncclInternalError;
+  }
+
+  // First try the two fast cases which just shrink a segment from one side.
+  if (i != 0 && lo == offset && offset + size != hi) {
+    a->cuts[i-1] = offset + size; // Bring bottom up.
+  } else if (lo != offset && offset + size == hi) {
+    a->cuts[i] = offset; // Bring top down.
+  } else { // Slow path.
+    insertSegment(a, i, offset, offset+size);
+  }
+  return ncclSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclShadowPool:
+
+struct ncclShadowPage { // A contiguous block of (at most) 64 objects
+  struct ncclShadowPage* next;
+  int objSize;
+  uint64_t freeMask;
+  void* devObjs;
+};
+struct ncclShadowObject {
+  struct ncclShadowObject* next;
+  void* devObj;
+  void* hostObj;
+  struct ncclShadowPage* page; // null if not allocated in page but directly in CUDA mempool.
+};
+
+void ncclShadowPoolConstruct(struct ncclShadowPool* pool) {
+  pool->hbits = 0;
+  pool->count = 0;
+  pool->table = nullptr;
+  pool->pages = nullptr;
+}
+
+ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool* pool) {
+  if (pool->hbits != 0) {
+    cudaStream_t stream;
+    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+    if (pool->count != 0) {
+      for (int i=0; i < 1<<pool->hbits; i++) {
+        struct ncclShadowObject* obj = pool->table[i];
+        while (obj != nullptr) {
+          struct ncclShadowPage* page = obj->page;
+          if (page != nullptr) {
+            if (page->freeMask == 0) { // Put full pages back into page list.
+              page->freeMask = 1;
+              page->next = pool->pages;
+              pool->pages = page;
+            }
+          } else {
+            cudaFreeAsync(obj->devObj, stream);
+          }
+          struct ncclShadowObject* next = obj->next;
+          free(obj);
+          obj = next;
+        }
+      }
+    }
+    free(pool->table);
+
+    while (pool->pages != nullptr) {
+      cudaFreeAsync(pool->pages->devObjs, stream);
+      struct ncclShadowPage* next = pool->pages->next;
+      free(pool->pages);
+      pool->pages = next;
+    }
+
+    cudaStreamSynchronize(stream);
+    cudaStreamDestroy(stream);
+    cudaMemPoolDestroy(pool->memPool);
+  }
+  return ncclSuccess;
+}
+
+static int hashBucket(int hbits, void* devObj) {
+  uintptr_t h = reinterpret_cast<uintptr_t>(devObj);
+  h ^= h>>32;
+  h *= 0x9e3779b97f4a7c13;
+  return (uint64_t)h >> (64-hbits);
+}
+
+static void hashInsert(struct ncclShadowPool* pool, struct ncclShadowObject* obj) {
+  int b = hashBucket(pool->hbits, obj->devObj);
+  obj->next = pool->table[b];
+  pool->table[b] = obj;
+}
+
+ncclResult_t ncclShadowPoolAlloc(
+    struct ncclShadowPool* pool, size_t size, void** outDevObj, void** outHostObj,
+    cudaStream_t stream
+  ) {
+  if (size == 0) {
+    if (outDevObj) *outDevObj = nullptr;
+    if (outHostObj) *outHostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int hbits = pool->hbits;
+  if (hbits == 0) {
+    cudaMemPoolProps props = {};
+    props.allocType = cudaMemAllocationTypePinned;
+    props.handleTypes = cudaMemHandleTypeNone;
+    props.location.type = cudaMemLocationTypeDevice;
+    cudaGetDevice(&props.location.id);
+    CUDACHECK(cudaMemPoolCreate(&pool->memPool, &props));
+
+    pool->hbits = hbits = 4;
+    pool->table = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<hbits);
+    for (int i=0; i < 1<<hbits; i++) pool->table[i] = nullptr;
+  }
+
+  // Check for hash table size increase before inserting. Maintain 2:1 object:bucket ratio.
+  if (pool->count+1 > 2<<hbits) {
+    struct ncclShadowObject** table0 = pool->table;
+    struct ncclShadowObject** table1 = (struct ncclShadowObject**)malloc(sizeof(struct ncclShadowObject*)<<(hbits+1));
+    pool->table = table1;
+    pool->hbits = hbits+1;
+    for (int i1=0; i1 < 2<<hbits; i1++) table1[i1] = nullptr;
+    for (int i0=0; i0 < 1<<hbits; i0++) {
+      struct ncclShadowObject* obj = table0[i0];
+      while (obj) {
+        struct ncclShadowObject* next = obj->next;
+        hashInsert(pool, obj);
+        obj = next;
+      }
+    }
+    hbits += 1; // match pool->hbits
+    free(table0);
+  }
+
+  struct ncclShadowPage* page;
+  void *devObj;
+  if ((64<<10)/size >= 3) {
+    int shift = std::max<int>(0, (int)log2Down(size) + 1 - 4);
+    int pageObjSize = ((size + (1<<shift)-1)>>shift)<<shift;
+    struct ncclShadowPage** pagePtr = &pool->pages;
+    while (true) {
+      page = *pagePtr;
+      if (page == nullptr) {
+        size_t pageSize = std::min<size_t>(64<<10, 64*pageObjSize);
+        page = (struct ncclShadowPage*)malloc(sizeof(struct ncclShadowPage));
+        page->objSize = pageObjSize;
+        page->freeMask = uint64_t(-1)>>(64 - pageSize/pageObjSize);
+        page->next = pool->pages;
+        pool->pages = page;
+        CUDACHECK(cudaMallocFromPoolAsync(&page->devObjs, pageSize, pool->memPool, stream));
+        CUDACHECK(cudaMemsetAsync(page->devObjs, 0, pageSize, stream));
+        // fall through...
+      }
+      if (page->objSize == pageObjSize) {
+        int slot = popFirstOneBit(&page->freeMask);
+        devObj = (char*)page->devObjs + slot*pageObjSize;
+        if (page->freeMask == 0) *pagePtr = page->next; // Remove full page from list.
+        break;
+      }
+      pagePtr = &page->next;
+    }
+  } else {
+    page = nullptr;
+    CUDACHECK(cudaMallocFromPoolAsync(&devObj, size, pool->memPool, stream));
+    CUDACHECK(cudaMemsetAsync(devObj, 0, size, stream));
+  }
+
+  struct ncclShadowObject* obj = (struct ncclShadowObject*)malloc(
+    sizeof(struct ncclShadowObject) + /*padding=*/alignof(max_align_t)-1 + size
+  );
+  obj->page = page;
+  obj->devObj = devObj;
+  obj->hostObj = alignUp((char*)(obj+1), alignof(max_align_t));
+  memset(obj->hostObj, 0, size);
+  hashInsert(pool, obj);
+  pool->count += 1;
+  if (outDevObj) *outDevObj = devObj;
+  if (outHostObj) *outHostObj = obj->hostObj;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolFree(struct ncclShadowPool* pool, void* devObj, cudaStream_t stream) {
+  if (devObj == nullptr) return ncclSuccess;
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject** pobj = &pool->table[b];
+  while (true) {
+    if (*pobj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if ((*pobj)->devObj == devObj) break;
+    pobj = &(*pobj)->next;
+  }
+  struct ncclShadowObject* obj = *pobj;
+  *pobj = obj->next;
+  if (obj->page != nullptr) {
+    if (obj->page->freeMask == 0) {
+      obj->page->next = pool->pages;
+      pool->pages = obj->page;
+    }
+    int slot = ((char*)obj->devObj - (char*)obj->page->devObjs)/obj->page->objSize;
+    obj->page->freeMask |= uint64_t(1)<<slot;
+  } else {
+    CUDACHECK(cudaFreeAsync(devObj, stream));
+  }
+  free(obj);
+  pool->count -= 1;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, void* devObj, void** hostObj) {
+  if (devObj == nullptr) {
+    *hostObj = nullptr;
+    return ncclSuccess;
+  }
+
+  int b = hashBucket(pool->hbits, devObj);
+  struct ncclShadowObject* obj = pool->table[b];
+  while (true) {
+    if (obj == nullptr) {
+      WARN("Device object does not exist in shadow pool.");
+      return ncclInternalError;
+    }
+    if (obj->devObj == devObj) break;
+    obj = obj->next;
+  }
+  *hostObj = obj->hostObj;
+  return ncclSuccess;
 }
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index f05337249..7615b9c52 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -14,6 +14,7 @@
 #include "proxy.h"
 #include "param.h"
 #include "ras.h"
+#include <mutex>
 
 #define BOOTSTRAP_N_CHECK_ABORT           10000
 #define BOOTSTRAP_TAG_CONNECT             (0x1 << 31)
@@ -85,13 +86,13 @@ struct bootstrapRootArgs {
 static char bootstrapNetIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress bootstrapNetIfAddr;
 static int bootstrapNetInitDone = 0;
-pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex bootstrapNetMutex;
 
 NCCL_PARAM(BootstrapNetEnable,"OOB_NET_ENABLE", 0);
 
 ncclResult_t bootstrapNetInit() {
   if (bootstrapNetInitDone == 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
     if (bootstrapNetInitDone == 0) {
       const char* env = ncclGetEnv("NCCL_COMM_ID");
       int nIfs = 0;
@@ -99,21 +100,18 @@ ncclResult_t bootstrapNetInit() {
         union ncclSocketAddress remoteAddr;
         if (ncclSocketGetAddrFromString(&remoteAddr, env) != ncclSuccess) {
           WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
-          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInvalidArgument;
         }
         NCCLCHECK(ncclFindInterfaceMatchSubnet(bootstrapNetIfName, &bootstrapNetIfAddr, &remoteAddr, MAX_IF_NAME_SIZE,
                                                &nIfs));
         if (nIfs <= 0) {
           WARN("NET/Socket : No usable listening interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclSystemError;
         }
       } else {
         NCCLCHECK(ncclFindInterfaces(bootstrapNetIfName, &bootstrapNetIfAddr, MAX_IF_NAME_SIZE, 1, &nIfs));
         if (nIfs <= 0) {
           WARN("Bootstrap : no socket interface found");
-          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInvalidUsage;
         }
       }
@@ -123,7 +121,6 @@ ncclResult_t bootstrapNetInit() {
       INFO(NCCL_BOOTSTRAP, "Bootstrap: Using%s", line);
       bootstrapNetInitDone = 1;
     }
-    pthread_mutex_unlock(&bootstrapNetLock);
   }
   return ncclSuccess;
 }
@@ -485,7 +482,7 @@ static ncclResult_t getUDS(uint64_t* peerUDS) {
 static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
   static int devOOB = -1;
   if (devOOB < 0) {
-    pthread_mutex_lock(&bootstrapNetLock);
+    std::lock_guard<std::mutex> lock(bootstrapNetMutex);
     if (devOOB < 0) {
       const char* userIfEnv = ncclGetEnv("NCCL_OOB_NET_IFNAME");
       if (userIfEnv && strlen(userIfEnv) > 0) {
@@ -516,7 +513,6 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
             WARN("no device found matching %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
           else
             WARN("no device found after excluding %s%s, verify NCCL_OOB_NET_IFNAME", searchExact ? "exactly " : "", userIfEnv);
-          pthread_mutex_unlock(&bootstrapNetLock);
           return ncclInvalidArgument;
         }
       } else {
@@ -529,13 +525,12 @@ static ncclResult_t netGetDevice(int rank, struct ncclComm* comm, int* dev) {
       bool hasProp = res == ncclSuccess;
       INFO(NCCL_BOOTSTRAP, "Bootstrap: Using %s:%d", (hasProp) ? props.name : "N/A", (hasProp) ? props.port : -1);
     }
-    pthread_mutex_unlock(&bootstrapNetLock);
   }
   *dev = devOOB;
   return ncclSuccess;
 }
 
-static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
+static ncclResult_t netRingConnect(void* ctx, ncclNet_t* net, struct bootstrapListen_t* listen, char peerHandle[NCCL_NET_HANDLE_MAXSIZE],
                                    void** sendComm, ncclNetDeviceHandle_t** sendDevHandle,
                                    void** recvComm, ncclNetDeviceHandle_t** recvDevHandle, volatile uint32_t* abortFlag) {
 
@@ -543,7 +538,7 @@ static ncclResult_t netRingConnect(ncclNet_t* net, struct bootstrapListen_t* lis
   do {
     NCCLCHECK(checkAbort(abortFlag, &abortCounter));
     if (!*sendComm)
-      NCCLCHECK(net->connect(listen->net.dev, NULL, peerHandle, sendComm, sendDevHandle));
+      NCCLCHECK(net->connect(ctx, listen->net.dev, peerHandle, sendComm, sendDevHandle));
     if (!*recvComm)
       NCCLCHECK(net->accept(listen->net.comm, recvComm, recvDevHandle));
   } while (!*sendComm || !*recvComm);
@@ -655,7 +650,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
   if (ncclParamBootstrapNetEnable()) {
     // Create net interface for other ranks to contact me (all gather)
     NCCLCHECK(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)));
-    NCCLCHECK(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
+    NCCLCHECK(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)));
     memcpy(info.connectInfo.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
   } else {
     // create socket for ring neightbor to contact mee
@@ -709,7 +704,7 @@ ncclResult_t bootstrapInit(int nHandles, void* handles, struct ncclComm* comm) {
 
   // accept and connect the ring network
   if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECK(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECK(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                              &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                              &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag));
   } else {
@@ -802,7 +797,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   // create a handle for the others to reach out to me
   if (ncclParamBootstrapNetEnable()) {
     NCCLCHECKGOTO(netGetDevice(rank, comm, &STATE_LISTEN(state, net.dev)), ret, fail);
-    NCCLCHECKGOTO(state->net->listen(STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
+    NCCLCHECKGOTO(state->net->listen(comm->netContext, STATE_LISTEN(state, net.dev), STATE_LISTEN(state, net.handle), &STATE_LISTEN(state, net.comm)), ret, fail);
     memcpy(info.handle, STATE_LISTEN(state, net.handle), NCCL_NET_HANDLE_MAXSIZE);
   } else {
     // create socket for ring neightbor to contact mee
@@ -821,7 +816,7 @@ ncclResult_t bootstrapSplit(uint64_t magic, struct ncclComm* comm, struct ncclCo
   NCCLCHECKGOTO(bootstrapSend(parent->bootstrap, prev, BOOTSTRAP_TAG_COMMSPLIT, &info, sizeof(union ringConnectInfo)), ret, fail);
   NCCLCHECKGOTO(bootstrapRecv(parent->bootstrap, next, BOOTSTRAP_TAG_COMMSPLIT, &nextPeer, sizeof(union ringConnectInfo)), ret, fail);
   if (ncclParamBootstrapNetEnable()) {
-    NCCLCHECKGOTO(netRingConnect(state->net, &state->listen, nextPeer.handle,
+    NCCLCHECKGOTO(netRingConnect(comm->netContext, state->net, &state->listen, nextPeer.handle,
                                  &STATE_RING(state, net.sendComm), &STATE_RING(state, net.sendDevHandle),
                                  &STATE_RING(state, net.recvComm), &STATE_RING(state, net.recvDevHandle), state->abortFlag),
                   ret, fail);
diff --git a/src/ce_coll.cc b/src/ce_coll.cc
new file mode 100644
index 000000000..3f3dcbd7f
--- /dev/null
+++ b/src/ce_coll.cc
@@ -0,0 +1,615 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "register_inline.h"
+#include <cuda.h>
+#include "cudawrap.h"
+#include "ce_coll.h"
+#include "alloc.h"
+
+// Static constant for graph synchronization
+static const uint32_t GRAPH_SYNC_VALUE = 1;
+
+// Static constants for intra-batch synchronization to improve CE collective performance with large scale
+// Frequency of intra-batch synchronization
+static const uint32_t CE_COLL_INTRA_BATCH_SYNC_FREQ = 8;
+// Message threshold for intra-batch synchronization
+static const uint64_t CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD = 512*1024*1024;
+
+ncclResult_t ncclCeInit(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint8_t* ceDevBase;
+  size_t ceDevBaseSize = alignUp(comm->nRanks*sizeof(uint32_t), 16) * 2;
+  ncclWindow_vidmem* ceWinDev;
+  ncclWindow_vidmem* ceWinDevHost;
+
+  // Ensure symmetric memory runtime is initialized
+  NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
+  // Allocate and register memory for the symmetric memory
+  NCCLCHECKGOTO(ncclMemAlloc((void**)&ceDevBase, ceDevBaseSize), ret, fail);
+  NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(comm, ceDevBase, ceDevBaseSize, NCCL_WIN_COLL_SYMMETRIC, &ceWinDev), ret, fail);
+  NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, ceWinDev, &ceWinDevHost), ret, fail);
+  // Get the ncclDevrWindow from the winHost field
+  comm->ceColl.ceSyncWin = (struct ncclDevrWindow*)ceWinDevHost->winHost;
+
+  comm->ceColl.baseUCSymReadyOffset = 0;
+  comm->ceColl.baseUCSymComplOffset = alignUp(comm->nRanks*sizeof(uint32_t), 16);
+  comm->ceColl.baseUCSymReadyPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymReadyOffset;
+  comm->ceColl.baseUCSymComplPtr = (uint8_t*)comm->ceColl.ceSyncWin->userPtr + comm->ceColl.baseUCSymComplOffset;
+  comm->ceColl.ceSeqNum = 0;
+  comm->ceColl.useCompletePtr = false;
+  comm->ceColl.intraBatchSyncFreq = CE_COLL_INTRA_BATCH_SYNC_FREQ;
+  comm->ceColl.intraBatchSyncMsgThreshold = CE_COLL_INTRA_BATCH_SYNC_MSG_THRESHOLD;
+  INFO(NCCL_INIT, "Init CE, rank %d baseUCSymReadyPtr %p, baseUCSymComplPtr %p, seq num %d", comm->rank, comm->ceColl.baseUCSymReadyPtr, comm->ceColl.baseUCSymComplPtr, comm->ceColl.ceSeqNum);
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Clean up ceInitTaskQueue
+  while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
+    free(task);
+  }
+  
+  // Clean up CE resources
+  if (comm->ceColl.baseUCSymReadyPtr != NULL) {
+    if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
+      NCCLCHECKGOTO(ncclCommWindowDeregister(comm, comm->ceColl.ceSyncWin->vidmem), ret, fail);
+      NCCLCHECKGOTO(ncclMemFree(comm->ceColl.baseUCSymReadyPtr), ret, fail);
+    }
+    comm->ceColl.baseUCSymReadyPtr = NULL;
+    comm->ceColl.baseUCSymComplPtr = NULL;
+    comm->ceColl.ceSyncWin = NULL;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
+  int driverVersion;
+  if (ncclCudaDriverVersion(&driverVersion) != ncclSuccess) return false;
+
+  // CE is supported in CUDA 12.5 and later
+  if (driverVersion >= 12050) {
+    switch (coll) {
+    case ncclFuncAllGather:
+    case ncclFuncAlltoAll:
+    case ncclFuncScatter:
+    case ncclFuncGather:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatchMemOpParams* batchParams, size_t* opIdx, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Source pointer is either the constant graph sync value or the sequence number
+  void* srcPtr = capturing ? (void*)&GRAPH_SYNC_VALUE : (void*)&currentSeq;
+  // Wait value is either the constant graph sync value or the sequence number
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+
+  // Use multi-cast address as destination pointer
+  void* mcDstPtr;
+  void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+  size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+  NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
+  
+  // Write our own ready/complete flag to the multi-cast address
+  CUDACHECKGOTO(cudaMemcpyAsync(
+    mcDstPtr,
+    srcPtr,
+    sizeof(uint32_t),
+    cudaMemcpyHostToDevice,
+    stream), ret, fail);
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclPrepUCSync(struct ncclComm* comm, bool isComplete,
+                               CUstreamBatchMemOpParams* batchParams,
+                               size_t* opIdx) {
+  ncclResult_t ret = ncclSuccess;
+
+  uint32_t* readyPtrs    = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+  uint32_t currentSeq = ++comm->ceColl.ceSeqNum;
+
+  // Write our own ready/complete flag to remote ranks
+  uint32_t waitValue = capturing ? GRAPH_SYNC_VALUE : currentSeq;
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    void * peerDstPtr;
+    void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
+    size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, comm->ceColl.ceSyncWin, offset, r, &peerDstPtr), ret, fail);
+    batchParams[*opIdx] = {};
+    batchParams[*opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+    batchParams[*opIdx].writeValue.address  = (CUdeviceptr)peerDstPtr;
+    batchParams[*opIdx].writeValue.value = waitValue;
+    batchParams[*opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+    (*opIdx)++;
+  }
+
+  // Add local wait operations for every other rank
+  for (int r = 0; r < comm->nRanks; ++r) {
+    if (r == comm->rank) continue;
+    batchParams[*opIdx] = {};
+    batchParams[*opIdx].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    batchParams[*opIdx].waitValue.address  = (CUdeviceptr)(isComplete ? (void*)&completePtrs[r] : (void*)&readyPtrs[r]);
+    batchParams[*opIdx].waitValue.value = waitValue;
+    batchParams[*opIdx].waitValue.flags = CU_STREAM_WAIT_VALUE_EQ;
+    (*opIdx)++;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Get pointers to the ready and complete synchronization arrays
+  uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
+  uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
+  
+  // Allocate enough slots for all possible ops
+  size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
+  size_t opIdx = 0;
+
+  // Prepare batch memory operations for synchronization
+  CUstreamBatchMemOpParams* batchParams = nullptr;
+  NCCLCHECKGOTO(ncclCalloc(&batchParams, batchSize), ret, fail);
+
+  if (comm->nvlsSupport) {
+    NCCLCHECKGOTO(ncclPrepMCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx, stream), ret, fail);
+  } else {
+    NCCLCHECKGOTO(ncclPrepUCSync(comm, comm->ceColl.useCompletePtr, batchParams, &opIdx), ret, fail);
+  }
+
+  // For CUDA graph capture, add reset operation
+  if (ncclCudaGraphValid(comm->planner.capturingGraph)) {
+    for (int i = 0; i < comm->nRanks; i++) {
+      batchParams[opIdx] = {};
+      batchParams[opIdx].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+      batchParams[opIdx].writeValue.address = (CUdeviceptr)(comm->ceColl.useCompletePtr ? (void*)&completePtrs[i] : (void*)&readyPtrs[i]);
+      batchParams[opIdx].writeValue.value = 0;
+      batchParams[opIdx].writeValue.flags = CU_STREAM_WRITE_VALUE_DEFAULT;
+      opIdx++;
+    }
+  }
+  
+  // Execute all memory operations in a single batch
+  CUCHECKGOTO(cuStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
+
+  // Toggle the flag for next call
+  comm->ceColl.useCompletePtr = !comm->ceColl.useCompletePtr;
+
+exit:
+  if (batchParams) free(batchParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
+  ncclResult_t ret = ncclSuccess;
+  
+  params->srcs = nullptr;
+  params->dsts = nullptr;
+  params->sizes = nullptr;
+  params->numOps = 0;
+  params->intraBatchSync = false;
+#if CUDART_VERSION >= 12080
+  params->attrs = nullptr;
+  params->attrIdxs = nullptr;
+  params->numAttrs = 0;
+#endif
+  
+  NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
+#if CUDART_VERSION >= 12080
+  NCCLCHECKGOTO(ncclCalloc(&params->attrs, nRanks), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&params->attrIdxs, nRanks), ret, fail);
+#endif
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+void ncclCeFreeBatchOpsParams(struct ncclCeBatchOpsParams* params) {
+  if (params->srcs) free(params->srcs);
+  if (params->dsts) free(params->dsts);
+  if (params->sizes) free(params->sizes);
+#if CUDART_VERSION >= 12080
+  if (params->attrs) free(params->attrs);
+  if (params->attrIdxs) free(params->attrIdxs);
+#endif
+}
+
+ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsParams* params, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+
+  // Check if there are any operations to perform
+  if (params->numOps == 0) {
+    return ncclSuccess;
+  }
+
+  // Check if we are in a CUDA graph capture
+  bool capturing = ncclCudaGraphValid(comm->planner.capturingGraph);
+
+  int driverVersion;
+  NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
+    
+  //--------------Graph capture--------------
+  // cudaMemcpyBatchAsync is not supported during CUDA graph capture
+  if (capturing) {
+    for (int i =0; i < params->numOps; i++) {
+      CUDACHECKGOTO(cudaMemcpyAsync(
+        (void*)params->dsts[i],
+        (void*)params->srcs[i],
+        params->sizes[i],
+        cudaMemcpyDeviceToDevice,
+        stream), ret, fail);
+
+      if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+        NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+      }
+    }
+  }
+  //--------------No graph capture--------------
+  else {
+    if (CUDART_VERSION >= 12080 && driverVersion >= 12080) {
+#if CUDART_VERSION >= 12080
+    // For CUDA 12.8+, use batch memory copy for better performance
+    params->attrs[0] = {};
+    params->attrs[0].srcAccessOrder = cudaMemcpySrcAccessOrderStream;
+    params->attrs[0].flags = cudaMemcpyFlagPreferOverlapWithCompute;
+    params->attrIdxs[0] = 0;
+    params->numAttrs = 1;
+
+    if (params->intraBatchSync) {
+      // Break into multiple batches with sync between them
+      int batchSize = comm->ceColl.intraBatchSyncFreq;
+      for (int i = 0; i < params->numOps; i += batchSize) {
+        int currentBatchSize = (i + batchSize <= params->numOps) ? batchSize : params->numOps - i;
+
+        #if CUDART_VERSION >= 13000
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+        #else
+        CUDACHECKGOTO(cudaMemcpyBatchAsync(
+          &params->dsts[i], &params->srcs[i], &params->sizes[i], currentBatchSize,
+          params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+        #endif
+
+        // Sync after each batch
+        if (i + batchSize < params->numOps) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    } else {
+      // Use single batch for all operations
+      #if CUDART_VERSION >= 13000
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, stream), ret, fail);
+      #else
+      CUDACHECKGOTO(cudaMemcpyBatchAsync(
+        params->dsts, params->srcs, params->sizes, params->numOps,
+        params->attrs, params->attrIdxs, params->numAttrs, nullptr, stream), ret, fail);
+      #endif
+    }
+#endif
+    } else {
+      // For older CUDA versions, fall back to individual transfers
+      for (int i = 0; i < params->numOps; i++) {
+        CUDACHECKGOTO(cudaMemcpyAsync(
+          (void*)params->dsts[i],
+          (void*)params->srcs[i],
+          params->sizes[i],
+          cudaMemcpyDeviceToDevice,
+          stream), ret, fail);
+
+        if (params->intraBatchSync && ((i+1) % comm->ceColl.intraBatchSyncFreq == 0) && ((i+1) < params->numOps)) {
+          NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+        }
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+
+ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of each rank's data chunk
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy own data to receive buffer if operation is out-of-place
+  if (myRecvBuff != mySendBuff) {
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)myRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Copy data to other ranks
+  for (int r = 1; r < comm->nRanks; r++) {
+    int targetRank = (comm->rank + r) % comm->nRanks;
+    offset = myRecvBuff - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, targetRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to every other rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks * comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  // Copy data to other ranks: send data chunk for each destination rank
+  for (int r = 0; r < comm->nRanks; r++) {
+    int dstRank = (comm->rank + r) % comm->nRanks;
+    uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    
+    if (dstRank == comm->rank) {
+      // Local copy for own data
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    } else {
+      // Remote copy to other ranks: send to rank dstRank's receive buffer at position comm->rank
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerRecvBuff), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+
+  // Check if we need to perform intra-batch synchronization
+  batchOpsParams.intraBatchSync = (batchOpsParams.numOps > comm->ceColl.intraBatchSyncFreq && chunkBytes*batchOpsParams.numOps >= comm->ceColl.intraBatchSyncMsgThreshold);
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data root sends to each rank
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerDstPtr;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, comm->nRanks), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Check if this is an in-place scatter operation
+    bool isInPlace = (myRecvBuff == mySendBuff + comm->rank * chunkBytes);
+
+    // Copy root's own data first if not in-place
+    if (!isInPlace) {
+      uint8_t* srcPtr = mySendBuff + comm->rank * chunkBytes;
+      uint8_t* dstPtr = myRecvBuff;
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+
+    // Root rank distributes data to other ranks
+    for (int r = 1; r < comm->nRanks; r++) {
+      int dstRank = (comm->rank + r) % comm->nRanks;
+      uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
+      uint8_t* dstPtr = isInPlace ? myRecvBuff + dstRank * chunkBytes : myRecvBuff;
+
+      offset = dstPtr - (uint8_t*)args->recvWin->userPtr;
+      NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, dstRank, &peerDstPtr), ret, fail);
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerDstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  }
+  // Non-root ranks don't need to perform any copy operations
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  
+  // Calculate the size of data each rank sends to root
+  const size_t chunkBytes = args->nElts * args->eltSize;
+  uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
+  uint8_t* myRecvBuff = (uint8_t*)args->recvBuff;
+  int rootRank = args->rootRank;
+  void* peerRecvBuff;
+  size_t offset;
+
+  struct ncclCeBatchOpsParams batchOpsParams = {};
+  NCCLCHECKGOTO(ncclCeInitBatchOpsParams(&batchOpsParams, 1), ret, fail);
+
+  // Ensure all ranks are ready before starting transfers
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+  if (comm->rank == rootRank) {
+    // Root rank copies its own data to the correct position in receive buffer
+    uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
+    if (mySendBuff != dstPtr) {
+      batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+      batchOpsParams.dsts[batchOpsParams.numOps] = (void*)dstPtr;
+      batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+      batchOpsParams.numOps++;
+    }
+  } else {
+    // Non-root ranks send their data to root's receive buffer
+    uint8_t* rootRecvPtr = (uint8_t*)args->recvBuff + comm->rank * chunkBytes;
+    offset = rootRecvPtr - (uint8_t*)args->recvWin->userPtr;
+    NCCLCHECKGOTO(ncclDevrGetLsaRankPtr(comm, args->recvWin, offset, rootRank, &peerRecvBuff), ret, fail);
+    batchOpsParams.srcs[batchOpsParams.numOps] = (void*)mySendBuff;
+    batchOpsParams.dsts[batchOpsParams.numOps] = (void*)peerRecvBuff;
+    batchOpsParams.sizes[batchOpsParams.numOps] = chunkBytes;
+    batchOpsParams.numOps++;
+  }
+
+  // Launch the batch operations
+  NCCLCHECKGOTO(ncclCeLaunchBatchOps(comm, &batchOpsParams, stream), ret, fail);
+
+  // Ensure all transfers are complete across all ranks
+  NCCLCHECKGOTO(ncclMemOpSync(comm, stream), ret, fail);
+
+exit:
+  ncclCeFreeBatchOpsParams(&batchOpsParams);
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan) {
+  ncclResult_t ret = ncclSuccess;
+  cudaStream_t stream = comm->planner.streams->stream;
+  struct ncclCeCollArgs* args = plan->ceCollArgs;
+
+  switch (args->func) {
+    case ncclFuncAllGather:
+      NCCLCHECKGOTO(ncclCeAllGather(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncAlltoAll:
+      NCCLCHECKGOTO(ncclCeAlltoAll(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncScatter:
+      NCCLCHECKGOTO(ncclCeScatter(comm, args, stream), ret, fail);
+      break;
+    case ncclFuncGather:
+      NCCLCHECKGOTO(ncclCeGather(comm, args, stream), ret, fail);
+      break;
+    default:
+      ret = ncclInvalidUsage;
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
diff --git a/src/collectives.cc b/src/collectives.cc
index 03122f8a7..ca69c9a78 100644
--- a/src/collectives.cc
+++ b/src/collectives.cc
@@ -14,10 +14,13 @@ const char* ncclFuncToString(ncclFunc_t fn) {
   switch (fn) {
   case ncclFuncAllGather: return "AllGather";
   case ncclFuncAllReduce: return "AllReduce";
+  case ncclFuncAlltoAll: return "AlltoAll";
   case ncclFuncBroadcast: return "Broadcast";
+  case ncclFuncGather: return "Gather";
   case ncclFuncRecv: return "Recv";
   case ncclFuncReduce: return "Reduce";
   case ncclFuncReduceScatter: return "ReduceScatter";
+  case ncclFuncScatter: return "Scatter";
   case ncclFuncSendRecv: return "SendRecv";
   case ncclFuncSend: return "Send";
   default: return "Invalid";
@@ -88,6 +91,19 @@ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcoun
   return ncclEnqueueCheck(&info);
 }
 
+NCCL_API(ncclResult_t, ncclAlltoAll, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(AlltoAll, NcclNvtxParamsAlltoAll,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype)));
+
+  struct ncclInfo info = { ncclFuncAlltoAll, "AlltoAll",
+    sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
+    ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
 NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
 ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
@@ -121,6 +137,19 @@ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int ro
   return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
 }
 
+NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Gather, NcclNvtxParamsGather,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
+
+  struct ncclInfo info = { ncclFuncGather, "Gather",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    GATHER_CHUNKSTEPS, GATHER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
 NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
     ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
@@ -147,6 +176,19 @@ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recv
   return ncclEnqueueCheck(&info);
 }
 
+NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm* comm, cudaStream_t stream) {
+  NVTX3_FUNC_WITH_PARAMS(Scatter, NcclNvtxParamsScatter,
+    NVTX3_PAYLOAD(comm ? comm->commHash : 0, count * ncclTypeSize(datatype), root));
+
+  struct ncclInfo info = { ncclFuncScatter, "Scatter",
+    sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
+    SCATTER_CHUNKSTEPS, SCATTER_SLICESTEPS };
+  return ncclEnqueueCheck(&info);
+}
+
 NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, cudaStream_t stream);
 ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
diff --git a/src/debug.cc b/src/debug.cc
index f034bc7e0..0d6ed8400 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -15,6 +15,7 @@
 #include <sys/syscall.h>
 #include <chrono>
 #include "param.h"
+#include <mutex>
 
 #define NCCL_DEBUG_RESET_TRIGGERED (-2)
 
@@ -28,9 +29,9 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-static uint64_t ncclDebugMask = 0;
+uint64_t ncclDebugMask = 0;
 FILE *ncclDebugFile = stdout;
-static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex ncclDebugMutex;
 static std::chrono::steady_clock::time_point ncclEpoch;
 static bool ncclWarnSetDebugInfo = false;
 
@@ -269,15 +270,13 @@ static void ncclDebugInit() {
  * they can share the debugging mechanisms and output files
  */
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
-  bool locked = false; // Keeps track of the ncclDebugLock state.
   int gotLevel = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE);
 
   if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
 
   // Save the last error (WARN) as a human readable string
   if (level == NCCL_LOG_WARN) {
-    pthread_mutex_lock(&ncclDebugLock);
-    locked = true;
+    std::lock_guard<std::mutex> lock(ncclDebugMutex);
     va_list vargs;
     va_start(vargs, fmt);
     (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
@@ -285,20 +284,13 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   }
 
   if (gotLevel >= 0 && (gotLevel < level || (flags & ncclDebugMask) == 0)) {
-    if (locked)
-      pthread_mutex_unlock(&ncclDebugLock);
     return;
   }
 
-  if (!locked) {
-    pthread_mutex_lock(&ncclDebugLock);
-    locked = true;
-  }
-  // From this point on ncclDebugLock is always locked so we don't need to check "locked" anymore.
+  std::lock_guard<std::mutex> lock(ncclDebugMutex);
   if (ncclDebugLevel < 0)
     ncclDebugInit();
   if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) {
-    pthread_mutex_unlock(&ncclDebugLock);
     return;
   }
 
@@ -386,17 +378,35 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file
   // necessary since we write bytes instead of the string.
   buffer[len++] = '\n';
   fwrite(buffer, 1, len, ncclDebugFile);
-  pthread_mutex_unlock(&ncclDebugLock);
 }
 
-NCCL_API(void, ncclResetDebugInit);
-void ncclResetDebugInit() {
+// Non-deprecated version for internal use.
+extern "C"
+__attribute__ ((visibility("default")))
+void ncclResetDebugInitInternal() {
   // Cleans up from a previous ncclDebugInit() and reruns.
   // Use this after changing NCCL_DEBUG and related parameters in the environment.
-  pthread_mutex_lock(&ncclDebugLock);
+  std::lock_guard<std::mutex> lock(ncclDebugMutex);
   // Let ncclDebugInit() know to complete the reset.
   __atomic_store_n(&ncclDebugLevel, NCCL_DEBUG_RESET_TRIGGERED, __ATOMIC_RELEASE);
-  pthread_mutex_unlock(&ncclDebugLock);
+}
+
+// In place of: NCCL_API(void, ncclResetDebugInit);
+__attribute__ ((visibility("default")))
+__attribute__ ((alias("ncclResetDebugInit")))
+void pncclResetDebugInit();
+extern "C"
+__attribute__ ((visibility("default")))
+__attribute__ ((weak))
+__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
+void ncclResetDebugInit();
+
+
+void ncclResetDebugInit() {
+  // This is now deprecated as part of the NCCL API. It will be removed
+  // from the API in the future. It is still available as an
+  // exported symbol.
+  ncclResetDebugInitInternal();
 }
 
 NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
diff --git a/src/dev_runtime.cc b/src/dev_runtime.cc
new file mode 100644
index 000000000..54e6e01bf
--- /dev/null
+++ b/src/dev_runtime.cc
@@ -0,0 +1,995 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "dev_runtime.h"
+#include "comm.h"
+#include "device.h"
+#include "transport.h"
+#include "group.h"
+#include "nccl_device.h"
+
+NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
+
+// Complete types from src/include/dev_runtime.h
+struct ncclDevrMemory {
+  int refCount;
+  struct ncclDevrMemory* next;
+  CUmemGenericAllocationHandle memHandle;
+  size_t size;
+  size_t bigOffset; // offset in big VA space
+};
+
+struct ncclDevrWindowSorted {
+  uintptr_t userAddr;
+  size_t size;
+  struct ncclDevrWindow* win;
+};
+
+struct ncclDevrTeam {
+  struct ncclDevrTeam* next;
+  struct ncclTeam team;
+  CUmemGenericAllocationHandle mcHandle;
+  void* mcBasePtr;
+  int worldRankList[];
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Helpers at the bottom:
+
+// Find least index such that `arg < sorted[i].key` (least upper bound)
+template<typename Obj, typename Key>
+static int listFindSortedLub(Key Obj::*key, Obj* sorted, int count, Key arg);
+
+template<typename Obj>
+static void listInsert(Obj** list, int* capacity, int* count, int index, Obj val);
+
+template<typename Obj>
+static void listRemove(Obj* list, int* count, int index);
+
+////////////////////////////////////////////////////////////////////////////////
+
+ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  if (devr->bigSize != 0) return ncclSuccess;
+
+  bool lsaIsLocal = true;
+  for (int i=0; i < comm->localRanks; i++) {
+    lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i;
+  }
+  devr->lsaSelf = lsaIsLocal ? comm->localRank : 0;
+  devr->lsaSize = lsaIsLocal ? comm->localRanks : 1;
+  devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int));
+  for (int i=0; i < devr->lsaSize; i++) {
+    devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf);
+  }
+
+  CUmemAllocationProp memProp = {};
+  memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  memProp.requestedHandleTypes = ncclCuMemHandleType;
+  memProp.location.id = comm->cudaDev;
+  CUCHECKGOTO(cuMemGetAllocationGranularity(&devr->granularity, &memProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED), ret, fail_lsaRankList);
+
+  devr->bigSize = ncclParamWinStride();
+  if (-devr->bigSize <= 1) {
+    devr->bigSize = 1;
+    for (int r=0; r < comm->nRanks; ++r) {
+      devr->bigSize = std::max<size_t>(devr->bigSize, comm->peerInfo[r].totalGlobalMem);
+    }
+  }
+  devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32);
+  INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30);
+  
+  ncclSpaceConstruct(&devr->bigSpace);
+  ncclShadowPoolConstruct(&devr->shadows);
+  return ncclSuccess;
+
+fail_lsaRankList:
+  free(devr->lsaRankList);
+  return ret;
+}
+
+static void symTeamDestroyAll(struct ncclComm* comm); // Further down
+
+ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
+  struct ncclDevrState* devr = &comm->devrState;
+  if (devr->bigSize == 0) return ncclSuccess;
+
+  while (!ncclIntruQueueEmpty(&devr->regTaskQueue)) {
+    struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue);
+    free(task);
+  }
+  
+  symTeamDestroyAll(comm);
+  { // delete windowTable
+    cudaStream_t stream;
+    if (cudaSuccess == cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)) {
+      struct ncclDevCommWindowTable* tableDev = devr->windowTable;
+      while (tableDev != nullptr) {
+        struct ncclDevCommWindowTable* tableHost;
+        if (ncclSuccess != ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)) break;
+        struct ncclDevCommWindowTable* next = tableHost->next;
+        ncclShadowPoolFree(&devr->shadows, tableDev, stream);
+        tableDev = next;
+      }
+      cudaStreamSynchronize(stream);
+      cudaStreamDestroy(stream);
+    }
+  }
+  CUdeviceptr flatAddr = reinterpret_cast<CUdeviceptr>(devr->lsaFlatBase);
+  CUCHECKIGNORE(cuMemUnmap(flatAddr, devr->lsaSize*devr->bigSize));
+  CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize));
+  ncclShadowPoolDestruct(&devr->shadows);
+  ncclSpaceDestruct(&devr->bigSpace);
+  free(devr->lsaRankList);
+  free(devr->winSorted);
+  return ncclSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+static ncclResult_t symMemoryMapLsaTeam(
+    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, size_t bigOffset
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  CUmemAccessDesc accessDesc = {};
+  union Message {
+    CUmemGenericAllocationHandle memHandle;
+    CUmemFabricHandle fabricHandle;
+  };
+
+  Message* messages = (Message*)calloc(devr->lsaSize, sizeof(Message));
+  if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+    messages[devr->lsaSelf].memHandle = memHandle;
+  } else {
+    CUCHECKGOTO(cuMemExportToShareableHandle(&messages[devr->lsaSelf].fabricHandle, memHandle, ncclCuMemHandleType, 0), ret, fail);
+  }
+
+  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, devr->lsaRankList, devr->lsaSelf, devr->lsaSize, messages, sizeof(Message)), ret, fail);
+
+  if (devr->lsaFlatBase == nullptr) { // Create on first need.
+    CUdeviceptr addr;
+    CUCHECKGOTO(cuMemAddressReserve(&addr, devr->lsaSize*devr->bigSize, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail);
+    devr->lsaFlatBase = reinterpret_cast<void*>(addr);
+  }
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = comm->cudaDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  for (int r = 0; r < devr->lsaSize; r++) {
+    CUmemGenericAllocationHandle impHandle;
+    if (r == devr->lsaSelf) {
+      impHandle = memHandle;
+    } else {
+      if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
+        int fd = -1;
+        NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, devr->lsaRankList[r], &messages[r], &fd), ret, fail);
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, reinterpret_cast<void*>((uintptr_t)fd), ncclCuMemHandleType), ret, fail);
+        SYSCHECKGOTO(close(fd), "close", ret, fail);
+      } else {
+        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&messages[r].fabricHandle, ncclCuMemHandleType), ret, fail);
+      }
+    }
+    CUdeviceptr addr = reinterpret_cast<uintptr_t>((char*)devr->lsaFlatBase + r*devr->bigSize + bigOffset);
+    CUCHECKGOTO(cuMemMap(addr, size, 0, impHandle, 0), ret, fail);
+    CUCHECKGOTO(cuMemSetAccess(addr, size, &accessDesc, 1), ret, fail);
+    if (r != devr->lsaSelf) {
+      CUCHECKGOTO(cuMemRelease(impHandle), ret, fail);
+    }
+  }
+  // Ensure everyone has imported my mem handle.
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, devr->lsaRankList, devr->lsaSelf, devr->lsaSize, 0xbeef), ret, fail);
+leave:
+  free(messages);
+  return ret;
+fail:
+  goto leave;
+}
+
+static ncclResult_t symBindTeamMemory(
+    struct ncclComm* comm, struct ncclDevrTeam* tm, struct ncclDevrMemory* mem
+  ) {
+  if (comm->nvlsSupport && tm->mcBasePtr != nullptr) {
+  #if CUDART_VERSION >= 12010
+    INFO(NCCL_NVLS, "Binding multicast memory at big=%lx to team {%d x %d}", mem->bigOffset, tm->team.nRanks, tm->team.stride);
+    CUCHECK(cuMulticastBindMem(tm->mcHandle, mem->bigOffset, mem->memHandle, 0, mem->size, 0));
+  #endif
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t symUnbindTeamMemory(
+    struct ncclComm* comm, struct ncclDevrTeam* tm, struct ncclDevrMemory* mem
+  ) {
+  if (comm->nvlsSupport && tm->mcBasePtr != nullptr) {
+  #if CUDART_VERSION >= 12010
+    CUCHECK(cuMulticastUnbind(tm->mcHandle, comm->cudaDev, mem->bigOffset, mem->size));
+  #endif
+  }
+  return ncclSuccess;
+}
+
+// Caller must barrier the team afterward.
+static ncclResult_t symTeamObtain(
+    struct ncclComm* comm, struct ncclTeam team, bool multimem,
+    struct ncclDevrTeam** outTeam
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  struct ncclDevrTeam* t = devr->teamHead;
+  bool teamIsNew = false;
+  while (true) {
+    if (t == nullptr) {
+      teamIsNew = true;
+      t = (struct ncclDevrTeam*)malloc(sizeof(struct ncclDevrTeam) + team.nRanks*sizeof(int));
+      t->team = team;
+      t->mcHandle = 0x0;
+      t->mcBasePtr = nullptr;
+      for (int i=0; i < team.nRanks; i++) {
+        t->worldRankList[i] = comm->rank + (i - team.rank)*team.stride;
+      }
+      break;
+    } else if (t->team.rank == team.rank && t->team.nRanks == team.nRanks && t->team.stride == team.stride) {
+      if (!multimem || t->mcBasePtr != nullptr) {
+        // Matching team is sufficient
+        if (outTeam) *outTeam = t;
+        return ncclSuccess;
+      }
+      break; // Need to enable multimem
+    }
+  }
+
+  if (multimem) {
+    if (!comm->nvlsSupport) {
+      WARN("Multicast support requested for team but none available on system.");
+      ret = ncclInvalidArgument;
+      goto fail;
+    } else {
+    #if CUDART_VERSION >= 12010
+      CUmemGenericAllocationHandle mcHandle = 0;
+      CUdeviceptr mcAddr = 0;
+      CUmulticastObjectProp mcProp = {};
+      char shareableHandle[NVLS_HANDLE_SIZE];
+
+      mcProp.numDevices = team.nRanks;
+      mcProp.handleTypes = ncclCuMemHandleType;
+      mcProp.flags = 0;
+      mcProp.size = devr->bigSize;
+      if (team.rank == 0) {
+        NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcProp, team.rank, team.nRanks, &mcHandle, shareableHandle), ret, fail);
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, t->worldRankList, team.rank, team.nRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail_mcHandle);
+      } else {
+        NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, t->worldRankList, team.rank, team.nRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
+        NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, t->worldRankList[0], &mcHandle), ret, fail);
+      }
+
+      CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail_mcHandle);
+      CUCHECKGOTO(cuMemAddressReserve(&mcAddr, devr->bigSize, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail_mcHandle);
+      CUCHECKGOTO(cuMemMap(mcAddr, devr->bigSize, 0, mcHandle, 0), ret, fail_mcHandle_mcAddr);
+      { CUmemAccessDesc accessDesc = {};
+        accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        accessDesc.location.id = comm->cudaDev;
+        accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        CUCHECKGOTO(cuMemSetAccess(mcAddr, devr->bigSize, &accessDesc, 1), ret, fail_mcHandle_mcAddr_unmap);
+      }
+      t->mcHandle = mcHandle;
+      t->mcBasePtr = reinterpret_cast<void*>(mcAddr);
+
+      // Bind new team with all existing memories.
+      for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) {
+        NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mcHandle_mcAddr_unmap_mems);
+      }
+
+      if (false) { // Error labels:
+      fail_mcHandle_mcAddr_unmap_mems:
+        for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) {
+          symUnbindTeamMemory(comm, t, mem);
+        }
+      fail_mcHandle_mcAddr_unmap:
+        CUCHECKIGNORE(cuMemUnmap(mcAddr, devr->bigSize));
+        goto fail_mcHandle_mcAddr; // silence unused label warning
+      fail_mcHandle_mcAddr:
+        CUCHECKIGNORE(cuMemAddressFree(mcAddr, devr->bigSize));
+        goto fail_mcHandle; // silence unused label warning
+      fail_mcHandle:
+        CUCHECKIGNORE(cuMemRelease(mcHandle));
+        goto fail; // silence unused label warning
+      }
+    #else
+      goto fail; // silence unused label warning
+    #endif
+    }
+  }
+
+  if (teamIsNew) {
+     // Add to list
+    t->next = devr->teamHead;
+    devr->teamHead = t;
+  }
+  if (outTeam) *outTeam = t;
+  return ret;
+
+fail:
+  if (teamIsNew) free(t);
+  return ret;
+}
+
+static void symTeamDestroyAll(struct ncclComm* comm) {
+  struct ncclDevrState* devr = &comm->devrState;
+  while (devr->teamHead != nullptr) {
+    struct ncclDevrTeam* t = devr->teamHead;
+    devr->teamHead = t->next;
+    if (t->mcBasePtr != nullptr) {
+      for (struct ncclDevrMemory* m = devr->memHead; m != nullptr; m = m->next) {
+        symUnbindTeamMemory(comm, t, m);
+      }
+      CUdeviceptr mcAddr = reinterpret_cast<CUdeviceptr>(t->mcBasePtr);
+      CUCHECKIGNORE(cuMemUnmap(mcAddr, devr->bigSize));
+      CUCHECKIGNORE(cuMemAddressFree(mcAddr, devr->bigSize));
+      CUCHECKIGNORE(cuMemRelease(t->mcHandle));
+    }
+    free(t);
+  }
+}
+
+// On success we take caller's reference on memHandle.
+// Due to multicast binds for each pre-exiting team, this function requires
+// caller do a world barrier before returning to user.
+static ncclResult_t symMemoryObtain(
+    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size,
+    struct ncclDevrMemory** outMem
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  int64_t bigOffset = 0;
+
+  struct ncclDevrMemory* mem = devr->memHead;
+  while (mem != nullptr) {
+    if (mem->memHandle == memHandle) {
+      CUCHECKIGNORE(cuMemRelease(memHandle));
+      goto leave;
+    }
+    mem = mem->next;
+  }
+  // New memory.
+  mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory));
+  mem->refCount = 0;
+  mem->memHandle = memHandle;
+  mem->size = size;
+ 
+  // Grab offset in the big space.
+  NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem);
+  mem->bigOffset = bigOffset;
+
+  // Map unicast addresses into flat VA space for lsa team.
+  NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space);
+
+  // Bind new memory with each existing team.
+  for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
+    NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams);
+  }
+  // Add to list of mems.
+  mem->next = devr->memHead;
+  devr->memHead = mem;
+
+leave:
+  mem->refCount += 1;
+  *outMem = mem;
+  return ret;
+
+fail_mem_space_teams:
+  for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
+    symUnbindTeamMemory(comm, t, mem);
+  }
+fail_mem_space:
+  ncclSpaceFree(&devr->bigSpace, bigOffset, size);
+fail_mem:
+  free(mem);
+//fail:
+  return ret;
+}
+
+static void symMemoryDropRef(
+    struct ncclComm* comm, struct ncclDevrMemory* mem
+  ) {
+  if (mem != nullptr && 0 == --mem->refCount) {
+    struct ncclDevrState* devr = &comm->devrState;
+    for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
+      symUnbindTeamMemory(comm, t, mem);
+    }
+    for (int r = 0; r < devr->lsaSize; r++) {
+      CUdeviceptr addr = reinterpret_cast<uintptr_t>((char*)devr->lsaFlatBase + r*devr->bigSize + mem->bigOffset);
+      CUCHECKIGNORE(cuMemUnmap(addr, mem->size));
+    }
+    ncclSpaceFree(&devr->bigSpace, mem->bigOffset, mem->size);
+    CUCHECKIGNORE(cuMemRelease(mem->memHandle));
+
+    struct ncclDevrMemory** ptr = &devr->memHead;
+    while (*ptr != mem) ptr = &(*ptr)->next;
+    *ptr = mem->next; // Remove from list.
+
+    free(mem);
+  }
+}
+
+static ncclResult_t symWindowTableInitOnce(struct ncclComm* comm, cudaStream_t stream) {
+  struct ncclDevrState* devr = &comm->devrState;
+  struct ncclDevCommWindowTable* tableDev = devr->windowTable;
+  if (tableDev == nullptr) { // Create on first need.
+    NCCLCHECK(ncclShadowPoolAlloc<ncclDevCommWindowTable>(&devr->shadows, &tableDev, nullptr, stream));
+    devr->windowTable = tableDev;
+  }
+  return ncclSuccess;
+}
+
+// On success we take callers reference on `mem`.
+static ncclResult_t symWindowCreate(
+    struct ncclComm* comm, struct ncclDevrMemory* mem,
+    size_t memOffset, void* userPtr, size_t userSize, int winFlags, void* localReg,
+    struct ncclWindow_vidmem** outWinDev, struct ncclDevrWindow** outWin,
+    cudaStream_t stream
+  ) {
+  uintptr_t userAddr = reinterpret_cast<uintptr_t>(userPtr);
+  struct ncclDevrState* devr = &comm->devrState;
+  struct ncclDevrWindow* win;
+
+  win = (struct ncclDevrWindow*)malloc(sizeof(struct ncclDevrWindow));
+  memset(win, 0, sizeof(*win));
+  win->memory = mem;
+  win->size = userSize;
+  win->bigOffset = mem->bigOffset + memOffset;
+  win->winFlags = winFlags;
+  win->localRegHandle = localReg;
+  if (userPtr == nullptr) {
+    // Null means caller has no VA and will use the lsa team flat VA address.
+    win->userPtr = (char*)devr->lsaFlatBase + (devr->lsaSelf*devr->bigSize) + mem->bigOffset;
+  } else {
+    win->userPtr = userPtr;
+  }
+
+  struct ncclWindow_vidmem* winDev;
+  struct ncclWindow_vidmem* winDevHost;
+  NCCLCHECK(ncclShadowPoolAlloc(&devr->shadows, &winDev, &winDevHost, stream));
+  win->vidmem = winDev;
+  winDevHost->lsaFlatBase = (char*)devr->lsaFlatBase + win->bigOffset;
+  winDevHost->mcOffset4K = win->bigOffset>>12;
+  winDevHost->stride4G = devr->bigSize>>32;
+  winDevHost->lsaRank = devr->lsaSelf;
+  winDevHost->worldRank = comm->rank;
+  winDevHost->winHost = (void*)win;
+  CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream));
+
+  NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists
+  struct ncclDevCommWindowTable* tableDev = devr->windowTable;
+  struct ncclDevCommWindowTable* tableHost;
+  NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost));
+  while (true) {
+    int i = 0;
+    while (i < 32 && tableHost->entries[i].window != nullptr) i += 1;
+    if (i < 32) {
+      tableHost->entries[i].base = userAddr;
+      tableHost->entries[i].size = userAddr + userSize;
+      tableHost->entries[i].window = winDev;
+      CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream));
+      break;
+    }
+    if (tableHost->next == nullptr) {
+      NCCLCHECK(ncclShadowPoolAlloc<ncclDevCommWindowTable>(&devr->shadows, &tableHost->next, nullptr, stream));
+      CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream));
+    }
+    tableDev = tableHost->next;
+    NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost));
+  }
+
+  { // insert into winSorted[]
+    int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr);
+    struct ncclDevrWindowSorted winSort;
+    winSort.userAddr = userAddr;
+    winSort.size = userSize;
+    winSort.win = win;
+    listInsert(&devr->winSorted, &devr->winSortedCapacity, &devr->winSortedCount, i, winSort);
+  }
+
+  if (outWinDev) *outWinDev = winDev;
+  if (outWin) *outWin = win;
+  return ncclSuccess;
+}
+
+static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vidmem* winDev, cudaStream_t stream) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  struct ncclWindow_vidmem* winDevHost;
+  struct ncclDevrWindow* winHost;
+
+  NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, winDev, &winDevHost), ret, fail);
+  winHost = (struct ncclDevrWindow*)winDevHost->winHost;
+
+  symMemoryDropRef(comm, winHost->memory);
+
+  { struct ncclDevCommWindowTable* tableDev = devr->windowTable;
+    struct ncclDevCommWindowTable* tableHost;
+    NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted);
+    while (true) {
+      int i = 0;
+      while (i < 32 && tableHost->entries[i].window != winDev) i += 1;
+      if (i < 32) {
+        memset(&tableHost->entries[i], 0, sizeof(tableHost->entries[i]));
+        CUDACHECKGOTO(cudaMemsetAsync(&tableDev->entries[i], 0, sizeof(tableDev->entries[i]), stream), ret, remove_winSorted);
+        break;
+      }
+      if (tableHost->next == nullptr) break; // Error didn't find window in table
+      tableDev = tableHost->next;
+      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted);
+    }
+  }
+  NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted);
+
+  NCCLCHECKGOTO(ncclCommDeregister(comm, winHost->localRegHandle), ret, remove_winSorted);
+
+remove_winSorted:
+  { int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, reinterpret_cast<uintptr_t>(winHost->userPtr));
+    i -= 1; // least upper bound is just after ours.
+    listRemove(devr->winSorted, &devr->winSortedCount, i);
+  }
+  free(winHost);
+fail:
+  return ret;
+}
+
+ncclResult_t ncclDevrWindowRegisterInGroup(
+    struct ncclComm* comm,
+    void* userPtr, size_t userSize, int winFlags, ncclWindow_t* outWinDev
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  CUdeviceptr memAddr = 0;
+  size_t memSize = 0;
+  CUmemGenericAllocationHandle memHandle = 0x0;
+  size_t memOffset;
+  struct ncclDevrMemory* mem = nullptr;
+  cudaStream_t stream = nullptr;
+  void* localRegHandle = nullptr;
+
+  NCCLCHECKGOTO(ncclCommRegister(comm, userPtr, userSize, &localRegHandle), ret, fail);
+
+  if (!comm->symmetricSupport) {
+    // We just return the local registration handle directly in this case, as there's no reason to allocate the
+    // ncclWindow_vidmem structure on the device, etc.
+    *outWinDev = reinterpret_cast<struct ncclWindow_vidmem*>(localRegHandle);
+    return ncclSuccess;
+  }
+  if (winFlags & NCCL_WIN_COLL_SYMMETRIC) {
+    // Defer symmetric kernel init until at least one window with that flag exists.
+    NCCLCHECKGOTO(ncclSymkInitOnce(comm), ret, fail);
+  }
+
+  // Get underlying cumem handle:
+  CUCHECKGOTO(cuMemGetAddressRange(&memAddr, &memSize, reinterpret_cast<CUdeviceptr>(userPtr)), ret, fail_locReg);
+  memOffset = reinterpret_cast<CUdeviceptr>(userPtr) - memAddr;
+  if (memOffset%NCCL_WIN_REQUIRED_ALIGNMENT != 0) {
+    WARN("Window address must be suitably aligned.");
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast<void*>(memAddr)), ret, fail_locReg);
+
+  // Trade cumem handle for ncclDevrMemory*
+  NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle);
+  memHandle = 0x0; // symMemoryObtain took our reference
+
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
+
+  NCCLCHECKGOTO(symWindowCreate(
+      comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream
+    ), ret, fail_locReg_memHandle_mem_stream);
+  mem = nullptr; // symWindowCreate took our reference
+  
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win);
+
+  // symWindowCreate needs barrier.
+  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_locReg_memHandle_mem_stream_win);
+
+  cudaStreamDestroy(stream);
+  return ret;
+
+fail_locReg_memHandle_mem_stream_win:
+  symWindowDestroy(comm, *outWinDev, stream);
+  *outWinDev = nullptr;
+  cudaStreamSynchronize(stream);
+fail_locReg_memHandle_mem_stream:
+  cudaStreamDestroy(stream);
+  symMemoryDropRef(comm, mem);
+fail_locReg_memHandle:
+  if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); }
+fail_locReg:
+  ncclCommDeregister(comm, localRegHandle);
+fail:
+  *outWinDev = nullptr;
+  return ret;
+}
+
+static ncclResult_t deepCopyDevCommRequirements(
+    struct ncclDevCommRequirements const* src,
+    struct ncclDevCommRequirements** dst
+) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevResourceRequirements **dstRes;
+  struct ncclTeamRequirements **dstTeam;
+
+  NCCLCHECK(ncclCalloc(dst, 1));
+
+  /* copy the entire struct now and update linked lists later */
+  **dst = *src;
+
+  dstRes = &(*dst)->resourceRequirementsList;
+  for (struct ncclDevResourceRequirements* rr = src->resourceRequirementsList; rr != nullptr; rr = rr->next) {
+    NCCLCHECKGOTO(ncclCalloc(dstRes, 1), ret, fail);
+    (*dstRes)->bufferSize = rr->bufferSize;
+    (*dstRes)->bufferAlign = rr->bufferAlign;
+    (*dstRes)->outBufferHandle = rr->outBufferHandle;
+    dstRes = &(*dstRes)->next;
+  }
+
+  dstTeam = &(*dst)->teamRequirementsList;
+  for (struct ncclTeamRequirements* tr = src->teamRequirementsList; tr != nullptr; tr = tr->next) {
+    NCCLCHECKGOTO(ncclCalloc(dstTeam, 1), ret, fail);
+    (*dstTeam)->team = tr->team;
+    (*dstTeam)->multimem = tr->multimem;
+    (*dstTeam)->outMultimemHandle = tr->outMultimemHandle;
+    dstTeam = &(*dstTeam)->next;
+  }
+
+exit:
+  return ret;
+fail:
+  freeDevCommRequirements(*dst);
+  *dst = nullptr;
+  goto exit;
+}
+
+void freeDevCommRequirements(
+    struct ncclDevCommRequirements* reqs
+) {
+  if (reqs) {
+    while (reqs->resourceRequirementsList) {
+      struct ncclDevResourceRequirements* rr_next = reqs->resourceRequirementsList->next;
+      free(reqs->resourceRequirementsList);
+      reqs->resourceRequirementsList = rr_next;
+    }
+
+    while (reqs->teamRequirementsList) {
+      struct ncclTeamRequirements* tr_next = reqs->teamRequirementsList->next;
+      free(reqs->teamRequirementsList);
+      reqs->teamRequirementsList = tr_next;
+    }
+
+    free(reqs);
+  }
+}
+
+ncclResult_t ncclDevrCommCreateInternal(
+    struct ncclComm* comm,
+    struct ncclDevCommRequirements const* reqs, struct ncclDevComm* outDevComm
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclDevrState* devr = &comm->devrState;
+  struct ncclTeam world = ncclTeamWorld(comm);
+  struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize);
+  struct ncclDevrTeam* tmLsa;
+  size_t bufSizeTotal;
+  struct ncclDevResourceRequirements* resReqsHead;
+  struct ncclDevResourceRequirements lsaBarReq;
+  cudaStream_t stream = nullptr;
+  CUmemGenericAllocationHandle memHandle = 0x0;
+  struct ncclDevrMemory* mem = nullptr;
+  struct ncclDevrWindow* win = nullptr;
+  struct ncclWindow_vidmem* winHost = nullptr;
+
+  memset(outDevComm, 0, sizeof(*outDevComm));
+  outDevComm->rank = comm->rank;
+  outDevComm->nRanks = comm->nRanks;
+  outDevComm->nRanks_rcp32 = idivRcp32(comm->nRanks);
+  outDevComm->lsaRank = devr->lsaSelf;
+  outDevComm->lsaSize = devr->lsaSize;
+  outDevComm->lsaSize_rcp32 = idivRcp32(devr->lsaSize);
+
+  NCCLCHECKGOTO(symTeamObtain(comm, lsa, /*multicast=*/reqs->lsaMultimem, &tmLsa), ret, fail);
+  outDevComm->lsaMultimem.mcBasePtr = tmLsa->mcBasePtr;
+
+  { struct ncclTeamRequirements* tr = reqs->teamRequirementsList;
+    while (tr != nullptr) {
+      if (tr->multimem) {
+        struct ncclDevrTeam* tm;
+        NCCLCHECKGOTO(symTeamObtain(comm, tr->team, tr->multimem, &tm), ret, fail);
+        if (tr->outMultimemHandle != nullptr) tr->outMultimemHandle->mcBasePtr = tm->mcBasePtr;
+      }
+      tr = tr->next;
+    }
+  }
+
+  resReqsHead = reqs->resourceRequirementsList;
+
+  ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq);
+  lsaBarReq.next = resReqsHead;
+  resReqsHead = &lsaBarReq;
+
+  { struct ncclDevResourceRequirements* rr = resReqsHead;
+    bufSizeTotal = 0;
+    while (rr != nullptr) {
+      bufSizeTotal = alignUp(bufSizeTotal, std::max<size_t>(128, rr->bufferAlign));
+      if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128;
+      bufSizeTotal += rr->bufferSize;
+      rr = rr->next;
+    }
+    bufSizeTotal = alignUp(bufSizeTotal, devr->granularity);
+  }
+
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
+
+  NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists
+  outDevComm->windowTable = comm->devrState.windowTable;
+
+  if (bufSizeTotal == 0) {
+    outDevComm->resourceWindow = nullptr;
+    outDevComm->resourceWindow_inlined = {};
+  } else {
+    CUmemAllocationProp memProp = {};
+    memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    memProp.requestedHandleTypes = ncclCuMemHandleType;
+    memProp.location.id = comm->cudaDev;
+
+    CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail);
+
+    NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail);
+    memHandle = 0x0; // Reference given to symMemoryObtain
+
+    NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward.
+      comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0,
+      /*localReg=*/nullptr, &outDevComm->resourceWindow, &win,
+      stream), ret, fail);
+    mem = nullptr; // Reference given to symWindowCreate
+    NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail);
+    outDevComm->resourceWindow_inlined = *winHost;
+
+    CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail);
+  }
+
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail);
+
+  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail);
+
+  cudaStreamDestroy(stream);
+  return ret;
+
+fail:
+  if (win != nullptr) {
+    symWindowDestroy(comm, win->vidmem, stream);
+    cudaStreamSynchronize(stream);
+  }
+  if (mem != nullptr) {
+    symMemoryDropRef(comm, mem);
+  }
+  if (memHandle != 0x0) {
+    CUCHECKIGNORE(cuMemRelease(memHandle));
+  }
+  if (stream != nullptr) {
+    cudaStreamDestroy(stream);
+  }
+  return ret;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* ptr, size_t size, ncclWindow_t* win, int winFlags);
+ncclResult_t ncclCommWindowRegister(
+    struct ncclComm* comm, void* userPtr, size_t userSize,
+    struct ncclWindow_vidmem** outWinDev, int winFlags
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  int saveDev;
+  struct ncclDevrRegTask* task;
+
+  CUDACHECK(cudaGetDevice(&saveDev));
+  NCCLCHECK(ncclGroupStartInternal());
+
+  if (userPtr == nullptr || userSize == 0 || !(comm->symmetricSupport || ncclParamLocalRegister())) goto exit;
+
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+
+  NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
+
+  NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail);
+  task->userPtr = userPtr;
+  task->userSize = userSize;
+  task->winFlags = winFlags;
+  task->outWinDev = outWinDev;
+  ncclIntruQueueEnqueue(&comm->devrState.regTaskQueue, task);
+  ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
+  cudaSetDevice(saveDev);
+  return ret;
+fail:
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win);
+ncclResult_t ncclCommWindowDeregister(struct ncclComm* comm, struct ncclWindow_vidmem* winDev) {
+  ncclResult_t ret = ncclSuccess;
+  int saveDev;
+  cudaStream_t stream;
+
+  if (winDev == nullptr) goto exit;
+
+  if (!comm->symmetricSupport) {
+    NCCLCHECKGOTO(ncclCommDeregister(comm, winDev), ret, fail);
+    goto exit;
+  }
+  CUDACHECKGOTO(cudaGetDevice(&saveDev), ret, fail);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail_dev);
+  NCCLCHECKGOTO(symWindowDestroy(comm, winDev, stream), ret, fail_dev_stream);
+fail_dev_stream:
+  cudaStreamSynchronize(stream);
+  cudaStreamDestroy(stream);
+fail_dev:
+  cudaSetDevice(saveDev);
+fail:
+exit:
+  return ret;
+}
+
+ncclResult_t ncclDevrFindWindow(
+    struct ncclComm* comm, void const* userPtr, struct ncclDevrWindow** outWin
+  ) {
+  struct ncclDevrState* devr = &comm->devrState;
+  uintptr_t userAddr = reinterpret_cast<uintptr_t>(userPtr);
+  int i = listFindSortedLub(&ncclDevrWindowSorted::userAddr, devr->winSorted, devr->winSortedCount, userAddr);
+  if (0 < i && (userAddr - devr->winSorted[i-1].userAddr < devr->winSorted[i-1].size)) {
+    *outWin = devr->winSorted[i-1].win;
+  } else {
+    *outWin = nullptr;
+  }
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, ncclDevCommCreate, ncclComm_t comm, ncclDevCommRequirements_t const* reqs, ncclDevComm_t* outDevComm);
+ncclResult_t ncclDevCommCreate(
+    ncclComm_t comm, struct ncclDevCommRequirements const* reqs,
+    struct ncclDevComm* outDevComm
+  ) {
+  ncclResult_t ret = ncclSuccess;
+  int saveDev;
+  struct ncclDevrCommCreateTask* task = nullptr;
+
+  CUDACHECK(cudaGetDevice(&saveDev));
+  NCCLCHECK(ncclGroupStartInternal());
+
+  if (!comm->symmetricSupport) {
+    WARN("Communicator does not support symmetric memory!");
+    ret = ncclInvalidUsage;
+    goto fail;
+  }
+
+  NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
+
+  NCCLCHECKGOTO(ncclDevrInitOnce(comm), ret, fail);
+
+  NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail);
+  // reqs must be deep copied to the task so background threads can safely access it
+  NCCLCHECKGOTO(deepCopyDevCommRequirements(reqs, &task->reqs), ret, fail);
+  task->outDevComm = outDevComm;
+  ncclIntruQueueEnqueue(&comm->devrState.commCreateTaskQueue, task);
+  ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
+
+exit:
+  ncclGroupErrCheck(ret);
+  NCCLCHECK(ncclGroupEndInternal());
+  cudaSetDevice(saveDev);
+  return ret;
+fail:
+  free(task);
+  goto exit;
+}
+
+NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const* devComm);
+ncclResult_t ncclDevCommDestroy(
+    struct ncclComm* comm, struct ncclDevComm const* devComm
+  ) {
+  //struct ncclDevrState* devr = &comm->devrState;
+  if (devComm->resourceWindow != nullptr) {
+    NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow));
+  }
+  return ncclSuccess;
+}
+
+
+// Get the corresponding pointer in another lsa rank's symmetric memory window
+ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, int lsaRank, void** outPtr) {
+  if (winHost == nullptr || outPtr == nullptr) {
+    return ncclInvalidArgument;
+  }
+
+  struct ncclDevrState* devr = &comm->devrState;
+  
+  // Validate lsaRank is within bounds
+  if (lsaRank < 0 || lsaRank >= devr->lsaSize) {
+    return ncclInvalidArgument;
+  }
+
+  // Validate offset is within bounds
+  if (offset < 0 || offset >= winHost->size) {
+    return ncclInvalidArgument;
+  }
+
+  // Calculate the address with offset for the specified lsa rank
+  *outPtr = (void*)((uintptr_t)devr->lsaFlatBase + lsaRank * devr->bigSize + winHost->bigOffset + offset);
+  return ncclSuccess;
+}
+
+// Get the multicast address for a given team
+ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, struct ncclTeam lsaTeam, void** outPtr){
+  if (winHost == nullptr || outPtr == nullptr) {
+    return ncclInvalidArgument;
+  }
+
+  if (!comm->nvlsSupport) {
+    return ncclInvalidUsage;
+  }
+
+  bool multimem = true;
+  struct ncclDevrTeam* tm;
+  NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm));
+    
+  // Return the base multicast address for this team with offset
+  *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset);
+  return ncclSuccess;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Find the least index strictly greater than arg.
+template<typename Obj, typename Key>
+static int listFindSortedLub(Key Obj::*key, Obj* sorted, int count, Key arg) {
+  int lo = 0, hi = count;
+  while (lo + 16 < hi) {
+    int i = (lo + hi)/2;
+    if (sorted[i].*key <= arg) lo = i+1;
+    else hi = i;
+  }
+  int i = lo;
+  while (i < hi && sorted[i].*key <= arg) i++;
+  return i;
+}
+
+template<typename Obj>
+static void listInsert(Obj** list, int* capacity, int* count, int index, Obj val) {
+  if (*capacity < *count + 1) {
+    *capacity *= 2;
+    if (*capacity == 0) *capacity = 16;
+    *list = (Obj*)realloc(*list, (*capacity)*sizeof(Obj));
+  }
+  for (int j = *count; j != index; j--) {
+    (*list)[j] = (*list)[j-1];
+  }
+  (*list)[index] = val;
+  *count += 1;
+}
+
+template<typename Obj>
+static void listRemove(Obj* list, int* count, int index) {
+  for (int i = index; i+1 < *count; i++) {
+    list[i] = list[i+1];
+  }
+  *count -= 1;
+}
+
diff --git a/src/device/CMakeLists.txt b/src/device/CMakeLists.txt
new file mode 100644
index 000000000..98447428d
--- /dev/null
+++ b/src/device/CMakeLists.txt
@@ -0,0 +1,60 @@
+# Run the scripts once during configuration to get the file lists
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${files}" files)
+list(TRANSFORM files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/)
+
+execute_process(
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    OUTPUT_VARIABLE symmetric_files
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+)
+string(STRIP "${symmetric_files}" symmetric_files)
+list(TRANSFORM symmetric_files PREPEND ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric/)
+
+# Create custom commands to generate source files with proper dependencies
+add_custom_command(
+    OUTPUT  ${files}
+    BYPRODUCTS ${files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating device source files"
+)
+
+add_custom_command(
+    OUTPUT  ${symmetric_files}
+    BYPRODUCTS ${symmetric_files}
+    COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py ${CMAKE_CURRENT_BINARY_DIR}/gensrc/symmetric "${ONLY_FUNCS}"
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/symmetric/generate.py
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+    COMMENT "Generating symmetric device source files"
+)
+
+# Add library target
+add_library(nccl_device OBJECT
+            ${files}
+            ${symmetric_files}
+            ${CMAKE_CURRENT_SOURCE_DIR}/common.cu
+            ${CMAKE_CURRENT_SOURCE_DIR}/onerank.cu
+)
+
+set_target_properties(nccl_device PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+)
+
+# Set include directories for the target
+target_include_directories(nccl_device PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_SOURCE_DIR}/src/include
+    ${CMAKE_SOURCE_DIR}/src/include/plugin
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_dependencies(nccl_device nccl_header)
diff --git a/src/device/Makefile b/src/device/Makefile
index 67ab176ca..fd8f2759d 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -19,7 +19,7 @@ OBJDIR := $(BUILDDIR)/obj/device
 MANIFEST := $(OBJDIR)/manifest
 DEVGLUE_OBJ  := $(OBJDIR)/device_glue.o
 
-INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include
+INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)
 
@@ -47,7 +47,11 @@ endif
 define COMPILE_SYM
 @$(SAY) "Compiling" $2;\
  mkdir -p $(dir $1);\
- $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1
+ if [[ -n "$3" ]]; then\
+ $(NVCC) $(NVCUFLAGS_SYM) $3 -dw $2 -o $1;\
+ else\
+ touch $2.empty.cu; $(NVCC) $(NVCUFLAGS_SYM) -dw $2.empty.cu -o $1; rm $2.empty.cu;\
+ fi
 endef
 
 DEPENDS.cu = $(NVCC) $(NVCUFLAGS) -M -dc $1
diff --git a/src/device/common.h b/src/device/common.h
index a2884b50c..a31cf5f8e 100644
--- a/src/device/common.h
+++ b/src/device/common.h
@@ -43,7 +43,7 @@ struct ncclShmemData {
   struct ncclDevKernelArgs args;
   int channelId;
   int aborted;
-  alignas(16) struct ncclDevComm comm;
+  alignas(16) struct ncclKernelComm comm;
   alignas(16) struct ncclDevChannel channel;
 
   int batchIx, nextBatchIx;
@@ -323,7 +323,7 @@ __device__ __forceinline__ void profiler(int action) {
         ncclShmem.comm.workCompleted[ncclShmem.channelId].data[wc%MAX_PROFILER_EVENTS_PER_CHANNEL].counter = wc;
       }
       ncclShmem.channel.workCounter += ncclShmem.nWorks;
-      if (action == FINI) ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
+      if (action == FINI) ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter = ncclShmem.channel.workCounter;
     }
   }
 }
@@ -351,7 +351,7 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   /* set abort flag to 0 */
   if (tid == 0) {
     ncclShmem.aborted = 0;
-    ncclShmem.channel.workCounter = ((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
+    ncclShmem.channel.workCounter = ((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId].workCounter;
   }
 
   // Use first 2 warps to load comm and channel, and remaining load work batch.
@@ -359,14 +359,14 @@ __device__ __forceinline__ void ncclKernelMain(struct ncclDevKernelArgs const* a
   case 0:
     { void* dst = &ncclShmem.comm;
       void* src = ncclShmem.args.comm;
-      int bytes = sizeof(ncclDevComm);
-      static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
+      int bytes = sizeof(ncclKernelComm);
+      static_assert(sizeof(ncclKernelComm) <= 16*WARP_SIZE, "ncclKernelComm cannot be loaded by a single warp in one insn.");
       copyToShmem16(tid, dst, src, bytes);
     } break;
   case 1:
-    { // Get address of channel without incurring indirect load from ncclDevComm::channels
+    { // Get address of channel without incurring indirect load from ncclKernelComm::channels
       void* dst = &ncclShmem.channel;
-      void* src = &((ncclDevCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
+      void* src = &((ncclKernelCommAndChannels*)ncclShmem.args.comm)->channels[ncclShmem.channelId];
       int bytes = sizeof(ncclDevChannel);
       static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
       copyToShmem16(tid-WARP_SIZE, dst, src, bytes);
diff --git a/src/device/generate.py b/src/device/generate.py
index f9c3a0e79..aefba9422 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 import sys
+import shutil
 
 # Order of redops, tys, protos, algos must match src/include/device.h
 all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
@@ -17,8 +18,11 @@
 
 if os.path.exists(gensrc):
   for name in os.listdir(gensrc):
-    os.remove(os.path.join(gensrc, name))
-    #os.truncate(os.path.join(gensrc, name), 0)
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
 else:
   os.mkdir(gensrc)
 
@@ -322,6 +326,16 @@ def partition_by_name(fns):
 name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
 name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
 
+files = ""
+for name in sorted(name_to_funcs.keys()):
+    files += name + ";"
+files += "device_table.cu;"
+files += "host_table.cc"
+
+# Do not print files when running make
+if os.environ.get("NCCL_USE_CMAKE", "0") == "1":
+    print(files)
+
 # Generate <gensrc>/rules.mk
 with open(os.path.join(gensrc, "rules.mk"), "w") as f:
   out = f.write
diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh
index 8f81347ec..9f050836c 100644
--- a/src/device/symmetric/all_gather.cuh
+++ b/src/device/symmetric/all_gather.cuh
@@ -1,32 +1,33 @@
-#include "symmetric.h"
-#include "symmetric/kernel.cuh"
-#include "symmetric/primitives.cuh"
+#include "sym_kernels.h"
+#include "kernel.cuh"
+#include "primitives.cuh"
 
 template<int BytePerPack, int UnrollPacks, int UnrollPeers>
 static __device__ void bcastDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    char* inputHere, char* outputRank0, bool inPlace, int nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<char> input, ncclSymPtr<char> output, bool inPlace, int nIters
   ) {
   using Pack = BytePack<BytePerPack>;
   int wn = tn/WARP_SIZE;
   int w = t/WARP_SIZE;
   int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpHere = (Pack*)inputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  Pack* inpPacks = (Pack*)input.localPtr() + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
   Pack tmp[UnrollPacks];
 
   nIters -= w;
   if (0 < nIters) {
     #pragma unroll
     for (int u=0; u < UnrollPacks; u++) {
-      tmp[u] = inpHere[u*WARP_SIZE];
+      tmp[u] = inpPacks[u*WARP_SIZE];
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   if (0 < nIters) {
     while (true) {
@@ -44,21 +45,21 @@ static __device__ void bcastDeep(
             if (partial && dr == nRanks) break;
             #pragma unroll UnrollPacks
             for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = tmp[u];
+              outPacks.lsaPtr(r)[u*WARP_SIZE] = tmp[u];
             }
             if (++r == nRanks) r = 0;
           }
         }
       }
-      inpHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
       nIters -= wn;
       if (nIters <= 0) break;
 
       // Load data for next iteration.
       #pragma unroll
       for (int u=0; u < UnrollPacks; u++) {
-        tmp[u] = inpHere[u*WARP_SIZE];
+        tmp[u] = inpPacks[u*WARP_SIZE];
       }
     }
   }
@@ -66,18 +67,17 @@ static __device__ void bcastDeep(
 
 template<int UnrollPeers, typename T>
 static __device__ void bcastEnds(
-    ncclSymPrims& prim, int tn, int t,
-    T* inputHere, T* outputRank0, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, bool inPlace, size_t nElts, uint32_t nPreElts, size_t nSufElts
   ) {
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpHere = (BytePack<sizeof(T)>*)inputHere;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  BytePack<sizeof(T)>* inpPacks = (BytePack<sizeof(T)>*)input.localPtr();
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
   #pragma unroll 1
   for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
     size_t elt = i < nPreElts ? i : nElts-nPreElts-nSufElts+i;
-    BytePack<sizeof(T)> tmp = inpHere[elt];
+    BytePack<sizeof(T)> tmp = inpPacks[elt];
     int dr = inPlace ? 1 : 0;
     int r = rank + dr;
     if (r == nRanks) r = 0;
@@ -85,14 +85,14 @@ static __device__ void bcastEnds(
     for (; dr + UnrollPeers <= nRanks; dr += UnrollPeers) {
       #pragma unroll UnrollPeers
       for (int u=0; u < UnrollPeers; u++) {
-        *add4G(outRank0+elt, r*stride4G) = tmp;
+        outPacks.lsaPtr(r)[elt] = tmp;
         if (++r == nRanks) r = 0;
       }
     }
     #pragma unroll UnrollPeers
     for (int u=0; u < UnrollPeers; u++) {
       if (dr+u == nRanks) break;
-      *add4G(outRank0+elt, r*stride4G) = tmp;
+      outPacks.lsaPtr(r)[elt] = tmp;
       if (++r == nRanks) r = 0;
     }
   }
@@ -100,95 +100,99 @@ static __device__ void bcastEnds(
 
 template<typename T>
 static __device__ void bcast(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
   ) {
   bool inPlace = (input == output);
-  // Mpve to rank=0
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
   size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
 
-  uint32_t nPreBytes = (128u - inputUptr)%128u;
+  uint32_t nPreBytes = (16 - input.offset)%16;
   nPreBytes = min((size_t)nPreBytes, nBytes);
   uintptr_t cursor = nPreBytes;
 
   constexpr int MinWarpPerBlock = 4;
 
-  if ((inputUptr-outputUptr)%16 == 0) {
+  if ((input.offset - output.offset)%16 == 0) {
     constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       bcastDeep<BytePerPack, UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
       waitNeeded = false;
     }
   }
 
-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
     constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, prim.nBlocks, prim.nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nBlocks, nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       bcastDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers>(
-        prim, tn, t, waitNeeded,
-        (char*)input + cursor, (char*)output + cursor, inPlace,
-        chunks*MinWarpPerBlock
+        handler, tn, t, waitNeeded, bar,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
+        inPlace, chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
       waitNeeded = false;
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   constexpr int UnrollPeers = 8;
   size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  bcastEnds<UnrollPeers>(prim, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
+  bcastEnds<UnrollPeers>(handler, tn, t, input, output, inPlace, nElts, nPreBytes/sizeof(T), nSufElts);
 }
 
-__device__ __forceinline__ void ncclSymRun_AllGather_ST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int const& rank = prim.rank;
+__device__ __forceinline__ void ncclSymkRun_AllGather_ST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+  int const& rank = handler.comm.rank;
 
-  // Threads numbered over rank.
-  int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int btn = prim.nBlocks*blockDim.x;
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bool waitNeeded = true;
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Threads numbered over rank.
+        int bt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int btn = nBlocks*blockDim.x;
 
-  bcast(prim, btn, bt, /*waitNeeded=*/true, (char*)args->input, (char*)args->output + rank*args->nElts, args->nElts);
+        bcast(handler, btn, bt, nBlocks, waitNeeded, bar, input, output + rank*nAllElts, nElts);
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-}
+        waitNeeded = false;
+      }
+    );
 
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
+}
 
 template<typename T>
 static __device__ void bcastMultimem(
-    ncclSymPrims& prim, int tn, int t, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler& handler, int tn, int t, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
   ) {
-  // Move output to multimem
-  output = prim.multimemPtr(output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
   size_t nBytes = nElts*sizeof(T);
-
-  uint32_t nPreBytes = (16-inputUptr)%16;
+  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input.localPtr());
+  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output.multimemPtr(handler.comm.lsaMultimem));
+  uint32_t nPreBytes = (16 - input.offset)%16;
   nPreBytes = min((size_t)nPreBytes, nBytes);
   uintptr_t nSufBytes;
 
@@ -227,51 +231,52 @@ static __device__ void bcastMultimem(
     uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
     BytePack<sizeof(T)> val = *reinterpret_cast<BytePack<sizeof(T)>*>(inputUptr + cursor);
     multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
   }
 }
 
-__device__ __forceinline__ void ncclSymRun_AllGather_STMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  int const& rank = prim.rank;
-
-  char* input = args->input;
-  char* output = args->output;
-  size_t bytes = args->nElts;
-  // Round robin memory to blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-
-  bcastMultimem(prim, tn, t, input, output + rank*bytes, bytes);
+__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar(
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  );
+  int const& rank = handler.comm.rank;
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  handler.forEachWork<char>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        // Round robin memory to blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;
+
+        bcastMultimem(handler, tn, t, input, output + rank*nAllElts, nElts);
+      }
+    );
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }
 
 template<typename EltType>
 static __device__ void allgather_LL_body(
-    ncclSymPrims &prim, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
+    ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
+    EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts
   ) {
   using Pack = BytePack<8>;
   constexpr int EltPerPack = 8/sizeof(EltType);
-
-  ncclCoopCta cta;
-  int rank = prim.rank;
-  int nRanks = prim.nRanks;
-  constexpr int tn = ncclSymMaxThreads;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
   int t = threadIdx.x;
+  constexpr int tn = ncclSymkMaxThreads;
 
   #pragma unroll 1
   while (0 < nElts) {
     int nIterPacks = min(nPacks, tn);
     if (t < nIterPacks) {
       Pack x = loadPack<Pack>(input, t*EltPerPack, nElts);
-      prim.bcastLL(/*slot=*/nIterPacks*rank + t, x);
+      lla2a.bcast(/*slot=*/nIterPacks*rank + t, x);
     }
 
     int tn_div_nPacks = tn/nIterPacks;
@@ -284,7 +289,7 @@ static __device__ void allgather_LL_body(
       #pragma unroll 1
       for (int i = t; i < (nRanks*nIterPacks & -(Unroll*tn)); i += Unroll*tn) {
         Pack got[Unroll];
-        prim.template recvLL<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
+        lla2a.template recvUnrolled<Unroll, Unroll>(i, Unroll, tn, /*&*/got);
         #pragma unroll
         for (int u=0; u < Unroll; u++) {
           storePack<Pack>(output + peer*nStrideElts, pack*EltPerPack, nElts, got[u]);
@@ -299,7 +304,7 @@ static __device__ void allgather_LL_body(
       if (i + n*tn < nRanks*nIterPacks) n += 1;
       if (n != 0) {
         Pack got[Unroll];
-        prim.template recvLL<1, Unroll>(i, n, tn, /*&*/got);
+        lla2a.template recvUnrolled<1, Unroll>(i, n, tn, /*&*/got);
         #pragma unroll
         for (int u=0; u < Unroll; u++) {
           if (u != 0 && u == n) break;
@@ -313,7 +318,7 @@ static __device__ void allgather_LL_body(
       // The non-unrolled but "obviously correct" implementation for reference.
       #pragma unroll 1
       for (int i = t; i < nRanks*nIterPacks; i += tn) {
-        Pack got = prim.template recvLL<Pack>(i);
+        Pack got = lla2a.template recv<Pack>(i);
         storePack(output + peer*nStrideElts, pack*EltPerPack, nElts, got);
         peer += tn_div_nPacks;
         pack += tn_mod_nPacks;
@@ -321,7 +326,7 @@ static __device__ void allgather_LL_body(
       }
     #endif
 
-    prim.endLL(cta);
+    lla2a.endEpoch(ncclCoopCta());
 
     input += tn*EltPerPack;
     output += tn*EltPerPack;
@@ -330,38 +335,41 @@ static __device__ void allgather_LL_body(
   }
 }
 
-static __device__ void ncclSymRun_AllGather_LL_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
+static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, /*maxElts=*/ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
   using Pack = BytePack<8>;
   constexpr int BytePerPack = 8;
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, BytePerPack);
-
-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-  int nBlockPacks = blockPackEnd - blockPackBegin;
-  int nBlockElts = nElts - blockPackBegin*BytePerPack;
-  nBlockElts = min(nBlockElts, nBlockPacks*BytePerPack);
-  char* blockInput = args->input + blockPackBegin*BytePerPack;
-  char* blockOutput = args->output + blockPackBegin*BytePerPack;
-
-  uint32_t lowBits = args->nElts;
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
-  if (__builtin_expect(lowBits%8 == 0, true)) {
-    // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
-    allgather_LL_body(prim, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput, nBlockElts/8, nBlockPacks, nElts/8);
-  } else {
-    allgather_LL_body(prim, blockInput, blockOutput, nBlockElts, nBlockPacks, nElts);
-  }
+
+  handler.singleWork<char>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<char> input, ncclSymPtr<char> output) {
+        int nPacks = divUp(nElts, BytePerPack);
+
+        char* blockInput = input.localPtr();
+        char* blockOutput = output.localPtr();
+
+        uint32_t lowBits = nElts;
+        lowBits |= (uintptr_t)blockInput;
+        lowBits |= (uintptr_t)blockOutput;
+        if (__builtin_expect(lowBits%8 == 0, true)) {
+          // NOTE: Specializing for 8-byte alignment in one case help at size=65K: 8.9us vs 5.6us
+          allgather_LL_body(handler, lla2a, (BytePack<8>*)blockInput, (BytePack<8>*)blockOutput,
+                            nElts/8, nPacks, nAllElts/8);
+        } else {
+          allgather_LL_body(handler, lla2a, blockInput, blockOutput, nElts, nPacks, nAllElts);
+        }
+      }
+    );
 }
 
-__device__ __forceinline__ void ncclSymRun_AllGather_LL(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LL(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/false);
 }
 
-__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(ncclSymDevArgs const* args) {
-  ncclSymRun_AllGather_LL_impl(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllGather_LL_impl(args, /*multimem=*/true);
 }
diff --git a/src/device/symmetric/all_reduce.cuh b/src/device/symmetric/all_reduce.cuh
index 6c5219784..94e40babb 100644
--- a/src/device/symmetric/all_reduce.cuh
+++ b/src/device/symmetric/all_reduce.cuh
@@ -1,35 +1,38 @@
-#include "symmetric.h"
-#include "symmetric/kernel.cuh"
-#include "symmetric/primitives.cuh"
+#include "sym_kernels.h"
+#include "nccl_device.h"
+#include "kernel.cuh"
+#include "primitives.cuh"
 
 template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
 static __device__ __forceinline__ void allreduceDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, char* inputRank0, char* outputRank0, int32_t nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
   ) {
   using Pack = BytePack<BytePerPack>;
   using Acc = typename Red::EltType;
   using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
 
+  ncclTeam world = ncclTeamWorld(handler.comm);
   int wn = tn/WARP_SIZE;
   int w = t/WARP_SIZE;
   int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outRank0 = (Pack*)outputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
   Pack acc0[UnrollPacks];
 
   nIters -= w;
   if (0 < nIters) {
     #pragma unroll
     for (int u=0; u < UnrollPacks; u++) {
-      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   if (0 < nIters) {
     while (true) {
@@ -39,7 +42,7 @@ static __device__ __forceinline__ void allreduceDeep(
       { Pack tmp1[UnrollPacks];
         #pragma unroll
         for (int u=0; u < UnrollPacks; u++) {
-          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+          tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
         }
         #pragma unroll
         for (int u=0; u < UnrollPacks; u++) {
@@ -64,7 +67,7 @@ static __device__ __forceinline__ void allreduceDeep(
             if (partial && ur!=0 && dr+ur == nRanks) break;
             #pragma unroll UnrollPacks
             for (int u=0; u < UnrollPacks; u++) {
-              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+              tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
             }
             if (++r == nRanks) r = 0;
           }
@@ -95,22 +98,22 @@ static __device__ __forceinline__ void allreduceDeep(
             if (partial && dr == nRanks) break;
             #pragma unroll UnrollPacks
             for (int u=0; u < UnrollPacks; u++) {
-              add4G(outRank0, r*stride4G)[u*WARP_SIZE] = acc0[u];
+              outPacks.peerPtr(world, r)[u*WARP_SIZE] = acc0[u];
             }
             if (++r == nRanks) r = 0;
           }
         }
       }
 
-      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
       nIters -= wn;
       if (nIters <= 0) break;
 
       // Load data for next iteration.
       #pragma unroll
       for (int u=0; u < UnrollPacks; u++) {
-        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+        acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
       }
     }
   }
@@ -118,21 +121,23 @@ static __device__ __forceinline__ void allreduceDeep(
 
 template<int UnrollPeers, typename Red, typename T>
 static __device__ __forceinline__ void allreduceEnds(
-    ncclSymPrims& prim, int tn, int t, Red red,
-    T* inputRank0, T* outputRank0, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t, Red red,
+    ncclSymPtr<T> input, ncclSymPtr<T> output,
+    size_t nElts, uint32_t nPreElts, size_t nSufElts
   ) {
   using Acc = typename Red::EltType;
 
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
-  BytePack<sizeof(T)>* outRank0 = (BytePack<sizeof(T)>*)outputRank0;
+  ncclTeam world = ncclTeamWorld(handler.comm);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
 
   #pragma unroll 1
   for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
     size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
-    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
     BytePack<sizeof(Acc)> acc1;
     BytePack<sizeof(T)> tmp[UnrollPeers];
     int dr = 1;
@@ -151,7 +156,7 @@ static __device__ __forceinline__ void allreduceEnds(
         #pragma unroll
         for (int u=0; u < UnrollPeers-partial; u++) {
           if (partial && u!=0 && dr+u == nRanks) break;
-          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          tmp[u] = inpPacks.peerPtr(world, r)[elt];
           r += 1;
           if (r == nRanks) r = 0;
         }
@@ -179,7 +184,7 @@ static __device__ __forceinline__ void allreduceEnds(
         #pragma unroll
         for (int u=0; u < UnrollPeers-partial; u++) {
           if (partial && dr+u == nRanks) break;
-          *add4G(outRank0+elt, r*stride4G) = acc0;
+          outPacks.peerPtr(world, r)[elt] = acc0;
           r += 1;
           if (r == nRanks) r = 0;
         }
@@ -190,35 +195,33 @@ static __device__ __forceinline__ void allreduceEnds(
 
 template<typename Red, typename T>
 static __device__ void allreduce(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
   ) {
-  int nRanks = prim.nRanks;
-  int nBlocks = prim.nBlocks;
-  // Mpve to rank=0
-  input = prim.peerPtr(0, input);
-  output = prim.peerPtr(0, output);
-
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
+  int const& nRanks = handler.comm.nRanks;
+  int const& nRanks_rcp32 = handler.nRanks_rcp32;
   size_t nBytes = nElts*sizeof(T);
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
+  uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
 
-  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  uint32_t nPreBytes = (16u - input.offset)%16u;
   nPreBytes = min((size_t)nPreBytes, nBytes);
   uintptr_t cursor = nPreBytes;
 
   constexpr int MinWarpPerBlock = 4;
 
-  if ((inputUptr-outputUptr)%16 == 0) {
+  if ((input.offset - output.offset)%16 == 0) {
     constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       allreduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
         chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
@@ -226,16 +229,17 @@ static __device__ void allreduce(
     }
   }
 
-  if (sizeof(T) == 4 || (sizeof(T) < 4 && (inputUptr-outputUptr)%4 == 0)) {
+  if (sizeof(T) == 4 || (sizeof(T) < 4 && (input.offset - output.offset)%4 == 0)) {
     constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       allreduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor,
+        (ncclSymPtr<char>)output + cursor,
         chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
@@ -243,46 +247,51 @@ static __device__ void allreduce(
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   constexpr int UnrollPeers = 8;
   size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  allreduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+  allreduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
 }
 
-
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  int /*const&*/ rank = prim.rank;
-  int /*const&*/ nRanks = prim.nRanks;
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
-
-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     rank, nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = nRanks*prim.nBlocks*blockDim.x;
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-
-  allreduce(prim, gtn, gt, /*waitNeeded=*/true, red, (T*)args->input, (T*)args->output, args->nElts);
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-}
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
+
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
 
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  bool waitNeeded = true;
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduce(handler, gtn, gt, nBlocks, waitNeeded, bar, red, input, output, nElts);
+
+        waitNeeded = false;
+      }
+    );
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
+}
 
 template<typename Red, typename T>
 static __device__ void allreduceMultimem(
-    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+    int tn, int t, Red red, T* input, T* output, size_t nElts
   ) {
-  // Mpve to multimem
-  input = prim.multimemPtr(input);
-  output = prim.multimemPtr(output);
-
   uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
   uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
   size_t nBytes = nElts*sizeof(T);
@@ -327,106 +336,132 @@ static __device__ void allreduceMultimem(
     uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
     BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
     multimem_st_global(outputUptr + cursor, val);
-    cursor += tn*sizeof(T);
   }
 }
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
-
-  // Threads numbered globally such that we round robin warps by rank then block.
-  int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                     prim.rank, prim.nRanks,
-                     prim.block, prim.nBlocks,
-                     threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int gtn = prim.nRanks*prim.nBlocks*blockDim.x;
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-
-  allreduceMultimem(prim, gtn, gt, red, (T*)args->input, (T*)args->output, args->nElts);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  };
+
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
+
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  auto const& multimem = handler.comm.lsaMultimem;
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Threads numbered globally such that we round robin warps by rank then block.
+        int gt = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                           rank, nRanks,
+                           block, nBlocks,
+                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int gtn = nRanks*nBlocks*blockDim.x;
+
+        allreduceMultimem(gtn, gt, red, input.multimemPtr(multimem), output.multimemPtr(multimem), nElts);
+      }
+    );
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/true);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
 }
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R_impl(ncclSymDevArgs const* args, bool multimem) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL | multimem*ncclSymPrims_UseMultimem);
-  int /*const&*/ rank = prim.rank;
-  using Acc = typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type;
-  Red<Acc> red(args->redOpArg);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R_impl(ncclSymkDevWorkArgs const* args, bool multimem) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A,
+    blockIdx.x, ncclSymkMaxThreads, multimem, handler.comm.lsaMultimem
+  );
+
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  using Acc = typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type;
+  Red<Acc> red(handler.devWork->redOpArg);
 
   using Pack = BytePack<8>;
   using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
   constexpr int EltPerPack = 8/sizeof(T);
-  int nElts = args->nElts;
-  int nPacks = divUp(nElts, EltPerPack);
-
-  bool packAligned = 8 <= alignof(T) || (
-      args->nElts*sizeof(T) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->input) |
-      (uint32_t)reinterpret_cast<uintptr_t>(args->output)
-    )%8 == 0;
-
-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int begin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int end = begin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-
-  nPacks = end - begin;
-  nElts -= begin*EltPerPack;
-  nElts = min(nElts, nPacks*EltPerPack);
-  T* input = (T*)args->input + begin*EltPerPack;
-  T* output = (T*)args->output + begin*EltPerPack;
-
-  ncclCoopCta cta;
-  int t = threadIdx.x;
-  int tn = ncclSymMaxThreads;
-
-  if (__builtin_expect(packAligned, true)) {
-    #pragma unroll 1
-    while (0 < nPacks) {
-      if (t < nPacks) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack((Pack*)output, t, nPacks, out);
-      }
-      prim.endLL(cta);
 
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nPacks -= tn;
-    }
-  } else {
-    #pragma unroll 1
-    while (0 < nElts) {
-      if (t*EltPerPack < nElts) {
-        int nIterPacks = min(nPacks, tn);
-        Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
-        prim.bcastLL(/*slot=*/nIterPacks*rank + t, inp);
-        Pack out = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-        storePack(output, t*EltPerPack, nElts, out);
-      }
-      prim.endLL(cta);
+  handler.singleWork<T>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
+        int nPacks = divUp(nElts, EltPerPack);
+
+        T* input = (T*)inputPtr.localPtr();
+        T* output = (T*)outputPtr.localPtr();
+
+        bool packAligned = 8 <= alignof(T) || (nElts*sizeof(T) | (uintptr_t)input | (uintptr_t)output)%8 == 0;
+
+        ncclCoopCta cta;
+        int t = threadIdx.x;
+        int tn = ncclSymkMaxThreads;
+
+        if (__builtin_expect(packAligned, true)) {
+          #pragma unroll 1
+          while (0 < nPacks) {
+            if (t < nPacks) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>((Pack*)input, t, nPacks);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack((Pack*)output, t, nPacks, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
 
-      input += tn*EltPerPack;
-      output += tn*EltPerPack;
-      nElts -= tn*EltPerPack;
-      nPacks -= tn;
-    }
-  }
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nPacks -= tn;
+          }
+        } else {
+          #pragma unroll 1
+          while (0 < nElts) {
+            if (t*EltPerPack < nElts) {
+              int nIterPacks = min(nPacks, tn);
+              Pack inp = loadPack<Pack>(input, t*EltPerPack, nElts);
+              lla2a.bcast(/*slot=*/nIterPacks*rank + t, inp);
+              AccPack out = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+                /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+                /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+                  return applyCast<T, Acc>(x);
+                },
+                /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+                  return applyReduce(red, a, b);
+                }
+              );
+              storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(out));
+            }
+            lla2a.endEpoch(cta);
+
+            input += tn*EltPerPack;
+            output += tn*EltPerPack;
+            nElts -= tn*EltPerPack;
+            nPacks -= tn;
+          }
+        }
+      }
+    );
 }
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/false);
 }
+
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(ncclSymDevArgs const* args) {
-  ncclSymRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(ncclSymkDevWorkArgs const* args) {
+  ncclSymkRun_AllReduce_AGxLL_R_impl<Red, T>(args, /*multimem=*/true);
 }
diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py
index 8fcb9a425..8e62bda5b 100755
--- a/src/device/symmetric/generate.py
+++ b/src/device/symmetric/generate.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 import sys
+import shutil
 
 ################################################################################
 # The first command line argument is the path to the directory to generate and
@@ -10,8 +11,11 @@
 
 if os.path.exists(gensrc):
   for name in os.listdir(gensrc):
-    os.remove(os.path.join(gensrc, name))
-    #os.truncate(os.path.join(gensrc, name), 0)
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
 else:
   os.mkdir(gensrc)
 
@@ -94,7 +98,7 @@ def enumerate_kernels():
         yield Rec(coll="ReduceScatter", algo=algo, red=red, ty=ty)
 
 def required_cuda(k):
-  cudart, arch, specific_sms  = 0, 0, None
+  cudart, arch, specific_sms  = 0, 600, None
   is_nvls = k.algo in nvls_algos_by_coll.get(k.coll, [])
   if is_nvls:
     cudart = max(cudart, 12010)
@@ -133,13 +137,13 @@ def kernel_gencode(k):
 
 def kernel_cname(k):
   if k.coll in reductions:
-    return paste("_", "ncclSymDevKernel", k.coll, k.algo, k.red, k.ty)
+    return paste("_", "ncclSymkDevKernel", k.coll, k.algo, k.red, k.ty)
   else:
-    return paste("_", "ncclSymDevKernel", k.coll, k.algo)
+    return paste("_", "ncclSymkDevKernel", k.coll, k.algo)
 
 def kernel_conds(k):
   cudart, arch, specific_sms = required_cuda(k)
-  if cudart == 0: return (None, None)
+  if cudart == 0 and arch == 0: return (None, None)
 
   cudart_cond = "CUDART_VERSION >= %d"%cudart
   if not specific_sms:
@@ -152,30 +156,30 @@ def instantiate(k):
   cudart_cond, arch_cond = kernel_conds(k)
   if (cudart_cond, arch_cond) == (None, None):
     form_red_ty = (
-      "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
-      "  ncclSymRun_{id}<{red}, {ty}>(&args);\n"
+      "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n"
+      "  ncclSymkRun_{id}<{red}, {ty}>(&args4K.args);\n"
       "}}"
     )
     form = (
-      "__global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
-      "  ncclSymRun_{id}(&args);\n"
+      "__global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n"
+      "  ncclSymkRun_{id}(&args4K.args);\n"
       "}}"
     )
   else:
     form_red_ty = (
       "#if {cudart_cond}\n"
-      "  __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "  __global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n"
       "    #if {arch_cond}\n"
-      "      ncclSymRun_{id}<{red}, {ty}>(&args);\n"
+      "      ncclSymkRun_{id}<{red}, {ty}>(&args4K.args);\n"
       "    #endif\n"
       "  }}\n"
       "#endif"
     )
     form = (
       "#if {cudart_cond}\n"
-      "  __global__ void {cname}(ncclSymDevArgs NCCL_GRID_CONSTANT const args) {{\n"
+      "  __global__ void {cname}(ncclSymkDevWorkArgs4K NCCL_GRID_CONSTANT const args4K) {{\n"
       "    #if {arch_cond}\n"
-      "      ncclSymRun_{id}(&args);\n"
+      "      ncclSymkRun_{id}(&args4K.args);\n"
       "    #endif\n"
       "  }}\n"
       "#endif"
@@ -192,11 +196,11 @@ def instantiate(k):
 def prototype(k):
   cudart_cond, arch_cond = kernel_conds(k)
   if cudart_cond is None:
-    form = "__global__ void {cname}(ncclSymDevArgs const);"
+    form = "__global__ void {cname}(ncclSymkDevWorkArgs4K const);"
   else:
     form = (
       "#if {cudart_cond}\n"
-      "  __global__ void {cname}(ncclSymDevArgs const);\n"
+      "  __global__ void {cname}(ncclSymkDevWorkArgs4K const);\n"
       "#else\n"
       "  constexpr void* {cname} = nullptr;\n"
       "#endif"
@@ -223,18 +227,20 @@ def partition(vals, keyfn):
   if (fname, coll) not in kernels_by_file:
     kernels_by_file[fname, coll] = []
 
+files_to_print = ""
 # Generate each kernel instantiation file
 for (fname, coll), ks in kernels_by_file.items():
+  files_to_print += fname + ";"
   with open(os.path.join(gensrc, fname), "w") as f:
-    emitln(f, '#include "symmetric.h"')
+    emitln(f, '#include "sym_kernels.h"')
     emitln(f, '#include "symmetric/kernel.cuh"')
     emitln(f, '#include "symmetric/{coll}.cuh"'.format(coll=coll_to_lower[coll]))
     for k in ks:
       emitln(f, instantiate(k))
 
-# Generate <gensrc>/symmetric_host.cc
-with open(os.path.join(gensrc, "symmetric_kernels.cc"), "w") as f:
-  emitln(f, '#include "symmetric.h"')
+# Generate <gensrc>/sym_kernels_host.cc
+with open(os.path.join(gensrc, "sym_kernels_host.cc"), "w") as f:
+  emitln(f, '#include "sym_kernels.h"')
   emitln(f, '#include "device.h"')
   emitln(f, '')
 
@@ -242,19 +248,19 @@ def partition(vals, keyfn):
     emitln(f, prototype(k))
   emitln(f, '')
 
-  emitln(f, 'extern int const ncclSymKernelCount = %d;' % len(list(enumerate_kernels())))
-  emitln(f, 'extern void* const ncclSymKernelList[] = {')
+  emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels())))
+  emitln(f, 'extern void* const ncclSymkKernelList[] = {')
   for k in enumerate_kernels():
     emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
   emitln(f, 'nullptr};')
   emitln(f, '')
 
-  emitln(f, 'void* ncclSymGetKernelPtr(ncclSymKernelId id, int red, ncclDataType_t ty) {')
+  emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {')
   indents += 1
   emitln(f, 'switch (id) {')
   emitln(f, 'default: return nullptr;')
   for (coll, algo), coll_algo_ks in partition(enumerate_kernels(), lambda k: (k.coll, k.algo)).items():
-    emitln(f, 'case ncclSymKernelId_'+coll+'_'+algo+':')
+    emitln(f, 'case ncclSymkKernelId_'+coll+'_'+algo+':')
     indents += 1
     if len(coll_algo_ks) == 1:
       emitln(f, 'return (void*)&'+kernel_cname(coll_algo_ks[0])+';')
@@ -277,9 +283,15 @@ def partition(vals, keyfn):
   emitln(f, '}')
 
 # Generate <gensrc>/rules.mk
+files_to_print += "rules.mk;"
+files_to_print += "sym_kernels_host.cc;"
+
+if os.environ.get("NCCL_USE_CMAKE", "0") == "1":
+    print(files_to_print)
+
 with open(os.path.join(gensrc, "rules.mk"), "w") as f:
   inst_names = sorted(set(kernel_fname(k) for k in enumerate_kernels()))
-  names = inst_names + ["symmetric_kernels.cc"]
+  names = inst_names + ["sym_kernels_host.cc"]
   f.write("LIB_OBJS_SYM_GEN = $(patsubst %,$(OBJDIR)/genobj/symmetric/%.o,{names})\n"
           .format(names=" ".join(names)))
   f.write("\n")
diff --git a/src/device/symmetric/kernel.cuh b/src/device/symmetric/kernel.cuh
index f631d51d9..bff67e460 100644
--- a/src/device/symmetric/kernel.cuh
+++ b/src/device/symmetric/kernel.cuh
@@ -1,27 +1,27 @@
 #ifndef NCCL_DEVICE_SYMMETRIC_KERNEL_H_
 #define NCCL_DEVICE_SYMMETRIC_KERNEL_H_
 
-#include "symmetric.h"
+#include "sym_kernels.h"
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLL_R(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLL_R(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_AGxLLMC_R(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_AGxLLMC_R(struct ncclSymkDevWorkArgs const* args);
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLD_AGxST(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLD_AGxST(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllReduce_RSxLDMC_AGxSTMC(struct ncclSymkDevWorkArgs const* args);
 
-__device__ __forceinline__ void ncclSymRun_AllGather_LL(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_LLMC(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_ST(struct ncclSymDevArgs const* args);
-__device__ __forceinline__ void ncclSymRun_AllGather_STMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LL(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_LLMC(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_ST(struct ncclSymkDevWorkArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_AllGather_STMC(struct ncclSymkDevWorkArgs const* args);
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(struct ncclSymkDevWorkArgs const* args);
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(struct ncclSymDevArgs const* args);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(struct ncclSymkDevWorkArgs const* args);
 #endif
diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh
index 167024400..73305d54c 100644
--- a/src/device/symmetric/primitives.cuh
+++ b/src/device/symmetric/primitives.cuh
@@ -1,11 +1,11 @@
 #ifndef NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
 #define NCCL_DEVICE_SYMMETRIC_PRIMITIVES_H_
 
-#include "symmetric.h"
+#include "sym_kernels.h"
 #include "bitops.h"
 #include "collectives.h"
-#include "op128.h"
-#include "reduce_kernel.h"
+#include "../op128.h"
+#include "../reduce_kernel.h"
 
 #if __CUDA_ARCH__ >= 700
 // __grid_constant__ appears to break cuda-gdb
@@ -24,397 +24,124 @@ static __device__ Int0 flattenIx(Int0 pos, Int1 size, Ints ...more) {
   return pos + size*flattenIx(more...);
 }
 
-// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
-// Pass these to idivFast64() for fast division on the GPU.
-static __device__ uint64_t idivRcp64_upto64(int x) {
-  static constexpr uint64_t table[65] = {
-    idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
-    idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
-    idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
-    idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
-    idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
-    idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
-    idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
-    idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
-    idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
-    idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
-    idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
-    idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
-    idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
-    idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
-    idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
-    idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
-    idivRcp64(0x40)
-  };
-  return table[x];
-}
-
-static __device__ uint32_t idivRcp32_upto64(int x) {
-  return idivRcp64_upto64(x)>>32;
-}
-
 namespace {
-struct ncclCoopCta {
-  __device__ void sync() { __syncthreads(); }
-  __device__ int self() { return threadIdx.x; }
-  __device__ int count() { return blockDim.x; }
-};
-struct ncclCoopWarps {
-  int log2_nWarps;
-  __device__ void sync() {
-    asm volatile("barrier.sync %0, %1;" :: "r"(1 + (threadIdx.x>>(5+log2_nWarps))), "r"(32<<log2_nWarps) : "memory");
+struct ncclSymkArgsHandler {
+  ncclDevComm const& comm;
+  ncclLLA2AHandle const& lsaLLA2A;
+  struct ncclSymkChannelWorkRange* channelWorkRange;
+  struct ncclSymkDevWork* devWork;
+  uint32_t nRanks_rcp32;
+
+  __device__ ncclSymkArgsHandler(ncclSymkDevWorkArgs const* args):
+    comm(args->kcomm.devComm),
+    lsaLLA2A(args->kcomm.lsaLLA2A) {
+    channelWorkRange = args->getWorkRange();
+
+    devWork = args->getWorks(args->nMaxChannels);
+    nRanks_rcp32 = comm.nRanks_rcp32;
   }
-  __device__ int self() { return threadIdx.x & ((32<<log2_nWarps)-1); }
-  __device__ int count() { return 32<<log2_nWarps; }
-};
-struct ncclCoopWarp {
-  __device__ void sync() { __syncwarp(); }
-  __device__ int self() { return threadIdx.x%32; }
-  __device__ int count() { return 32; }
-};
-}
-
-namespace {
-static constexpr int ncclSymPrims_UseBarrier = 1;
-static constexpr int ncclSymPrims_UseLL = 2;
-static constexpr int ncclSymPrims_UseMultimem = 4;
-struct ncclSymPrims {
-  int flags;
-  int const &rank;
-  int const &nRanks;
-  uint32_t const &nRanks_rcp32;
-  int block, nBlocks;
-  uint32_t nBlocks_rcp32;
-  uint32_t nBlocks_nWarps_rcp32;
-  uint32_t nRanks_nBlocks_rcp32;
-  uint32_t nWarpPerRank, nWarpPerRank_rcp32;
-  struct ncclSymDevBase* const &base;
-  uintptr_t offsetMc;
-
-  uint32_t const &stride4G;
-  uint32_t barEpoch;
-  uint32_t llEpoch;
-
-  __device__ ncclSymPrims(ncclSymDevComm const &comm, int flags):
-    flags(flags),
-    rank(comm.rank),
-    nRanks(comm.nRanks),
-    nRanks_rcp32(comm.nRanks_rcp32),
-    block(blockIdx.x),
-    nBlocks(gridDim.x),
-    nBlocks_rcp32(idivRcp32_upto64(nBlocks)),
-    nBlocks_nWarps_rcp32(imulRcp32(nBlocks, nBlocks_rcp32, blockDim.x/32, idivRcp32_upto64(blockDim.x/32))),
-    nRanks_nBlocks_rcp32(imulRcp32(nRanks, nRanks_rcp32, gridDim.x, nBlocks_rcp32)),
-    nWarpPerRank(idivFast32(nBlocks*blockDim.x/32, nRanks, nRanks_rcp32)),
-    nWarpPerRank_rcp32(idivRcp32_upto64(nWarpPerRank)),
-    base(comm.base),
-    offsetMc((flags & ncclSymPrims_UseMultimem) ? (char*)comm.baseMc - (char*)base : 0x0),
-    stride4G(comm.stride4G) {
 
-    #if CUDART_VERSION >= 12030 && __CUDA_ARCH__ >= 900
-      cudaGridDependencySynchronize();
-    #endif
-
-    if ((flags & ncclSymPrims_UseBarrier) && threadIdx.x < nRanks) {
-      barEpoch = (flags & ncclSymPrims_UseMultimem) ? base->barEpochMc[block] : base->barEpochUc[block];
-    }
-    if (flags & ncclSymPrims_UseLL) llEpoch = base->llEpoch[block] + 2;
-  }
-  __device__  ~ncclSymPrims() {
-    if (threadIdx.x == 0) {
-      if (flags & ncclSymPrims_UseBarrier) {
-        ((flags & ncclSymPrims_UseMultimem) ? base->barEpochMc : base->barEpochUc)[block] = barEpoch;
-      }
-      if (flags & ncclSymPrims_UseLL) base->llEpoch[block] = llEpoch - 2;
+  template<typename T>
+    __device__ void getWorkRange(int block,
+                                 uint16_t& workLo, size_t& indexLo, uint16_t& workHi, size_t& indexHi) {
+    constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
+    uint32_t fracLo, fracHi;
+
+    // Where the work begins
+    workLo = (block==0) ? 0 : channelWorkRange[block-1].workHi; // start where predecessor ends
+    fracLo = (block==0) ? 0 : channelWorkRange[block-1].fracHi + 1;
+    // If the predecessor ended on the work boundary, then we step to the beginning of the next work.
+    // This ensures we never have empty parts.
+    if (fracLo == 0x10000) {
+      workLo++;
+      fracLo = 0;
     }
-  }
+    struct ncclSymkDevWork const& dw = devWork[workLo];
+    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
 
-  template<typename T>
-  __device__ T* peerPtr(int peer, T* selfPtr) {
-    return add4G(selfPtr, (peer-rank)*stride4G);
+    // Where the work ends
+    workHi = channelWorkRange[block].workHi;
+    fracHi = channelWorkRange[block].fracHi + 1;
+    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
   }
 
   template<typename T>
-  __device__ T* multimemPtr(T* selfPtr) {
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(selfPtr) + offsetMc);
+    __device__ void getWorkRangeFused(int blockIdx, int w,
+                                      int& block, int& nBlocks, size_t& indexLo, size_t& indexHi) {
+    constexpr int EltPerCell = NCCL_SYM_KERNEL_CELL_SIZE / sizeof(T);
+    struct ncclSymkDevWork const& dw = devWork[w];
+    uint32_t fracLo, fracHi;
+    int lastBlock;
+
+    block = blockIdx - dw.sChannelId;
+    nBlocks = dw.nChannels;
+    lastBlock = dw.sChannelId+dw.nChannels-1;
+
+    // Where the work begins
+    fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF);
+    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
+    fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000;
+    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
   }
 
-  __device__  void barrierArrive(ncclCoopCta cta, bool release) {
-    cta.sync();
-    #if __CUDA_ARCH__ < 700
-      if (release) {
-        if (cta.self() == 0) __threadfence_system();
-        cta.sync();
-      }
-    #endif
-    if (flags & ncclSymPrims_UseMultimem) {
-    #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
-      if (cta.self() == 0) {
-        uint32_t* inbox = &multimemPtr(base)->barInboxMc[block];
-        if (release) {
-          asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
-        } else {
-          asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
-        }
-      }
-    #endif
-    } else {
-      int r = cta.self();
-      if (r != rank && r < nRanks) {
-        uint32_t* inbox = &peerPtr(r, base)->barInboxPerPeer[block*nRanks + rank];
-        #if __CUDA_ARCH__ >= 700
-          if (release) {
-            asm volatile("st.release.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-          } else {
-            asm volatile("st.relaxed.sys.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-          }
-        #else
-          asm volatile("st.volatile.u32 [%0],%1;" :: "l"(inbox), "r"(barEpoch+1));
-        #endif
-      }
-    }
-  }
+  template<typename T, typename Fn>
+    __device__ void forEachWork(Fn const& fn) {
+      uint16_t workLo, workHi;
+      size_t indexLo, indexHi;
 
-  __device__  void barrierWait(ncclCoopCta cta, bool acquire) {
-    if (flags & ncclSymPrims_UseMultimem) {
-    #if __CUDA_ARCH__ >= 900
-      if (cta.self() == 0) {
-        uint32_t* inbox = &base->barInboxMc[block];
-        while (true) {
-          uint32_t got;
-          if (acquire) {
-            asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          } else {
-            asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          }
-          if (got-(barEpoch+nRanks) <= uint32_t(-1)>>1) break;
-        }
-        barEpoch += nRanks;
-      }
-    #endif
-    } else {
-      int r = cta.self();
-      if (r != rank && r < nRanks) {
-        uint32_t* inbox = &base->barInboxPerPeer[block*nRanks + r];
-        while (true) {
-          uint32_t got;
-          #if __CUDA_ARCH__ >= 700
-            if (acquire) {
-              asm volatile("ld.acquire.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-            } else {
-              asm volatile("ld.relaxed.sys.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-            }
-          #else
-            asm volatile("ld.volatile.u32 %0,[%1];" : "=r"(got) : "l"(inbox));
-          #endif
-          if (got-(barEpoch+1) <= uint32_t(-1)>>1) break;
-        }
-      }
-      #if __CUDA_ARCH__ < 700
-        if (acquire) {
-          cta.sync();
-          if (cta.self() == 0) __threadfence();
-        }
-      #endif
-      barEpoch += 1;
-    }
-    cta.sync();
-  }
+      getWorkRange<T>(blockIdx.x, workLo, indexLo, workHi, indexHi);
 
-  __device__ void endLL(ncclCoopCta cta) {
-    if (__builtin_expect(llEpoch >= -2u, false)) {
-      cta.sync();
-      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch);
-      int epochSize = ncclSymLLEpochSize(nRanks);
-      #pragma unroll 4
-      for (int i=cta.self(); i*16 < epochSize; i += cta.count()) {
-        buf[i] = uint4{0, 0, 0, 0};
-      }
-    }
-    cta.sync();
-    llEpoch += (llEpoch == -1u) ? 3 : 1;
-  }
-
-  template<typename T>
-  __device__ void sendLL(int peer, int slot, T val) {
-    union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-    tmp = val;
-    uint4* buf = ncclSymDevBase_getLLBuf(peerPtr(peer, base), nRanks, block, llEpoch) + slot;
-    #pragma unroll
-    for (int u=0; u < divUp(sizeof(T),8); u++) {
-      asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-    }
-  }
-
-  template<typename T>
-  __device__ void bcastLL(int slot, T val) {
-    if (flags & ncclSymPrims_UseMultimem) {
-      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      tmp = val;
-      uint4* bufmc = ncclSymDevBase_getLLBuf(multimemPtr(base), nRanks, block, llEpoch) + slot;
-      #pragma unroll
-      for (int u=0; u < divUp(sizeof(T),8); u++) {
-        asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(bufmc + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-      }
-    } else {
-      union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      tmp = val;
-      uint4* buf0 = ncclSymDevBase_getLLBuf(peerPtr(0, base), nRanks, block, llEpoch) + slot;
-      int dr = 0;
-      int r = rank;
+      size_t currentIndexLo = indexLo;
       #pragma unroll 1
-      for (; dr+8 <= nRanks; dr += 8) {
-        #pragma unroll
-        for (int ur=0; ur < 8; ur++) {
-          uint4* buf = add4G(buf0, r*stride4G);
-          #pragma unroll
-          for (int u=0; u < divUp(sizeof(T),8); u++) {
-            asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
-          }
-          r += 1;
-          if (r == nRanks) r = 0;
-        }
-      }
-      #pragma unroll
-      for (int ur=0; ur < 8; ur++, dr++) {
-        if (dr == nRanks) break;
-        uint4* buf = add4G(buf0, r*stride4G);
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T),8); u++) {
-          asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" :: "l"(buf + ncclSymLLMaxSlots(sizeof(T))*u), "r"(u32[u][0]), "r"(u32[u][1]), "r"(llEpoch));
+      for (int w = workLo; w <= workHi; w++) {
+        struct ncclSymkDevWork const& dw = devWork[w];
+        size_t const& nAllElts = dw.nElts;
+        size_t currentIndexHi;
+        int block, nBlocks;
+        if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) {
+          getWorkRangeFused<T>(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi);
+        } else {
+          currentIndexHi = (w < workHi) ? nAllElts : indexHi;
+          block = 0;
+          nBlocks = 1;
         }
-        r += 1;
-        if (r == nRanks) r = 0;
-      }
-    }
-  }
 
-  template<int nSlotsMin, int nSlotsMax, typename T>
-  __device__ void recvLL(int slot0, int nSlots, int stride, T(&elts)[nSlotsMax]) {
-    uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0;
-    uint4 tmp[nSlotsMax][divUp(sizeof(T),8)];
-    //int spins=0;
-    while (true) {
-      #pragma unroll
-      for (int u=0; u < nSlotsMax; u++) {
-        if (u < nSlotsMin || u < nSlots) {
-          #pragma unroll
-          for (int v=0; v < divUp(sizeof(T),8); v++) {
-            asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w) : "l"(buf + u*stride + v*ncclSymLLMaxSlots(sizeof(T))));
-          }
-        }
-      }
-      bool okAll = true;
-      #pragma unroll
-      for (int u=0; u < nSlotsMax; u++) {
-        #pragma unroll
-        for (int v=0; v < divUp(sizeof(T),8); v++) {
-          if (u < nSlotsMin || u < nSlots) {
-            bool ok = tmp[u][v].y == llEpoch &&
-                      tmp[u][v].w == llEpoch;
-            okAll &= ok;
-          }
-        }
-      }
-      if (__builtin_expect(okAll, true)) break;
-      //if (spins++ == 10<<20) spins=0;
-    }
-    #pragma unroll
-    for (int u=0; u < nSlotsMax; u++) {
-      if (nSlotsMin <= u && u == nSlots) break;
-      union { T val; uint32_t u32[divUp(sizeof(T),8)][2]; };
-      #pragma unroll
-      for (int v=0; v < divUp(sizeof(T),8); v++) {
-        u32[v][0] = tmp[u][v].x;
-        u32[v][1] = tmp[u][v].z;
-      }
-      elts[u] = val;
-    }
-  }
+        fn(block, nBlocks, currentIndexHi - currentIndexLo, nAllElts,
+           ncclSymPtr<T>(dw.inputWin, dw.inputOff) + currentIndexLo,
+           ncclSymPtr<T>(dw.outputWin, dw.outputOff) + currentIndexLo);
 
-  template<typename Pack, typename T, typename Red, int Unroll=8>
-  __device__ Pack recvReduceLL(int slot, int stride, Red red) {
-    using Acc = typename Red::EltType;
-    using AccPack = BytePack<sizeof(Pack)*sizeof(Acc)/sizeof(T)>;
-    AccPack acc;
-    bool first = true;
-    int r = 0;
-    #pragma unroll 1
-    for (; r+Unroll <= nRanks; r += Unroll) {
-      Pack got[Unroll];
-      this->template recvLL</*Min=*/Unroll>(slot + r*stride, Unroll, stride, got);
-      AccPack acc0 = applyCast<T, Acc>(got[0]);
-      acc = first ? acc0 : applyReduce(red, acc, acc0);
-      first = false;
-      #pragma unroll
-      for (int i=1; i < Unroll; i++) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
-    }
-    if (r < nRanks) {
-      Pack got[Unroll];
-      this->template recvLL</*Min=*/1>(slot + r*stride, nRanks-r, stride, got);
-      AccPack acc0 = applyCast<T, Acc>(got[0]);
-      acc = first ? acc0 : applyReduce(red, acc, acc0);
-      #pragma unroll
-      for (int i=1; i < Unroll-1; i++) {
-        if (r+i < nRanks) acc = applyReduce(red, acc, applyCast<T, Acc>(got[i]));
+        currentIndexLo = 0;
       }
-    }
-    return applyCast<Acc, T>(acc);
   }
 
-  template<typename T>
-  __device__ T recvLL(int slot) {
-    T one[1];
-    this->template recvLL<1, 1, T>(slot, 1, 0, one);
-    return one[0];
-  }
+  template<typename T, typename Fn>
+    __device__ void singleWork(Fn const& fn) {
+      uint16_t w;
+      size_t indexLo, indexHi;
 
-  template<typename Coop, typename T>
-  __device__ void coopRecvLL(Coop coop, int slot0, int nSlots, T* dst) {
-    int me = coop.self();
-    if (me < nSlots) {
-      uint4* buf = ncclSymDevBase_getLLBuf(base, nRanks, block, llEpoch) + slot0 + me;
-      uint4 got[divUp(sizeof(T), 8)];
-      //int spins=0;
-      #pragma unroll 1
-      while (true) {
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T), 8); u++) {
-          asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];" : "=r"(got[u].x), "=r"(got[u].y), "=r"(got[u].z), "=r"(got[u].w) : "l"(buf + u*ncclSymLLMaxSlots(sizeof(T))));
-        }
-        bool ok = true;
-        #pragma unroll
-        for (int u=0; u < divUp(sizeof(T), 8); u++) {
-          ok &= got[u].y == llEpoch;
-          ok &= got[u].w == llEpoch;
-        }
-        if (__builtin_expect(ok, true)) break;
-        //if (++spins == 10<<20) { spins=0; printf("r=%d LL spin @ ix=%d got=%d want=%d\n", rank, slot0+me, got[0].y, llEpoch); }
-      }
-      union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
-      #pragma unroll
-      for (int u=0; u < divUp(sizeof(T), 8); u++) {
-        u32[u][0] = got[u].x;
-        u32[u][1] = got[u].z;
-      }
-      dst[slot0 + me] = val;
-    }
+      getWorkRange<T>(blockIdx.x, w, indexLo, w, indexHi);
+
+      struct ncclSymkDevWork const& dw = devWork[w];
+
+      fn(indexHi - indexLo, dw.nElts,
+         ncclSymPtr<T>(dw.inputWin, dw.inputOff) + indexLo,
+         ncclSymPtr<T>(dw.outputWin, dw.outputOff) + indexLo);
   }
 };
 }
 
 template<template<typename> typename Red, typename T, bool nvls>
-struct ncclSymAccumType { using Type = T; };
+struct ncclSymkAccumType { using Type = T; };
 
 // Only Red's whose opArg is invariant w.r.t. the datatype can have a different
 // accumulator type. At the moment this excludes integer min/max, sumpostdiv,
 // and premulsum.
-template<> struct ncclSymAccumType<FuncSum, __half, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __half, false> { using Type = float; };
 #if defined(__CUDA_BF16_TYPES_EXIST__)
-template<> struct ncclSymAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_bfloat16, false> { using Type = float; };
 #endif
 #if defined(__CUDA_FP8_TYPES_EXIST__)
-template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
-template<> struct ncclSymAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e4m3, false> { using Type = float; };
+template<> struct ncclSymkAccumType<FuncSum, __nv_fp8_e5m2, false> { using Type = float; };
 #endif
 #endif
diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh
index 4fd96093e..8f79b3990 100644
--- a/src/device/symmetric/reduce_scatter.cuh
+++ b/src/device/symmetric/reduce_scatter.cuh
@@ -1,35 +1,36 @@
-#include "symmetric.h"
-#include "symmetric/kernel.cuh"
-#include "symmetric/primitives.cuh"
+#include "sym_kernels.h"
+#include "kernel.cuh"
+#include "primitives.cuh"
 
 template<int BytePerPack, int UnrollPacks, int UnrollPeers, typename T, typename Red>
 static __device__ void reduceDeep(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, char* inputRank0, char* outputHere, int32_t nIters
+    ncclSymkArgsHandler const& handler, int tn, int t,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<char> input, ncclSymPtr<char> output, int32_t nIters
   ) {
   using Pack = BytePack<BytePerPack>;
   using Acc = typename Red::EltType;
   using AccPack = BytePack<BytePerPack*sizeof(Acc)/sizeof(T)>;
 
+  ncclTeam world = ncclTeamWorld(handler.comm);
   int wn = tn/WARP_SIZE;
   int w = t/WARP_SIZE;
   int lane = t%WARP_SIZE;
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  Pack* inpRank0 = (Pack*)inputRank0 + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
-  Pack* outHere = (Pack*)outputHere + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+  ncclSymPtr<Pack> inpPacks = (ncclSymPtr<Pack>)input + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
+  ncclSymPtr<Pack> outPacks = (ncclSymPtr<Pack>)output + intptr_t(w)*UnrollPacks*WARP_SIZE + lane;
   Pack acc0[UnrollPacks];
 
   nIters -= w;
   if (0 < nIters) {
     #pragma unroll
     for (int u=0; u < UnrollPacks; u++) {
-      acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+      acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   if (0 < nIters) {
     while (true) {
@@ -39,7 +40,7 @@ static __device__ void reduceDeep(
       { Pack tmp1[UnrollPacks];
         #pragma unroll
         for (int u=0; u < UnrollPacks; u++) {
-          tmp1[u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+          tmp1[u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
         }
         #pragma unroll
         for (int u=0; u < UnrollPacks; u++) {
@@ -65,7 +66,7 @@ static __device__ void reduceDeep(
             if (partial && ur!=0 && dr+ur == nRanks) break;
             #pragma unroll UnrollPacks
             for (int u=0; u < UnrollPacks; u++) {
-              tmp1[ur][u] = add4G(inpRank0, r*stride4G)[u*WARP_SIZE];
+              tmp1[ur][u] = inpPacks.peerPtr(world, r)[u*WARP_SIZE];
             }
             r += 1;
             if (r == nRanks) r = 0;
@@ -85,17 +86,17 @@ static __device__ void reduceDeep(
       for (int u=0; u < UnrollPacks; u++) acc0[u] = applyCast<Acc, T>(acc1[u]);
 
       #pragma unroll UnrollPacks
-      for (int u=0; u < UnrollPacks; u++) outHere[u*WARP_SIZE] = acc0[u];
+      for (int u=0; u < UnrollPacks; u++) outPacks.localPtr()[u*WARP_SIZE] = acc0[u];
 
-      inpRank0 += intptr_t(wn)*UnrollPacks*WARP_SIZE;
-      outHere += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      inpPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
+      outPacks += intptr_t(wn)*UnrollPacks*WARP_SIZE;
       nIters -= wn;
       if (nIters <= 0) break;
 
       // Load data for next iteration.
       #pragma unroll
       for (int u=0; u < UnrollPacks; u++) {
-        acc0[u] = add4G(inpRank0, rank*stride4G)[u*WARP_SIZE];
+        acc0[u] = inpPacks.peerPtr(world, rank)[u*WARP_SIZE];
       }
     }
   }
@@ -103,20 +104,22 @@ static __device__ void reduceDeep(
 
 template<int UnrollPeers, typename Red, typename T>
 static __device__ void reduceEnds(
-    ncclSymPrims& prim, int tn, int t, Red red,
-    T* inputRank0, T* outputHere, size_t nElts, uint32_t nPreElts, size_t nSufElts
+    ncclSymkArgsHandler const& handler, int tn, int t, Red red,
+    ncclSymPtr<T> input, ncclSymPtr<T> output,
+    size_t nElts, uint32_t nPreElts, size_t nSufElts
   ) {
   using Acc = typename Red::EltType;
 
-  int const& rank = prim.rank;
-  int const& nRanks = prim.nRanks;
-  uint32_t const& stride4G = prim.stride4G;
-  BytePack<sizeof(T)>* inpRank0 = (BytePack<sizeof(T)>*)inputRank0;
-  BytePack<sizeof(T)>* outHere = (BytePack<sizeof(T)>*)outputHere;
+  ncclTeam world = ncclTeamWorld(handler.comm);
+  int const& rank = handler.comm.rank;
+  int const& nRanks = handler.comm.nRanks;
+
+  ncclSymPtr<BytePack<sizeof(T)>> inpPacks = (ncclSymPtr<BytePack<sizeof(T)>>)input;
+  ncclSymPtr<BytePack<sizeof(T)>> outPacks = (ncclSymPtr<BytePack<sizeof(T)>>)output;
   #pragma unroll 1
   for (size_t i = t; i < nPreElts+nSufElts; i += tn) {
     size_t elt = i < nPreElts ? i : nElts-nSufElts-nPreElts+i;
-    BytePack<sizeof(T)> acc0 = *add4G(inpRank0+elt, rank*stride4G);
+    BytePack<sizeof(T)> acc0 = inpPacks.peerPtr(world, rank)[elt];
     BytePack<sizeof(Acc)> acc1;
     BytePack<sizeof(T)> tmp[UnrollPeers];
     int dr = 1;
@@ -135,7 +138,7 @@ static __device__ void reduceEnds(
         #pragma unroll
         for (int u=0; u < UnrollPeers-partial; u++) {
           if (partial && u!=0 && dr+u == nRanks) break;
-          tmp[u] = *add4G(inpRank0+elt, r*stride4G);
+          tmp[u] = inpPacks.peerPtr(world, r)[elt];
           r += 1;
           if (r == nRanks) r = 0;
         }
@@ -152,26 +155,25 @@ static __device__ void reduceEnds(
     }
 
     acc0 = applyCast<Acc, T>(acc1);
-    outHere[elt] = acc0;
+    outPacks.localPtr()[elt] = acc0;
   }
 }
 
 template<typename Red, typename T>
 static __device__ void reduce(
-    ncclSymPrims& prim, int tn, int t, bool waitNeeded,
-    Red red, T* input, T* output, size_t nElts
+    ncclSymkArgsHandler const& handler, int tn, int t, int nBlocks,
+    bool waitNeeded, ncclLsaBarrierSession<ncclCoopCta>& bar,
+    Red red, ncclSymPtr<T> input, ncclSymPtr<T> output, size_t nElts
   ) {
-  int nRanks = prim.nRanks;
-  int nBlocks = prim.nBlocks;
-  // Mpve input to rank=0
-  input = prim.peerPtr(0, input);
+  int const& nRanks = handler.comm.nRanks;
+  int const& nRanks_rcp32 = handler.nRanks_rcp32;
+  uint32_t nBlocks_rcp32 = nccl::utility::idivRcp32_upto64(nBlocks);
+  uint32_t nRanks_nBlocks_rcp32 = nccl::utility::imulRcp32(nRanks, nRanks_rcp32, nBlocks, nBlocks_rcp32);
 
-  uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
-  uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
-  uint32_t alignment = uint32_t(inputUptr - outputUptr);
+  uint32_t alignment = uint32_t(input.offset - output.offset);
   size_t nBytes = nElts*sizeof(T);
 
-  uint32_t nPreBytes = (16u - inputUptr)%16u;
+  uint32_t nPreBytes = (16u - input.offset)%16u;
   nPreBytes = min((size_t)nPreBytes, nBytes);
   uintptr_t cursor = nPreBytes;
 
@@ -181,12 +183,12 @@ static __device__ void reduce(
     constexpr int BytePerPack = 16, UnrollPacks = 4, UnrollPeers = 2;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       reduceDeep<BytePerPack, UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
         chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
@@ -198,12 +200,12 @@ static __device__ void reduce(
     constexpr int BytePerPack = 4, UnrollPacks = 4, UnrollPeers = 4;
     constexpr int BytePerChunk = MinWarpPerBlock*UnrollPacks*WARP_SIZE*BytePerPack;
     uint32_t chunks = (nBytes-cursor)/BytePerChunk;
-    chunks -= imodFast32(chunks, nRanks*nBlocks, prim.nRanks_nBlocks_rcp32);
+    chunks -= imodFast32(chunks, nRanks*nBlocks, nRanks_nBlocks_rcp32);
     if (chunks != 0) {
       uintptr_t cursorAfter = cursor + uintptr_t(chunks)*BytePerChunk;
       reduceDeep<(sizeof(T) <= BytePerPack ? BytePerPack : 0), UnrollPacks, UnrollPeers, T>(
-        prim, tn, t, waitNeeded, red,
-        (char*)input + cursor, (char*)output + cursor,
+        handler, tn, t, waitNeeded, bar, red,
+        (ncclSymPtr<char>)input + cursor, (ncclSymPtr<char>)output + cursor,
         chunks*MinWarpPerBlock
       );
       cursor = cursorAfter;
@@ -211,42 +213,47 @@ static __device__ void reduce(
     }
   }
 
-  if (waitNeeded) prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  if (waitNeeded) bar.wait(ncclCoopCta(), cuda::memory_order_relaxed);
 
   constexpr int UnrollPeers = 8;
   size_t nSufElts = (nBytes-cursor)/sizeof(T);
-  reduceEnds<UnrollPeers>(prim, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
+  reduceEnds<UnrollPeers>(handler, tn, t, red, input, output, nElts, nPreBytes/sizeof(T), nSufElts);
 }
 
-
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LD(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
-
-  // Round robin warps over blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  //prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-
-  reduce(prim, tn, t, /*waitNeeded=*/true, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x
+  };
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
+  int const& rank = handler.comm.rank;
+
+  bar.arrive(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  bool waitNeeded = true;
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Round robin warps over blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;
+
+        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts);
+
+        waitNeeded = false;
+      }
+    );
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
 }
 
-
 template<typename Red, typename T>
 static __device__ void reduceMultimem(
-    ncclSymPrims& prim, int tn, int t, Red red, T* input, T* output, size_t nElts
+    int tn, int t, Red red, T* input, T* output, size_t nElts
   ) {
-  // Mpve input to multimem
-  input = prim.multimemPtr(input);
-
   uintptr_t inputUptr = reinterpret_cast<uintptr_t>(input);
   uintptr_t outputUptr = reinterpret_cast<uintptr_t>(output);
   size_t nBytes = nElts*sizeof(T);
@@ -291,41 +298,52 @@ static __device__ void reduceMultimem(
     uintptr_t cursor = i < nPreBytes ? i : nBytes-nSufBytes+(i-nPreBytes);
     BytePack<sizeof(T)> val = applyLoadMultimem<Red, sizeof(T)>(red, inputUptr + cursor);
     *reinterpret_cast<BytePack<sizeof(T)>*>(outputUptr + cursor) = val;
-    cursor += tn*sizeof(T);
   }
 }
 
 template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LDMC(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseBarrier|ncclSymPrims_UseMultimem);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/true>::Type> red(args->redOpArg);
-
-  // Round robin warps over blocks.
-  int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
-                    prim.block, prim.nBlocks,
-                    threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
-  int tn = prim.nBlocks*blockDim.x;
-
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
-
-  reduceMultimem(prim, tn, t, red, (T*)args->input + prim.rank*args->nElts, (T*)args->output, args->nElts);
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLsaBarrierSession<ncclCoopCta> bar{
+    ncclCoopCta(), handler.comm, ncclTeamTagLsa(), blockIdx.x, /*multimem=*/true
+  };
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/true>::Type> red(handler.devWork->redOpArg);
+
+  int const& rank = handler.comm.rank;
+  auto const& multimem = handler.comm.lsaMultimem;
+
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  handler.forEachWork<T>(
+      [&]__device__(int block, int nBlocks, size_t nElts, size_t nAllElts,
+                    ncclSymPtr<T> input, ncclSymPtr<T> output) {
+        // Round robin warps over blocks.
+        int t = flattenIx(threadIdx.x%WARP_SIZE, WARP_SIZE,
+                          block, nBlocks,
+                          threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
+        int tn = nBlocks*blockDim.x;
+
+        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts);
+      }
+    );
 
-  prim.barrierArrive(ncclCoopCta(), /*release=*/false);
-  prim.barrierWait(ncclCoopCta(), /*acquire=*/false);
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
 }
 
 // T is user type, EltType is the most aligned type
 template<typename T, typename Red, typename EltType>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
-    ncclSymPrims &prim, Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL_body(
+    ncclSymkArgsHandler& handler, ncclLLA2ASession<ncclCoopCta>& lla2a,
+    Red red, EltType* input, EltType* output, int nElts, int nPacks, int nStrideElts) {
   using Pack = BytePack<8>;
+  using Acc = typename Red::EltType;
+  using AccPack = BytePack<8*sizeof(Acc)/sizeof(T)>;
   constexpr int EltPerPack = 8/sizeof(EltType);
 
-  int nRanks = prim.nRanks;
-  int rank = prim.rank;
+  int const& nRanks = handler.comm.nRanks;
+  int const& rank = handler.comm.rank;
   int t = threadIdx.x;
-  int tn = ncclSymMaxThreads;
+  constexpr int tn = ncclSymkMaxThreads;
   ncclCoopCta cta;
 
   #pragma unroll 1
@@ -339,17 +357,25 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
     #pragma unroll 1
     for (int i = t; i < nRanks*nIterPacks; i += tn) {
       Pack got = loadPack<Pack>(input + peer*nStrideElts, pack*EltPerPack, nElts);
-      prim.sendLL(peer, rank*nIterPacks + pack, got);
+      lla2a.send(peer, rank*nIterPacks + pack, got);
       peer += tn_div_nPacks;
       pack += tn_mod_nPacks;
       if (nIterPacks <= pack) { peer += 1; pack -= nIterPacks; }
     }
 
     if (t < nIterPacks) {
-      Pack got = prim.template recvReduceLL<Pack, T>(t, nIterPacks, red);
-      storePack(output, t*EltPerPack, nElts, got);
+      AccPack got = lla2a.template recvReduce</*Unroll=*/8, Pack>(
+        /*slotStart=*/t, /*slotCount=*/nRanks, /*slotStride=*/nIterPacks,
+        /*eltToAcc=*/[&] __device__ (Pack x)->AccPack {
+          return applyCast<T, Acc>(x);
+        },
+        /*reduce=*/[&] __device__ (AccPack a, AccPack b)->AccPack {
+          return applyReduce(red, a, b);
+        }
+      );
+      storePack(output, t*EltPerPack, nElts, applyCast<Acc, T>(got));
     }
-    prim.endLL(cta);
+    lla2a.endEpoch(cta);
 
     input += tn*EltPerPack;
     output += tn*EltPerPack;
@@ -357,31 +383,34 @@ __device__ __forceinline__ void ncclSymRun_ReduceScatter_LL_body(
     nPacks -= tn;
   }
 }
-template<template<typename> typename Red, typename T>
-__device__ __forceinline__ void ncclSymRun_ReduceScatter_LL(ncclSymDevArgs const* args) {
-  ncclSymPrims prim(args->comm, ncclSymPrims_UseLL);
-  Red<typename ncclSymAccumType<Red, T, /*nvls=*/false>::Type> red(args->redOpArg);
 
+template<template<typename> typename Red, typename T>
+__device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs const* args) {
+  ncclSymkArgsHandler handler{args};
+  ncclLLA2ASession<ncclCoopCta> lla2a(
+    ncclCoopCta(), handler.comm, ncclTeamLsa(handler.comm), handler.lsaLLA2A, blockIdx.x, ncclSymkMaxThreads
+  );
+  Red<typename ncclSymkAccumType<Red, T, /*nvls=*/false>::Type> red(handler.devWork->redOpArg);
   using Pack = BytePack<8>;
   constexpr int EltPerPack = 8/sizeof(T);
-  int nAllElts = args->nElts;
-  int nAllPacks = divUp(nAllElts, EltPerPack);
-  uint32_t nPackPerBlock, nPackModBlock;
-  idivmodFast32(&nPackPerBlock, &nPackModBlock, nAllPacks, prim.nBlocks, prim.nBlocks_rcp32);
-  int blockPackBegin = prim.block*nPackPerBlock + minval<int>(prim.block, nPackModBlock);
-  int blockPackEnd = blockPackBegin + nPackPerBlock + (prim.block < nPackModBlock ? 1 : 0);
-  int nPacks = blockPackEnd - blockPackBegin;
-  int nElts = nAllElts - blockPackBegin*EltPerPack;
-  nElts = min(nElts, nPacks*EltPerPack);
-  T* input = (T*)args->input + blockPackBegin*EltPerPack;
-  T* output = (T*)args->output + blockPackBegin*EltPerPack;
-
-  uint32_t lowBits = args->nElts*sizeof(T);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->input);
-  lowBits |= (uint32_t)reinterpret_cast<uintptr_t>(args->output);
-  if (__builtin_expect(lowBits%8 == 0, true)) {
-    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, (Pack*)input, (Pack*)output, nPacks, nPacks, nAllElts/EltPerPack);
-  } else {
-    ncclSymRun_ReduceScatter_LL_body<T>(prim, red, input, output, nElts, nPacks, nAllElts);
-  }
+
+  handler.singleWork<T>(
+      [&]__device__(int nElts, int nAllElts,
+                    ncclSymPtr<T> inputPtr, ncclSymPtr<T> outputPtr) {
+        int nPacks = divUp(nElts, EltPerPack);
+
+        T* input = (T*)inputPtr.localPtr();
+        T* output = (T*)outputPtr.localPtr();
+
+        uint32_t lowBits = nElts*sizeof(T);
+        lowBits |= (uintptr_t)input;
+        lowBits |= (uintptr_t)output;
+        if (__builtin_expect(lowBits%8 == 0, true)) {
+          ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, (Pack*)input, (Pack*)output,
+                                               nPacks, nPacks, divUp(nAllElts, EltPerPack));
+        } else {
+          ncclSymkRun_ReduceScatter_LL_body<T>(handler, lla2a, red, input, output, nElts, nPacks, nAllElts);
+        }
+      }
+    );
 }
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 225a4cffc..00a0ef8da 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -14,6 +14,9 @@
 #include "profiler.h"
 #include "transport.h"
 #include "register_inline.h"
+#include "ce_coll.h"
+#include "nvtx.h"
+#include "scheduler.h"
 
 #include <cstring> // std::memcpy
 #include <cinttypes> // PRIx64
@@ -30,8 +33,8 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
   int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
   for (int sym=0; sym <= 1; sym++) {
-    int kcount = sym==0 ? ncclDevKernelCount : ncclSymKernelCount;
-    void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymKernelList;
+    int kcount = sym==0 ? ncclDevKernelCount : ncclSymkKernelCount;
+    void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList;
     for (int k=0; k < kcount; k++) {
       void* fn = kptrs[k];
       cudaFuncAttributes attr = {0};
@@ -164,6 +167,7 @@ static void finishPlan(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   size_t workBytes = plan->workBytes;
   size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
 
+  if (plan->isSymColl) return;
   plan->threadPerBlock = std::max(plan->threadPerBlock, NCCL_MIN_NTHREADS);
 
   // If we can fit everything into the kernel args we do so.
@@ -263,7 +267,6 @@ static bool testBudget(
 
 ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
   struct ncclKernelPlanner* planner = &comm->planner;
-  if (planner->isSymColl) return ncclSuccess;
   struct ncclTaskColl *task;
   task = ncclIntruQueueHead(&planner->collTaskQueue);
   while (task != nullptr) {
@@ -328,6 +331,7 @@ ncclResult_t ncclTasksRegAndEnqueue(struct ncclComm* comm) {
 ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool* needConnect, ncclSimInfo_t* simInfo) {
   struct ncclKernelPlanner* planner = &comm->planner;
   planner->persistent = ncclCudaGraphValid(planner->capturingGraph);
+
   // Tasks from the sorter come out ordered size descending.
   struct ncclTaskColl* task = ncclTaskCollSorterDequeueAll(&planner->collSorter);
   // Tasks are assembled by (fn,op,ty) size ascending.
@@ -336,36 +340,8 @@ ncclResult_t ncclPrepareTasks(struct ncclComm* comm, bool* algoNeedConnect, bool
   int fnOpTyIndices[ncclNumFuncs*ncclNumDevRedOps*ncclNumTypes];
   int fnOpTyCount = 0;
 
-  if (comm->nNodes == 1 && planner->nTasksColl == 1 && planner->nTasksP2p == 0) {
-    void* sendSymPtr;
-    void* recvSymPtr;
-    struct ncclReg* sendReg;
-    struct ncclReg* recvReg;
-    size_t size = task->count*ncclTypeSize(task->datatype);
-    NCCLCHECK(ncclRegFindSymmetric(comm, task->sendbuff, size, &sendSymPtr, &sendReg));
-    NCCLCHECK(ncclRegFindSymmetric(comm, task->recvbuff, size, &recvSymPtr, &recvReg));
-    bool implemented = ncclSymImplemented(task->func, task->opDev.op, task->datatype);
-
-    if (sendReg && recvReg && (sendReg->winFlags & recvReg->winFlags & NCCL_WIN_COLL_SYMMETRIC) && implemented) {
-      enum ncclSymKernelId kernel;
-      int nChannels, nWarps;
-      float estTimeUs = 1.e18;
-      NCCLCHECK(ncclSymPickKernel(comm, task->func, task->opDev.op, task->datatype, task->count, &estTimeUs, &kernel, &nChannels, &nWarps));
-
-      // We should only use symmetric kernel if it beats the asymmetric kernel. But the
-      // perf model accuracy from asymmetric kernels is too inaccurate and reports too high
-      // of a bandwidth. For now just always use symmetric if available.
-      if (kernel != ncclSymKernelId_Count) {
-        task->sendbuff = sendSymPtr;
-        task->recvbuff = recvSymPtr;
-        task->devFuncId = (int)kernel;
-        task->nMaxChannels = nChannels;
-        task->nWarps = nWarps;
-        ncclIntruQueueEnqueue(&planner->collTaskQueue, task);
-        planner->isSymColl = true;
-        return ncclSuccess;
-      }
-    }
+  if (comm->symmetricSupport) {
+    NCCLCHECK(ncclMakeSymmetricTaskList(comm, task, &planner->collSymTaskQueue, &task));
   }
 
   // Walk the size sorted tasks, binning them by (fn,op,ty).
@@ -532,7 +508,7 @@ static ncclResult_t scheduleCollTasksToPlan(
   size_t trafficBytes[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
   int nChannels[2*2] = {0, 0, 0, 0}; // [collnet][nvls]
   int const nMaxChannels[2*2] = {comm->nChannels, comm->nvlsChannels, // [collnet][nvls]
-                                 comm->nChannels, comm->nvlsChannels};
+                                 comm->nChannels, std::min(comm->nChannels, comm->nvlsChannels)};
   constexpr size_t MinTrafficPerChannel = 16 << 10; // 16K traffic as minimal
   do {
     size_t workBytes = 0;
@@ -725,6 +701,7 @@ static ncclResult_t scheduleCollTasksToPlan(
         }
         proxyOp->eActivationMask = task->eActivationMask;
         proxyOp->incWorkCounter = true;
+        proxyOp->nChannels = nChannels;
         addWorkBatchToPlan(comm, plan, c, workNode->workType, task->devFuncId, plan->workBytes);
         // Coverity reports "proxyOp->connection" as being possibly uninitialized.  It's hard to
         // determine if that's actually true but it's also not clear if that would be an issue.
@@ -740,6 +717,8 @@ static ncclResult_t scheduleCollTasksToPlan(
       plan->kernelFn = ncclDevKernelForFunc[task->devFuncId];
       plan->kernelSpecialized = ncclDevKernelForFuncIsSpecialized[task->devFuncId];
     }
+    // Profiler
+    plan->groupApiEventHandle = task->groupApiEventHandle;
 
     if (comm->rank == 0) {
       INFO(NCCL_TUNING, "%s: %ld Bytes -> Algo %s proto %s channel{Lo..Hi}={%d..%d}",
@@ -792,8 +771,9 @@ static ncclResult_t addP2pToPlan(
     int nChannelsMin, int nChannelsMax, int p2pRound,
     int sendRank, void* sendAddr, ssize_t sendBytes,
     int recvRank, void* recvAddr, ssize_t recvBytes,
-    struct ncclTaskP2p** p2pTasks
+    const int planTotalTasks[], struct ncclTaskP2p** p2pTasks
   ) {
+  ncclResult_t ret = ncclSuccess;
   constexpr int connIndex = 1;
   bool selfSend = (sendRank == comm->rank);
   // recv: dir=0, send: dir=1
@@ -804,6 +784,8 @@ static ncclResult_t addP2pToPlan(
   bool proxySameProcess[2] = {true, true};
   void** handles[2] = {NULL, NULL};
   uint8_t base = ncclP2pChannelBaseForRound(comm, p2pRound);
+  struct ncclProxyOp proxyOps[2] = {};
+  int nProxyOps = selfSend ? 0 : 2;
   if (!selfSend) {
     for (int part=0; part < nChannelsMax; part++) {
       int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
@@ -857,7 +839,7 @@ static ncclResult_t addP2pToPlan(
       bool pxnUsed = !ncclPxnDisable(comm) && comm->isAllNvlink && comm->maxLocalRanks > 1;
       if (bytes[dir] > 0 && proxySameProcess[dir] && protocol[dir] == NCCL_PROTO_SIMPLE && (!pxnUsed)) {
         int regFlag = 0;
-        NCCLCHECK(ncclCalloc(&handles[dir], nChannelsMax));
+        NCCLCHECKGOTO(ncclCalloc(&handles[dir], nChannelsMax), ret, cleanup);
         for (int part = 0; part < nChannelsMax; part++) {
           int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
           struct ncclChannelPeer** channelPeers = comm->channels[channelId].peers;
@@ -880,7 +862,7 @@ static ncclResult_t addP2pToPlan(
       void* regAddr = NULL;
       if (conn->conn.flags & (NCCL_P2P_WRITE | NCCL_P2P_READ)) {
         // We require users registering buffers on both sides
-        NCCLCHECK(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue));
+        NCCLCHECKGOTO(ncclRegisterP2pIpcBuffer(comm, addrs[dir], bytes[dir], peerRank, &regFlag, &regAddr, &plan->cleanupQueue), ret, cleanup);
         if (regFlag) {
           if (dir == 0 && (conn->conn.flags & NCCL_P2P_WRITE)) recvAddr = regAddr;
           else if (dir == 1 && (conn->conn.flags & NCCL_P2P_READ)) sendAddr = regAddr;
@@ -905,14 +887,17 @@ static ncclResult_t addP2pToPlan(
     if (p2pTasks[dir]) p2pTasks[dir]->nChannels = nChannels[dir];
   }
 
-  struct ncclWorkList* workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
+  struct ncclWorkList* workNode;
+  workNode = ncclMemoryStackAllocInlineArray<ncclWorkList, ncclDevWorkP2p>(&comm->memScoped, 1);
   workNode->workType = ncclDevWorkTypeP2p;
   workNode->size = sizeof(struct ncclDevWorkP2p);
   ncclIntruQueueEnqueue(&plan->workQueue, workNode);
-  uint32_t workOffset = plan->workBytes;
+  uint32_t workOffset;
+  workOffset = plan->workBytes;
   plan->workBytes += sizeof(struct ncclDevWorkP2p);
 
-  struct ncclDevWorkP2p* work = (struct ncclDevWorkP2p*)(workNode+1);
+  struct ncclDevWorkP2p* work;
+  work = (struct ncclDevWorkP2p*)(workNode+1);
   work->nP2pChannels = comm->p2pnChannels;
   work->channelBase = base;
   work->nSendChannels = nChannels[1];
@@ -933,8 +918,6 @@ static ncclResult_t addP2pToPlan(
   work->recvBytes = recvBytes==-1 ? 0 : recvBytes;
   work->profilerEnabled = ncclProfilerPluginLoaded() && ((p2pTasks[0] ? p2pTasks[0] : p2pTasks[1])->eActivationMask & ncclProfileKernelCh);
 
-  struct ncclProxyOp proxyOps[2] = {};
-  int nProxyOps = selfSend ? 0 : 2;
   for (int dir=0; dir < nProxyOps; dir++) {
     struct ncclProxyOp* op = &proxyOps[dir];
     op->root = dir ? sendRank : recvRank;
@@ -947,6 +930,7 @@ static ncclResult_t addP2pToPlan(
     op->chunkSize = chunkSize[dir];
     op->reg = netRegistered[dir];
     op->coll = p2pTasks[dir] ? p2pTasks[dir]->func : 0;
+    op->collAPI = p2pTasks[dir] ? p2pTasks[dir]->collAPI : 0;
     op->task.p2p = p2pTasks[dir];
     op->rank = comm->rank;
     op->eActivationMask = p2pTasks[dir] ? p2pTasks[dir]->eActivationMask : 0;
@@ -955,6 +939,15 @@ static ncclResult_t addP2pToPlan(
   }
 
   nChannelsMax = std::max(nChannels[0], nChannels[1]);
+  // Determine how many peers this plan will target concurrently. Make a
+  // simplifying assumption that each task targets a different peer.
+  // Each task is striped across 'nChannelsMax' of 'p2pnChannels' channels.
+  // Each channel runs up to NCCL_MAX_DEV_WORK_P2P_PER_BATCH tasks concurrently.
+  int maxConcurrent;
+  int concurrentTasks[2];
+  maxConcurrent = comm->p2pnChannels / nChannelsMax * NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+  concurrentTasks[0] = std::min(planTotalTasks[0], maxConcurrent);
+  concurrentTasks[1] = std::min(planTotalTasks[1], maxConcurrent);
   for (int part=0; part < nChannelsMax; part++) {
     int incWorkCounter = -1;
     int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, part);
@@ -1003,13 +996,17 @@ static ncclResult_t addP2pToPlan(
         // equal one plus the batch index this p2p settled in.
         proxyOps[dir].channelId = channelId;
         proxyOps[dir].opCount = uint64_t(comm->planner.wipPlan.channels[channelId].nWorkBatchesP2p)<<1 | 1;
-        NCCLCHECK(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
-        NCCLCHECK(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]));
+        proxyOps[dir].nChannels = nChannels[dir];
+        proxyOps[dir].nPeers = concurrentTasks[dir];
+        NCCLCHECKGOTO(addProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
+        NCCLCHECKGOTO(addProfilerProxyOpIfNeeded(comm, plan, &proxyOps[dir]), ret, cleanup);
       }
     }
   }
-
-  return ncclSuccess;
+cleanup:
+  free(handles[0]);
+  free(handles[1]);
+  return ret;
 }
 
 static int calcP2pChannelCount(size_t totalSize, int minChannels, int maxChannels, size_t minSize, size_t maxSize) {
@@ -1041,6 +1038,8 @@ static ncclResult_t scheduleP2pTasksToPlan(
   // Try to use all channels, but one channel per operation.
   while (nChannelsMin*nRanks > comm->p2pnChannels && nChannelsMin > 1) nChannelsMin /= 2;
 
+  // Save the total count of send/recv tasks in the plan
+  int planTotalTasks[2] = {comm->planner.nTasksP2pRecv, comm->planner.nTasksP2pSend};
   while (comm->planner.nTasksP2p != 0) {
     for (int round=0; round < nRanks; round++) {
       int sendRank = comm->p2pSchedule[round].sendRank;
@@ -1071,22 +1070,30 @@ static ncclResult_t scheduleP2pTasksToPlan(
         ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, send);
         ncclMemoryPoolFree(&comm->memPool_ncclTaskP2p, recv);
         comm->planner.nTasksP2p -= 2;
+        comm->planner.nTasksP2pSend -= 1;
+        comm->planner.nTasksP2pRecv -= 1;
       } else {
         // Ensure room for worst case of one new batch per channel.
         if (!testBudget(budget, plan->nWorkBatches+nChannelsMax, plan->workBytes + sizeof(struct ncclDevWorkP2p))) {
           return ncclSuccess;
         }
         struct ncclTaskP2p* p2pTasks[2] = { recv, send };
-        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, p2pTasks));
+        NCCLCHECK(addP2pToPlan(comm, plan, nChannelsMin, nChannelsMax, round, sendRank, sendBuff, sendBytes, recvRank, recvBuff, recvBytes, planTotalTasks, p2pTasks));
         if (send != nullptr) {
           ncclIntruQueueDequeue(&peers[sendRank].sendQueue);
+          // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
+          plan->groupApiEventHandle = send->groupApiEventHandle;
           ncclIntruQueueEnqueue(&plan->p2pTaskQueue, send);
           comm->planner.nTasksP2p -= 1;
+          comm->planner.nTasksP2pSend -= 1;
         }
         if (recv != nullptr) {
           ncclIntruQueueDequeue(&peers[recvRank].recvQueue);
+          // Profiler - We can overwrite groupAPI event handles here since all operations here belong to the same group
+          plan->groupApiEventHandle = recv->groupApiEventHandle;
           ncclIntruQueueEnqueue(&plan->p2pTaskQueue, recv);
           comm->planner.nTasksP2p -= 1;
+          comm->planner.nTasksP2pRecv -= 1;
         }
       }
     }
@@ -1125,7 +1132,7 @@ namespace {
 }
 
 static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* plan) {
-  if (plan->isSymColl) return ncclSuccess;
+  if (plan->isSymColl || plan->isCeColl) return ncclSuccess;
 
   size_t workBytes = plan->workBytes;
   size_t batchBytes = plan->nWorkBatches*sizeof(struct ncclDevWorkBatch);
@@ -1297,7 +1304,7 @@ static ncclResult_t hostStreamPlanTask(struct ncclComm* comm, struct ncclKernelP
 }
 
 static void CUDART_CB hostStreamPlanCallback(void *plan_) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
   struct ncclKernelPlan* plan = (struct ncclKernelPlan*)plan_;
   ncclResult_t result = hostStreamPlanTask(plan->comm, plan);
   if (result != ncclSuccess) {
@@ -1318,6 +1325,9 @@ static ncclResult_t reclaimPlan(struct ncclComm* comm, struct ncclCommCallback*
       CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
     }
   }
+  if (plan->isSymColl) {
+    free(plan->kernelSymArgs);
+  }
   // Free coll tasks
   struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
   while (ct != nullptr) {
@@ -1394,7 +1404,9 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
   planner->persistent = persistent;
   int nPlans = 0;
 
-  if (planner->nTasksColl + planner->nTasksP2p != 0) {
+  if (planner->nTasksColl + planner->nTasksP2p != 0 ||
+      !ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
+      !ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
     do {
       memset(&planner->wipPlan, 0, sizeof(planner->wipPlan));
 
@@ -1406,53 +1418,55 @@ ncclResult_t ncclLaunchPrepare(struct ncclComm* comm) {
       plan->workStorageType = persistent ? ncclDevWorkStorageTypePersistent
                                          : ncclDevWorkStorageTypeFifo;
 
-      if (planner->isSymColl) {
-        plan->workStorageType = ncclDevWorkStorageTypeArgs;
-
-        struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collTaskQueue);
-        plan->isSymColl = true;
-        plan->kernelFn = ncclSymGetKernelPtr((ncclSymKernelId)task->devFuncId, task->opDev.op, task->datatype);
-        plan->threadPerBlock = task->nWarps*WARP_SIZE;
-        plan->channelMask = uint64_t(-1) >> (64-task->nMaxChannels);
-
-        plan->kernelArgsSize = sizeof(struct ncclSymDevArgs);
-        plan->kernelSymArgs = ncclMemoryStackAlloc<struct ncclSymDevArgs>(&comm->memScoped);
-        plan->kernelSymArgs->comm = comm->symDevComm;
-        plan->kernelSymArgs->rootRank = task->root;
-        plan->kernelSymArgs->redOpArg = task->opDev.scalarArg;
-        plan->kernelSymArgs->nElts = task->count;
-        plan->kernelSymArgs->input = (char*)task->sendbuff;
-        plan->kernelSymArgs->output = (char*)task->recvbuff;
-
-        planner->nTasksColl -= 1;
+      if (!ncclIntruQueueEmpty(&planner->collCeTaskQueue)) {
+        struct ncclTaskColl* task = ncclIntruQueueHead(&planner->collCeTaskQueue);
+        plan->isCeColl = true;
+        plan->ceCollArgs = ncclMemoryStackAlloc<struct ncclCeCollArgs>(&comm->memScoped);
+        plan->ceCollArgs->rootRank = task->root;
+        plan->ceCollArgs->nElts = task->count;
+        plan->ceCollArgs->eltSize = ncclTypeSize(task->datatype);
+        plan->ceCollArgs->sendBuff = (uint8_t*)task->sendbuff;
+        plan->ceCollArgs->recvBuff = (uint8_t*)task->recvbuff;
+        plan->ceCollArgs->func = task->func;
+        plan->ceCollArgs->sendWin = task->sendWin;
+        plan->ceCollArgs->recvWin = task->recvWin;
+
         ncclIntruQueueEnqueue(&planner->planQueue, plan);
-        INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d",
-        ncclFuncToString(task->func), task->count * ncclTypeSize(task->datatype), ncclSymKernelIdToString(task->devFuncId), task->nMaxChannels, plan->threadPerBlock);
+        ncclIntruQueueDequeue(&planner->collCeTaskQueue);
+        ncclMemoryPoolFree(&comm->memPool_ncclTaskColl, task);
         nPlans += 1;
       } else {
-        struct ncclKernelPlanBudget budget;
-        budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
-        // Non-persistent kernels fill up at most half of our fifo per kernel.
-        budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
-
-        // Drain coll tasks first. This is essential since we partition tasks based
-        // on the work budget and p2p work isn't collective. If we were to drain p2p
-        // first, the place where we cut the kernel could vary by rank which would
-        // cause the "shortest channel first" channel picker to have divergent results.
-        if (planner->nTasksColl != 0) {
-          NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
+	if (!ncclIntruQueueEmpty(&planner->collSymTaskQueue)) {
+          NCCLCHECKGOTO(ncclSymmetricTaskScheduler(comm, &planner->collSymTaskQueue, plan), result, failure);
         }
-        // And only drain p2p tasks once colls are depleted.
-        if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
-          NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
+        else {
+          struct ncclKernelPlanBudget budget;
+          budget.inArgsBytes = comm->workArgsBytes - sizeof(struct ncclDevKernelArgs);
+          // Non-persistent kernels fill up at most half of our fifo per kernel.
+          budget.outArgsBytes = plan->persistent ? (1<<30) : comm->workFifoBytes/2;
+
+          // Drain coll tasks first. This is essential since we partition tasks based
+          // on the work budget and p2p work isn't collective. If we were to drain p2p
+          // first, the place where we cut the kernel could vary by rank which would
+          // cause the "shortest channel first" channel picker to have divergent results.
+          if (planner->nTasksColl != 0) {
+            NCCLCHECKGOTO(scheduleCollTasksToPlan(comm, plan, &budget), result, failure);
+          }
+          // And only drain p2p tasks once colls are depleted.
+          if (planner->nTasksColl == 0 && planner->nTasksP2p != 0) {
+            NCCLCHECKGOTO(scheduleP2pTasksToPlan(comm, plan, &budget), result, failure);
+          }
         }
+
         finishPlan(comm, plan);
         if (plan->workBytes != 0) {
           ncclIntruQueueEnqueue(&planner->planQueue, plan);
           nPlans += 1;
         }
       }
-    } while (planner->nTasksColl + planner->nTasksP2p != 0);
+    } while (planner->nTasksColl + planner->nTasksP2p != 0 ||
+             !ncclIntruQueueEmpty(&planner->collSymTaskQueue) ||
+             !ncclIntruQueueEmpty(&planner->collCeTaskQueue));
 
     struct ncclKernelPlan* planHead = ncclIntruQueueHead(&planner->planQueue);
     planner->unlaunchedPlansHead = planHead;
@@ -1531,8 +1545,6 @@ ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, stru
 NCCL_PARAM(MemSyncDomain, "MEM_SYNC_DOMAIN", cudaLaunchMemSyncDomainRemote);
 #endif
 
-NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
-
 ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan) {
   ncclResult_t ret = ncclSuccess;
   struct ncclKernelPlanner* planner = &comm->planner;
@@ -1542,6 +1554,9 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   dim3 block = {(unsigned)plan->threadPerBlock, 1, 1};
   int smem = ncclShmemDynamicSize(comm->cudaArch);
   cudaStream_t launchStream = planner->streams->stream;
+
+  NCCLCHECK(ncclProfilerStartKernelLaunchEvent(plan, launchStream));
+
   void* extra[] = {
     CU_LAUNCH_PARAM_BUFFER_POINTER, plan->kernelArgs,
     CU_LAUNCH_PARAM_BUFFER_SIZE, &plan->kernelArgsSize,
@@ -1588,25 +1603,24 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
     }
     #endif
     #if CUDART_VERSION >= 12030
-    bool capturing = ncclCudaGraphValid(planner->capturingGraph);
     enum ncclImplicitOrder implicitOrder;
-    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, capturing, driverVersion), ret, do_return);
+    NCCLCHECKGOTO(getImplicitOrder(&implicitOrder, plan->persistent, driverVersion), ret, do_return);
     if (implicitOrder == ncclImplicitOrderLaunch) {
       launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT;
       launchAttrs[attrs].value.launchCompletionEvent.event = comm->sharedRes->launchEvent;
       launchAttrs[attrs].value.launchCompletionEvent.flags = 0;
       attrs++;
     }
-    if (comm->planner.isSymColl && compCap >= 90 && driverVersion >= 12030) {
+    if (plan->isSymColl && compCap >= 90 && driverVersion >= 12030) {
       launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION;
       launchAttrs[attrs].value.programmaticStreamSerializationAllowed = 1;
       attrs++;
     }
     #endif
     #if CUDART_VERSION >= 13000
-    if (compCap >= 90 && driverVersion >= 13000) {
+    if (compCap >= 100 && driverVersion >= 13000) {
       launchAttrs[attrs].id = CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING;
-      launchAttrs[attrs].value.nvlinkUtilCentricScheduling = ncclParamNvlinkUtilCentricSchedEnable();
+      launchAttrs[attrs].value.nvlinkUtilCentricScheduling = comm->config.nvlinkCentricSched;
       attrs++;
     }
     #endif
@@ -1628,6 +1642,7 @@ ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan
   }
 
 do_return:
+  NCCLCHECK(ncclProfilerStopKernelLaunchEvent(plan));
   return ret;
 }
 
@@ -1765,6 +1780,8 @@ static ncclResult_t updateCollCostTable(
     if ((a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) && collNetSupport != 1) continue;
     // CollNetDirect is only supported for up to 8 local GPUs
     if (a == NCCL_ALGO_COLLNET_DIRECT && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
+    // Disable CollNet Chain for more than 8 local GPUs
+    if (a == NCCL_ALGO_COLLNET_CHAIN && comm->maxLocalRanks > NCCL_MAX_DIRECT_ARITY+1) continue;
     if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && (!nvlsSupport || (info->func != ncclFuncAllReduce && comm->localRanks > NCCL_MAX_NVLS_ARITY))) continue;
     if (a == NCCL_ALGO_NVLS && collNetSupport != 1 && comm->nNodes > 1) continue;
     /* Tree reduceScatter doesn't support scaling yet */
@@ -1844,7 +1861,11 @@ static ncclResult_t topoGetAlgoInfo(
     }
   } else if (info->algorithm == NCCL_ALGO_NVLS || info->algorithm == NCCL_ALGO_NVLS_TREE) {
     // NVLS should not need more than 16 channels to get peak BW.
-    nc = comm->nvlsChannels;
+    if (comm->nNodes > 1 && info->algorithm == NCCL_ALGO_NVLS) {
+      nc = std::min(comm->nvlsChannels, comm->nChannels);
+    } else {
+      nc = comm->nvlsChannels;
+    }
   } else {
     // Ring/Tree channel tuning
     while (nBytes < nc * nt * threadThreshold) {
@@ -2107,6 +2128,7 @@ static ncclResult_t calcCollChunking(
   }
   proxyOp->pattern = pattern;
   proxyOp->coll = info->func;
+  proxyOp->collAPI = info->func;
   proxyOp->root = info->root;
   proxyOp->isOneRPN = comm->isOneRPN;
   // This is used by P2P to reduce the receive buffer size. We don't use it in collectives
@@ -2170,6 +2192,35 @@ static ncclResult_t calcCollChunking(
     proxyOp->nbytes = DIVUP(nBytes, nChannels);
   }
 
+  // Set peer count hints used by network plugin
+  switch (proxyOp->pattern) {
+  case ncclPatternRing:
+  case ncclPatternRingTwice:
+  case ncclPatternPipelineFrom:
+  case ncclPatternPipelineTo:
+  case ncclPatternPatUp:
+  case ncclPatternPatDown:
+    proxyOp->nPeers = 1;
+    break;
+  case ncclPatternTreeUp:
+  case ncclPatternTreeDown:
+  case ncclPatternTreeUpDown:
+  case ncclPatternNvlsTree:
+    proxyOp->nPeers = (NCCL_MAX_TREE_ARITY - 1) * 2;
+    break;
+  case ncclPatternCollnetChain:
+  case ncclPatternCollnetDirect:
+  case ncclPatternNvls:
+  case ncclPatternProfiler:
+    // Peer count hints unused
+    break;
+  case ncclPatternSend:
+  case ncclPatternRecv:
+  default:
+    WARN("Unknown pattern %d", pattern);
+    return ncclInternalError;
+  }
+
   *outChunkSize = proxyOp->chunkSize;
   return ncclSuccess;
 }
@@ -2269,70 +2320,225 @@ static ncclResult_t hostToDevRedOp(
   return ncclSuccess;
 }
 
-// Converts `info` to a task and adds it to `comm->planner`. The exception is with
-// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
-// thus don't need a task.
-static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
+static ncclResult_t ncclPlannerSetCapturingGraph(struct ncclComm* comm, struct ncclInfo* info) {
   struct ncclKernelPlanner *planner = &comm->planner;
-
-  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
-    int peer = info->root;
-    ssize_t nBytes = info->count*ncclTypeSize(info->datatype);
-    bool isSendNotRecv = info->coll == ncclFuncSend;
-
-    // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-    ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
-    struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
-    p2p->func = info->coll;
-    p2p->buff = (void*)info->recvbuff;
-    p2p->count = info->count;
-    p2p->datatype = info->datatype;
-    p2p->root = info->root;
-    p2p->bytes = nBytes;
-    p2p->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
-    ncclIntruQueueEnqueue(
-      isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
-      p2p);
-    planner->nTasksP2p += 1;
-
-    // Mark channels that need pre-connect
-    if (comm->rank != peer) {
-      if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
-        // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
-        (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
-        int round = 0;
-        while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
-                                      : comm->p2pSchedule[round].recvRank)) {
-          round += 1;
+  if (info->stream != planner->streamRecent || planner->streams == nullptr) {
+    planner->streamRecent = info->stream;
+    struct ncclCudaStreamList* l = planner->streams;
+    while (true) {
+      if (l == nullptr) { // Got to the end, this must be a new stream.
+        struct ncclCudaGraph graph;
+        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
+        if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
+          WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
+          return ncclInvalidUsage;
         }
-        uint8_t base = ncclP2pChannelBaseForRound(comm, round);
-        for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
-          int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
-          if (isSendNotRecv) {
-            if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
-              // the send/recv connector is shared among split shared comms. We need to set hasSeen to
-              // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
-              // shared comms together.
-              comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
-              comm->connectSend[peer] |= (1UL<<channelId);
-              ncclGroupCommPreconnect(comm);
-            }
-          } else {
-            if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
-              comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
-              comm->connectRecv[peer] |= (1UL<<channelId);
-              ncclGroupCommPreconnect(comm);
-            }
+        planner->capturingGraph = graph; // C++ struct assignment
+        // Add stream to list
+        l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
+        l->stream = info->stream;
+        l->next = planner->streams;
+        planner->streams = l;
+        break;
+      }
+      if (l->stream == info->stream)
+        break; // Already seen stream.
+      l = l->next;
+    }
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t p2pTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    ncclFunc_t coll,
+    ncclFunc_t collAPI,
+    void* buff,
+    size_t count,
+    ncclDataType_t datatype,
+    int peer) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+
+  // Determine peer and basic parameters.
+  ssize_t nBytes = count*ncclTypeSize(datatype);
+  bool isSendNotRecv = coll == ncclFuncSend;
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(comm, ncclGroupTaskTypeCollective);
+  info->coll = coll;
+  // Set capturing graph. Called here so that profiler can emit a group API event with this information
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
+  NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
+  NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
+
+  NCCLCHECK(ncclProfilerStartP2pApiEvent(info, isGraphCaptured));
+
+  struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
+  p2p->func = coll;
+  p2p->collAPI = collAPI;
+  p2p->buff = buff;
+  p2p->count = count;
+  p2p->datatype = datatype;
+  p2p->root = peer;
+  p2p->bytes = nBytes;
+  p2p->eActivationMask = ncclProfilerApiState.eActivationMask;
+  p2p->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  p2p->p2pApiEventHandle = ncclProfilerApiState.p2pApiEventHandle;
+  ncclIntruQueueEnqueue(
+    isSendNotRecv ? &planner->peers[peer].sendQueue : &planner->peers[peer].recvQueue,
+    p2p);
+  planner->nTasksP2p += 1;
+  if (isSendNotRecv)
+    planner->nTasksP2pSend += 1;
+  else
+    planner->nTasksP2pRecv += 1;
+
+  // Mark channels that need pre-connect
+  if (comm->rank != peer) {
+    if (!(isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen)) {
+      // planner->peers[peer].send/recvSeen is private to each comm, so we need to set it anyway.
+      (isSendNotRecv ? planner->peers[peer].sendSeen : planner->peers[peer].recvSeen) = true;
+      int round = 0;
+      while (peer != (isSendNotRecv ? comm->p2pSchedule[round].sendRank
+                                    : comm->p2pSchedule[round].recvRank)) {
+        round += 1;
+      }
+      uint8_t base = ncclP2pChannelBaseForRound(comm, round);
+      for (int c=0; c < comm->p2pnChannelsPerPeer; c++) {
+        int channelId = ncclP2pChannelForPart(comm->p2pnChannels, base, c);
+        if (isSendNotRecv) {
+          if (comm->channels[channelId].peers[peer]->send[1].hasSeen == 0) { // P2P uses only 1 connector
+            // the send/recv connector is shared among split shared comms. We need to set hasSeen to
+            // 1 in order to avoid duplicate connection setup if user group sendrecv ops with split
+            // shared comms together.
+            comm->channels[channelId].peers[peer]->send[1].hasSeen = 1;
+            comm->channels[channelId].peers[peer]->send[1].p2pOnly = 1;
+            comm->connectSend[peer] |= (1UL<<channelId);
+            ncclGroupCommPreconnect(comm);
+          }
+        } else {
+          if (comm->channels[channelId].peers[peer]->recv[1].hasSeen == 0) { // P2P uses only 1 connector
+            comm->channels[channelId].peers[peer]->recv[1].hasSeen = 1;
+            comm->channels[channelId].peers[peer]->recv[1].p2pOnly = 1;
+            comm->connectRecv[peer] |= (1UL<<channelId);
+            ncclGroupCommPreconnect(comm);
           }
         }
       }
     }
+  }
+  ncclProfilerStopP2pApiEvent();
+  return ncclSuccess;
+}
+
+static ncclResult_t collTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    struct ncclDevRedOpFull opDev) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
+  // Set capturing graph. Called here so that profiler can emit a group API event with this information
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  bool isGraphCaptured = ncclCudaGraphValid(planner->capturingGraph);
+  NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
+  NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
+  NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured));
+  
+  struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
+  t->func = info->coll;
+  t->sendbuff = info->sendbuff;
+  t->recvbuff = info->recvbuff;
+  t->count = info->count;
+  t->root = info->root;
+  t->datatype = info->datatype;
+  size_t elementSize = ncclTypeSize(t->datatype);
+  if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
+    t->count *= elementSize;
+    t->datatype = ncclInt8;
+    elementSize = 1;
+  }
+  t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
+  t->opHost = info->op;
+  t->opDev = opDev; // C++ struct assignment
+  t->chunkSteps = info->chunkSteps;
+  t->sliceSteps = info->sliceSteps;
+  t->eActivationMask = ncclProfilerApiState.eActivationMask;
+  t->groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  t->collApiEventHandle = ncclProfilerApiState.collApiEventHandle;
+
+  planner->nTasksColl += 1;
+  ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
+
+  ncclProfilerStopCollApiEvent();
+  return ncclSuccess;
+}
+
+static ncclResult_t ceCollTaskAppend(
+    struct ncclComm* comm,
+    struct ncclInfo* info,
+    struct ncclDevrWindow* sendWin,
+    struct ncclDevrWindow* recvWin,
+    struct ncclDevRedOpFull opDev) {
+  struct ncclKernelPlanner *planner = &comm->planner;
+  
+  // Check if CE needs initialization
+  if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* ceTask;
+    NCCLCHECK(ncclCalloc(&ceTask, 1));
+    ceTask->comm = comm;
+    ncclIntruQueueEnqueue(&comm->ceInitTaskQueue, ceTask);
+    ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
+  }
+
+  // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
+  ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
+  NCCLCHECK(ncclPlannerSetCapturingGraph(comm, info));
+  struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
+
+  t->func = info->coll;
+  t->sendbuff = info->sendbuff;
+  t->recvbuff = info->recvbuff;
+  t->count = info->count;
+  t->root = info->root;
+  t->datatype = info->datatype;
+  size_t elementSize = ncclTypeSize(t->datatype);
+  if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
+    t->count *= elementSize;
+    t->datatype = ncclInt8;
+    elementSize = 1;
+  }
+  t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
+  t->opHost = info->op;
+  t->opDev = opDev; // C++ struct assignment
+  t->chunkSteps = info->chunkSteps;
+  t->sliceSteps = info->sliceSteps;
+  t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
+  t->sendWin = sendWin;
+  t->recvWin = recvWin;
+
+  ncclIntruQueueEnqueue(&planner->collCeTaskQueue, t);
+
+  return ncclSuccess;
+}
+
+// Converts `info` to a task and adds it to `comm->planner`. The exception is with
+// single rank communicators, collectives are issued as `ncclMemcpyAsync`s and
+// thus don't need a task.
+static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
+  ncclFunc_t collAPI = info->coll;
+
+  if (info->coll == ncclFuncSend || info->coll == ncclFuncRecv) {
+    NCCLCHECK(p2pTaskAppend(comm, info, info->coll, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
   } else {
     // Empty collectives can be discarded.
     if (info->count == 0) return ncclSuccess;
 
     if (info->datatype == ncclFloat8e4m3 || info->datatype == ncclFloat8e5m2) {
-      if (comm->minCompCap < 90) {
+      if (comm->minCompCap < 90 && info->coll != ncclFuncAllGather && info->coll != ncclFuncBroadcast && info->coll != ncclFuncAlltoAll && info->coll != ncclFuncScatter && info->coll != ncclFuncGather) {
         WARN("FP8 reduction support begins with sm90 capable devices.");
         return ncclInvalidArgument;
       }
@@ -2347,61 +2553,59 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       NCCLCHECK(ncclLaunchOneRank(info->recvbuff, info->sendbuff, info->count, opDev, info->datatype, info->stream));
       return ncclSuccess;
     } else {
-      // Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
-      ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
-      struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
-      t->func = info->coll;
-      t->sendbuff = info->sendbuff;
-      t->recvbuff = info->recvbuff;
-      t->count = info->count;
-      t->root = info->root;
-      t->datatype = info->datatype;
-      size_t elementSize = ncclTypeSize(t->datatype);
-      if (t->func == ncclFuncAllGather || t->func == ncclFuncBroadcast) {
-        t->count *= elementSize;
-        t->datatype = ncclInt8;
-        elementSize = 1;
+      struct ncclDevrWindow* sendWin;
+      struct ncclDevrWindow* recvWin;
+      ncclDevrFindWindow(comm, info->sendbuff, &sendWin);
+      ncclDevrFindWindow(comm, info->recvbuff, &recvWin);
+      bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype);
+      
+      // Append CE collective task if CE is supported and requested by user
+      if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) {
+        NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev));
       }
-      t->trafficBytes = t->count*elementSize*ncclFuncTrafficPerByte(t->func, comm->nRanks);
-      t->opHost = info->op;
-      t->opDev = opDev; // C++ struct assignment
-      t->chunkSteps = info->chunkSteps;
-      t->sliceSteps = info->sliceSteps;
-      t->eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
-
-      planner->nTasksColl += 1;
-      ncclTaskCollSorterInsert(&planner->collSorter, t, t->trafficBytes);
-    }
-  }
-
-  if (info->stream != planner->streamRecent || planner->streams == nullptr) {
-    planner->streamRecent = info->stream;
-    struct ncclCudaStreamList* l = planner->streams;
-    while (true) {
-      if (l == nullptr) { // Got to the end, this must be a new stream.
-        struct ncclCudaGraph graph;
-        NCCLCHECK(ncclCudaGetCapturingGraph(&graph, info->stream));
-        if (planner->streams != nullptr && !ncclCudaGraphSame(planner->capturingGraph, graph)) {
-          WARN("Streams given to a communicator within a NCCL group must either be all uncaptured or all captured by the same graph.");
-          return ncclInvalidUsage;
+      // Append kernel-based collective
+      else {
+        if (info->coll == ncclFuncAlltoAll) {
+          for (int r=0; r<comm->nRanks; r++) {
+            NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)((char*)info->sendbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
+            NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)((char*)info->recvbuff+r*info->count*ncclTypeSize(info->datatype)), info->count, info->datatype, r));
+          }
+        } else if (info->coll == ncclFuncGather){
+          size_t offset = 0;
+          NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, (void*)info->sendbuff, info->count, info->datatype, info->root));
+          if (comm->rank == info->root) {
+            for (int r=0; r<comm->nRanks; r++) {
+              void* buff = (void*)((char*)info->recvbuff + offset);
+              NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, buff, info->count, info->datatype, r));
+              offset += info->count * ncclTypeSize(info->datatype);
+            }
+          }
+        } else if (info->coll == ncclFuncScatter) {
+          size_t offset = 0;
+          if (comm->rank == info->root) {
+            for (int r = 0; r < comm->nRanks; r++) {
+              void* buff = (void*)((char*)info->sendbuff + offset);
+              NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncSend, collAPI, buff, info->count, info->datatype, r));
+              offset += info->count * ncclTypeSize(info->datatype);
+            }
+          }
+          NCCLCHECK(p2pTaskAppend(comm, info, ncclFuncRecv, collAPI, (void*)info->recvbuff, info->count, info->datatype, info->root));
+        } else {
+          NCCLCHECK(collTaskAppend(comm, info, opDev));
         }
-        planner->capturingGraph = graph; // C++ struct assignment
-        // Add stream to list
-        l = ncclMemoryStackAlloc<struct ncclCudaStreamList>(&comm->memScoped);
-        l->stream = info->stream;
-        l->next = planner->streams;
-        planner->streams = l;
-        break;
       }
-      if (l->stream == info->stream)
-        break; // Already seen stream.
-      l = l->next;
     }
   }
+
   return ncclSuccess;
 }
 
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth
+  // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls
+  if (ncclProfilerApiState.profilerGroupDepth > 0) {
+    ncclProfilerApiState.profilerGroupDepth++;
+  }
   NCCLCHECK(ncclGroupStartInternal());
   ncclResult_t ret = ncclSuccess;
   int devOld = -1;
diff --git a/src/graph/CMakeLists.txt b/src/graph/CMakeLists.txt
new file mode 100644
index 000000000..1dec7cbf7
--- /dev/null
+++ b/src/graph/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Graph sources
+set(GRAPH_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/topo.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuning.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/xml.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/search.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/paths.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/connect.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/rings.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/trees.cc
+)
+
+# Add graph sources to parent scope
+set(GRAPH_SOURCES ${GRAPH_SOURCES} PARENT_SCOPE)
diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 152739b0c..c5fe959ae 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -21,6 +21,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
   int localRanks = comm->topo->nodes[GPU].count;
   int nChannels = comm->nChannels;
 
+  topoRanks->crossNicRing = graphs[NCCL_ALGO_RING]->crossNic;
   topoRanks->nvlsHeadNum = 0;
   for (int c=0; c<nChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
@@ -232,7 +233,6 @@ static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph*
     sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
     sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collnetDirect.headRank, channel->collnetDirect.out, channel->collnetDirect.shift);
     INFO(NCCL_GRAPH, "%s", line);
-    channel->collnetChain.depth = comm->nRanks/comm->nNodes;
   }
   free(heads);
   return ncclSuccess;
@@ -249,7 +249,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
     if (nvlsHeads[h * comm->nNodes + comm->node] == comm->rank) headRank = h;
   }
 
-  for (int c=0; c<comm->nChannels; c++) {
+  for (int c=0; c<comm->nvlsChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     channel->nvls.nHeads = nHeads;
     for (int h=0; h<nHeads; h++) channel->nvls.up[h] = comm->nRanks+1+h;
@@ -301,7 +301,7 @@ static ncclResult_t connectNvls(struct ncclComm* comm, int* nvlsHeads, int nHead
   }
   // Set prev/next in all channels (NVLS compute channels work
   // orthogonally to NVLS search channels).
-  for (int c=0; c<comm->nChannels; c++) {
+  for (int c=0; c<comm->nvlsChannels; c++) {
     struct ncclChannel* channel = comm->channels+c;
     channel->nvls.treeUp = treeUp[c%2];
     channel->nvls.treeDown[0] = channel->nvls.down;
@@ -389,17 +389,17 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   NCCLCHECKGOTO(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS), ret, fail);
 
-  // Alternate rings to avoid crossing rails
-  if (graphs[NCCL_ALGO_RING]->crossNic == 2 && (nChannels % 2) == 0) {
-    for (int r=0; r<comm->nRanks; r++) {
-      if (comm->rankToNode[r] % 2 == 1) {
-        // Exchange rings
-        for (int c=0; c<nChannels; c+=2) {
-          exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
-          exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
-          exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
-          exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
-        }
+  // Alternate rings to avoid crossing rails.
+  // CrossNic values could be not the same on all nodes as it depends on the number of net devs and the NVLink bandwidth.
+  // Therefore, it's only done if the rank obtained a solution with crossNic=2.
+  for (int r = 0; r < comm->nRanks; r++) {
+    if (allTopoRanks[r]->crossNicRing == 2 && (nChannels % 2) == 0 && (comm->rankToNode[r] % 2) == 1) {
+      // Exchange rings
+      for (int c=0; c<nChannels; c+=2) {
+        exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
+        exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
+        exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
+        exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
       }
     }
   }
@@ -459,7 +459,14 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
       int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
       nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
     }
-    NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+
+    for (int c = 0; c < comm->nChannels; c++) {
+      comm->channels[c].collnetChain.depth = comm->nRanks/comm->nNodes;
+    }
+
+    if (comm->maxLocalRanks <= NCCL_MAX_DIRECT_ARITY+1) {
+      NCCLCHECKGOTO(connectCollNet(comm, graphs[NCCL_ALGO_COLLNET_DIRECT]), ret, fail);
+    }
   }
 
   // Use 4 compute channels per search channel to reach peak BW on <8 PPN
@@ -490,9 +497,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
   if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
     comm->nvlsChannels = parent->nvlsResources->nChannels;
   }
-  if (comm->nChannels < comm->nvlsChannels) {
-    nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
-  }
   NCCLCHECKGOTO(connectNvls(comm, nvlsHeads, minHeadNum), ret, fail);
 #endif
   if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 82c0d9972..86d185bc0 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -375,11 +375,15 @@ ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerIn
   nvmlGpuFabricInfoV_t *fabricInfo1 = &info1->fabricInfo;
   nvmlGpuFabricInfoV_t *fabricInfo2 = &info2->fabricInfo;
   // A zero UUID means we don't have MNNVL fabric info
-  if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+  unsigned long uuid0 = 0;
+  unsigned long uuid1 = 0;
+  memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0));
+  memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1));
+  if ((uuid0 | uuid1) == 0) return ncclSuccess;
   if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
       (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
     TRACE(NCCL_NET, "MNNVL matching peer 0x%lx UUID %lx.%lx cliqueId 0x%x",
-         info2->busId, ((long *)fabricInfo2->clusterUuid)[0], ((long *)fabricInfo2->clusterUuid)[1], fabricInfo2->cliqueId);
+         info2->busId, uuid0, uuid1, fabricInfo2->cliqueId);
     *ret = 1;
   }
   return ncclSuccess;
@@ -613,7 +617,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   return ncclSuccess;
 }
 
-NCCL_PARAM(PxnC2c, "PXN_C2C", 0);
+NCCL_PARAM(PxnC2c, "PXN_C2C", 1);
 
 ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
   // Precompute paths between GPUs/NICs.
@@ -793,7 +797,6 @@ void ncclTopoFree(struct ncclTopoSystem* system) {
   free(system);
 }
 
-NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", -1);
 
 static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gpu index*/, int peerRank, int* nChannels) {
   int peer;
@@ -815,8 +818,8 @@ static ncclResult_t ncclTopoGetNchannels(struct ncclComm* comm, int g /*local gp
     }
   } else {
     // Remote rank, use network
-    int nNetChannels = ncclParamNChannelsPerNetPeer();
-    if (nNetChannels == -1) {
+    int nNetChannels = comm->config.nChannelsPerNetPeer;
+    if (nNetChannels == NCCL_CONFIG_UNDEF_INT) {
        //start from 2 channels per NIC and reduce with scale
        nNetChannels = 2;
 
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 8fdf54ea4..3a87725f1 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -8,6 +8,7 @@
 #include "graph.h"
 #include "topo.h"
 #include "comm.h"
+#include "nccl.h"
 #include "nvmlwrap.h"
 #include "coll_net.h"
 #include "transport.h"
@@ -15,6 +16,7 @@
 #include <fcntl.h>
 #include "cpuset.h"
 #include "bootstrap.h"
+#include <mutex>
 
 #define BUSID_SIZE (sizeof("0000:00:00.0"))
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))
@@ -404,7 +406,7 @@ ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* s
 
 #define PCI_BRIDGE_DEVICE_CLASS "0x060400"
 
-struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
+struct kvDict kvDictPciClass[] = { { PCI_BRIDGE_DEVICE_CLASS, PCI }, {"0x080100", /*CX8 data direct*/PCI}, { "0x068000", NVS }, { "0x068001", CPU }, { "0x03", GPU }, { "0x02", NIC }, { NULL, PCI /* Default fallback value */ } };
 struct kvDict kvDictPciGen[] = {
   { "2.5 GT/s", 15 }, { "5 GT/s", 30 }, { "8 GT/s", 60 }, { "16 GT/s", 120 }, { "32 GT/s", 240 }, /* Kernel 5.6 and earlier */
   { "2.5 GT/s PCIe", 15 }, { "5.0 GT/s PCIe", 30 }, { "8.0 GT/s PCIe", 60 }, { "16.0 GT/s PCIe", 120 }, { "32.0 GT/s PCIe", 240 }, { "64.0 GT/s PCIe", 480 },
@@ -982,8 +984,7 @@ ncclResult_t ncclTopoMakePciParent(struct ncclXml* xml, struct ncclXmlNode** par
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, ncclNetVDeviceProps_t* vProps,
-struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, ncclNetVDeviceProps_t* vProps, struct ncclXmlNode** physNetNodes) {
   if (vProps->ndevs > NCCL_NET_MAX_DEVS_PER_NIC) {
     WARN("TOPO/NET : Tried to merge too many NICs. %d > %d", vProps->ndevs, NCCL_NET_MAX_DEVS_PER_NIC);
     return ncclInternalError;
@@ -997,7 +998,7 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev
 
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
-  ncclResult_t ret = makeVDevice(&vDevIndex, vProps);
+  ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps);
   if (ret != ncclSuccess) {
     INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
       vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
@@ -1015,9 +1016,10 @@ struct ncclXmlNode** physNetNodes, ncclResult_t (*makeVDevice)(int*, ncclNetVDev
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
   ncclResult_t ret = ncclSuccess;
-  INFO(NCCL_ENV|NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
+  const char* str = netInfo->forceMerge;
+  INFO(NCCL_ENV | NCCL_NET, "TOPO/NET : Force-fusing NICs using NCCL_NET_FORCE_MERGE=%s", str);
   char* ncStr;
   NCCLCHECK(ncclCalloc(&ncStr, strlen(str)+1));
   strcpy(ncStr, str);
@@ -1053,7 +1055,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs,
       goto fail;
     }
 
-    ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
+    ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);
     if (ret == ncclSuccess) {
       // Only set that a device is "placed" after successfully making a vNic (it's possible to exit before this)
       for (int i = 0; i < vProps.ndevs; i++) {
@@ -1075,7 +1077,7 @@ ncclResult_t ncclTopoForceMerge(struct ncclXml* xml, char* str, int* placedDevs,
   goto exit;
 }
 
-ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*)) {
+ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int* placedDevs, ncclNetProperties_t* propsList, struct ncclXmlNode** physNetNodes, int nPhysDevs) {
   // Compute the path type between each device
   int* paths = NULL;
   ncclResult_t res = ncclSuccess;
@@ -1105,7 +1107,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
       // Select each unplaced device "j" which is at most "mergeLevel" distance from "i", but not equal to "i"
       // (Don't merge the same device with itself)
       for (int j = 0; j < nPhysDevs; j++) {
-        if (paths[i*nPhysDevs + j] <= mergeLevel &&
+        if (paths[i*nPhysDevs + j] <= netInfo->mergeLevel &&
         placedDevs[j] == 0 && j != i) {
           vProps.devs[vProps.ndevs++] = j;
           placedDevs[j] = 1;
@@ -1119,7 +1121,7 @@ ncclResult_t ncclTopoAutoMerge(struct ncclXml* xml, int mergeLevel, int* placedD
         return ncclInternalError;
       }
 
-      ncclResult_t ret = ncclTopoMakeVnic(xml, &vProps, physNetNodes, makeVDevice);
+      ncclResult_t ret = ncclTopoMakeVnic(xml, netInfo, &vProps, physNetNodes);
 
       // Merging failed.
       // Mark all as unplaced and increase their distance to disconnected (PATH_DIS)
@@ -1157,6 +1159,92 @@ struct kvDict nicPathKvList[] = {
   { NULL, 0 }
 };
 
+
+ncclResult_t ncclTopoFindLinkWidthRec(ncclXmlNode* node, ncclXmlNode** physNetNodes, int ndevs, int* foundPhysNet, int* linkWidth) {
+  int myLinkWidth = 0;
+  if (strcmp(node->name, "pci") == 0) {
+    NCCLCHECK(xmlGetAttrInt(node, "link_width", &myLinkWidth));
+#ifdef ENABLE_TRACE
+    const char *busidAttr, *linkAttr;
+    NCCLCHECK(xmlGetAttrStr(node, "busid", &busidAttr));
+    NCCLCHECK(xmlGetAttr(node, "link_width", &linkAttr));
+    TRACE(NCCL_GRAPH, "Found link_width (%s)=%d for busid=%s", linkAttr, myLinkWidth, busidAttr);
+#endif
+  }
+
+  *foundPhysNet = 0;
+  // Detect if a physical child is found. This information will be propagated up the stack.
+  int devId = 0;
+  while (devId < ndevs && !(*foundPhysNet)) *foundPhysNet = (node == physNetNodes[devId++]);
+
+  int totalChildLinkWidth = 0;
+  for (int i = 0; i < node->nSubs; i++) {
+    ncclXmlNode* child = node->subs[i];
+    int found = 0;
+    int tempLinkWidth = 0;
+    NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &found, &tempLinkWidth));
+    if (found) {
+      *foundPhysNet = 1;
+      totalChildLinkWidth += tempLinkWidth;
+    }
+  }
+
+  if (*foundPhysNet == 0) {
+    // No child NICs were found, do not accrue any detected link_width
+    *linkWidth = 0;
+    INFO(NCCL_GRAPH, "Did not find child net device. Returning link_width=%d totalChildLinkWidth=%d", *linkWidth, totalChildLinkWidth);
+  } else if (totalChildLinkWidth == 0) {
+    // If A child NIC was found but no link_width was detected among children, assign the link_width to mine (I am the first pci node right above the physNetNode).
+    *linkWidth = myLinkWidth;
+    INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
+  } else {
+  // Standard recursive accrual of link_width. The link_width is either the bottleneck of this PCI node's width or the sum of its children's width.
+    *linkWidth = myLinkWidth > 0 ? std::min(myLinkWidth, totalChildLinkWidth) : totalChildLinkWidth;
+    INFO(NCCL_GRAPH, "Found child net device for %s. Returning link_width=%d totalChildLinkWidth=%d", node->name, *linkWidth, totalChildLinkWidth);
+  }
+
+  return ncclSuccess;
+}
+
+// DFS over nodes under common parent
+// Exclude link widths of non-physNetNodes chains
+ncclResult_t ncclTopoFindLinkWidth(ncclXmlNode* parent, ncclXmlNode** physNetNodes, int ndevs, int* linkWidth) {
+  *linkWidth = 0;
+  for (int i = 0; i < parent->nSubs; i++) {
+    ncclXmlNode* child = parent->subs[i];
+    int foundPhysNet = 0;
+    int childLinkWidth = 0;
+    NCCLCHECK(ncclTopoFindLinkWidthRec(child, physNetNodes, ndevs, &foundPhysNet, &childLinkWidth));
+    if (foundPhysNet) {
+      *linkWidth += childLinkWidth;
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoWidenLinks(ncclXmlNode** physNetNodes, int ndevs, ncclXmlNode* parent) {
+  int sumLinkWidth = 0;
+  NCCLCHECK(ncclTopoFindLinkWidth(parent, physNetNodes, ndevs, &sumLinkWidth));
+  for (int i = 0; i < ndevs; i++) {
+    ncclXmlNode* temp = physNetNodes[i];
+    while (temp != parent) {
+      if (strcmp(temp->name, "pci") == 0) {
+        NCCLCHECK(xmlSetAttrInt(temp, "link_width", sumLinkWidth));
+        TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, temp->name);
+      }
+      temp = temp->parent;
+    }
+  }
+
+  if (strcmp(parent->name, "pci") == 0) {
+    NCCLCHECK(xmlSetAttrInt(parent, "link_width", sumLinkWidth));
+    TRACE(NCCL_GRAPH, "Set link_width to %d for node %s", sumLinkWidth, parent->name);
+  }
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclNetVDeviceProps_t* vProps, ncclXmlNode** parent) {
   ncclNetProperties_t props[NCCL_NET_MAX_DEVS_PER_NIC];
   ncclXmlNode* physNetNodes[NCCL_NET_MAX_DEVS_PER_NIC];
@@ -1170,54 +1258,50 @@ ncclResult_t ncclTopoGetVNicParent(struct ncclXml* xml, ncclResult_t (*getProper
 
   int path = PATH_LOC;
   NCCLCHECK(ncclTopoGetPath(physNetNodes, vProps->ndevs, &path, parent));
-  if (path == PATH_LOC) {
-    *parent = NULL;
-  } else if (parent && strcmp((*parent)->name, "pci") == 0) {
-    // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
-    const char* c;
-    NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
-    if (strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
+  if (path == PATH_PHB || path == PATH_PXB || path == PATH_PIX) {
+    INFO(NCCL_GRAPH, "Widening links");
+    NCCLCHECK(ncclTopoWidenLinks(physNetNodes, vProps->ndevs, *parent));
+  }
+
+  if (*parent) {
+    if (strcmp((*parent)->name, "pci") == 0) {
+      // Compare PCI class here to avoid NCCL WARN when the "class" attribute doesn't exist
+      const char* c;
+      NCCLCHECK(xmlGetAttrStr(*parent, "class", &c));
+      if (c && strcmp(c, PCI_BRIDGE_DEVICE_CLASS) == 0) {
+        // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
+        NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
+      }
+    } else if (strcmp((*parent)->name, "cpu") == 0) {
       // If the common parent is a PCI switch, we must reparent the new NIC under a made up pci device with a unique busid
       NCCLCHECK(ncclTopoMakePciParent(xml, parent, physNetNodes[0]));
     }
   }
+
   TRACE(NCCL_GRAPH, "Selected parent %s with path %d", (*parent)->name, path);
   return ncclSuccess;
 }
 
-ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*getProperties)(int, ncclNetProperties_t*), int physicalDevs) {
+ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, struct ncclTopoNetInfo* netInfo, int physicalDevs) {
   int* placedDevs = NULL;
   struct ncclXmlNode** physNetNodes = NULL;
-  if (physicalDevs == 0) return ncclSuccess;
-
-  ncclCalloc(&physNetNodes, physicalDevs);
+  ncclNetProperties_t* props = NULL;
   ncclResult_t res = ncclSuccess;
+  if (physicalDevs == 0) return ncclSuccess;
 
-  ncclNetProperties_t* props = NULL;
-  ncclCalloc(&props, physicalDevs);
+  NCCLCHECK(ncclCalloc(&physNetNodes, physicalDevs));
+  NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
+  NCCLCHECK(ncclCalloc(&props, physicalDevs));
   for (int i = 0; i < physicalDevs; i++) {
-    NCCLCHECKGOTO(getProperties(i, props + i), res, out);
+    NCCLCHECKGOTO(netInfo->getProperties(i, props + i), res, out);
     struct ncclXmlNode* physNetNode;
     NCCLCHECKGOTO(xmlFindTagKv(xml, "net", &physNetNode, "name", props[i].name), res, out);
     physNetNodes[i] = physNetNode;
     TRACE(NCCL_GRAPH, "Found physical ncclNet node %d %s", i,  props[i].name);
   }
 
-  // By default, don't merge any devices
-  int mergeLevel;
-  mergeLevel = PATH_PORT;
-  { // Avoids warnings related to jumping to "out"
-    const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
-    if (mergeLevelEnv) kvConvertToInt(mergeLevelEnv, &mergeLevel, nicPathKvList);
-    char* forceMerge = (char*) ncclGetEnv("NCCL_NET_FORCE_MERGE");
-    NCCLCHECK(ncclCalloc(&placedDevs, physicalDevs));
-    memset(placedDevs, 0, sizeof(int)*physicalDevs);
-
-    if (forceMerge) {
-      NCCLCHECKGOTO(ncclTopoForceMerge(xml, forceMerge, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
-    }
-  }
-  NCCLCHECKGOTO(ncclTopoAutoMerge(xml, mergeLevel, placedDevs, props, physNetNodes, physicalDevs, makeVDevice), res, out);
+  if (netInfo->forceMerge) NCCLCHECKGOTO(ncclTopoForceMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);
+  NCCLCHECKGOTO(ncclTopoAutoMerge(xml, netInfo, placedDevs, props, physNetNodes, physicalDevs), res, out);
 
 out:
   free(physNetNodes);
@@ -1226,10 +1310,10 @@ ncclResult_t ncclTopoMakeVNics(struct ncclXml* xml, ncclResult_t (*makeVDevice)(
   return res;
 }
 
-static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), const char* netName, int coll, int virtualNics, bool dmaBufSupport) {
+static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIndex, struct ncclTopoNetInfo* netInfo, int virtualNics) {
   for (int n = startIndex; n < endIndex; n++) {
     ncclNetProperties_t props;
-    NCCLCHECK(getProperties(n, &props));
+    NCCLCHECK(netInfo->getProperties(n, &props));
     struct ncclXmlNode* netNode = NULL;
     struct ncclXmlNode* parent = NULL;
     if (virtualNics) {
@@ -1237,7 +1321,7 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
       NCCLCHECK(xmlFindTagKv(xml, "net", &net, "name", props.name));
       // In the event of multithreaded use case, we need to re-discover the shared parent of the given devices for this vNIC
       // Only run this if the net doesn't exist locally - this may alter the XML state
-      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, getProperties, &props.vProps, &parent));
+      if (net == NULL) NCCLCHECK(ncclTopoGetVNicParent(xml, netInfo->getProperties, &props.vProps, &parent));
     }
 
     NCCLCHECK(ncclTopoFillNet(xml, props.pciPath, props.name, &netNode, parent));
@@ -1248,18 +1332,18 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
     NCCLCHECK(xmlSetAttrInt(netNode, "keep", 1));
     int dev;
     xmlGetAttrIntDefault(netNode, "dev", &dev, -1);
-    if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netName, dev, n);
+    if (dev != -1 && dev != n) INFO(NCCL_GRAPH, "TOPO/NET : Changing %s dev index from %d to %d", netInfo->name, dev, n);
     NCCLCHECK(xmlSetAttrInt(netNode, "dev", n));
     NCCLCHECK(xmlInitAttrInt(netNode, "latency", props.latency));
     NCCLCHECK(xmlInitAttrInt(netNode, "speed", props.speed));
     NCCLCHECK(xmlInitAttrInt(netNode, "port", props.port));
     NCCLCHECK(xmlInitAttrUint64(netNode, "guid", props.guid));
     NCCLCHECK(xmlInitAttrInt(netNode, "maxconn", props.maxComms));
-    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
-    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netName, gdrSupport ? "Enabled" : "Disabled", n, props.name);
+    bool gdrSupport = (props.ptrSupport & NCCL_PTR_CUDA) || (netInfo->dmaBufSupport && (props.ptrSupport & NCCL_PTR_DMABUF));
+    INFO(NCCL_NET,"NET/%s : GPU Direct RDMA %s for HCA %d '%s'", netInfo->name, gdrSupport ? "Enabled" : "Disabled", n, props.name);
     NCCLCHECK(xmlInitAttrInt(netNode, "gdr", gdrSupport));
     // Only set coll if it's not 0
-    if (coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", coll));
+    if (netInfo->coll) NCCLCHECK(xmlInitAttrInt(netNode, "coll", netInfo->coll));
 
     const char* keepAttr;
     NCCLCHECK(xmlGetAttr(netNode, "coll", &colAttr));
@@ -1272,51 +1356,45 @@ static ncclResult_t ncclTopoPopulateNics(ncclXml* xml, int startIndex, int endIn
 }
 
 // Calls to network plugin APIs should be protected. This function should be called inside a per-process lock.
-ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport) {
-  int usePhysicalDevices = (dumpXmlFile || makeVDevice == NULL);
-  if (state->nPhysicalNics == -1) NCCLCHECK(devices(&state->nPhysicalNics));
-  // Enumerate physical devices
-  NCCLCHECK(ncclTopoPopulateNics(xml, 0, state->nPhysicalNics, getProperties, netName, coll, false, dmaBufSupport));
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net) {
+  bool usePhysicalDevices = (dumpXmlFile || net->makeVDevice == NULL);
+  int nPhysicalNics, nVirtualNics;
+  NCCLCHECK(net->getDevCount(net->netPluginIndex, &nPhysicalNics, &nVirtualNics));
+  // List the physical devices in the topo
+  NCCLCHECK(ncclTopoPopulateNics(xml, 0, nPhysicalNics, net, /*virtual=*/false));
   if (!usePhysicalDevices) {
-    if (state->nVirtualNics == -1) {
-      NCCLCHECK(ncclTopoMakeVNics(xml, makeVDevice, getProperties, state->nPhysicalNics));
+    // Virtual devices are only created once per network
+    if (nVirtualNics == NCCL_UNDEF_DEV_COUNT) {
+      NCCLCHECK(ncclTopoMakeVNics(xml, net, nPhysicalNics));
+      // Update the number of virtual devices both locally and in the state tracking the plugin.
+      // Note: 0 is a valid number of virtual devices
       int nDevs;
-      NCCLCHECK(devices(&nDevs));
-      state->nVirtualNics = nDevs - state->nPhysicalNics;
+      NCCLCHECK(net->devices(&nDevs));
+      nVirtualNics = nDevs - nPhysicalNics;
+      NCCLCHECK(net->setVirtDevCount(net->netPluginIndex, nVirtualNics));
     }
-    if (state->nVirtualNics > 0) {
-      // Populate new devices
-      NCCLCHECK(ncclTopoPopulateNics(xml, state->nPhysicalNics, state->nPhysicalNics+state->nVirtualNics, getProperties, netName, coll, true, dmaBufSupport));
+    // populate the virtual devices if any
+    if (nVirtualNics > 0) {
+      NCCLCHECK(ncclTopoPopulateNics(xml, nPhysicalNics, nPhysicalNics + nVirtualNics, net, /*virtual=*/true));
     }
   }
 
   return ncclSuccess;
 }
 
-static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
-ncclTopoNetState netStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclTopoNetState collNetStates[NCCL_NET_MAX_PLUGINS] = {};
-ncclResult_t ncclTopoGetSharedState(ncclTopoNetState** state, const char* name, ncclTopoNetState* states) {
-  INFO(NCCL_GRAPH, "Retrieving state for %s", name);
-  for (int i = 0; i < NCCL_NET_MAX_PLUGINS; i++) {
-    // Empty slot
-    if (states[i].name == NULL) {
-      states[i].nVirtualNics = -1;
-      states[i].nPhysicalNics = -1;
-      states[i].name = strdup(name);
-      *state = states + i;
-      INFO(NCCL_GRAPH, "Initialized state %d for %s", i, name);
-      return ncclSuccess;
-    // Found my slot
-    } else if (strcmp(states[i].name, name) == 0) {
-      *state = states + i;
-      return ncclSuccess;
-    }
+ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge) {
+  if (forceMerge) *forceMerge = ncclGetEnv("NCCL_NET_FORCE_MERGE");
+  const char* mergeLevelEnv = ncclGetEnv("NCCL_NET_MERGE_LEVEL");
+  if (mergeLevelEnv) {
+    kvConvertToInt(mergeLevelEnv, mergeLevel, nicPathKvList);
+  } else {
+    *mergeLevel = PATH_PORT;
   }
-  WARN("NET/TOPO : Couldn't find net with name %s", name);
-  return ncclInternalError;
+  return ncclSuccess;
 }
 
+static std::mutex netMutex;
+
 ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system, const char* dumpXmlFile) {
   ncclResult_t ret = ncclSuccess;
   struct ncclXml* xml;
@@ -1324,7 +1402,7 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   int* localRanks = NULL;
   struct ncclXml* rankXml;
   int localRank = -1, nLocalRanks = 0;
-  int netLockHeld = 0;
+  struct ncclTopoNetInfo netInfo = {0};
   NCCLCHECK(xmlAlloc(&xml, NCCL_TOPO_XML_MAX_NODES));
   const char* xmlTopoFile = ncclGetEnv("NCCL_TOPO_FILE");
   if (xmlTopoFile) {
@@ -1364,21 +1442,35 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
 
   // Auto-detect NICs if needed. net/collnet share the same xml/graph nodes,
   // so we start with collnet so that it has precedence.
-  pthread_mutex_lock(&netLock);
-  netLockHeld = 1;
-  INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
-  ncclTopoNetState* state;
-  state = NULL;
-  if (collNetSupport(comm)) {
-    NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclCollNet->name, collNetStates), ret, fail);
-    NCCLCHECKGOTO(ncclTopoProcessNet(xml, 1, dumpXmlFile, state,
-      comm->ncclCollNet->getProperties, comm->ncclCollNet->makeVDevice, comm->ncclCollNet->devices, comm->ncclCollNet->name, comm->dmaBufSupport), ret, fail);
-  }
-  NCCLCHECKGOTO(ncclTopoGetSharedState(&state, comm->ncclNet->name, netStates), ret, fail);
-  NCCLCHECKGOTO(ncclTopoProcessNet(xml, 0, dumpXmlFile, state,
-    comm->ncclNet->getProperties, comm->ncclNet->makeVDevice, comm->ncclNet->devices, comm->ncclNet->name, comm->dmaBufSupport), ret, fail);
-  pthread_mutex_unlock(&netLock);
-  netLockHeld = 0;
+  {
+      std::lock_guard<std::mutex> lock(netMutex);
+      INFO(NCCL_GRAPH, "TOPO/NET : Importing network plugins to topology");
+      if (collNetSupport(comm)) {
+        netInfo.coll = 1;
+        netInfo.netPluginIndex = comm->netPluginIndex;
+        netInfo.dmaBufSupport = comm->dmaBufSupport;
+        netInfo.getDevCount = ncclCollNetGetDevCount;
+        netInfo.setVirtDevCount = ncclCollNetSetVirtDevCount;
+        netInfo.name = comm->ncclCollNet->name;
+        netInfo.getProperties = comm->ncclCollNet->getProperties;
+        netInfo.makeVDevice = comm->ncclCollNet->makeVDevice;
+        netInfo.devices = comm->ncclCollNet->devices;
+        NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
+        NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
+      }
+
+      netInfo.coll = 0;
+      netInfo.netPluginIndex = comm->netPluginIndex;
+      netInfo.dmaBufSupport = comm->dmaBufSupport;
+      netInfo.getDevCount = ncclNetGetDevCount;
+      netInfo.setVirtDevCount = ncclNetSetVirtDevCount;
+      netInfo.name = comm->ncclNet->name;
+      netInfo.getProperties = comm->ncclNet->getProperties;
+      netInfo.makeVDevice = comm->ncclNet->makeVDevice;
+      netInfo.devices = comm->ncclNet->devices;
+      NCCLCHECK(ncclTopoGetFusionEnv(&netInfo.mergeLevel, &netInfo.forceMerge));
+      NCCLCHECKGOTO(ncclTopoProcessNet(xml, dumpXmlFile, &netInfo), ret, fail);
+  }
 
   // Remove XML branches which don't have a node with keep="1" (typically when importing a topology)
   NCCLCHECKGOTO(ncclTopoTrimXml(xml), ret, fail);
@@ -1436,7 +1528,6 @@ ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** sy
   free(xml);
   return ret;
 fail:
-  if (netLockHeld) pthread_mutex_unlock(&netLock);
   goto exit;
 }
 
@@ -1491,6 +1582,38 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
   return ncclSuccess;
 }
 
+enum netDevsPolicy {
+  NETDEVS_POLICY_AUTO = 0x0,
+  NETDEVS_POLICY_ALL = 0x1,
+  NETDEVS_POLICY_MAX = 0x2,
+  NETDEVS_POLICY_UNDEF = 0xffffffff
+};
+
+static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
+static int netDevsPolicyNum = -1;
+
+static void getNetDevsPolicyOnce() {
+  const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY");
+  if (envStr) {
+    if (strcasecmp(envStr, "AUTO") == 0) {
+      netDevsPolicy = NETDEVS_POLICY_AUTO;
+    } else if (strcasecmp(envStr, "ALL") == 0) {
+      netDevsPolicy = NETDEVS_POLICY_ALL;
+    } else if (strncasecmp(envStr, "MAX:", strlen("MAX:")) == 0) {
+      int envNum = atoi(envStr + strlen("MAX:"));
+      if (envNum > 0) {
+        netDevsPolicy = NETDEVS_POLICY_MAX;
+        netDevsPolicyNum = envNum;
+      }
+    }
+    if (netDevsPolicy == NETDEVS_POLICY_UNDEF)
+      INFO(NCCL_ENV, "Unable to recognize NCCL_NETDEVS_POLICY=%s, using NCCL_NETDEVS_POLICY_AUTO instead.", envStr);
+    else
+      INFO(NCCL_ENV, "NCCL_NETDEVS_POLICY set by environment to %s", envStr);
+  }
+  if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO;
+}
+
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
@@ -1503,13 +1626,30 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
     return ncclInternalError;
   }
 
-  int localGpus[NCCL_TOPO_MAX_NODES];
-  int localGpuCount;
-  NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+  static pthread_once_t once = PTHREAD_ONCE_INIT;
+  pthread_once(&once,getNetDevsPolicyOnce);
+  int netsPerGpu = 0;
+  if (netDevsPolicy == NETDEVS_POLICY_AUTO) {
+    int localGpus[NCCL_TOPO_MAX_NODES];
+    int localGpuCount;
+    NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
+    netsPerGpu = DIVUP(localNetCount, localGpuCount);
+  } else if (netDevsPolicy == NETDEVS_POLICY_ALL) {
+    netsPerGpu = localNetCount;
+  } else if (netDevsPolicy == NETDEVS_POLICY_MAX) {
+    if (netDevsPolicyNum <= 0) {
+      WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
+      return ncclInternalError;
+    }
+    netsPerGpu = std::min(netDevsPolicyNum, localNetCount);
+  } else {
+    WARN("Unknown netDevs policy");
+    return ncclInternalError;
+  }
 
   int net = system->nodes[GPU].nodes[gpu].gpu.dev;
   if (isPow2(localNetCount)) net = mirrorBits(net, localNetCount);
-  net += channelId%(DIVUP(localNetCount,localGpuCount));
+  net += channelId%(netsPerGpu);
   if (id) *id = system->nodes[NET].nodes[localNets[net%localNetCount]].id;
   if (dev) *dev = system->nodes[NET].nodes[localNets[net%localNetCount]].net.dev;
   return ncclSuccess;
@@ -1567,25 +1707,10 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
   cpu_set_t mask;
   SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity");
 
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", gpu->gpu.dev,
-          ncclCpusetToRangeStr(&mask, affinityStr, sizeof(affinityStr)));
-  }
-#endif
-
   // Get the affinity of the CPU close to our GPU.
   cpu_set_t cpuMask = cpu->cpu.affinity;
 
-#ifdef ENABLE_TRACE
-  {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", gpu->gpu.dev,
-          ncclCpusetToRangeStr(&cpuMask, affinityStr, sizeof(affinityStr)));
-  }
-#endif
-
+  // Get the final affinity
   cpu_set_t finalMask;
   if (ncclParamIgnoreCpuAffinity())
     // Ignore the CPU affinity set and use the GPU one instead
@@ -1596,12 +1721,22 @@ ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu
 
   memcpy(affinity, &finalMask, sizeof(cpu_set_t));
 
-  // If there is a non empty set, use it to set affinity
+  // display the final affinity
+  char msg[1024] = "";
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "Affinity for GPU %d is ", gpu->gpu.dev);
   if (CPU_COUNT(&finalMask)) {
-    char affinityStr[sizeof(cpu_set_t)*2];
-    INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", gpu->gpu.dev,
-         ncclCpusetToRangeStr(&finalMask, affinityStr, sizeof(affinityStr)));
+    (void)ncclCpusetToRangeStr(&finalMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
+  } else {
+    snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), "empty, ignoring");
+  }
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ". (GPU affinity = ");
+  (void)ncclCpusetToRangeStr(&cpuMask, msg + strlen(msg), sizeof(msg) - strlen(msg));
+  if (!ncclParamIgnoreCpuAffinity()) {
+    snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " ; CPU affinity = ");
+    (void)ncclCpusetToRangeStr(&mask, msg + strlen(msg), sizeof(msg) - strlen(msg));
   }
+  snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), ").");
+  INFO(NCCL_INIT, "%s: %s", __func__, msg);
   return ncclSuccess;
 }
 
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 9ef10ff2d..49d408d95 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -190,12 +190,26 @@ ncclResult_t ncclTopoGetGpuMinPath(struct ncclTopoSystem* system, int type, int*
 ncclResult_t ncclTopoGetGpuMaxPath(struct ncclTopoSystem* system, int type, int* max);
 ncclResult_t ncclTopoSplitNvLink(struct ncclTopoSystem* system, int* splitNvLink);
 
-struct ncclTopoNetState {
-  int nVirtualNics;
-  int nPhysicalNics;
+struct ncclTopoNetInfo {
+  bool coll;
+  // communicator-specific information
+  int netPluginIndex;
+  bool dmaBufSupport;
+  // NIC fusion
+  int mergeLevel;
+  const char* forceMerge;
+  // dev count tracking functions (not part of ncclNet)
+  ncclResult_t (*getDevCount)(int, int*, int*);
+  ncclResult_t (*setVirtDevCount)(int, int);
+  // ncclNet API functions
   const char* name;
+  ncclResult_t (*getProperties)(int, ncclNetProperties_t*);
+  ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*);
+  ncclResult_t (*devices)(int*);
 };
-ncclResult_t ncclTopoProcessNet(ncclXml* xml, int coll, const char* dumpXmlFile, ncclTopoNetState* state, ncclResult_t (*getProperties)(int, ncclNetProperties_t*), ncclResult_t (*makeVDevice)(int*, ncclNetVDeviceProps_t*), ncclResult_t (*devices)(int*), const char* netName, bool dmaBufSupport);
+
+ncclResult_t ncclTopoProcessNet(ncclXml* xml, const char* dumpXmlFile, struct ncclTopoNetInfo* net);
+ncclResult_t ncclTopoGetFusionEnv(int* mergeLevel, const char** forceMerge);
 
 #define NCCL_TOPO_XML_MAX_NODES 256
 #define NCCL_GRAPH_XML_MAX_NODES 4096
@@ -240,6 +254,8 @@ static ncclResult_t ncclTopoDevToRank(struct ncclTopoSystem* system, int dev, in
   return ncclInternalError;
 }
 
+extern struct kvDict nicPathKvList[];
+
 static ncclResult_t ncclTopoIdToNetDev(struct ncclTopoSystem* system, int64_t id, int* netDev) {
   *netDev = -1;
   for (int i=0; i<system->nodes[NET].count; i++) {
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index 8e99f18c3..bfb279850 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -8,6 +8,7 @@
 #include "device.h"
 #include "comm.h"
 #include "topo.h"
+#include "nccl_tuner.h"
 
 NCCL_PARAM(Nthreads, "NTHREADS", -2);
 NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
@@ -129,63 +130,72 @@ ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes
   goto exit;
 }
 
-// Latencies in us, Bandwidths in GB/s
-// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple }
-static const float baseLat  [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = {
-       {  6.8, 14.0,  8.4 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
-       {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
-       {    0,    0,    0 }, {    0,    0,    0 }}; // NVLS, NVLS Tree
-
-// NVLink, PCI, Network
-#define NCCL_HW_NVLINK 0
-#define NCCL_HW_PCI 1
-#define NCCL_HW_NET 2
-static float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] =
-{ /* NVLINK */
-  { /* Tree (LL/LL128/Simple)*/ { .6, 1.25, 4.0 }, /* Ring (LL/LL128/Simple)*/ { .6, 1.9, 3.4 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
-    /* NVLS */ { 0, 0, 25 }, /* NVLSTree */ { 0, 0, 25 } },
-  /* PCI */
-  { /* Tree (LL/LL128/Simple)*/ { 1.0, 1.9, 4.0 }, /* Ring (LL/LL128/Simple)*/ { 1.0, 2.5, 5.7 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 3.7 }, /* CollNetChain (Simple)*/ { 0, 0, 2.8 },
-    /* NVLS */ { 0, 0, 0 }, /* NVLSTree */ { 0, 0, 0 } },
-  /* NET */
-  { /* Tree (LL/LL128/Simple)*/ { 5.0, 8.5, 14 }, /* Ring (LL/LL128/Simple)*/ { 2.7, 4.0, 14.0 },
-    /* CollNetDirect (Simple)*/ { 0, 0, 31 }, /* CollNetChain (Simple)*/ { 0, 0, 30 },
-    /* NVLS */ { 0, 0, 18 }, /* NVLSTree */ { 0, 0, 14 } }
-};
-
-/* Array indexes used below */
-#define VOLTA_COMPCAP_IDX 0
-#define AMPERE_COMPCAP_IDX 1
-#define HOPPER_COMPCAP_IDX 2
-#define BLACKWELL_COMPCAP_IDX 3
-
-// LL128 max BW per channel
-static const double llMaxBws[][3] = {
-  /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4},
-  /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0},
-  /* Hopper-N1/AMD-N2/AMD-N4) */ {141.0, 45.0 /*avg of ring & tree*/, 35.0},
-  /* Blackwell-N1/AMD-N2/AMD-N4) */ {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0},
+// NVLS efficiency factor.
+static const float nvlsEfficiency[NCCL_NUM_COMPCAPS] = {
+  0.0f, // Volta
+  0.0f, // Ampere
+  0.85f, // Hopper
+  0.74f, // Blackwell
 };
 
-static const double perChMaxRingLL128Bws[][3] = {
-  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
-  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 36.7},
-  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*36.7},
-};
-static const double perChMaxTreeLL128Bws[][3] = {
-  /* Volta (N1/N2/N4) */  {20.0, 20.0, 20.0},
-  /* Ampere (N1/N2/N4) */ {20.0, 20.0, 20.0},
-  /* Hopper (N1/N2/N4) */ {36.7, 36.7, 29.0},
-  /* Blackwell (N1/N2/N4) */ {2*36.7, 2*36.7, 2*29.0},
-};
-static const double perChMaxTreeBws[][3] = {
-  /* Volta (N1/N2/N4) */  {26.5, 18.5, 10.0},
-  /* Ampere (N1/N2/N4) */ {24.0, 23.6, 17.8},
-  /* Hopper (N1/N2/N4) */ {38.7, 41.4, 36.0},
-  /* Blackwell (N1/N2/N4) */ {2*38.7, 2*41.4, 2*36.0},
+// Default tuner constants
+static const ncclTunerConstants_t ncclTunerConstantsDefaults = {
+  .baseLatencies = {
+    {  6.8, 14.0,  8.4 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+    {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
+    {    0,    0,    0 }, {    0,    0,    0 },  // NVLS, NVLS Tree
+    {  8.0,  8.0,  8.0 }                         // PAT
+    },
+  .hwLatencies = {
+  /* NVLINK */
+  { { .6, 1.25, 4.0 }, { .6, 1.9, 3.4 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,  25 }, {  0,   0,  25 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* PCI */
+  { { 1.0, 1.9, 4.0 }, { 1.0, 2.5, 5.7 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,   0 }, {  0,   0,    0 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* NET */
+  { { 5.0, 8.5, 14 }, { 2.7, 4.0, 14.0 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {   0,   0, 31 }, {   0,   0,   30 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {   0,   0, 18 }, {   0,   0,   14 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {   0,   0, 14 } /* PAT (LL/LL128/Simple)*/
+    },
+  },
+  .llMaxBws = {
+     {39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */
+     {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */
+     {141.0, 45.0 /*avg of ring & tree*/, 35.0}, /* Hopper-N1/AMD-N2/AMD-N4) */
+     {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, /* Blackwell-N1/AMD-N2/AMD-N4) */
+  },
+  .perChMaxRingLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */
+    {2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 29.0}, /* Hopper (N1/N2/N4) */
+    {55.6, 31.67, 20.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {38.7, 41.4, 36.0}, /* Hopper (N1/N2/N4) */
+    {70.0, 42.8, 24.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxNVLSTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {0.0, 57.7, 45.5}, /* Hopper (N1/N2/N4) */
+    {0.0, 96.0, 43.1} /* Blackwell (N1/N2/N4) */
+  }
 };
 
 NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
@@ -210,6 +220,13 @@ static float getNetOverhead(struct ncclComm* comm) {
 
 NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
 
+ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) {
+
+  comm->tunerConstants = ncclTunerConstantsDefaults;
+
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
   int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
   comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
@@ -229,17 +246,18 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
   int nRanks = comm->nRanks;
   if (nRanks <= 1) return ncclSuccess;
 
-  int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? HOPPER_COMPCAP_IDX : minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
+  int compCapIndex = minCompCap >= 100 ? NCCL_BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? NCCL_HOPPER_COMPCAP_IDX : minCompCap >= 80 ? NCCL_AMPERE_COMPCAP_IDX : NCCL_VOLTA_COMPCAP_IDX);
   int index2 = nNodes <= 2 ? nNodes-1 : 2;
   // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
   int index1 = nNodes == 1 ? compCapIndex :
                (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
-  double llMaxBw = llMaxBws[index1][index2];
-  double perChMaxTreeBw = perChMaxTreeBws[compCapIndex][index2];
-  double perChMaxRingLL128Bw = perChMaxRingLL128Bws[compCapIndex][index2];
-  double perChMaxTreeLL128Bw = perChMaxTreeLL128Bws[compCapIndex][index2];
+  double llMaxBw = comm->tunerConstants.llMaxBws[index1][index2];
+  double perChMaxTreeBw = comm->tunerConstants.perChMaxTreeBws[compCapIndex][index2];
+  double perChMaxRingLL128Bw = comm->tunerConstants.perChMaxRingLL128Bws[compCapIndex][index2];
+  double perChMaxTreeLL128Bw = comm->tunerConstants.perChMaxTreeLL128Bws[compCapIndex][index2];
+  double perChMaxNVLSTreeBw = comm->tunerConstants.perChMaxNVLSTreeBws[compCapIndex][index2];
   // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
-  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) hwLat[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = hwLat[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
   float ppn = (float)nRanks / nNodes;
 
   int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
@@ -264,15 +282,22 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
             && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
         int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
         float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
-        if (a == NCCL_ALGO_NVLS) {
+        if (a == NCCL_ALGO_NVLS_TREE || a == NCCL_ALGO_NVLS)
+        {
+          // NVLS/NVLStree needs at least 2 channels
+          if (graphs[a]->nChannels < 2 ) continue;
+          // Convert to NVLS busBW/channel
+          float intraBw = graphs[a]->bwIntra * nvlsEfficiency[compCapIndex] * (graphs[a]->nChannels - 1) / graphs[a]->nChannels;
+	  // AllReduce pipelines two operations.
           if (coll == ncclFuncAllReduce) {
-            bw = std::min(graphs[a]->bwIntra, graphs[a]->bwInter);
+            intraBw *= 2.0f;
           } else {
-            // allgather and reducescatter
-            bw = std::min(graphs[a]->bwIntra * (ppn - 1.0f) / ppn, graphs[a]->bwInter * 0.9f);
+            intraBw *= (ppn - 1) / ppn;
           }
-        }
-        if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);
+          // Handle 2 node case of NVLSTree
+          float interBw = graphs[a]->bwInter * ((nNodes <= 2 && a == NCCL_ALGO_NVLS_TREE) ? 2 : 1);
+          bw = std::min( {intraBw, interBw, a == NCCL_ALGO_NVLS_TREE ? (float)perChMaxNVLSTreeBw : std::numeric_limits<float>::max()} );
+        };
         float busBw = graphs[a]->nChannels * bw;
 
         // Various model refinements
@@ -320,27 +345,26 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
         // Convert bus BW to algorithm BW
         if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
           float ratio = 1.0f;
-          if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps;
-          else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0;
+          if (a == NCCL_ALGO_RING || a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= (1.0 * nRanks) / nsteps;
           else ratio *= .5;
           busBw *= ratio;
         }
         comm->bandwidths[coll][a][p] = busBw;
-        comm->latencies[coll][a][p] = baseLat[a][p];
-        float intraLat = hwLat[intraHw[a]][a][p];
+        comm->latencies[coll][a][p] = comm->tunerConstants.baseLatencies[a][p];
+        float intraLat = comm->tunerConstants.hwLatencies[intraHw[a]][a][p];
         // With ppn=1 latencies are fully exposed, use the Tree network latency
-        float interLat = ppn == 1 ? hwLat[NCCL_HW_NET][NCCL_ALGO_TREE][p] : hwLat[NCCL_HW_NET][a][p];
+        float interLat = ppn == 1 ? comm->tunerConstants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_TREE][p] : comm->tunerConstants.hwLatencies[NCCL_HW_NET][a][p];
         interLat += graphs[a]->latencyInter;
         // Also add the flush extra latency
         if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
 
         if (a == NCCL_ALGO_RING) {
-          float lat = hwLat[hw[a]][a][p];
+          float lat = comm->tunerConstants.hwLatencies[hw[a]][a][p];
           if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
             if (graphs[a]->sameChannels) {
               comm->latencies[coll][a][p] += lat;
             } else {
-              if (p == NCCL_PROTO_SIMPLE) lat = hwLat[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
+              if (p == NCCL_PROTO_SIMPLE) lat = comm->tunerConstants.hwLatencies[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
               comm->latencies[coll][a][p] += nsteps*lat;
             }
           } else {
@@ -371,8 +395,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
           comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
         } else if (a == NCCL_ALGO_PAT) {
           if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
-            comm->latencies[coll][a][p] = 8 // Base time
-              + log2i(nNodes) * (interLat/3.5) // Log latency
+            comm->latencies[coll][a][p] += log2i(nNodes) * (interLat/3.5) // Log latency
               + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
           }
         }
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 96b0c9a7c..010120627 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -917,31 +917,33 @@ ncclResult_t ncclTopoFillNet(struct ncclXml* xml, const char* pciPath, const cha
 
   if (*netNode != NULL) return ncclSuccess;
 
-  const char* pciSysPath = pciPath;
-  if (pciSysPath) {
-    char subSystem[PATH_MAX];
-    NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
-    // This is not a PCI device (virtual, usb, ...).
-    if (strcmp(subSystem, "pci") != 0) {
-      INFO(NCCL_NET|NCCL_GRAPH, "Topology detection: network path %s is not a PCI device (%s). Attaching to first CPU", pciSysPath, subSystem);
-      pciSysPath = NULL;
-    }
-  }
-
   struct ncclXmlNode* parent = NULL;
   if (forceParent) {
     parent = forceParent;
-  } else if (pciSysPath) {
-    int offset;
-    for (offset=strlen(pciSysPath)-1; pciSysPath[offset] != '/'; offset--);
-    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    strcpy(busId, pciSysPath+offset+1);
-    NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
-    NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
-    NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
   } else {
-    // Virtual NIC, no PCI device, attach to first CPU
-    NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+    const char* pciSysPath = pciPath;
+    if (pciSysPath) {
+      char subSystem[PATH_MAX];
+      NCCLCHECK(ncclTopoGetSubsystem(pciSysPath, subSystem));
+      // This is not a PCI device (virtual, usb, ...).
+      if (strcmp(subSystem, "pci") != 0 && !forceParent) {
+        INFO(NCCL_NET | NCCL_GRAPH, "Topology detection: network path (name = %s) %s is not a PCI device (%s). Attaching to first CPU", netName, pciSysPath, subSystem);
+        pciSysPath = NULL;
+      }
+    }
+
+    if (pciSysPath) {
+      int offset;
+      for (offset = strlen(pciSysPath) - 1; pciSysPath[offset] != '/'; offset--);
+      char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+      strcpy(busId, pciSysPath + offset + 1);
+      NCCLCHECK(ncclTopoGetPciNode(xml, busId, &parent));
+      NCCLCHECK(xmlSetAttrIfUnset(parent, "class", "0x02"));
+      NCCLCHECK(ncclTopoGetXmlFromSys(parent, xml));
+    } else {
+      // Virtual NIC, no PCI device, attach to first CPU
+      NCCLCHECK(xmlFindTag(xml, "cpu", &parent));
+    }
   }
 
   struct ncclXmlNode* nicNode = NULL;
diff --git a/src/graph/xml.h b/src/graph/xml.h
index ad9f0faff..ac9ef7286 100644
--- a/src/graph/xml.h
+++ b/src/graph/xml.h
@@ -124,6 +124,13 @@ static ncclResult_t xmlGetAttrUint64(struct ncclXmlNode* node, const char* attrN
   return ncclSuccess;
 }
 
+static ncclResult_t xmlGetAttrUint64Default(struct ncclXmlNode* node, const char* attrName, uint64_t* value, uint64_t defaultValue) {
+  const char* str;
+  NCCLCHECK(xmlGetAttr(node, attrName, &str));
+  *value = str ? strtoull(str, NULL, 0) : defaultValue;
+  return ncclSuccess;
+}
+
 static ncclResult_t xmlGetAttrLong(struct ncclXmlNode* node, const char* attrName, int64_t* value) {
   const char* str;
   NCCLCHECK(xmlGetAttrStr(node, attrName, &str));
diff --git a/src/group.cc b/src/group.cc
index 08ac54e9e..aa2824412 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -11,6 +11,9 @@
 #include "channel.h"
 #include <assert.h>
 #include "bootstrap.h"
+#include "ce_coll.h"
+#include "profiler.h"
+#include "nvtx.h"
 
 #define GROUP_MAX_RECLAIM_STEPS 10
 
@@ -90,7 +93,7 @@ ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job) {
 NCCL_API(ncclResult_t, ncclGroupStart);
 ncclResult_t ncclGroupStart() {
   ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
 
   NCCLCHECK(ncclGroupStartInternal());
   TRACE_CALL("ncclGroupStart()");
@@ -100,7 +103,7 @@ ncclResult_t ncclGroupStart() {
 NCCL_API(ncclResult_t, ncclGroupEnd);
 ncclResult_t ncclGroupEnd() {
   ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
   NCCLCHECKGOTO(ncclGroupEndInternal(), ret, exit);
   TRACE_CALL("ncclGroupEnd()");
 exit:
@@ -110,7 +113,7 @@ ncclResult_t ncclGroupEnd() {
 NCCL_API(ncclResult_t, ncclGroupSimulateEnd, ncclSimInfo_t* simInfo);
 ncclResult_t ncclGroupSimulateEnd(ncclSimInfo_t* simInfo) {
   ncclResult_t ret = ncclSuccess;
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
   NCCLCHECKGOTO(ncclGroupEndInternal(simInfo), ret, exit);
   TRACE_CALL("ncclGroupSimulateEnd()");
 exit:
@@ -123,64 +126,87 @@ struct ncclPreconnectJob {
   bool* algoNeedConnect;
 };
 
+struct ncclPrepareTasksAndCollPreconnectJob {
+  struct ncclAsyncJob base;
+  struct ncclComm* comm;
+  ncclSimInfo_t* simInfo;
+};
+
 ncclResult_t ncclP2PPreconnectFunc(struct ncclAsyncJob* job_) {
   struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
   struct ncclComm* comm = job->comm;
   CUDACHECK(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
   NCCLCHECK(ncclTransportP2pSetup(comm, NULL, 1));
   return ncclSuccess;
 }
 
-ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
-  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
-  struct ncclComm* comm = job->comm;
-  ncclResult_t ret = ncclSuccess;
-
-  CUDACHECK(cudaSetDevice(comm->cudaDev));
-  if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+static ncclResult_t ncclCollPreconnect(struct ncclComm* comm, bool* algoNeedConnect) {
   for (int i = 0; i < NCCL_NUM_ALGORITHMS; ++i) {
-    if (job->algoNeedConnect[i]) {
+    if (algoNeedConnect[i]) {
       switch (i) {
         case NCCL_ALGO_RING: {
-          NCCLCHECKGOTO(ncclTransportRingConnect(comm), ret, fail);
+          NCCLCHECK(ncclTransportRingConnect(comm));
           break;
         }
         case NCCL_ALGO_TREE: {
-          NCCLCHECKGOTO(ncclTransportTreeConnect(comm), ret, fail);
+          NCCLCHECK(ncclTransportTreeConnect(comm));
           break;
         }
         case NCCL_ALGO_NVLS: {
           /* If we are using NVLS_TREE algo, we must mark NVLS algo to set up
            * NVLS intra-node buffer */
-          NCCLCHECKGOTO(ncclNvlsBufferSetup(comm), ret, fail);
+          NCCLCHECK(ncclNvlsBufferSetup(comm));
           break;
         }
         case NCCL_ALGO_NVLS_TREE: {
-          NCCLCHECKGOTO(ncclNvlsTreeConnect(comm), ret, fail);
+          NCCLCHECK(ncclNvlsTreeConnect(comm));
           break;
         }
         case NCCL_ALGO_COLLNET_CHAIN: {
-          NCCLCHECKGOTO(ncclCollNetChainBufferSetup(comm), ret, fail);
+          NCCLCHECK(ncclCollNetChainBufferSetup(comm));
           break;
         }
         case NCCL_ALGO_COLLNET_DIRECT: {
-          NCCLCHECKGOTO(ncclCollNetDirectBufferSetup(comm), ret, fail);
+          NCCLCHECK(ncclCollNetDirectBufferSetup(comm));
           break;
         }
         case NCCL_ALGO_PAT: {
-          NCCLCHECKGOTO(ncclTransportPatConnect(comm), ret, fail);
+          NCCLCHECK(ncclTransportPatConnect(comm));
           break;
         }
         // Yes, it's a dead code.  That's fine...
         // coverity[dead_error_begin]
         default: {
-          ret = ncclInternalError;
-          goto fail;
+          NCCLCHECK(ncclInternalError);
         }
       }
     }
   }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclPrepareTasksAndCollPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPrepareTasksAndCollPreconnectJob* job = (ncclPrepareTasksAndCollPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  bool needConnect;
+  bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+  memset(algoNeedConnect, 0, sizeof(bool)*NCCL_NUM_ALGORITHMS);
+  CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, job->simInfo));
+  if (comm->cuMemSupport && needConnect) NCCLCHECK(ncclCollPreconnect(comm, algoNeedConnect));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclCollPreconnectFunc(struct ncclAsyncJob* job_) {
+  struct ncclPreconnectJob* job = (struct ncclPreconnectJob*)job_;
+  struct ncclComm* comm = job->comm;
+  ncclResult_t ret = ncclSuccess;
+
+  if (!job_->isThreadMain) CUDACHECK(cudaSetDevice(comm->cudaDev));
+  if (!job_->isThreadMain && CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+  NCCLCHECKGOTO(ncclCollPreconnect(comm, job->algoNeedConnect), ret, fail);
 
 exit:
   free(job->algoNeedConnect);
@@ -194,52 +220,33 @@ struct ncclGroupSymmetricJob {
   struct ncclComm* comm;
 };
 
-NCCL_PARAM(WinStride, "WIN_STRIDE", -1);
-
 ncclResult_t ncclCommGroupRegisterSymmetric(struct ncclAsyncJob* job_) {
   struct ncclGroupSymmetricJob* job = (struct ncclGroupSymmetricJob*)job_;
   struct ncclComm* comm = job->comm;
   ncclResult_t ret = ncclSuccess;
 
   CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-  if (comm->baseStride == 0) {
-    cudaStream_t hostStream;
-    // first time to allocate symmetric VA space.
-    // calling into this function means symmetric is supported.
-    struct ncclSymDevBase* symBase = NULL;
-    size_t size = ncclSymDevBase::size(comm->localRanks);
-    if (ncclParamWinStride() != -1) {
-      comm->baseStride = ncclParamWinStride();
-    } else {
-      size_t maxStride = 0;
-      for (int r = 0; r < comm->nRanks; ++r)
-        if (comm->peerInfo[r].totalGlobalMem > maxStride) maxStride = comm->peerInfo[r].totalGlobalMem;
-      comm->baseStride = maxStride;
-    }
-    INFO(NCCL_INIT, "rank %d base stride %zuGB total VM %zuGB", comm->rank, comm->baseStride >> 30, (comm->baseStride * comm->localRanks) >> 30);
-    NCCLCHECKGOTO(ncclIpcSymmetricInit(comm), ret, fail);
-    NCCLCHECKGOTO(ncclNvlsSymmetricInit(comm), ret, fail);
-    comm->symAllocHead = 0;
-
-    // Allocate symmetric memory for NCCL internal usage
-    NCCLCHECKGOTO(ncclCommSymmetricAllocInternal(comm, size, alignof(struct ncclSymDevBase), (void**)&symBase), ret, fail);
-    assert((void*)symBase == (void*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride));
-    NCCLCHECKGOTO(ncclStrongStreamAcquire(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false, &hostStream), ret, fail);
-    CUDACHECKGOTO(cudaMemsetAsync(symBase, 0, size, hostStream), ret, fail);
-    CUDACHECKGOTO(cudaStreamSynchronize(hostStream), ret, fail);
-    NCCLCHECKGOTO(ncclStrongStreamRelease(ncclCudaGraphNone(), &comm->sharedRes->hostStream, /*concurrent=*/false), ret, fail);
-
-    comm->symDevComm.base = (struct ncclSymDevBase*)(comm->baseUCSymPtr + comm->localRank * comm->baseStride);
-    comm->symDevComm.baseMc = (struct ncclSymDevBase*)comm->baseMCSymPtr;
-    comm->symDevComm.nRanks = comm->localRanks;
-    comm->symDevComm.nRanks_rcp32 = idivRcp32(comm->localRanks);
-    comm->symDevComm.rank = comm->localRank;
-    comm->symDevComm.stride4G = comm->baseStride >> 32;
+
+  while (!ncclIntruQueueEmpty(&comm->devrState.regTaskQueue)) {
+    struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&comm->devrState.regTaskQueue);
+    NCCLCHECKGOTO(ncclDevrWindowRegisterInGroup(
+      comm, task->userPtr, task->userSize, task->winFlags, task->outWinDev),
+      ret, fail);
+    free(task);
   }
 
-  while (!ncclIntruQueueEmpty(&comm->symRegTaskQueue)) {
-    struct ncclSymRegTask* task = ncclIntruQueueDequeue(&comm->symRegTaskQueue);
-    NCCLCHECKGOTO(ncclCommSymmetricRegisterInternal(comm, task->buff, task->baseSize, task->alignment, task->memHandle, task->regHandle), ret, fail);
+  while (!ncclIntruQueueEmpty(&comm->devrState.commCreateTaskQueue)) {
+    struct ncclDevrCommCreateTask* task = ncclIntruQueueDequeue(&comm->devrState.commCreateTaskQueue);
+    NCCLCHECKGOTO(ncclDevrCommCreateInternal(
+      comm, (struct ncclDevCommRequirements const*)task->reqs, task->outDevComm),
+      ret, fail);
+    freeDevCommRequirements(task->reqs); // free additional task memory for reqs
+    free(task);
+  }
+
+  while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
+    struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
+    NCCLCHECKGOTO(ncclCeInit(task->comm), ret, fail);
     free(task);
   }
 
@@ -296,7 +303,11 @@ static ncclResult_t doLaunches(struct ncclComm* head) {
             comm->planner.unlaunchedPlansHead = plan->next;
             CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), result, failure);
             NCCLCHECKGOTO(ncclLaunchKernelBefore_NoUncapturedCuda(comm, plan), result, failure);
-            NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+            if (plan->isCeColl) {
+              NCCLCHECKGOTO(ncclLaunchCeColl(comm, plan), result, failure);
+            } else {
+              NCCLCHECKGOTO(ncclLaunchKernel(comm, plan), result, failure);
+            }
           }
           // Barrier reduction input indicates if we require further rounds.
           if (useBarrier) ncclCommIntraBarrierIn(comm, comm->planner.unlaunchedPlansHead != nullptr ? 1 : 0);
@@ -392,6 +403,12 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
 
   if (!ncclIntruQueueEmpty(asyncJobsMain)) {
     struct ncclAsyncJob* job = ncclIntruQueueHead(asyncJobsMain);
+    if (job->next == nullptr) {
+      job->isThreadMain = true;
+      ncclAsyncJobMain(job);
+      job->state = ncclGroupJobJoined;
+      return job->result;
+    }
     do {
       PTHREADCHECKGOTO(pthread_create(&job->thread, nullptr, ncclAsyncJobMain, job), "pthread_create", ret, fail);
       job = job->next;
@@ -444,6 +461,51 @@ static ncclResult_t asyncJobLaunch(struct ncclIntruQueue<struct ncclAsyncJob, &n
   goto exit;
 }
 
+NCCL_PARAM(SingleProcMemRegEnable, "SINGLE_PROC_MEM_REG_ENABLE", 0);
+
+static ncclResult_t ncclPrepareTasksAndCollPreconnect(struct ncclComm* comm, ncclSimInfo_t* simInfo, struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next>* asyncCollJobs) {
+  if (ncclParamSingleProcMemRegEnable()) {
+    struct ncclPrepareTasksAndCollPreconnectJob* job;
+    NCCLCHECK(ncclCalloc(&job, 1));
+    job->base.func = ncclPrepareTasksAndCollPreconnectFunc;
+    job->base.undo = nullptr;
+    job->base.destructor = free;
+    job->base.state = ncclGroupJobRunning;
+    job->base.abortFlag = comm->abortFlag;
+    job->base.abortFlagDev = comm->abortFlagDev;
+    job->comm = comm;
+    job->simInfo = simInfo;
+    ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
+  } else {
+    bool needConnect = false;
+    bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
+    memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+
+    CUDACHECK(cudaSetDevice(comm->cudaDev));
+    NCCLCHECK(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo));
+
+    if (comm->cuMemSupport && needConnect) {
+      ncclResult_t ret;
+      struct ncclPreconnectJob* job;
+      NCCLCHECK(ncclCalloc(&job, 1));
+      job->base.func = ncclCollPreconnectFunc;
+      job->base.undo = nullptr;
+      job->base.destructor = free;
+      job->base.state = ncclGroupJobRunning;
+      job->base.abortFlag = comm->abortFlag;
+      job->base.abortFlagDev = comm->abortFlagDev;
+      job->comm = comm;
+      if ((ret = ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS))) {
+        free(job);
+        NCCLCHECK(ret);
+      }
+      memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
+      ncclIntruQueueEnqueue(asyncCollJobs, &job->base);
+    }
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInfo = NULL) {
   ncclResult_t ret = ncclSuccess;
   struct ncclGroupJob *gjob = (struct ncclGroupJob*) job_;
@@ -518,27 +580,7 @@ static ncclResult_t groupLaunch(struct ncclAsyncJob *job_, ncclSimInfo_t* simInf
       // at the same time.
       comm = cliqueHead;
       do {
-        bool needConnect = false;
-        bool algoNeedConnect[NCCL_NUM_ALGORITHMS];
-        memset(algoNeedConnect, 0, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-
-        CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-        NCCLCHECKGOTO(ncclPrepareTasks(comm, algoNeedConnect, &needConnect, simInfo), ret, fail);
-
-        if (comm->cuMemSupport && needConnect) {
-          struct ncclPreconnectJob* job;
-          NCCLCHECKGOTO(ncclCalloc(&job, 1), ret, fail);
-          job->base.func = ncclCollPreconnectFunc;
-          job->base.undo = nullptr;
-          job->base.destructor = free;
-          job->base.state = ncclGroupJobRunning;
-          job->base.abortFlag = comm->abortFlag;
-          job->base.abortFlagDev = comm->abortFlagDev;
-          job->comm = comm;
-          NCCLCHECKGOTO(ncclCalloc(&job->algoNeedConnect, NCCL_NUM_ALGORITHMS), ret, fail);
-          memcpy(job->algoNeedConnect, algoNeedConnect, sizeof(bool) * NCCL_NUM_ALGORITHMS);
-          ncclIntruQueueEnqueue(&asyncCollJobs, &job->base);
-        }
+        NCCLCHECKGOTO(ncclPrepareTasksAndCollPreconnect(comm, simInfo, &asyncCollJobs), ret, fail);
         comm = comm->groupNext[ncclGroupTaskTypeCollective];
       } while (comm != nullptr && comm->intraComm0 == cliqueHead->intraComm0);
       // connect
@@ -617,6 +659,13 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
     goto exit;
   }
 
+  if (ncclProfilerApiState.profilerGroupDepth > 0) {
+    ncclProfilerApiState.profilerGroupDepth--;
+  }
+  if (ncclProfilerApiState.profilerGroupDepth == 0) {
+    NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupEndApiStart));
+  }
+
   if ((--ncclGroupDepth) > 0) goto exit;
 
   if ((ret = ncclGroupError) != ncclSuccess) goto fail;
@@ -701,6 +750,8 @@ ncclResult_t ncclGroupEndInternal(ncclSimInfo_t* simInfo) {
   groupLocalResetJobState();
 
 exit:
+  // Profiler group API start is called inside taskAppend to get graph capture information for the event
+  NCCLCHECK(ncclProfilerStopGroupApiEvent());
   return ret;
 fail:
   if (groupJob) {
diff --git a/src/include/allocator.h b/src/include/allocator.h
index 189c3d4e2..05da29a62 100644
--- a/src/include/allocator.h
+++ b/src/include/allocator.h
@@ -7,7 +7,55 @@
 #ifndef NCCL_ALLOCATOR_H_
 #define NCCL_ALLOCATOR_H_
 
-ncclResult_t ncclCommSymmetricAllocInternal(struct ncclComm* comm, size_t size, size_t alignment, void** symPtr);
-ncclResult_t ncclCommSymmetricFreeInternal(struct ncclComm* comm, void* symPtr);
+////////////////////////////////////////////////////////////////////////////////
+// ncclSpace: Allocates contiguous segments of non-negative integers. Useful
+// as a memory allocator when we can't put allocator state within the memory
+// being allocated.
+
+struct ncclSpace {
+  int count;
+  int capacity;
+  int64_t* cuts;
+};
+
+void ncclSpaceConstruct(struct ncclSpace* a);
+void ncclSpaceDestruct(struct ncclSpace* a);
+ncclResult_t ncclSpaceAlloc(struct ncclSpace* a, int64_t spaceLimit, int64_t objSize, int objAlign, int64_t* outObjOffset);
+ncclResult_t ncclSpaceFree(struct ncclSpace* a, int64_t objOffset, int64_t objSize);
+
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclShadowPool: Allocates device-side objects, their host-side shadows, and
+// maintains the device->host object address mapping.
+
+struct ncclShadowObject;
+struct ncclShadowPage;
+struct ncclShadowPool {
+  int count, hbits;
+  struct ncclShadowObject** table;
+  cudaMemPool_t memPool;
+  struct ncclShadowPage* pages;
+};
+
+void ncclShadowPoolConstruct(struct ncclShadowPool*);
+ncclResult_t ncclShadowPoolDestruct(struct ncclShadowPool*);
+ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool*, size_t size, void** outDevObj, void** outHostObj, cudaStream_t stream);
+ncclResult_t ncclShadowPoolFree(struct ncclShadowPool*, void* devObj, cudaStream_t stream);
+ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool*, void* devObj, void** outHostObj);
+
+template<typename T>
+static inline ncclResult_t ncclShadowPoolAlloc(struct ncclShadowPool* pool, T** outDevObj, T** outHostObj, cudaStream_t stream) {
+  void* devObj;
+  void* hostObj;
+  ncclResult_t got = ncclShadowPoolAlloc(pool, sizeof(T), &devObj, &hostObj, stream);
+  if (outDevObj) *outDevObj = (T*)devObj;
+  if (outHostObj) *outHostObj = (T*)hostObj;
+  return got;
+}
+
+template<typename T>
+static inline ncclResult_t ncclShadowPoolToHost(struct ncclShadowPool* pool, T* devObj, T** hostObj) {
+  return ncclShadowPoolToHost(pool, (void*)devObj, (void**)hostObj);
+}
 
 #endif
diff --git a/src/include/bitops.h b/src/include/bitops.h
index 71053ed49..badc91b50 100644
--- a/src/include/bitops.h
+++ b/src/include/bitops.h
@@ -41,6 +41,9 @@ constexpr static __host__ __device__ Int maxval(Int a, Int b, More ...more) {
   #endif
 }
 
+#define BIT(x) (1UL << (x))
+#define MASK(x) ((1UL << x) - 1UL)
+
 #define DIVUP(x, y) \
     (((x)+(y)-1)/(y))
 
@@ -68,14 +71,26 @@ static __host__ __device__ constexpr Z roundDown(X x, Y y) {
 }
 
 // assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-static __host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x + a-1) & Z(-a);
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+static __host__ __device__ constexpr Z alignUp(X x, Y a) {
+  return (x + a-1) & -Z(a);
 }
+template<typename T>
+static __host__ __device__ T* alignUp(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
+}
+
 // assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-static __host__ __device__ constexpr Z alignDown(X x, int a) {
-  return x & Z(-a);
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+static __host__ __device__ constexpr Z alignDown(X x, Y a) {
+  return x & -Z(a);
+}
+
+template<typename T>
+static __host__ __device__ T* alignDown(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
 }
 
 template<typename Int>
@@ -341,7 +356,7 @@ static __host__ UInt reverseSubBits(UInt x) {
     default: static_assert(8*sizeof(UInt) <= 64, "Unsupported integer type.");
     }
     return reverseSubBits<UInt, 8>(x);
-  } else if (nSubBits == 1) {
+  } else if (nSubBits <= 1) {
     return x;
   } else {
     UInt m = UInt(-1)/((UInt(1)<<(nSubBits/2))+1);
diff --git a/src/include/ce_coll.h b/src/include/ce_coll.h
new file mode 100644
index 000000000..e47effb8c
--- /dev/null
+++ b/src/include/ce_coll.h
@@ -0,0 +1,76 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CE_COLL_H_
+#define NCCL_CE_COLL_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "bitops.h"
+
+// Memory operations per rank for different synchronization protocols
+#define NCCL_CE_SYNC_OPS_PER_RANK_MC 2
+#define NCCL_CE_SYNC_OPS_PER_RANK_UC 3
+
+struct ncclCeColl {
+  uint8_t* baseUCSymReadyPtr;
+  uint8_t* baseUCSymComplPtr;
+  size_t baseUCSymReadyOffset;
+  size_t baseUCSymComplOffset;
+  uint32_t ceSeqNum;
+  bool useCompletePtr;
+  uint32_t intraBatchSyncFreq;
+  uint64_t intraBatchSyncMsgThreshold;
+  struct ncclDevrWindow* ceSyncWin;
+};
+
+struct ncclCeInitTask {
+  struct ncclCeInitTask *next;
+  struct ncclComm* comm;
+};
+
+struct alignas(16) ncclCeCollArgs {
+  ncclFunc_t func;
+  int rootRank;
+  size_t nElts;
+  size_t eltSize;
+  uint8_t* sendBuff;
+  uint8_t* recvBuff;
+  struct ncclDevrWindow* sendWin;
+  struct ncclDevrWindow* recvWin;
+};
+
+struct ncclCeBatchOpsParams {
+  void** dsts;
+  void** srcs;
+  size_t* sizes;
+  size_t numOps;
+  bool intraBatchSync;
+#if CUDART_VERSION >= 12080
+  cudaMemcpyAttributes* attrs;
+  size_t* attrIdxs;
+  size_t numAttrs;
+#endif
+};
+
+bool ncclCeImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+
+ncclResult_t ncclCeInit(struct ncclComm* comm);
+
+ncclResult_t ncclCeFinalize(struct ncclComm* comm);
+
+ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream);
+
+ncclResult_t ncclLaunchCeColl(struct ncclComm* comm, struct ncclKernelPlan* plan);
+
+ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+
+ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream);
+#endif /* NCCL_CE_COLL_H_ */
diff --git a/src/include/channel.h b/src/include/channel.h
index ee9aa6d0b..bd34f54c1 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -17,15 +17,16 @@ ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncc
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
 
 inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
+  int base;
   if (comm->nNodes > 1) {
     int nodeDelta = p2pRound/comm->maxLocalRanks;
     int localDelta = p2pRound%comm->maxLocalRanks;
-    int base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+    base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
     base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
-    return base & 0xff;
   } else {
-    return p2pRound & 0xff;
+    base = p2pRound;
   }
+  return reverseBits(base, log2Up(comm->p2pnChannels));
 }
 
 #endif
diff --git a/src/include/coll_net.h b/src/include/coll_net.h
index affbf0a24..574fd95eb 100644
--- a/src/include/coll_net.h
+++ b/src/include/coll_net.h
@@ -16,7 +16,7 @@ typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
 static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
 static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(comm->collNetContext, dev, handle, listenComm)); return ncclSuccess; }
 static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
 static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
 static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, size_t size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
@@ -29,6 +29,7 @@ static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* d
 static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
 static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
 static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static ncclResult_t collNetFinalize(struct ncclComm* comm, void* ctx) { NCCLCHECK(comm->ncclCollNet->finalize(ctx)); return ncclSuccess; }
 
 static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
 
diff --git a/src/include/collectives.h b/src/include/collectives.h
index c68b0418c..038eb8dd1 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -8,7 +8,7 @@
 #define NCCL_COLLECTIVES_H_
 
 #include "nccl.h"
-#include "nccl_common.h"
+#include "nccl_tuner.h"
 #include "device.h"
 
 #define NCCL_MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
@@ -18,10 +18,16 @@
 #define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
 #define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
 #define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLTOALL_SLICESTEPS 1
+#define ALLTOALL_CHUNKSTEPS 1
 #define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
 #define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
 #define BROADCAST_SLICESTEPS 1
 #define BROADCAST_CHUNKSTEPS 1
+#define GATHER_SLICESTEPS 1
+#define GATHER_CHUNKSTEPS 1
+#define SCATTER_SLICESTEPS 1
+#define SCATTER_CHUNKSTEPS 1
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
diff --git a/src/include/comm.h b/src/include/comm.h
index 1378e0765..22faf3682 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -18,6 +18,9 @@
 #include "graph.h"
 #include "profiler.h"
 #include "allocator.h"
+#include "dev_runtime.h"
+#include "sym_kernels.h"
+#include "ce_coll.h"
 
 #if CUDART_VERSION < 9000
 struct cudaLaunchParams {
@@ -198,12 +201,14 @@ struct ncclTaskColl {
   int32_t nMaxChannels:8;
   int32_t nWarps:8;
   int32_t algorithm:8, protocol:8;
-  uint32_t isCollnet:1, isNvls:1;
-  uint32_t devFuncId:30;
+  uint32_t isCollnet:1, isNvls:1, isSymLast:1;
+  uint32_t devFuncId:29;
   int regBufType;
   // number of elements in planner->ipcMemQueue associated with this collective
   int nCleanupQueueElts;
 
+  struct ncclDevrWindow* sendWin;
+  struct ncclDevrWindow* recvWin;
   void* sendMhandle;
   void* recvMhandle;
   void** sendNetHandles;
@@ -217,12 +222,16 @@ struct ncclTaskColl {
 
   // Profiler plugin
   int eActivationMask;
+  void* groupApiEventHandle;
+  void* collApiEventHandle;
   void* eventHandle;
   uint8_t nChannels;
 };
+
 struct ncclTaskP2p {
   struct ncclTaskP2p* next;
   ncclFunc_t func;
+  ncclFunc_t collAPI;
   void* buff;
   size_t count;
   ncclDataType_t datatype;
@@ -231,6 +240,8 @@ struct ncclTaskP2p {
 
   // Profiler plugin
   int eActivationMask;
+  void* groupApiEventHandle;
+  void* p2pApiEventHandle;
   void* eventHandle;
   uint8_t nChannels;
 };
@@ -246,12 +257,14 @@ struct ncclKernelPlan {
   bool persistent; // aka captured in a graph
   bool isHostCbEnq;
   bool isSymColl;
+  bool isCeColl;
   enum ncclDevWorkStorageType workStorageType;
   bool kernelSpecialized;
   void* kernelFn;
   union {
     struct ncclDevKernelArgs* kernelArgs;
-    struct ncclSymDevArgs* kernelSymArgs;
+    void* kernelSymArgs;
+    struct ncclCeCollArgs* ceCollArgs;
   };
   size_t kernelArgsSize;
   uint64_t channelMask; // bitset of which channels are present
@@ -270,6 +283,8 @@ struct ncclKernelPlan {
   struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
 
   // Profiler plugin
+  void* groupApiEventHandle;
+  void* kernelLaunchEventHandle;
   void* groupEventHandle;
 };
 
@@ -360,9 +375,8 @@ struct ncclKernelPlanner {
   struct ncclTaskCollSorter collSorter;
   struct Peer* peers/*[nRanks]*/;
   int nTasksColl, nTasksP2p;
+  int nTasksP2pSend, nTasksP2pRecv;
   bool persistent;
-  bool isSymColl;
-
   // The list of user streams aggregated over all tasks present.
   struct ncclCudaStreamList* streams;
   // The most recent user stream. Ignored if streams==nullptr
@@ -378,6 +392,8 @@ struct ncclKernelPlanner {
   //////////////////////////////////////////////////////////////////////////////
 
   struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collTaskQueue;
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collCeTaskQueue;
+  struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next> collSymTaskQueue;
   struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> collWorkQueue;
   struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> tmpCollWorkQueue;
   struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next> collCleanupQueue;
@@ -417,6 +433,8 @@ typedef enum ncclGroupTaskType {
   ncclGroupTaskTypeNum = 2,
 } ncclGroupTaskType_t;
 
+struct ncclCommSymTeams;
+
 struct ncclComm {
   uint64_t startMagic;
   struct ncclMemoryStack memPermanent, memScoped;
@@ -436,10 +454,12 @@ struct ncclComm {
   bool peerInfoValid;
 
   ncclNet_t* ncclNet;
+  void* netContext;
   int netPluginIndex;
   int ncclNetVer;
   ncclNetDeviceType netDeviceType;
   ncclCollNet_t* ncclCollNet;
+  void* collNetContext;
   void* bootstrap;
   // Bitmasks for ncclTransportP2pSetup
   uint64_t* connectSend;
@@ -472,6 +492,7 @@ struct ncclComm {
   int localRank;
   int localRanks;
   int maxLocalRanks;
+  int minLocalRanks;
   int* rankToNode;
   int* rankToLocalRank;
   int* localRankToRank;
@@ -482,6 +503,9 @@ struct ncclComm {
   struct cliqueInfo clique; // Our MNNVL clique information
   int cliqueRank; // Our rank within the MNNVL clique
 
+  // NVL Domain info
+  ncclNvlDomainInfo_v5_t nvlDomainInfo;
+
   bool checkPointers;
   bool dmaBufSupport;
 
@@ -508,7 +532,8 @@ struct ncclComm {
   int p2pChunkSize;
   int nvlsChunkSize;
 
-  // Algorithm/Protocols thresholds
+  // Tuner values
+  ncclTunerConstants_t tunerConstants;
   ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
   float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
@@ -527,8 +552,7 @@ struct ncclComm {
   uint32_t destroyFlag;
 
   // Device side of the communicator (for cudaFree's)
-  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
-  struct ncclSymDevComm symDevComm;
+  struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm
 
   uint32_t workArgsBytes; // max size of kernel args
   uint32_t workFifoBytes; // size of workFifoBuf, power of 2
@@ -624,6 +648,10 @@ struct ncclComm {
   uint64_t seqNumber[NCCL_NUM_FUNCTIONS];
   struct ncclProfilerProxy profiler;
 
+  // CE Collective
+  struct ncclCeColl ceColl;
+  struct ncclIntruQueue<struct ncclCeInitTask, &ncclCeInitTask::next> ceInitTaskQueue;
+  
   // buffer registration cache
   struct ncclRegCache regCache;
   int isAllNvlink;
@@ -632,13 +660,10 @@ struct ncclComm {
   bool useNetPXN;
   bool useGdr;
   int splitCount;
-  // symmetric buffer
-  uint8_t* baseUCSymPtr;
-  uint8_t* baseMCSymPtr;
-  size_t baseStride;
-  size_t symAllocHead;
-  CUmemGenericAllocationHandle symMCHandle;
-  struct ncclIntruQueue<struct ncclSymRegTask, &ncclSymRegTask::next> symRegTaskQueue;
+
+  struct ncclDevrState devrState; // The symmetric runtime state
+  struct ncclSymkState symkState; // The symmetric kernels state (built on previous)
+
   uint64_t endMagic;
 };
 
diff --git a/src/include/core.h b/src/include/core.h
index a1754beeb..2ce1d8e78 100644
--- a/src/include/core.h
+++ b/src/include/core.h
@@ -16,6 +16,7 @@
 
 #ifdef PROFAPI
 #define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
     __attribute__ ((visibility("default"))) \
     __attribute__ ((alias(#func)))          \
     ret p##func (args);                     \
diff --git a/src/include/cpuset.h b/src/include/cpuset.h
index 99e3edf4d..df936a31e 100644
--- a/src/include/cpuset.h
+++ b/src/include/cpuset.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,54 +7,38 @@
 #ifndef NCCL_CPUSET_H_
 #define NCCL_CPUSET_H_
 
-// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+#include "nccl.h"
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <sched.h>
 
-static int hexToInt(char c) {
-  int v = c - '0';
-  if (v < 0) return -1;
-  if (v > 9) v = 10 + c - 'a';
-  if ((v < 0) || (v > 15)) return -1;
-  return v;
-}
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t.
+// The bitmask is divided into chunks of 32 bits, each of them represented by 8 hex number.
+#define U32_LEN 32 // using uint32_t
+#define CPU_SET_N_U32 (CPU_SETSIZE / U32_LEN)
 
-#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+static ncclResult_t ncclStrToCpuset(const char* maskStr, cpu_set_t* set) {
+  uint32_t cpumasks[CPU_SET_N_U32] = {0};
 
-static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
-  uint32_t cpumasks[CPU_SET_N_U32];
-  int m = CPU_SET_N_U32-1;
-  cpumasks[m] = 0;
-  for (int o=0; o<strlen(str); o++) {
-    char c = str[o];
-    if (c == ',') {
-      m--;
-      cpumasks[m] = 0;
-    } else {
-      int v = hexToInt(c);
-      if (v == -1) break;
-      cpumasks[m] <<= 4;
-      cpumasks[m] += v;
-    }
+  // transform the string into an array of 32 bit masks, starting with the highest mask
+  int m = CPU_SET_N_U32;
+  char* str = strdup(maskStr);
+  char* token = strtok(str, ",");
+  while (token != NULL && m > 0) {
+    cpumasks[--m] = strtoul(token, NULL, /*base = hex*/ 16);
+    token = strtok(NULL, ",");
   }
-  // Copy cpumasks to mask
-  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
-    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
-  }
-  return ncclSuccess;
-}
+  free(str);
 
-static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
-  int c = 0;
-  uint8_t* m8 = (uint8_t*)mask;
-  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
-    if (c == 0 && m8[o] == 0) continue;
-    sprintf(str+c, "%02x", m8[o]);
-    c+=2;
-    if (o && o%4 == 0) {
-      sprintf(str+c, ",");
-      c++;
+  // list all the CPUs as part of the CPU set, starting with the lowest mask (= current value of m)
+  CPU_ZERO(set);
+  for (int a = 0; (a + m) < CPU_SET_N_U32; a++) {
+    // each mask is U32_LEN CPUs, list them all if the bit is on
+    for (int i = 0; i < U32_LEN; ++i) {
+      if (cpumasks[a + m] & (1UL << i)) CPU_SET(i + a * U32_LEN, set);
     }
   }
-  str[c] = '\0';
   return ncclSuccess;
 }
 
@@ -83,4 +67,31 @@ static char* ncclCpusetToRangeStr(cpu_set_t* mask, char* str, size_t len) {
   return str;
 }
 
+static ncclResult_t ncclStrListToCpuset(const char* userStr, cpu_set_t* mask) {
+  // reset the CPU set
+  CPU_ZERO(mask);
+  const char delim[] = ",";
+  char* str = strdup(userStr);
+  char* token = strtok(str, delim);
+  while (token != NULL) {
+    uint64_t cpu = strtoull(token, NULL, 0);
+    CPU_SET(cpu, mask);
+    token = strtok(NULL, delim);
+  }
+  free(str);
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCpusetToStrList(cpu_set_t* mask, char* str, size_t len) {
+  if (len == 0) return ncclSuccess;
+  str[0] = '\0';
+  int count = 0;
+  for (uint64_t id = 0; id < CPU_SETSIZE; ++id) {
+    if (CPU_ISSET(id, mask)) {
+      snprintf(str + strlen(str), len - strlen(str), "%s%lu", (count++ == 0) ? "" : ",", id);
+    }
+  }
+  return ncclSuccess;
+}
+
 #endif
diff --git a/src/include/cudawrap.h b/src/include/cudawrap.h
index 2edc60f21..f05f13e43 100644
--- a/src/include/cudawrap.h
+++ b/src/include/cudawrap.h
@@ -114,6 +114,12 @@ DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
 DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
 #endif
+/* Stream-MemOp support */
+DECLARE_CUDA_PFN_EXTERN(cuStreamBatchMemOp, 11070);
+DECLARE_CUDA_PFN_EXTERN(cuStreamWaitValue32, 11070);
+DECLARE_CUDA_PFN_EXTERN(cuStreamWaitValue64, 11070);
+DECLARE_CUDA_PFN_EXTERN(cuStreamWriteValue32, 11070);
+DECLARE_CUDA_PFN_EXTERN(cuStreamWriteValue64, 11070);
 #endif
 
 ncclResult_t ncclCudaLibraryInit(void);
diff --git a/src/include/debug.h b/src/include/debug.h
index 4e50cbf5a..3822e8760 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -17,6 +17,7 @@
 #define NCCL_THREAD_NAMELEN 16
 
 extern int ncclDebugLevel;
+extern uint64_t ncclDebugMask;
 extern FILE *ncclDebugFile;
 
 void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
@@ -27,11 +28,30 @@ extern char ncclLastError[];
 
 #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
-#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
+
+#define INFO(FLAGS, ...) \
+    do{ \
+        int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
+        if((level >= NCCL_LOG_INFO && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) \
+            ncclDebugLog(NCCL_LOG_INFO, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \
+    } while(0)
+
+#define TRACE_CALL(...) \
+    do { \
+        int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
+        if((level >= NCCL_LOG_TRACE && (NCCL_CALL & ncclDebugMask)) || (level < 0)) { \
+            ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__); \
+        } \
+    } while (0)
 
 #ifdef ENABLE_TRACE
-#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define TRACE(FLAGS, ...) \
+    do { \
+        int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
+        if ((level >= NCCL_LOG_TRACE && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) { \
+            ncclDebugLog(NCCL_LOG_TRACE, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \
+        } \
+    } while (0)
 #else
 #define TRACE(...)
 #endif
diff --git a/src/include/dev_runtime.h b/src/include/dev_runtime.h
new file mode 100644
index 000000000..5f6e66e33
--- /dev/null
+++ b/src/include/dev_runtime.h
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_RUNTIME_H_
+#define NCCL_DEVICE_RUNTIME_H_
+#include "nccl.h"
+#include "nccl_device.h"
+#include "nccl_common.h"
+#include "allocator.h"
+#include "bitops.h"
+#include "utils.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclDevr[_]: runtime implements for symmetric API.
+
+struct ncclDevrMemory;
+struct ncclDevrWindow {
+  struct ncclDevrMemory* memory;
+  void* userPtr;
+  size_t size;
+  size_t bigOffset; // Offset in big VA space.
+  int winFlags;
+  void* localRegHandle;
+  struct ncclWindow_vidmem* vidmem;
+};
+struct ncclDevrWindowSorted;
+struct ncclDevrTeam;
+
+struct ncclDevrRegTask {
+  struct ncclDevrRegTask *next;
+  void* userPtr;
+  size_t userSize;
+  int winFlags;
+  ncclWindow_t* outWinDev;
+};
+
+struct ncclDevrCommCreateTask {
+  struct ncclDevrCommCreateTask *next;
+  struct ncclDevCommRequirements* reqs;
+  struct ncclDevComm* outDevComm;
+};
+
+struct ncclDevrState {
+  // Like localRank/localRanks except "lsa" ranks must be consecutive in the world
+  // and all lsa subsets have the same number of ranks. If any condition is
+  // false then the lsa team is just the singleton of self.
+  int lsaSelf;
+  int lsaSize;
+  int* lsaRankList;
+
+  size_t granularity; // cuMemGetAllocationGranularity
+  struct ncclDevrMemory* memHead;
+  struct ncclDevrWindowSorted* winSorted;
+  int winSortedCapacity, winSortedCount;
+  struct ncclDevrTeam* teamHead;
+  size_t bigSize; // size of our big logical space (128GB?)
+  struct ncclSpace bigSpace; // allocates our big VA space.
+  void* lsaFlatBase; // base ptr for all lsa ranks big VA's concatenated together: size = lsaRanks*bigSize
+  struct ncclShadowPool shadows;
+  struct ncclDevCommWindowTable* windowTable;
+
+  struct ncclIntruQueue<struct ncclDevrRegTask, &ncclDevrRegTask::next> regTaskQueue;
+  struct ncclIntruQueue<struct ncclDevrCommCreateTask, &ncclDevrCommCreateTask::next> commCreateTaskQueue;
+};
+
+// We assume ncclComm has a `ncclDevrState symState` member.
+ncclResult_t ncclDevrInitOnce(struct ncclComm* comm);
+ncclResult_t ncclDevrFinalize(struct ncclComm* comm);
+
+// If found *outWinHost will be populated and *outWinId >= 0, otherwise *outWinId == -1
+ncclResult_t ncclDevrFindWindow(struct ncclComm* comm, void const* userPtr, struct ncclDevrWindow** outWin);
+
+ncclResult_t ncclDevrWindowRegisterInGroup(
+  struct ncclComm* comm, void* ptr, size_t size, int winFlags, ncclWindow_t* outWinDev
+);
+
+ncclResult_t ncclDevrCommCreateInternal(
+  struct ncclComm* comm, struct ncclDevCommRequirements const* reqs, struct ncclDevComm* outDevComm
+);
+void freeDevCommRequirements(
+  struct ncclDevCommRequirements* reqs
+);
+
+// Get the corresponding pointer in another lsa rank's symmetric memory window
+ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, int lsaRank, void** outPtr);
+
+// Get the multicast address for a given team
+ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindow* winHost, size_t offset, struct ncclTeam lsaTeam, void** outPtr);
+#endif
diff --git a/src/include/device.h b/src/include/device.h
index 2c5ce1029..9ffc26095 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -8,9 +8,8 @@
 #define NCCL_DEVICE_H_
 
 #include "nccl.h"
-#include "nccl_common.h"
+#include "nccl_tuner.h"
 #include "bitops.h"
-#include "symmetric.h"
 #include <algorithm>
 #include <stdint.h>
 #include <sys/types.h>
@@ -159,6 +158,7 @@ struct ncclProxyConnector {
 struct ncclConnector {
   int connected;
   int hasSeen;
+  int p2pOnly;
   struct ncclProxyConnector proxyConn;
   struct ncclTransportComm* transportComm;
   void* transportResources;
@@ -228,7 +228,7 @@ struct ncclChannelPeer {
   int refCount;
 };
 
-struct ncclDevComm;
+struct ncclKernelComm;
 
 struct alignas(16) ncclDevWorkP2p {
   void *sendAddr, *recvAddr;
@@ -267,16 +267,10 @@ inline __host__ uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2
 // ncclP2pChannelToPart and ncclP2pChannelForPart are inverses. The device code
 // uses ncclP2pChannelToPart to determine which part "this" channel is responsible for.
 inline __host__ int ncclP2pChannelForPart(int nP2pChannels, int base, int part) {
-  // Only works because nP2pChannels is pow2
-  int nChannelsLog2 = countOneBits(nP2pChannels-1);
-  int delta = reverseBits(part, nChannelsLog2);
-  return (base + delta) & (nP2pChannels-1);
+  return (base + part) & (nP2pChannels-1);
 }
 inline __device__ int ncclP2pChannelToPart(int nP2pChannels, int base, int channel) {
-  // Only works because nP2pChannels is pow2
-  int nChannelsLog2 = countOneBits(nP2pChannels-1);
-  int delta = (channel-base) & (nP2pChannels-1);
-  return reverseBits(delta, nChannelsLog2);
+  return (channel - base) & (nP2pChannels-1);
 }
 
 struct alignas(16) ncclDevWorkColl {
@@ -413,7 +407,7 @@ struct ncclDevProfiler {
   } data[MAX_PROFILER_EVENTS_PER_CHANNEL];
 };
 
-struct ncclDevComm {
+struct ncclKernelComm {
   int rank;
   int nRanks;
   int node;
@@ -436,8 +430,8 @@ struct ncclDevComm {
   struct ncclDevProfiler* workCompleted/*[MAXCHANNELS]*/;
 };
 
-struct alignas(16) ncclDevCommAndChannels {
-  struct ncclDevComm comm;
+struct alignas(16) ncclKernelCommAndChannels {
+  struct ncclKernelComm comm;
   struct ncclDevChannel channels[MAXCHANNELS];
 };
 
@@ -448,7 +442,7 @@ enum ncclDevWorkStorageType: uint8_t {
 };
 
 struct alignas(16) ncclDevKernelArgs {
-  struct ncclDevComm* comm;
+  struct ncclKernelComm* comm;
   uint64_t channelMask;
   enum ncclDevWorkStorageType workStorageType;
   uint32_t workMask;
diff --git a/src/include/graph.h b/src/include/graph.h
index 7475e5a7b..6b926717e 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -115,6 +115,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
 ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
 
 struct ncclTopoRanks {
+  int crossNicRing;
   int ringRecv[MAXCHANNELS];
   int ringSend[MAXCHANNELS];
   int ringPrev[MAXCHANNELS];
@@ -131,6 +132,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs
 ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
     struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent);
 
+ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm);
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
 ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time);
 
diff --git a/src/include/group.h b/src/include/group.h
index 033a187da..6e317c6c4 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -43,6 +43,7 @@ struct ncclAsyncJob {
   uint32_t* childAbortFlagDev; /* point to child abortFlagDev */
   ncclComm_t comm;
   int destroyFlag;
+  bool isThreadMain;
 };
 
 ncclResult_t ncclAsyncLaunch(
diff --git a/src/include/nccl_common.h b/src/include/nccl_common.h
index 0f387c15e..0a3842151 100644
--- a/src/include/nccl_common.h
+++ b/src/include/nccl_common.h
@@ -7,6 +7,11 @@
 #ifndef NCCL_DEBUG_H_
 #define NCCL_DEBUG_H_
 
+// Workaround for libstdc++ trying to force public visibility of std:: symbols.  We don't want to do that in libnccl.so.
+#include <bits/c++config.h>
+#undef _GLIBCXX_VISIBILITY
+#define _GLIBCXX_VISIBILITY(V)
+
 #include <cstdint>
 
 typedef enum {
@@ -60,24 +65,11 @@ typedef enum {
   ncclFuncSendRecv = 5,
   ncclFuncSend = 6,
   ncclFuncRecv = 7,
-  ncclNumFuncs = 8
+  ncclFuncAlltoAll = 8,
+  ncclFuncScatter = 9,
+  ncclFuncGather = 10,
+  ncclNumFuncs = 11
 } ncclFunc_t;
 
-#define NCCL_NUM_ALGORITHMS 7 // Tree/Ring/CollNet*/PAT
-#define NCCL_ALGO_UNDEF -1
-#define NCCL_ALGO_TREE 0
-#define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET_DIRECT 2
-#define NCCL_ALGO_COLLNET_CHAIN 3
-#define NCCL_ALGO_NVLS 4
-#define NCCL_ALGO_NVLS_TREE 5
-#define NCCL_ALGO_PAT 6
-
-#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define NCCL_PROTO_UNDEF -1
-#define NCCL_PROTO_LL 0
-#define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_SIMPLE 2
 
-#define NCCL_ALGO_PROTO_IGNORE -1.0
 #endif
diff --git a/src/include/nccl_device.h b/src/include/nccl_device.h
new file mode 100644
index 000000000..88b2531d1
--- /dev/null
+++ b/src/include/nccl_device.h
@@ -0,0 +1,15 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_device/impl/comm__funcs.h"
+#include "nccl_device/coop.h"
+#include "nccl_device/impl/core__funcs.h"
+#include "nccl_device/impl/ll_a2a__funcs.h"
+#include "nccl_device/impl/mem_barrier__funcs.h"
+//#include "nccl_device/net_barrier__funcs.h"
+//#include "nccl_device/net_scratch_a2a__funcs.h"
+//#include "nccl_device/barrier__funcs.h"
+#include "nccl_device/impl/ptr__funcs.h"
diff --git a/src/include/nccl_device/README.md b/src/include/nccl_device/README.md
new file mode 100644
index 000000000..bf1728d47
--- /dev/null
+++ b/src/include/nccl_device/README.md
@@ -0,0 +1,32 @@
+This directory has been structured to make it easy for user to read the headers to learn the API. The files adjacent
+to this README are meant for humans. They contain the essential declarations like which types exist and function prototypes and comments
+indicating the contract/usage. Everything else goes into the "impl/" subdirectory. Most modules are stratified into three layers:
+
+1) "foo.h" Public API declarations.
+2) "impl/foo__types.h" struct definitions. Has #include of layer 1.
+3) "impl/foo_funcs.h" inline functions. Has #include of layer 2.
+
+The include dependencies should be acyclic for layers 1 and 2 since order matters for declarations and types. Layer 3 though
+can freely have cycles amongst itself ("impl/foo__funcs.h" and "impl/bar__funcs.h" can mutually include each other) since
+functions can be defined in any order once declared.
+
+Translation units should just include "nccl_device.h" to ensure they get all the "impl/foo__funcs.h". But if a translation unit wants
+to be more specific as to which module it pulls in it should include "impl/foo__funcs.h".
+
+One of the nasty reasons this was required is because of C++ defaulted function parameters:
+
+```
+// +++ in foo.h +++
+struct Foo; // defined in some __types.h
+
+// +++ in "impl/foo__types.h" +++
+struct Foo { int x; };
+
+// +++ in "bar.h" +++
+// Prototype function where default value is default construction of Foo. Since
+// Foo would be incomplete if just including "foo.h" the compiler errors because
+// it can't reason about the {}.
+// I was able to solve this by including "impl/foo__types.h" instead.
+#include "impl/foo__types.h"
+void bar(Foo arg = {});
+```
diff --git a/src/include/nccl_device/comm.h b/src/include/nccl_device/comm.h
new file mode 100644
index 000000000..d989ce1f6
--- /dev/null
+++ b/src/include/nccl_device/comm.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_COMM_H_
+#define _NCCL_DEVICE_COMM_H_
+#include "core.h"
+#endif
diff --git a/src/include/nccl_device/coop.h b/src/include/nccl_device/coop.h
new file mode 100644
index 000000000..9a8d4b0a8
--- /dev/null
+++ b/src/include/nccl_device/coop.h
@@ -0,0 +1,152 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_COOP_H_
+#define _NCCL_DEVICE_COOP_H_
+#include "utility.h"
+
+// ncclCoop[Foo]: NCCL's versions of CUDA's Cooperative Groups. They conform
+// to just this subset of the CUDA API:
+//   int Coop::thread_rank();
+//   int Coop::size();
+//   int Coop::num_threads(); // same as size()
+//   void Coop::sync();
+
+#if __CUDACC__
+template<int nThreadsPow2>
+struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
+  static_assert(nccl::utility::isPow2(nThreadsPow2) && nThreadsPow2 <= 32, "Condition required");
+
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return nccl::utility::lane() % nThreadsPow2;
+  }
+  NCCL_DEVICE_INLINE constexpr int size() const { return nThreadsPow2; }
+  NCCL_DEVICE_INLINE constexpr int num_threads() const { return nThreadsPow2; }
+
+  NCCL_DEVICE_INLINE uint32_t laneMask() const {
+    return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
+  }
+  NCCL_DEVICE_INLINE void sync() {
+    __syncwarp(laneMask());
+  }
+};
+#endif
+
+#if __CUDACC__
+typedef ncclCoopTile<1> ncclCoopThread;
+typedef ncclCoopTile<32> ncclCoopWarp;
+#endif
+
+#if __CUDACC__
+struct ncclCoopLanes { // Some lanes of this warp.
+  uint32_t lmask;
+  
+  NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
+
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return __popc(lmask & nccl::utility::lanemask_lt());
+  }
+  NCCL_DEVICE_INLINE int size() const {
+    return __popc(lmask);
+  }
+  NCCL_DEVICE_INLINE int num_threads() const {
+    return __popc(lmask);
+  }
+  NCCL_DEVICE_INLINE void sync() {
+    __syncwarp(lmask);
+  }
+};
+#endif
+
+#if __CUDACC__
+// A set of consecutive warps that the user has also supplied with a unique
+// id from [0..15]. It is an error for two different warp spans with the same
+// id to be in a collective concurrently.
+struct ncclCoopWarpSpan {
+  uint32_t warp0:8, nWarps:8, id:8;
+
+  NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
+    warp0(warp0), nWarps(nWarps), id(id) {
+  }
+  
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return threadIdx.x - 32*warp0;
+  }
+  NCCL_DEVICE_INLINE int size() const {
+    return 32*nWarps;
+  }
+  NCCL_DEVICE_INLINE int num_threads() const {
+    return 32*nWarps;
+  }
+
+  NCCL_DEVICE_INLINE void sync() {
+    //asm volatile("barrier.sync %0, %1;" :: "r"(1+id), "r"(32*nWarps) : "memory");
+    __barrier_sync_count(1+id, 32*nWarps);
+  }
+};
+#endif
+
+#if __CUDACC__
+struct ncclCoopCta {
+  NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x; }
+  NCCL_DEVICE_INLINE int size() const { return blockDim.x; }
+  NCCL_DEVICE_INLINE int num_threads() const { return blockDim.x; }
+  NCCL_DEVICE_INLINE void sync() { __syncthreads(); }
+};
+#endif
+
+#if __CUDACC__
+template<int nThreadsPow2>
+NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile<nThreadsPow2> coop) {
+  return coop.laneMask();
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
+  return coop.lmask;
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
+  return -1u;
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
+  return -1u;
+}
+#endif
+
+#if __CUDACC__
+// ncclCoopIsThread:
+// At compile time do we know the given coop is a single thread only.
+template<int nThreads>
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopTile<nThreads>) {
+  return nThreads == 1;
+}
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
+#endif
+
+#if __CUDACC__
+// Pick threads of our warp that are safe to use collectively.
+NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
+  return ncclCoopLanes{__activemask()};
+}
+#endif
+
+#if __CUDACC__
+// Pick threads of our warp that are safe to use collectively given that this
+// is a collective on the provided cooperative group.
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclCoopTile<32> ncclCoopCoalesced(Coop) {
+  return ncclCoopTile<32>();
+}
+NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) {
+  return coop;
+}
+template<int nThreads>
+NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThreads> coop) {
+  return coop;
+}
+#endif
+
+#endif
diff --git a/src/include/nccl_device/core.h b/src/include/nccl_device/core.h
new file mode 100644
index 000000000..dd41d6925
--- /dev/null
+++ b/src/include/nccl_device/core.h
@@ -0,0 +1,150 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_CORE_H_
+#define _NCCL_DEVICE_CORE_H_
+#include <nccl.h>
+#include "coop.h"
+#include "utility.h"
+
+struct ncclDevComm;
+typedef struct ncclDevComm ncclDevComm_t;
+
+struct ncclTeam;
+typedef struct ncclTeam ncclTeam_t;
+
+// typedef struct ncclWindow_vidmem* ncclWindow_t; // in nccl.h
+
+struct ncclMultimemHandle;
+typedef struct ncclMultimemHandle ncclMultimemHandle_t;
+
+typedef uint32_t ncclDevResourceHandle;
+typedef ncclDevResourceHandle ncclDevResourceHandle_t;
+
+struct ncclLsaBarrierHandle;
+typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
+
+struct ncclLLA2AHandle;
+typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
+
+struct ncclTeam {
+  int nRanks, rank, stride;
+};
+
+#if __cplusplus
+template<typename T> struct ncclSymPtr;
+#endif
+
+#if __cplusplus
+struct ncclTeamTagWorld {};
+struct ncclTeamTagLsa {};
+struct ncclTeamTagRail {};
+#endif
+
+struct ncclDevCommRequirements;
+typedef struct ncclDevCommRequirements ncclDevCommRequirements_t;
+
+struct ncclDevResourceRequirements;
+typedef struct ncclDevResourceRequirements ncclDevResourceRequirements_t;
+
+struct ncclTeamRequirements;
+typedef struct ncclTeamRequirements ncclTeamRequirements_t;
+
+struct ncclDevCommRequirements {
+  ncclDevResourceRequirements_t* resourceRequirementsList;
+  ncclTeamRequirements_t* teamRequirementsList;
+
+  bool lsaMultimem; // Enable multimem on lsa team
+
+  int lsaBarrierCount;
+};
+
+struct ncclDevResourceRequirements {
+  ncclDevResourceRequirements_t* next;
+  size_t bufferSize, bufferAlign;
+  ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
+};
+
+struct ncclTeamRequirements {
+  ncclTeamRequirements_t* next;
+  ncclTeam_t team;
+  bool multimem;
+  ncclMultimemHandle_t* outMultimemHandle; // If non-null, target assigned during ncclDevCommCreate.
+};
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommCreate(ncclComm_t, ncclDevCommRequirements_t const*, ncclDevComm_t* outDevComm);
+NCCL_EXTERN_C __host__ ncclResult_t ncclDevCommDestroy(ncclComm_t, ncclDevComm_t const* devComm);
+
+////////////////////////////////////////////////////////////////////////////////
+// Team API:
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const&);
+#endif
+NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamWorld(ncclComm_t);
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const&);
+#endif
+NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamLsa(ncclComm_t);
+
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int bPeer);
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int bPeer);
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const&, ncclTeam, int rank);
+#endif
+NCCL_EXTERN_C __host__ int ncclTeamRankToWorld(ncclComm_t, ncclTeam_t, int rank);
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const&, ncclTeam, int rank);
+#endif
+NCCL_EXTERN_C __host__ int ncclTeamRankToLsa(ncclComm_t, ncclTeam_t, int rank);
+
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize);
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize);
+
+// Interpret each team as a set of ranks. This function assumes that `subset`
+// is a subset of `parent`. Thus the number of ranks in the set difference of
+// `parent` minus `subset` is `super.nRanks - subset.nRanks`. Given `index` this
+// function returns the index'th element of `parent` minus `subset`.
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index);
+
+// Equivalent to ncclTeamOuterFactor of lsa team.
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const&);
+#endif
+NCCL_EXTERN_C __host__ ncclTeam_t ncclTeamRail(ncclComm_t);
+
+// Get offset of resource buffer within `comm.resourceWindow`.
+NCCL_EXTERN_C NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t);
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const&, ncclDevResourceHandle);
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Window API:
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset);
+NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer);
+NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer);
+NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer);
+NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mmHandle);
+NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const&);
+#endif
+
+#if __CUDACC__
+// Convenience for combining ncclGet***Pointer() with resource handle.
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const&, ncclDevResourceHandle);
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const&, ncclDevResourceHandle, int peer);
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const&, ncclDevResourceHandle, ncclTeam, int peer);
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const&, ncclDevResourceHandle, ncclMultimemHandle);
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const&, ncclDevResourceHandle);
+#endif
+
+#endif // _NCCL_DEVICE_CORE_H_
diff --git a/src/include/nccl_device/impl/comm__funcs.h b/src/include/nccl_device/impl/comm__funcs.h
new file mode 100644
index 000000000..0bfe90c91
--- /dev/null
+++ b/src/include/nccl_device/impl/comm__funcs.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_COMM__FUNCS_H_
+#define _NCCL_DEVICE_COMM__FUNCS_H_
+#include "comm__types.h"
+#endif // _NCCL_DEVICE_COMM__FUNCS_H_
diff --git a/src/include/nccl_device/impl/comm__types.h b/src/include/nccl_device/impl/comm__types.h
new file mode 100644
index 000000000..680d7055b
--- /dev/null
+++ b/src/include/nccl_device/impl/comm__types.h
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_COMM__TYPES_H_
+#define _NCCL_DEVICE_COMM__TYPES_H_
+#include "../comm.h"
+#include "core__types.h"
+#include "mem_barrier__types.h"
+#include "ll_a2a__types.h"
+
+struct ncclDevCommWindowTable;
+#if __cplusplus
+struct ncclDevCommWindowTable {
+  struct Entry {
+    uintptr_t base, size;
+    ncclWindow_t window;
+  } entries[32];
+  struct ncclDevCommWindowTable* next;
+};
+#endif
+
+struct ncclDevComm {
+  int rank, nRanks;
+  uint32_t nRanks_rcp32;
+  int lsaRank, lsaSize;
+  uint32_t lsaSize_rcp32;
+
+  struct ncclDevCommWindowTable* windowTable;
+
+  ncclWindow_t resourceWindow;
+  struct ncclWindow_vidmem resourceWindow_inlined;
+
+  ncclMultimemHandle_t lsaMultimem;
+  ncclLsaBarrierHandle_t lsaBarrier;
+};
+
+#endif // _NCCL_DEVICE_COMM__TYPES_H_
diff --git a/src/include/nccl_device/impl/core__funcs.h b/src/include/nccl_device/impl/core__funcs.h
new file mode 100644
index 000000000..1087cd289
--- /dev/null
+++ b/src/include/nccl_device/impl/core__funcs.h
@@ -0,0 +1,210 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_CORE__FUNCS_H_
+#define _NCCL_DEVICE_CORE__FUNCS_H_
+#include "core__types.h"
+#include "comm__types.h"
+#include "ptr__types.h"
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamWorld(ncclDevComm const &comm) {
+  ncclTeam ans;
+  ans.nRanks = comm.nRanks;
+  ans.rank = comm.rank;
+  ans.stride = 1;
+  return ans;
+}
+#endif
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamLsa(ncclDevComm const &comm) {
+  ncclTeam ans;
+  ans.nRanks = comm.lsaSize;
+  ans.rank = comm.lsaRank;
+  ans.stride = 1;
+  return ans;
+}
+#endif
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE ncclTeam ncclTeamRail(ncclDevComm const& comm) {
+  ncclTeam ans;
+  ans.nRanks = nccl::utility::idivFast32(comm.nRanks, comm.lsaSize, comm.lsaSize_rcp32);
+  ans.rank = nccl::utility::idivFast32(comm.rank, comm.lsaSize, comm.lsaSize_rcp32);
+  ans.stride = comm.lsaSize;
+  return ans;
+}
+#endif
+
+NCCL_HOST_DEVICE_INLINE bool ncclTeamRankIsMember(ncclTeam_t a, ncclTeam_t b, int brank) {
+  int wrank = (brank - b.rank)*b.stride;
+  uint32_t adelta = wrank/a.stride;
+  uint32_t amod = wrank%a.stride;
+  int arank = a.rank + adelta;
+  return 0 <= arank && arank < a.nRanks && amod == 0;
+}
+
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankToTeam(ncclTeam_t a, ncclTeam_t b, int brank) {
+  int wrank = (brank - b.rank)*b.stride;
+  uint32_t adelta = wrank/a.stride;
+  //uint32_t amod = wrank%a.stride;
+  int arank = a.rank + adelta;
+  return arank;
+}
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankToWorld(ncclDevComm const& comm, ncclTeam tm, int rank) {
+  return comm.rank + (rank - tm.rank)*tm.stride;
+}
+#endif
+
+#if __cplusplus
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankToLsa(ncclDevComm const& comm, ncclTeam tm, int rank) {
+  return comm.lsaRank + (rank - tm.rank)*tm.stride;
+}
+#endif
+
+NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamInnerFactor(ncclTeam_t parent, int innerSize) {
+  ncclTeam_t ans;
+  ans.nRanks = innerSize;
+  ans.rank = parent.rank%innerSize;
+  ans.stride = parent.stride;
+  return ans;
+}
+
+NCCL_HOST_DEVICE_INLINE ncclTeam_t ncclTeamOuterFactor(ncclTeam_t parent, int innerSize) {
+  ncclTeam_t ans;
+  ans.nRanks = parent.nRanks/innerSize;
+  ans.rank = parent.rank/innerSize;
+  ans.stride = parent.stride*innerSize;
+  return ans;
+}
+
+NCCL_HOST_DEVICE_INLINE int ncclTeamRankInDifference(ncclTeam_t parent, ncclTeam_t subset, int index) {
+  int stride = subset.stride/parent.stride;
+  int below = parent.rank - subset.rank*stride;
+  if (stride < 0) {
+    stride = -stride;
+    below -= (subset.nRanks-1)*stride;
+  }
+  if (index < below) {
+    return index;
+  } else if (index-below < (subset.nRanks-1)*(stride-1)) {
+    return below + 1 + ((index-below)/(stride-1))*stride + (index-below)%(stride-1);
+  } else {
+    return below + 1 + (subset.nRanks-1)*stride + (index - below - (subset.nRanks-1)*(stride-1));
+  }
+}
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetLocalPointer(ncclWindow_t w, size_t offset) {
+  char* base = nccl::utility::loadConst(&w->lsaFlatBase);
+  uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
+  int i = nccl::utility::loadConst(&w->lsaRank);
+  return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetLsaPointer(ncclWindow_t w, size_t offset, int peer) {
+  char* base = nccl::utility::loadConst(&w->lsaFlatBase);
+  uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
+  int i = peer;
+  return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, int peer) {
+  char* base = nccl::utility::loadConst(&w->lsaFlatBase);
+  uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
+  int worldRank = nccl::utility::loadConst(&w->worldRank);
+  int lsaRank = nccl::utility::loadConst(&w->lsaRank);
+  int i = lsaRank + (peer - worldRank);
+  return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetPeerPointer(ncclWindow_t w, size_t offset, ncclTeam tm, int peer) {
+  char* base = nccl::utility::loadConst(&w->lsaFlatBase);
+  uint32_t stride4G = nccl::utility::loadConst(&w->stride4G);
+  int lsaRank = nccl::utility::loadConst(&w->lsaRank);
+  int i = lsaRank + (peer - tm.rank)*tm.stride;
+  return (void*)(nccl::utility::add4G(base, i*stride4G) + offset);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetMultimemPointer(ncclWindow_t w, size_t offset, ncclMultimemHandle mm) {
+  void* ptr = mm.mcBasePtr;
+  ptr = reinterpret_cast<char(*)[4096]>(ptr) + nccl::utility::loadConst(&w->mcOffset4K);
+  return (void*)((char*)ptr + offset);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset, ncclDevComm const& comm) {
+  return ncclGetMultimemPointer(w, offset, comm.lsaMultimem);
+}
+#endif
+
+NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) {
+  return ((size_t)h)*128;
+}
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLocalPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
+  void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
+  uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
+  void* local = nccl::utility::add4G(lsaFlatBase, comm.lsaRank*stride4G);
+  return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaPointer(ncclDevComm const& comm, ncclDevResourceHandle h, int peer) {
+  int r = peer;
+  void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
+  uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
+  void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
+  return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferPeerPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclTeam team, int peer) {
+  int r = comm.lsaRank + (peer - team.rank)*team.stride;
+  void* lsaFlatBase = comm.resourceWindow_inlined.lsaFlatBase;
+  uint32_t stride4G = comm.resourceWindow_inlined.stride4G;
+  void* local = nccl::utility::add4G(lsaFlatBase, r*stride4G);
+  return (void*)(reinterpret_cast<char(*)[128]>(local) + h);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h, ncclMultimemHandle mm) {
+  void* ptr = mm.mcBasePtr;
+  ptr = reinterpret_cast<char(*)[4096]>(ptr) + comm.resourceWindow_inlined.mcOffset4K;
+  ptr = reinterpret_cast<char(*)[128]>(ptr) + h;
+  return ptr;
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void* ncclGetResourceBufferLsaMultimemPointer(ncclDevComm const& comm, ncclDevResourceHandle h) {
+  return ncclGetResourceBufferMultimemPointer(comm, h, comm.lsaMultimem);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE ncclSymPtr<char> ncclGetResourceBuffer(ncclDevComm const& comm, ncclDevResourceHandle h) {
+  return ncclSymPtr<char>(comm.resourceWindow, size_t(h)*128);
+}
+#endif
+
+#endif
diff --git a/src/include/nccl_device/impl/core__types.h b/src/include/nccl_device/impl/core__types.h
new file mode 100644
index 000000000..d2d1350b1
--- /dev/null
+++ b/src/include/nccl_device/impl/core__types.h
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_CORE__TYPES_H_
+#define _NCCL_DEVICE_CORE__TYPES_H_
+#include "../core.h"
+
+// nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t;
+struct ncclWindow_vidmem {
+  void* winHost;
+  //ncclGinWindow_t ginWin;
+  char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team
+  int lsaRank;
+  int worldRank;
+  uint32_t stride4G;
+  uint32_t mcOffset4K;
+};
+
+struct ncclMultimemHandle {
+  void* mcBasePtr;
+};
+
+#endif
diff --git a/src/include/nccl_device/impl/ll_a2a__funcs.h b/src/include/nccl_device/impl/ll_a2a__funcs.h
new file mode 100644
index 000000000..39bdf7a29
--- /dev/null
+++ b/src/include/nccl_device/impl/ll_a2a__funcs.h
@@ -0,0 +1,229 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_LL_A2A__FUNCS_H_
+#define _NCCL_DEVICE_LL_A2A__FUNCS_H_
+#include "ll_a2a__types.h"
+#include "comm__types.h"
+#include "../utility.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::ncclLLA2ASession(
+    Coop coop, ncclDevComm const& comm, ncclTeam team,
+    ncclLLA2AHandle handle, uint32_t block, int maxElts,
+    bool multimem, ncclMultimemHandle mmHandle
+  ):
+  ncclLLA2ASession_internal<Coop>{
+    coop, comm, team, handle, (int)block, /*pitch=*/maxElts,
+    multimem, mmHandle, /*epoch=*/0, /*slotsOffset=*/0
+  } {
+  uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+  line += block*(1 + 2*handle.nSlots);
+  this->epoch = line->x + 2;
+  this->slotsOffset = this->calcSlotOffset();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLLA2ASession<Coop>::~ncclLLA2ASession() {
+  uint4* line = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  line += this->block*(1 + 2*this->handle.nSlots);
+  if (this->coop.thread_rank() == 0) line->x = this->epoch - 2;
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::send(int peer, int elt, T data) {
+  using nccl::utility::divUp;
+  union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+  tmp = data;
+  uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, peer);
+  buf += this->slotsOffset + elt;
+  #pragma unroll
+  for (int u=0; u < divUp(sizeof(T), 8); u++) {
+    asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+      "l"(buf + u*this->pitch),
+      "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+    );
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::bcast(int elt, T data) {
+  using nccl::utility::divUp;
+  if (this->multimem) {
+    union { T tmp; uint32_t u32[divUp(sizeof(T),8)][2]; };
+    tmp = data;
+    uint4* bufmc = (uint4*)ncclGetResourceBufferMultimemPointer(this->comm, this->handle.bufHandle, this->mmHandle);
+    bufmc += this->slotsOffset + elt;
+    #pragma unroll
+    for (int u=0; u < divUp(sizeof(T), 8); u++) {
+      asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+        "l"(bufmc + this->pitch*u),
+        "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+      );
+    }
+  } else {
+    union { T tmp; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+    tmp = data;
+    int dr = 0;
+    int r = this->team.rank;
+    #pragma unroll 1
+    for (; dr+8 <= this->team.nRanks; dr += 8) {
+      #pragma unroll
+      for (int ur=0; ur < 8; ur++) {
+        uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
+        buf += this->slotsOffset + elt;
+        #pragma unroll
+        for (int u=0; u < divUp(sizeof(T),8); u++) {
+          asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+            "l"(buf + u*this->pitch),
+            "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+          );
+        }
+        r += 1;
+        if (r == this->team.nRanks) r = 0;
+      }
+    }
+    #pragma unroll
+    for (int ur=0; ur < 8; ur++, dr++) {
+      if (dr == this->team.nRanks) break;
+      uint4* buf = (uint4*)ncclGetResourceBufferPeerPointer(this->comm, this->handle.bufHandle, this->team, r);
+      buf += this->slotsOffset + elt;
+      #pragma unroll
+      for (int u=0; u < divUp(sizeof(T),8); u++) {
+        asm volatile("st.volatile.v4.u32 [%0],{%1,%3,%2,%3};" ::
+          "l"(buf + u*this->pitch),
+          "r"(u32[u][0]), "r"(u32[u][1]), "r"(this->epoch)
+        );
+      }
+      r += 1;
+      if (r == this->team.nRanks) r = 0;
+    }
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+template<typename T>
+NCCL_DEVICE_INLINE T ncclLLA2ASession<Coop>::recv(int elt) {
+  T ret[1];
+  this->template recvUnrolled</*MinEltCount=*/1, /*MaxEltCount=*/1>(elt, 1, 0, ret);
+  return ret[0];
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+template<int MinEltCount, int MaxEltCount, typename T>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::recvUnrolled(int eltStart, int eltCount, int eltStride, T(&elts)[MaxEltCount]) {
+  using nccl::utility::divUp;
+  uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  buf += this->slotsOffset + eltStart;
+
+  uint4 tmp[MaxEltCount][divUp(sizeof(T), 8)];
+  #pragma unroll 1
+  while (true) {
+    #pragma unroll
+    for (int u=0; u < MaxEltCount; u++) {
+      if (u < MinEltCount || u < eltCount) {
+        #pragma unroll
+        for (int v=0; v < divUp(sizeof(T), 8); v++) {
+          asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];"
+            : "=r"(tmp[u][v].x), "=r"(tmp[u][v].y), "=r"(tmp[u][v].z), "=r"(tmp[u][v].w)
+            : "l"(buf + u*eltStride + v*this->pitch));
+        }
+      }
+    }
+    bool okAll = true;
+    #pragma unroll
+    for (int u=0; u < MaxEltCount; u++) {
+      #pragma unroll
+      for (int v=0; v < divUp(sizeof(T), 8); v++) {
+        if (u < MinEltCount || u < eltCount) {
+          bool ok = tmp[u][v].y == this->epoch &&
+                    tmp[u][v].w == this->epoch;
+          okAll &= ok;
+        }
+      }
+    }
+    if (__builtin_expect(okAll, true)) break;
+  }
+
+  #pragma unroll
+  for (int u=0; u < MaxEltCount; u++) {
+    if (MinEltCount <= u && u == eltCount) break;
+    union { T val; uint32_t u32[divUp(sizeof(T), 8)][2]; };
+    #pragma unroll
+    for (int v=0; v < divUp(sizeof(T), 8); v++) {
+      u32[v][0] = tmp[u][v].x;
+      u32[v][1] = tmp[u][v].z;
+    }
+    elts[u] = val;
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
+NCCL_DEVICE_INLINE auto ncclLLA2ASession<Coop>::recvReduce(
+    int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce reduce
+  ) -> decltype(eltToAcc(nccl::utility::declval<Elt>())) {
+  using Acc = decltype(eltToAcc(nccl::utility::declval<Elt>()));
+  Acc acc;
+  int i = 0;
+  #pragma unroll 1
+  for (; i+Unroll <= eltCount; i += Unroll) {
+    Elt got[Unroll];
+    this->template recvUnrolled</*Min=*/Unroll>(eltStart + i*eltStride, Unroll, eltStride, got);
+    Acc acc0 = eltToAcc(got[0]);
+    acc = i==0 ? acc0 : reduce(acc, acc0);
+    #pragma unroll
+    for (int j=1; j < Unroll; j++) acc = reduce(acc, eltToAcc(got[j]));
+  }
+  if (i < eltCount) {
+    Elt got[Unroll];
+    this->template recvUnrolled</*Min=*/1>(eltStart + i*eltStride, eltCount-i, eltStride, got);
+    Acc acc0 = eltToAcc(got[0]);
+    acc = i==0 ? acc0 : reduce(acc, acc0);
+    #pragma unroll
+    for (int j=1; j < Unroll-1; j++) {
+      if (i+j < eltCount) acc = reduce(acc, eltToAcc(got[j]));
+    }
+  }
+  return acc;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLLA2ASession<Coop>::endEpoch(Coop) {
+  if (__builtin_expect(this->epoch >= -2u, false)) {
+    this->coop.sync();
+    uint4* buf = (uint4*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+    buf += this->slotsOffset;
+    #pragma unroll 4
+    for (int i=this->coop.thread_rank(); i < this->handle.nSlots; i += this->coop.size()) {
+      buf[i] = uint4{0, 0, 0, 0};
+    }
+  }
+  this->coop.sync();
+  this->epoch += (this->epoch == -1u) ? 3 : 1;
+  this->slotsOffset = this->calcSlotOffset();
+}
+#endif
+
+#endif // _NCCL_DEVICE_LL_A2A__FUNCS_H_
diff --git a/src/include/nccl_device/impl/ll_a2a__types.h b/src/include/nccl_device/impl/ll_a2a__types.h
new file mode 100644
index 000000000..501777acf
--- /dev/null
+++ b/src/include/nccl_device/impl/ll_a2a__types.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_LL_A2A__TYPES_H_
+#define _NCCL_DEVICE_LL_A2A__TYPES_H_
+#include "../ll_a2a.h"
+#include "core__types.h"
+
+struct ncclLLA2AHandle {
+  ncclDevResourceHandle_t bufHandle;
+  uint32_t nSlots;
+};
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLLA2ASession_internal {
+  Coop coop;
+  ncclDevComm const& comm;
+  ncclTeam team;
+  ncclLLA2AHandle handle;
+  int block;
+  int pitch;
+  bool multimem;
+  ncclMultimemHandle mmHandle;
+  uint32_t epoch;
+  uint32_t slotsOffset;
+
+  NCCL_DEVICE_INLINE uint32_t calcSlotOffset() const {
+    return block*(1 + 2*handle.nSlots) + 1 + (epoch & 1)*handle.nSlots;
+  }
+};
+#endif
+
+#endif // _NCCL_DEVICE_LL_A2A__TYPES_H_
diff --git a/src/include/nccl_device/impl/mem_barrier__funcs.h b/src/include/nccl_device/impl/mem_barrier__funcs.h
new file mode 100644
index 000000000..86a5d0fbc
--- /dev/null
+++ b/src/include/nccl_device/impl/mem_barrier__funcs.h
@@ -0,0 +1,126 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#include "mem_barrier__types.h"
+#include "comm__types.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeam team,
+    ncclLsaBarrierHandle handle, uint32_t index,
+    bool multimem, ncclMultimemHandle mmHandle
+  ):
+  ncclLsaBarrierSession_internal<Coop>{
+    coop, comm, team, handle, (int)index,
+#if CUDART_VERSION >= 12060
+    multimem,
+#else // WAR for an issue with ptxas in CTK < 12.6
+    /*multimem=*/false,
+#endif
+    mmHandle, /*epoch=*/0
+  } {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+  this->epoch = state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index];
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeamTagLsa, uint32_t index, bool multimem
+  ): ncclLsaBarrierSession(
+    coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::~ncclLsaBarrierSession() {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  if (this->coop.thread_rank() == 0) {
+#if __CUDA_ARCH__ == 1200 && CUDART_VERSION < 13000
+    // WAR for a compiler issue with CTK < 13.0
+    if (this->index == 0)
+      state[(this->multimem ? 0 : 1)*this->handle.nBarriers] = this->epoch;
+    else
+#endif
+    state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index] = this->epoch;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::arrive(Coop, cuda::memory_order order) {
+  this->coop.sync();
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      uint32_t* inbox = this->mcInbox(/*multimem=*/true);
+      if (nccl::utility::releaseOrderOf(order) != cuda::memory_order_relaxed) {
+        asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
+      } else {
+        asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
+      }
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(peer, this->team.rank));
+      inbox.store(this->epoch+1, nccl::utility::releaseOrderOf(order));
+    }
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::wait(Coop, cuda::memory_order order) {
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      cuda::atomic_ref<uint32_t> inbox(*this->mcInbox(/*multimem=*/false));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + this->team.nRanks) <= uint32_t(-1)>>1) break;
+      }
+      this->epoch += this->team.nRanks;
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(this->team.rank, peer));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + 1) <= uint32_t(-1)>>1) break;
+      }
+    }
+    this->epoch += 1;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::sync(Coop coop, cuda::memory_order order) {
+  this->arrive(coop, order);
+  this->wait(coop, order);
+}
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
diff --git a/src/include/nccl_device/impl/mem_barrier__types.h b/src/include/nccl_device/impl/mem_barrier__types.h
new file mode 100644
index 000000000..8498cd6ba
--- /dev/null
+++ b/src/include/nccl_device/impl/mem_barrier__types.h
@@ -0,0 +1,46 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#include "../mem_barrier.h"
+#include "core__types.h"
+
+struct ncclLsaBarrierHandle {
+  ncclDevResourceHandle_t bufHandle;
+  int nBarriers;
+};
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal {
+  Coop coop;
+  ncclDevComm const& comm;
+  ncclTeam team;
+  ncclLsaBarrierHandle handle;
+  int index;
+  bool multimem;
+  ncclMultimemHandle mmHandle;
+  uint32_t epoch;
+
+  NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) {
+    uint32_t* state;
+    if (multimem) { // multicast
+      state = (uint32_t*)ncclGetResourceBufferMultimemPointer(comm, handle.bufHandle, mmHandle);
+    } else { // unicast
+      state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+    }
+    return state + 2*handle.nBarriers + index;
+  }
+
+  NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) {
+    uint32_t* state = (uint32_t*)ncclGetResourceBufferPeerPointer(comm, handle.bufHandle, team, owner);
+    return state + 3*handle.nBarriers + index*team.nRanks + peer;
+  }
+};
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
diff --git a/src/include/nccl_device/impl/ptr__funcs.h b/src/include/nccl_device/impl/ptr__funcs.h
new file mode 100644
index 000000000..ef33634e4
--- /dev/null
+++ b/src/include/nccl_device/impl/ptr__funcs.h
@@ -0,0 +1,157 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_PTR__FUNCS_H_
+#define _NCCL_DEVICE_PTR__FUNCS_H_
+#include "ptr__types.h"
+#include "core__funcs.h"
+#include "comm__types.h"
+
+#if __cplusplus
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr<T>::ncclSymPtr(ncclWindow_t window, size_t offset):
+  window(window), offset(offset) {
+}
+
+template<typename T>
+template<typename U>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>::operator ncclSymPtr<U>() const {
+  return {window, offset};
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator+=(unsigned long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) + d);
+  return *this;
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned int d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& ncclSymPtr<T>::operator-=(unsigned long long d) {
+  offset = reinterpret_cast<size_t>(reinterpret_cast<T*>(offset) - d);
+  return *this;
+}
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::localPtr() const {
+  return (T*)ncclGetLocalPointer(window, offset);
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaPtr(int peer) const {
+  return (T*)ncclGetLsaPointer(window, offset, peer);
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(int peer) const {
+  return (T*)ncclGetPeerPointer(window, offset, peer);
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::peerPtr(ncclTeam team, int peer) const {
+  return (T*)ncclGetPeerPointer(window, offset, team, peer);
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::multimemPtr(ncclMultimemHandle mmHandle) const {
+  return (T*)ncclGetMultimemPointer(window, offset, mmHandle);
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T* ncclSymPtr<T>::lsaMultimemPtr(ncclDevComm const& comm) const {
+  return (T*)ncclGetLsaMultimemPointer(window, offset, comm);
+}
+#endif
+
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d) {
+  return p += d;
+}
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d) {
+  return p -= d;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return reinterpret_cast<T*>(a.offset) - reinterpret_cast<T*>(b.offset);
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE bool operator==(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return a.window == b.window && a.offset == b.offset;
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE bool operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b) {
+  return a.window != b.window || a.offset != b.offset;
+}
+
+#endif // __cplusplus
+#endif // _NCCL_DEVICE_PTR__FUNCS_H_
diff --git a/src/include/nccl_device/impl/ptr__types.h b/src/include/nccl_device/impl/ptr__types.h
new file mode 100644
index 000000000..3f9a1a0f8
--- /dev/null
+++ b/src/include/nccl_device/impl/ptr__types.h
@@ -0,0 +1,11 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_PTR__TYPES_H_
+#define _NCCL_DEVICE_PTR__TYPES_H_
+#include "../ptr.h"
+#include "core__types.h"
+#endif // _NCCL_DEVICE_PTR__TYPES_H_
diff --git a/src/include/nccl_device/ll_a2a.h b/src/include/nccl_device/ll_a2a.h
new file mode 100644
index 000000000..db3a517b7
--- /dev/null
+++ b/src/include/nccl_device/ll_a2a.h
@@ -0,0 +1,53 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_LL_A2A_H_
+#define _NCCL_DEVICE_LL_A2A_H_
+#include "impl/core__types.h"
+
+struct ncclLLA2AHandle;
+
+NCCL_EXTERN_C __host__ int ncclLLA2ACalcSlots(int maxElts, int maxEltSize);
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclLLA2ACreateRequirement(int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLLA2ASession_internal;
+
+template<typename Coop>
+struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclLLA2ASession(Coop, ncclDevComm const&, ncclTeam, ncclLLA2AHandle, uint32_t block, int maxElts, bool multimem=false, ncclMultimemHandle mmHandle={});
+
+  NCCL_DEVICE_INLINE ~ncclLLA2ASession();
+
+  ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable
+  
+  template<typename T>
+  NCCL_DEVICE_INLINE void send(int peer, int slot, T data);
+
+  template<typename T>
+  NCCL_DEVICE_INLINE void bcast(int slot, T data);
+
+  template<typename T>
+  NCCL_DEVICE_INLINE T recv(int slot);
+
+  template<int MinEltCount, int MaxEltCount, typename T>
+  NCCL_DEVICE_INLINE void recvUnrolled(int eltStart, int eltCount, int eltStride, T(&vals)[MaxEltCount]);
+
+  template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
+  NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red)
+    -> decltype(eltToAcc(nccl::utility::declval<Elt>())) ;
+  
+  // End an alltoall region. For every peer in team you must have done both of the
+  // following each of which can be accomplished using any thread in coop:
+  //  1. Targeted that peer with at least one send().
+  //  2. Received from a slot targeted by that peer.
+  NCCL_DEVICE_INLINE void endEpoch(Coop);
+};
+#endif
+
+#endif // _NCCL_DEVICE_LL_A2A_H_
diff --git a/src/include/nccl_device/mem_barrier.h b/src/include/nccl_device/mem_barrier.h
new file mode 100644
index 000000000..ea90cc6f6
--- /dev/null
+++ b/src/include/nccl_device/mem_barrier.h
@@ -0,0 +1,35 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER_H_
+#define _NCCL_DEVICE_MEM_BARRIER_H_
+#include "impl/core__types.h"
+
+struct ncclLsaBarrierHandle;
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclLsaBarrierCreateRequirement(ncclTeam_t, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal;
+
+template<typename Coop>
+struct ncclLsaBarrierSession: ncclLsaBarrierSession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeam, ncclLsaBarrierHandle, uint32_t index, bool multimem=false, ncclMultimemHandle mmHandle={});
+
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeamTagLsa, uint32_t index, bool multimem=false);
+
+  NCCL_DEVICE_INLINE ~ncclLsaBarrierSession();
+
+  ncclLsaBarrierSession(ncclLsaBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE void arrive(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void wait(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order);
+};
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER_H_
diff --git a/src/include/nccl_device/ptr.h b/src/include/nccl_device/ptr.h
new file mode 100644
index 000000000..4b8914c88
--- /dev/null
+++ b/src/include/nccl_device/ptr.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_PTR_H_
+#define _NCCL_DEVICE_PTR_H_
+#include "core.h"
+#include <stdint.h>
+
+#if __cplusplus
+template<typename T>
+struct ncclSymPtr {
+  using ElementType = T;
+  ncclWindow_t window;
+  size_t offset;
+
+  NCCL_HOST_DEVICE_INLINE constexpr ncclSymPtr(ncclWindow_t window=nullptr, size_t offset=0);
+
+  template<typename U>
+  NCCL_HOST_DEVICE_INLINE operator ncclSymPtr<U>() const;
+
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(int d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned int d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(long long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator+=(unsigned long long d);
+
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(int d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned int d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(long long d);
+  NCCL_HOST_DEVICE_INLINE ncclSymPtr<T>& operator-=(unsigned long long d);
+
+  #if __CUDACC__
+  NCCL_DEVICE_INLINE T* localPtr() const;
+  NCCL_DEVICE_INLINE T* lsaPtr(int peer) const;
+  NCCL_DEVICE_INLINE T* peerPtr(int peer) const;
+  NCCL_DEVICE_INLINE T* peerPtr(ncclTeam team, int peer) const;
+  NCCL_DEVICE_INLINE T* multimemPtr(ncclMultimemHandle mmHandle) const;
+  NCCL_DEVICE_INLINE T* lsaMultimemPtr(ncclDevComm const&) const;
+  #endif
+};
+
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator+(ncclSymPtr<T> p, Int d);
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator-(ncclSymPtr<T> p, Int d);
+template<typename T>
+NCCL_HOST_DEVICE_INLINE ptrdiff_t operator-(ncclSymPtr<T> a, ncclSymPtr<T> b);
+
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator==(ncclSymPtr<T> a, ncclSymPtr<T> b);
+template<typename T, typename Int>
+NCCL_HOST_DEVICE_INLINE ncclSymPtr<T> operator!=(ncclSymPtr<T> a, ncclSymPtr<T> b);
+#endif
+
+#endif
diff --git a/src/include/nccl_device/utility.h b/src/include/nccl_device/utility.h
new file mode 100644
index 000000000..b98a0d973
--- /dev/null
+++ b/src/include/nccl_device/utility.h
@@ -0,0 +1,352 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_UTILITY_H_
+#define _NCCL_DEVICE_UTILITY_H_
+
+#if __CUDACC__
+  #define NCCL_DEVICE_INLINE __device__ __forceinline__
+  #define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+  #ifndef __host__
+    #define __host__
+  #endif
+  #define NCCL_DEVICE_INLINE
+  #define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline))
+#endif
+
+#if __cplusplus
+#define NCCL_EXTERN_C extern "C"
+#else
+#define NCCL_EXTERN_C
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#if __CUDACC__
+#include <cuda/atomic>
+#endif
+
+#if __cplusplus
+namespace nccl {
+namespace utility {
+
+template<typename T>
+T&& declval() noexcept {
+  static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval.");
+}
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) {
+  return (x+y-1)/y;
+}
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+NCCL_HOST_DEVICE_INLINE constexpr Z roundUp(X x, Y y) {
+  return (x+y-1) - (x+y-1)%y;
+}
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+NCCL_HOST_DEVICE_INLINE constexpr Z roundDown(X x, Y y) {
+  return x - x%y;
+}
+
+// assumes second argument is a power of 2
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+NCCL_HOST_DEVICE_INLINE constexpr Z alignUp(X x, Y a) {
+  return (x + a-1) & -Z(a);
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE T* alignUp(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE void* alignUp(void const* x, size_t a) {
+  return reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(x) + a-1) & -uintptr_t(a));
+}
+
+// assumes second argument is a power of 2
+template<typename X, typename Y, typename Z = decltype(X()+int())>
+NCCL_HOST_DEVICE_INLINE constexpr Z alignDown(X x, Y a) {
+  return x & -Z(a);
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE T* alignDown(T* x, size_t a) {
+  static_assert(sizeof(T) == 1, "Only single byte types allowed.");
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
+}
+template<typename T>
+NCCL_HOST_DEVICE_INLINE void* alignDown(void const* x, size_t a) {
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(x) & -uintptr_t(a));
+}
+
+template<typename T>
+NCCL_HOST_DEVICE_INLINE T add4G(T base, int delta4G) {
+  union { uint32_t u32[2]; T tmp; };
+  tmp = base;
+  u32[1] += delta4G;
+  return tmp;
+}
+
+
+template<typename Int>
+NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) {
+  return (x & (x-1)) == 0;
+}
+
+// Produce the reciprocal of x for use in idivByRcp
+NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) {
+  return uint32_t(-1)/x + isPow2(x);
+}
+NCCL_HOST_DEVICE_INLINE constexpr uint64_t idivRcp64(uint64_t x) {
+  return uint64_t(-1)/x + isPow2(x);
+}
+
+NCCL_HOST_DEVICE_INLINE uint32_t mul32hi(uint32_t a, uint32_t b) {
+#if __CUDA_ARCH__
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+NCCL_HOST_DEVICE_INLINE uint64_t mul64hi(uint64_t a, uint64_t b) {
+#if __CUDA_ARCH__
+  return __umul64hi(a, b);
+#else
+  return (uint64_t)(((unsigned __int128)a)*b >> 64);
+#endif
+}
+
+// Produce the reciprocal of x*y given their respective reciprocals. This incurs
+// no integer division on device.
+NCCL_HOST_DEVICE_INLINE uint32_t imulRcp32(uint32_t x, uint32_t xrcp, uint32_t y, uint32_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint32_t rcp = mul32hi(xrcp, yrcp);
+  uint32_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+NCCL_HOST_DEVICE_INLINE uint64_t imulRcp64(uint64_t x, uint64_t xrcp, uint64_t y, uint64_t yrcp) {
+  if (xrcp == 0) return yrcp;
+  if (yrcp == 0) return xrcp;
+  uint64_t rcp = mul64hi(xrcp, yrcp);
+  uint64_t rem = -x*y*rcp;
+  if (x*y <= rem) rcp += 1;
+  return rcp;
+}
+
+// Fast unsigned integer division where divisor has precomputed reciprocal.
+// idivFast(x, y, idivRcp(y)) == x/y
+NCCL_HOST_DEVICE_INLINE void idivmodFast32(uint32_t *quo, uint32_t *rem, uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q = yrcp == 0 ? x : mul32hi(x, yrcp);
+  uint32_t r = x - y*q;
+  if (r >= y) { q += 1; r -= y; }
+  *quo = q;
+  *rem = r;
+}
+NCCL_HOST_DEVICE_INLINE void idivmodFast64(uint64_t *quo, uint64_t *rem, uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint32_t q = yrcp == 0 ? x : mul64hi(x, yrcp);
+  uint32_t r = x - y*q;
+  if (r >= y) { q += 1; r -= y; }
+  *quo = q;
+  *rem = r;
+}
+
+NCCL_HOST_DEVICE_INLINE uint32_t idivFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return q;
+}
+NCCL_HOST_DEVICE_INLINE uint32_t idivFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return q;
+}
+
+NCCL_HOST_DEVICE_INLINE uint32_t imodFast32(uint32_t x, uint32_t y, uint32_t yrcp) {
+  uint32_t q, r;
+  idivmodFast32(&q, &r, x, y, yrcp);
+  return r;
+}
+NCCL_HOST_DEVICE_INLINE uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrcp) {
+  uint64_t q, r;
+  idivmodFast64(&q, &r, x, y, yrcp);
+  return r;
+}
+
+#if __CUDACC__
+// Precomputed integer reciprocoals for denominator values 1..64 inclusive.
+// Pass these to idivFast64() for fast division on the GPU.
+NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) {
+  static constexpr uint64_t table[65] = {
+    idivRcp64(0x01), idivRcp64(0x01), idivRcp64(0x02), idivRcp64(0x03),
+    idivRcp64(0x04), idivRcp64(0x05), idivRcp64(0x06), idivRcp64(0x07),
+    idivRcp64(0x08), idivRcp64(0x09), idivRcp64(0x0a), idivRcp64(0x0b),
+    idivRcp64(0x0c), idivRcp64(0x0d), idivRcp64(0x0e), idivRcp64(0x0f),
+    idivRcp64(0x10), idivRcp64(0x11), idivRcp64(0x12), idivRcp64(0x13),
+    idivRcp64(0x14), idivRcp64(0x15), idivRcp64(0x16), idivRcp64(0x17),
+    idivRcp64(0x18), idivRcp64(0x19), idivRcp64(0x1a), idivRcp64(0x1b),
+    idivRcp64(0x1c), idivRcp64(0x1d), idivRcp64(0x1e), idivRcp64(0x1f),
+    idivRcp64(0x20), idivRcp64(0x21), idivRcp64(0x22), idivRcp64(0x23),
+    idivRcp64(0x24), idivRcp64(0x25), idivRcp64(0x26), idivRcp64(0x27),
+    idivRcp64(0x28), idivRcp64(0x29), idivRcp64(0x2a), idivRcp64(0x2b),
+    idivRcp64(0x2c), idivRcp64(0x2d), idivRcp64(0x2e), idivRcp64(0x2f),
+    idivRcp64(0x30), idivRcp64(0x31), idivRcp64(0x32), idivRcp64(0x33),
+    idivRcp64(0x34), idivRcp64(0x35), idivRcp64(0x36), idivRcp64(0x37),
+    idivRcp64(0x38), idivRcp64(0x39), idivRcp64(0x3a), idivRcp64(0x3b),
+    idivRcp64(0x3c), idivRcp64(0x3d), idivRcp64(0x3e), idivRcp64(0x3f),
+    idivRcp64(0x40)
+  };
+  return table[x];
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) {
+  return idivRcp64_upto64(x)>>32;
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE void fenceAcquireGpu() {
+  static __device__ int dummy;
+  int tmp;
+  asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
+  dummy = tmp;
+}
+NCCL_DEVICE_INLINE void fenceReleaseGpu() {
+  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
+  return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
+         ord == cuda::memory_order_acq_rel ? cuda::memory_order_acquire :
+         ord;
+}
+NCCL_DEVICE_INLINE cuda::memory_order releaseOrderOf(cuda::memory_order ord) {
+  return ord == cuda::memory_order_acquire ? cuda::memory_order_relaxed :
+         ord == cuda::memory_order_acq_rel ? cuda::memory_order_release :
+         ord;
+}
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE int lane() {
+  int ret;
+  asm("mov.u32 %0, %%laneid;" : "=r"(ret));
+  return ret;
+}
+NCCL_DEVICE_INLINE unsigned int lanemask_lt() {
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
+  return ret;
+}
+#endif
+
+#if __CUDACC__
+// Load anything, but cache like its constant memory.
+template<typename T>
+NCCL_DEVICE_INLINE T loadConst(T const *p) {
+  if (alignof(T) == 1) {
+    union { uint8_t part[sizeof(T)]; T ret; };
+    for (int i=0; i < (int)sizeof(T); i++) part[i] = __ldg((uint8_t const*)p + i);
+    return ret;
+  } else if (alignof(T) == 2) {
+    union { uint16_t part[sizeof(T)/2]; T ret; };
+    for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = __ldg((uint16_t const*)p + i);
+    return ret;
+  } else if (alignof(T) == 4) {
+    union { uint32_t part[sizeof(T)/4]; T ret; };
+    for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = __ldg((uint32_t const*)p + i);
+    return ret;
+  } else if (alignof(T) == 8) {
+    union { uint64_t part[sizeof(T)/8]; T ret; };
+    for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = __ldg((uint64_t const*)p + i);
+    return ret;
+  } else { // alignof(T) >= 16
+    union { ulonglong2 part[sizeof(T)/16]; T ret; };
+    for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = __ldg((ulonglong2 const*)p + i);
+    return ret;
+  }
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Optional<T>: Holds a T that may or may not be constructed. An Optional
+// constructed with a Present<Arg...> will have its T constructed via the
+// T::T(Arg...) constructor. An Optional constructed with a Absent will not
+// have its T constructed.
+
+template<int ...vals>
+struct IntSeq {};
+
+template<int n, int m, int ...i>
+struct IntSeqUpTo: IntSeqUpTo<n, m+1, i..., m> {};
+template<int n, int ...i>
+struct IntSeqUpTo<n, n, i...> { using Type = IntSeq<i...>; };
+
+// Present<Arg...>: Packs a list of arguments together to be passed to Optional<T>.
+template<typename ...Arg>
+struct Present;
+template<>
+struct Present<> {};
+template<typename H, typename ...T>
+struct Present<H, T...> {
+  H h;
+  Present<T...> t;
+
+  NCCL_HOST_DEVICE_INLINE H get(IntSeq<0>) {
+    return static_cast<H>(h);
+  }
+  template<int i>
+  NCCL_HOST_DEVICE_INLINE decltype(auto) get(IntSeq<i>) {
+    return t.get(IntSeq<i-1>{});
+  }
+};
+
+NCCL_HOST_DEVICE_INLINE Present<> present() {
+  return Present<>{};
+}
+template<typename H, typename ...T>
+NCCL_HOST_DEVICE_INLINE Present<H&&, T&&...> present(H&& h, T&& ...t) {
+  return Present<H&&, T&&...>{static_cast<H&&>(h), present(static_cast<T&&>(t)...)};
+}
+
+struct Absent {};
+
+template<typename T>
+struct Optional {
+  bool present; // Is `thing` constructed.
+  union { T thing; };
+
+  // Construct with absent thing:
+  NCCL_HOST_DEVICE_INLINE constexpr Optional(): present(false) {}
+  NCCL_HOST_DEVICE_INLINE constexpr Optional(Absent): present(false) {}
+
+  // Helper constructor
+  template<int ...i, typename ...Arg>
+  NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args, IntSeq<i...>):
+    present(true),
+    thing{args.get(IntSeq<i>())...} {
+  }
+  // Construct with present thing:
+  template<typename ...Arg>
+  NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args):
+    Optional(args, IntSeqUpTo<sizeof...(Arg), 0>::Type()) {
+  }
+
+  NCCL_HOST_DEVICE_INLINE ~Optional() {
+    if (present) thing.~T();
+  }
+};
+
+}}
+#endif // __cplusplus
+#endif
diff --git a/src/include/net.h b/src/include/net.h
index 552e9bcb4..f13eebb06 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -12,10 +12,16 @@
 #include "comm.h"
 #include "checks.h"
 
+#define NCCL_UNDEF_DEV_COUNT -1
+
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 ncclResult_t ncclNetInit(struct ncclComm* comm);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
+ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev);
+ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDev);
+ncclResult_t ncclCollNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev);
+ncclResult_t ncclCollNetSetVirtDevCount(int netPluginIndex, int nVirtDev);
 
 // Test whether the current GPU support GPU Direct RDMA.
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
diff --git a/src/include/net_device.h b/src/include/net_device.h
index c3a79e35c..99ae9c38b 100644
--- a/src/include/net_device.h
+++ b/src/include/net_device.h
@@ -12,7 +12,7 @@
 
 // Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
-#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
 
 typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
 
@@ -27,6 +27,7 @@ typedef struct {
 typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
 typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
 typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
-typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
 
 #endif
diff --git a/src/include/nvmlwrap.h b/src/include/nvmlwrap.h
index 72fbf9ce2..ce8925ef9 100644
--- a/src/include/nvmlwrap.h
+++ b/src/include/nvmlwrap.h
@@ -253,6 +253,24 @@ typedef nvmlGpuFabricInfo_v2_t nvmlGpuFabricInfoV_t;
 */
 #define nvmlGpuFabricInfo_v2 NVML_STRUCT_VERSION(GpuFabricInfo, 2)
 
+/**
+ * Structure to store platform information (v2)
+ */
+typedef struct
+{
+    unsigned int version;                       //!< the API version number
+    unsigned char ibGuid[16];                   //!< Infiniband GUID reported by platform (for Blackwell, ibGuid is 8 bytes so indices 8-15 are zero)
+    unsigned char chassisSerialNumber[16];      //!< Serial number of the chassis containing this GPU (for Blackwell it is 13 bytes so indices 13-15 are zero)
+    unsigned char slotNumber;                   //!< The slot number in the chassis containing this GPU (includes switches)
+    unsigned char trayIndex;                    //!< The tray index within the compute slots in the chassis containing this GPU (does not include switches)
+    unsigned char hostId;                       //!< Index of the node within the slot containing this GPU
+    unsigned char peerType;                     //!< Platform indicated NVLink-peer type (e.g. switch present or not)
+    unsigned char moduleId;                     //!< ID of this GPU within the node
+} nvmlPlatformInfo_v2_t;
+
+typedef nvmlPlatformInfo_v2_t nvmlPlatformInfo_t;
+#define nvmlPlatformInfo_v2 NVML_STRUCT_VERSION(PlatformInfo, 2)
+
 /**
  * Confidential Compute Feature Status values
  */
@@ -270,6 +288,7 @@ typedef struct nvmlConfComputeSystemState_st {
  */
 #define NVML_CC_SYSTEM_MULTIGPU_NONE 0
 #define NVML_CC_SYSTEM_MULTIGPU_PROTECTED_PCIE 1
+#define NVML_CC_SYSTEM_MULTIGPU_NVLE 2
 
 /**
  * Confidential Compute System settings
@@ -303,6 +322,7 @@ extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMa
 struct ncclNvmlCCStatus {
     bool CCEnabled;
     bool multiGpuProtectedPCIE;
+    bool multiGpuNVLE;
 };
 
 // All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
@@ -320,6 +340,7 @@ ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* ma
 ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
 ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
 ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo);
+ncclResult_t ncclNvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *plaformInfo);
 ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status);
 
 #endif // End include guard
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index de50dfe2e..8f20be43d 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -9,6 +9,8 @@
 
 #include "nvtx3/nvtx3.hpp"
 
+#include "param.h"
+
 #if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
 #define NVTX3_CONSTEXPR_IF_CPP14 constexpr
 #else
@@ -32,15 +34,20 @@
 #define NVTX_SID_CommSplit            13
 #define NVTX_SID_CommFinalize         14
 #define NVTX_SID_CommShrink           15
+#define NVTX_SID_AlltoAll             16
+#define NVTX_SID_Gather               17
+#define NVTX_SID_Scatter              18
 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 16 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
 struct nccl_domain{static constexpr char const* name{"NCCL"};};
 
+extern int64_t ncclParamNvtxDisable();
+
 /// @brief Register an NVTX payload schema for static-size payloads.
 class payload_schema {
  public:
@@ -74,6 +81,32 @@ class payload_schema {
     nullptr, 0, 0, 0, 0, nullptr};
 };
 
+class ncclOptionalNvtxScopedRange
+{
+ public:
+  void push(const nvtx3::event_attributes& attr) noexcept {
+    // pushed must not be true already, but it's too expensive to check
+    pushed = true;
+    nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), attr.get());
+  }
+
+  ~ncclOptionalNvtxScopedRange() noexcept {
+    if (!pushed) {
+      return;
+    }
+    nvtxDomainRangePop(nvtx3::domain::get<nccl_domain>());
+  }
+
+  ncclOptionalNvtxScopedRange() = default;
+  ncclOptionalNvtxScopedRange(ncclOptionalNvtxScopedRange const&) = delete;
+  ncclOptionalNvtxScopedRange& operator=(ncclOptionalNvtxScopedRange const&) = delete;
+  ncclOptionalNvtxScopedRange(ncclOptionalNvtxScopedRange&&) = delete;
+  ncclOptionalNvtxScopedRange& operator=(ncclOptionalNvtxScopedRange&&) = delete;
+
+ private:
+  bool pushed = false;
+};
+
 // Convenience macro to give the payload parameters a scope.
 #define NVTX3_PAYLOAD(...) __VA_ARGS__
 
@@ -81,26 +114,43 @@ class payload_schema {
 // @param N NCCL API name without the `nccl` prefix.
 // @param T name of the used NVTX payload schema without "Schema" suffix.
 // @param P payload parameters/entries
-#define NVTX3_FUNC_WITH_PARAMS(N, T, P) \
-  constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
-  static const payload_schema schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, \
-    schemaId, sizeof(T)}; \
-  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
-  const T _payload = {P}; \
-  nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}}; \
-  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
-  ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
+#define NVTX3_FUNC_WITH_PARAMS(N, T, P)                                                          \
+  ncclOptionalNvtxScopedRange nvtx3_range__;                                                     \
+  if (!ncclParamNvtxDisable())                                                                   \
+  {                                                                                              \
+    constexpr uint64_t schemaId = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
+    static const payload_schema                                                                  \
+        schema{T##Schema, std::extent<decltype(T##Schema)>::value - 1, schemaId, sizeof(T)};     \
+    static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__};     \
+    const T _payload = {P};                                                                      \
+    nvtxPayloadData_t nvtx3_bpl__[] = {{schemaId, sizeof(_payload), &_payload}};                 \
+    ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__};       \
+    nvtx3_range__.push(nvtx3_func_attr__);                                                       \
+  }
+
+#define NCCL_NVTX3_FUNC_RANGE \
+  ncclOptionalNvtxScopedRange nvtx3_range__; \
+  if (!ncclParamNvtxDisable()) { \
+    static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
+    static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
+    nvtx3_range__.push(nvtx3_func_attr__); \
+  }
 
 /// @brief Creates an NVTX range with extended payload using the RAII pattern.
 /// @tparam PayloadType Data type of the payload.
 template <typename PayloadType>
-class ncclNvtxRange {
+class ncclOptionalNvtxPayloadRange {
  public:
-  explicit ncclNvtxRange(const nvtxEventAttributes_t* evtAttr) noexcept {
-    nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), evtAttr);
+  void push(const nvtx3::event_attributes& attr) noexcept {
+    // pushed must not be true already, but it's too expensive to check
+    pushed = true;
+    nvtxDomainRangePushEx(nvtx3::domain::get<nccl_domain>(), attr.get());
   }
 
-  ~ncclNvtxRange() noexcept {
+  ~ncclOptionalNvtxPayloadRange() noexcept {
+    if (!pushed) {
+      return;
+    }
     if (payloadData.payload) {
       nvtxRangePopPayload(nvtx3::domain::get<nccl_domain>(), &payloadData, 1);
     } else {
@@ -113,25 +163,34 @@ class ncclNvtxRange {
     payloadData = {schemaId, sizeof(PayloadType), &payload};
   }
 
-  ncclNvtxRange() = delete;
-  ncclNvtxRange(ncclNvtxRange const&) = default;
-  ncclNvtxRange& operator=(ncclNvtxRange const&) = default;
-  ncclNvtxRange(ncclNvtxRange&&) = default;
-  ncclNvtxRange& operator=(ncclNvtxRange&&) = default;
+  ncclOptionalNvtxPayloadRange() = default;
+  ncclOptionalNvtxPayloadRange(ncclOptionalNvtxPayloadRange const&) = delete;
+  ncclOptionalNvtxPayloadRange& operator=(ncclOptionalNvtxPayloadRange const&) = delete;
+  ncclOptionalNvtxPayloadRange(ncclOptionalNvtxPayloadRange&&) = delete;
+  ncclOptionalNvtxPayloadRange& operator=(ncclOptionalNvtxPayloadRange&&) = delete;
 
   // Holds the payload data.
   PayloadType payload{};
 
+  bool isPushed() const noexcept {
+    return pushed;
+  }
+
  private:
+  bool pushed = false;
   nvtxPayloadData_t payloadData = {NVTX_PAYLOAD_ENTRY_TYPE_INVALID, 0, NULL};
 };
 
 // Create an NVTX range with the function name as the range name. Use RAII pattern.
 // @param T Type ID of the NVTX payload (pointer for variable-size payloads).
-#define NVTX3_RANGE(T) \
-  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
-  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
-  ncclNvtxRange<T> nvtx3_range__{nvtx3_func_attr__.get()};
+#define NVTX3_RANGE(T)                                                                       \
+  ncclOptionalNvtxPayloadRange<T> nvtx3_range__;                                             \
+  if (!ncclParamNvtxDisable())                                                               \
+  {                                                                                          \
+    static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
+    ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__};                \
+    nvtx3_range__.push(nvtx3_func_attr__);                                                   \
+  }
 
 // Add static-size payload to the NVTX range created with `NVTX3_RANGE()`,
 // which must be in this or an outer scope.
@@ -139,6 +198,9 @@ class ncclNvtxRange {
 // @param S name of the used NVTX payload schema.
 // @param P payload parameters/entries
 #define NVTX3_RANGE_ADD_PAYLOAD(N, S, P) do { \
+  if (!nvtx3_range__.isPushed()) { \
+    break; \
+  } \
   constexpr uint64_t schema_id = NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##N; \
   static const payload_schema schema{S, std::extent<decltype(S)>::value - 1, schema_id, \
     sizeof(nvtx3_range__.payload)}; \
diff --git a/src/include/nvtx3/nvToolsExtCounters.h b/src/include/nvtx3/nvToolsExtCounters.h
index 00e2b7f8f..e24ab0e04 100644
--- a/src/include/nvtx3/nvToolsExtCounters.h
+++ b/src/include/nvtx3/nvToolsExtCounters.h
@@ -332,4 +332,4 @@ NVTX_DECLSPEC void NVTX_API nvtxCountersSubmitBatchEx(
 }
 #endif /* __cplusplus */
 
-#endif /* NVTOOLSEXT_COUNTERS_H */
\ No newline at end of file
+#endif /* NVTOOLSEXT_COUNTERS_H */
diff --git a/src/include/nvtx3/nvToolsExtSemanticsCounters.h b/src/include/nvtx3/nvToolsExtSemanticsCounters.h
index f97624a07..6334bfc5d 100644
--- a/src/include/nvtx3/nvToolsExtSemanticsCounters.h
+++ b/src/include/nvtx3/nvToolsExtSemanticsCounters.h
@@ -85,4 +85,4 @@ typedef struct nvtxSemanticsCounter_v1 {
     } limits;
 } nvtxSemanticsCounter_t;
 
-#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
\ No newline at end of file
+#endif /* NVTX_SEMANTIC_ID_COUNTERS_V1 */
diff --git a/src/include/nvtx3/nvToolsExtSemanticsScope.h b/src/include/nvtx3/nvToolsExtSemanticsScope.h
index eed6f3095..e6d1c5f26 100644
--- a/src/include/nvtx3/nvToolsExtSemanticsScope.h
+++ b/src/include/nvtx3/nvToolsExtSemanticsScope.h
@@ -27,4 +27,4 @@ typedef struct nvtxSemanticsScope_v1
     uint64_t scopeId;
 } nvtxSemanticsScope_t;
 
-#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
\ No newline at end of file
+#endif /* NVTX_SEMANTIC_ID_SCOPE_V1 */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
index 00fc81768..6fca4801b 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
@@ -28,4 +28,4 @@
 #define NVTX_EXT_HELPER_UNUSED_ARGS(...) \
     NVTX_EXT_CONCAT(_NVTX_EXT_VOIDIFY, NVTX_EXT_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__)
 
-#endif /* NVTX_EXT_HELPER_MACROS_H */
\ No newline at end of file
+#endif /* NVTX_EXT_HELPER_MACROS_H */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
index 79bb0c1c5..56dcab692 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
@@ -96,4 +96,4 @@ NVTX_LINKONCE_DEFINE_GLOBAL nvtxExtGlobals1_t NVTX_VERSIONED_IDENTIFIER(nvtxExtG
 } /* extern "C" */
 #endif /* __cplusplus */
 
-#endif /* NVTX_EXT_IMPL_H */
\ No newline at end of file
+#endif /* NVTX_EXT_IMPL_H */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
index 0f6ff9667..c34fa8392 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
@@ -145,4 +145,4 @@ NVTX_EXT_COUNTERS_IMPL_FN_V1(void, nvtxCountersSubmitBatchEx,
 } /* extern "C" */
 #endif /* __cplusplus */
 
-#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
\ No newline at end of file
+#endif /* NVTX_EXT_IMPL_COUNTERS_V1 */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
index 71e30bc37..9d07f5b1f 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
@@ -269,4 +269,4 @@
 
 /*** END: Helper for `NVTX_PAYLOAD_STATIC_SCHEMA_{INIT,CREATE}` */
 
-#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
\ No newline at end of file
+#endif /* NVTX_EXT_PAYLOAD_HELPER_INTERNAL_H */
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
index 6a30e6633..eeb227a5a 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
@@ -148,4 +148,4 @@ NVTX_EXT_PAYLOAD_VERSIONED_ID(nvtxExtPayloadTypeInfo)[NVTX_PAYLOAD_ENTRY_TYPE_IN
 };
 
 #undef nvtx_alignof
-#undef nvtx_alignof2
\ No newline at end of file
+#undef nvtx_alignof2
diff --git a/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
index bcad095a0..6be0ac796 100644
--- a/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
+++ b/src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
@@ -41,4 +41,4 @@ typedef struct nvtxExtModuleInfo_t
 
 typedef int (NVTX_API * NvtxExtInitializeInjectionFunc_t)(nvtxExtModuleInfo_t* moduleInfo);
 
-#endif /* NVTXEXTTYPES_H */
\ No newline at end of file
+#endif /* NVTXEXTTYPES_H */
diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h
index 89a41d4b5..587c1a2a4 100644
--- a/src/include/nvtx_payload_schemas.h
+++ b/src/include/nvtx_payload_schemas.h
@@ -90,6 +90,13 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllGather, static cons
   )
 )
 
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAlltoAll, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr)
+  )
+)
+
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsAllReduce, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
     (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
@@ -106,6 +113,14 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsBroadcast, static cons
   )
 )
 
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsGather, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root")
+  )
+)
+
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduce, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
     (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
@@ -123,6 +138,14 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsReduceScatter, static
   )
 )
 
+NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsScatter, static constexpr,
+  NCCL_NVTX_PAYLOAD_ENTRIES(
+    (uint64_t, comm, TYPE_UINT64, nccl_nvtxCommStr),
+    (size_t, bytes, TYPE_SIZE, nccl_nvtxMsgSizeStr),
+    (int, root, TYPE_INT, "Root")
+  )
+)
+
 // Used in NCCL APIs `ncclSend` and `ncclRecv`.
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsSendRecv, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
index 18d1486d7..d92a21b4e 100644
--- a/src/include/plugin/nccl_net.h
+++ b/src/include/plugin/nccl_net.h
@@ -16,6 +16,7 @@
 //Maximum value NCCL can accept for maxP2pBytes and maxCollBytes net properties
 #define NCCL_MAX_NET_SIZE_BYTES (1*1024*1024*1024*1024L)
 #define NCCL_NET_OPTIONAL_RECV_COMPLETION 0x1
+#define NCCL_NET_MULTI_REQUEST 0x2
 
 #define MAX_NET_SIZE (1024*1024*1024L) // Rather than send INT_MAX which is 2G-1, send a power of two.
 #define MAX_COLLNET_SIZE (512*1024*1024L) //Set for initial collent plugins when size was not dynamically queried
@@ -32,22 +33,24 @@
 #define NCCL_NET_MAX_PLUGINS 16
 #endif
 
+#define NCCL_NET_MAX_DEVS_PER_NIC 4
+
+#include "net/net_v11.h"
 #include "net/net_v10.h"
 #include "net/net_v9.h"
 #include "net/net_v8.h"
 #include "net/net_v7.h"
 #include "net/net_v6.h"
 
-typedef ncclNet_v10_t ncclNet_t;
-typedef ncclCollNet_v10_t ncclCollNet_t;
-typedef ncclNetSGE_v10_t ncclNetSGE_t;
-typedef ncclNetProperties_v10_t ncclNetProperties_t;
-typedef ncclNetVDeviceProps_v10_t ncclNetVDeviceProps_t;
-typedef ncclNetCommConfig_v10_t ncclNetCommConfig_t;
-
-#define NCCL_NET_MAX_DEVS_PER_NIC NCCL_NET_MAX_DEVS_PER_NIC_V10
+typedef ncclNet_v11_t ncclNet_t;
+typedef ncclCollNet_v11_t ncclCollNet_t;
+typedef ncclNetSGE_v11_t ncclNetSGE_t;
+typedef ncclNetProperties_v11_t ncclNetProperties_t;
+typedef ncclNetAttr_v11_t ncclNetAttr_t;
+typedef ncclNetVDeviceProps_v11_t ncclNetVDeviceProps_t;
+typedef ncclNetCommConfig_v11_t ncclNetCommConfig_t;
 
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v10
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v10
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v11
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v11
 
 #endif // end include guard
diff --git a/src/include/plugin/nccl_profiler.h b/src/include/plugin/nccl_profiler.h
index 710aac4d5..5ce77c3ee 100644
--- a/src/include/plugin/nccl_profiler.h
+++ b/src/include/plugin/nccl_profiler.h
@@ -8,14 +8,18 @@
 #define NCCL_PROFILER_H_
 
 enum {
-  ncclProfileGroup     = (1 << 0),  // group event type
-  ncclProfileColl      = (1 << 1),  // host collective call event type
-  ncclProfileP2p       = (1 << 2),  // host point-to-point call event type
-  ncclProfileProxyOp   = (1 << 3),  // proxy operation event type
-  ncclProfileProxyStep = (1 << 4),  // proxy step event type
-  ncclProfileProxyCtrl = (1 << 5),  // proxy control event type
-  ncclProfileKernelCh  = (1 << 6),  // kernel channel event type
-  ncclProfileNetPlugin = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroup          = (1 << 0),  // group event type
+  ncclProfileColl           = (1 << 1),  // host collective call event type
+  ncclProfileP2p            = (1 << 2),  // host point-to-point call event type
+  ncclProfileProxyOp        = (1 << 3),  // proxy operation event type
+  ncclProfileProxyStep      = (1 << 4),  // proxy step event type
+  ncclProfileProxyCtrl      = (1 << 5),  // proxy control event type
+  ncclProfileKernelCh       = (1 << 6),  // kernel channel event type
+  ncclProfileNetPlugin      = (1 << 7),  // network plugin-defined, events
+  ncclProfileGroupApi       = (1 << 8),  // Group API events
+  ncclProfileCollApi        = (1 << 9),  // Collective API events
+  ncclProfileP2pApi         = (1 << 10), // Point-to-Point API events
+  ncclProfileKernelLaunch   = (1 << 11), // Kernel launch events
 };
 
 typedef enum {
@@ -50,22 +54,28 @@ typedef enum {
 
   /* Kernel event states */
   ncclProfilerKernelChStop             = 22,
+
+  /* Group API States */
+  ncclProfilerGroupStartApiStop        = 23,
+  ncclProfilerGroupEndApiStart         = 24
 } ncclProfilerEventState_t;
 
 typedef ncclProfilerEventState_t ncclProfilerEventState_v1_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v2_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v3_t;
 typedef ncclProfilerEventState_t ncclProfilerEventState_v4_t;
+typedef ncclProfilerEventState_t ncclProfilerEventState_v5_t;
 
 #include <cstdint>
+#include "profiler/profiler_v5.h"
 #include "profiler/profiler_v4.h"
 #include "profiler/profiler_v3.h"
 #include "profiler/profiler_v2.h"
 #include "profiler/profiler_v1.h"
 
-typedef ncclProfiler_v4_t ncclProfiler_t;
-typedef ncclProfilerEventDescr_v4_t ncclProfilerEventDescr_t;
-typedef ncclProfilerEventStateArgs_v4_t ncclProfilerEventStateArgs_t;
+typedef ncclProfiler_v5_t ncclProfiler_t;
+typedef ncclProfilerEventDescr_v5_t ncclProfilerEventDescr_t;
+typedef ncclProfilerEventStateArgs_v5_t ncclProfilerEventStateArgs_t;
 
 #define NCCL_PROFILER_NET_VER_BITS  (16)
 #define NCCL_PROFILER_NET_VER_MASK  (~0U >> NCCL_PROFILER_NET_VER_BITS)
diff --git a/src/include/plugin/nccl_tuner.h b/src/include/plugin/nccl_tuner.h
index f2401890d..fbd87b58f 100644
--- a/src/include/plugin/nccl_tuner.h
+++ b/src/include/plugin/nccl_tuner.h
@@ -11,12 +11,49 @@
 #include "nccl.h"
 #include "nccl_common.h"
 
+#include "tuner/tuner_v5.h"
 #include "tuner/tuner_v4.h"
 #include "tuner/tuner_v3.h"
 #include "tuner/tuner_v2.h"
 
-typedef ncclTuner_v4_t ncclTuner_t;
+typedef ncclTuner_v5_t ncclTuner_t;
+typedef ncclTunerConstants_v5_t ncclTunerConstants_t;
+typedef ncclNvlDomainInfo_v5_t ncclNvlDomainInfo_t;
 
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v4"
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v5"
+
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+#define NCCL_ALGO_PAT 6
+#define NCCL_NUM_ALGORITHMS NCCL_NUM_ALGORITHMS_V5 // Tree/Ring/CollNet*/PAT
+
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+#define NCCL_NUM_PROTOCOLS NCCL_NUM_PROTOCOLS_V5 // Simple/LL/LL128
+
+#define NCCL_ALGO_PROTO_IGNORE -1.0
+
+#define NCCL_HW_NVLINK 0
+#define NCCL_HW_PCI 1
+#define NCCL_HW_NET 2
+#define NCCL_NUM_HW_LINKS NCCL_NUM_HW_LINKS_V5
+
+#define NCCL_VOLTA_COMPCAP_IDX 0
+#define NCCL_AMPERE_COMPCAP_IDX 1
+#define NCCL_HOPPER_COMPCAP_IDX 2
+#define NCCL_BLACKWELL_COMPCAP_IDX 3
+#define NCCL_NUM_COMPCAPS NCCL_NUM_COMPCAPS_V5
+
+#define NCCL_TUNING_SCALE_1NODE 0
+#define NCCL_TUNING_SCALE_2NODES 1
+#define NCCL_TUNING_SCALE_4NODES 2
+#define NCCL_NUM_TUNING_SCALES NCCL_NUM_TUNING_SCALES_V5
 
 #endif
diff --git a/src/include/plugin/net/net_v10.h b/src/include/plugin/net/net_v10.h
index ada6d482e..2e9187b0a 100644
--- a/src/include/plugin/net/net_v10.h
+++ b/src/include/plugin/net/net_v10.h
@@ -5,11 +5,9 @@
 #ifndef NET_V10_H_
 #define NET_V10_H_
 
-#define NCCL_NET_MAX_DEVS_PER_NIC_V10 4
-
 typedef struct {
   int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V10];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v10_t;
 
 #define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
diff --git a/src/include/plugin/net/net_v11.h b/src/include/plugin/net/net_v11.h
new file mode 100644
index 000000000..68e100637
--- /dev/null
+++ b/src/include/plugin/net/net_v11.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NET_V11_H_
+#define NET_V11_H_
+
+typedef struct {
+  int ndevs;
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
+} ncclNetVDeviceProps_v11_t;
+
+#define NCCL_NET_TRAFFIC_CLASS_UNDEF -1
+
+typedef struct {
+  // Plugin-specific TC value
+  int trafficClass;
+} ncclNetCommConfig_v11_t;
+
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int forceFlush;                  // Force a flush on receives
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  ncclNetVDeviceProps_v11_t vProps;
+  size_t maxP2pBytes;              // Max transfer size for point-to-point operations
+  size_t maxCollBytes;             // Max transfer size for collective operations
+  int maxMultiRequestSize;         // Maximum number of requests supported in a single multi-request.
+} ncclNetProperties_v11_t;
+
+#define NCCL_NET_ATTR_UNDEF -1
+
+#define NCCL_NET_ATTR_INIT { \
+  { NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF }, /* sendCommAttr */ \
+  { NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF, NCCL_NET_ATTR_UNDEF }, /* recvCommAttr */ \
+  (uint32_t)NCCL_NET_ATTR_UNDEF, /* op */ \
+  (uint32_t)NCCL_NET_ATTR_UNDEF, /* algo */ \
+  (uint32_t)NCCL_NET_ATTR_UNDEF, /* proto */ \
+}
+
+typedef struct {
+  int32_t maxConcurrentPeers;
+  int32_t minConcurrentPeers;
+  int32_t maxFlowsPerPeer;
+  int32_t minFlowsPerPeer;
+} ncclNetCommAttr_v11_t;
+
+typedef struct {
+  ncclNetCommAttr_v11_t sendCommAttr;
+  ncclNetCommAttr_v11_t recvCommAttr;
+  uint32_t op;
+  uint32_t algo;
+  uint32_t proto;
+} ncclNetAttr_v11_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclNetCommConfig_v11_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v11_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v11_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* phandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** phandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+
+  // Virtual NIC APIs. makeVDevice will create a virtual NIC given the specified properties, and tell the caller
+  // what index this new vNIC exists at
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+  // Finalize the network.
+  ncclResult_t (*finalize)(void* ctx);
+
+  ncclResult_t (*setNetAttr)(void* ctx, ncclNetAttr_v11_t* netAttr);
+} ncclNet_v11_t;
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  size_t size;
+} ncclNetSGE_v11_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, size_t count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v11_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v11_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Create a virtual NIC given the specified properties, which can be accessed at device index d
+  ncclResult_t (*makeVDevice)(int* d, ncclNetVDeviceProps_v11_t* props);
+  // Finalize the collective network.
+  ncclResult_t (*finalize)(void* ctx);
+} ncclCollNet_v11_t;
+
+#endif // end include guard
diff --git a/src/include/plugin/net/net_v9.h b/src/include/plugin/net/net_v9.h
index ce9d91748..ef054bbe6 100644
--- a/src/include/plugin/net/net_v9.h
+++ b/src/include/plugin/net/net_v9.h
@@ -7,11 +7,9 @@
 #ifndef NET_V9_H_
 #define NET_V9_H_
 
-#define NCCL_NET_MAX_DEVS_PER_NIC_V9 4
-
 typedef struct {
   int ndevs;
-  int devs[NCCL_NET_MAX_DEVS_PER_NIC_V9];
+  int devs[NCCL_NET_MAX_DEVS_PER_NIC];
 } ncclNetVDeviceProps_v9_t;
 
 typedef struct {
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
index 300e436a0..83b58e985 100644
--- a/src/include/plugin/plugin.h
+++ b/src/include/plugin/plugin.h
@@ -21,4 +21,6 @@ void* ncclOpenProfilerPluginLib(const char* name);
 void* ncclGetNetPluginLib(enum ncclPluginType type);
 ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type);
 
+extern char* ncclPluginLibPaths[];
+
 #endif
diff --git a/src/include/plugin/profiler/profiler_v5.h b/src/include/plugin/profiler/profiler_v5.h
new file mode 100644
index 000000000..dab1db9e1
--- /dev/null
+++ b/src/include/plugin/profiler/profiler_v5.h
@@ -0,0 +1,151 @@
+/*************************************************************************
+ * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef PROFILER_V5_H_
+#define PROFILER_V5_H_
+
+typedef struct {
+  uint64_t type;                // event type descriptor: ncclProfileColl, ...
+  void* parentObj;              // pointer to the profiler parent object (for coll is the group)
+  int rank;                     // originating rank
+  union {
+    struct {
+      bool graphCaptured;
+      int groupDepth;
+    } groupApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      int root;
+      void* stream;
+      bool graphCaptured;
+    } collApi;
+
+    struct {
+      const char* func;
+      size_t count;
+      const char* datatype;
+      void* stream;
+      bool graphCaptured;
+    } p2pApi;
+
+    struct {
+      void* stream;
+    } kernelLaunch;
+
+    struct {
+      uint64_t seqNumber;
+      const char* func;
+      void const* sendBuff;
+      void* recvBuff;
+      size_t count;
+      int root;
+      const char* datatype;
+      uint8_t nChannels;
+      uint8_t nWarps;
+      const char* algo;
+      const char* proto;
+      void* parentGroup; // for backward compatibility with v4
+    } coll;
+
+    struct {
+      const char* func;
+      void* buff;
+      const char* datatype;
+      size_t count;
+      int peer;
+      uint8_t nChannels;
+      void* parentGroup; // for backward compatibility with v4
+    } p2p;
+
+    struct {
+      pid_t pid;                // pid of the originating process
+      uint8_t channelId;        // channel id for this proxy operation
+      int peer;                 // remote rank for send/recv
+      int nSteps;               // number of steps for this proxy operation
+      int chunkSize;            // amount of data transferred by this proxy operation
+      int isSend;
+    } proxyOp;
+
+    struct {
+      int step;
+    } proxyStep;
+
+    struct {
+      uint8_t channelId;
+      uint64_t pTimer;          // start timestamp from GPU globaltimer
+    } kernelCh;
+
+    struct {
+      int64_t id;
+      void* data;
+    } netPlugin;
+  };
+} ncclProfilerEventDescr_v5_t;
+
+typedef union {
+  struct {
+    size_t transSize;
+  } proxyStep;
+
+  struct {
+    int appendedProxyOps;
+  } proxyCtrl;
+
+  struct {
+    void* data;
+  } netPlugin;
+
+  struct {
+    uint64_t pTimer;
+  } kernelCh;
+} ncclProfilerEventStateArgs_v5_t;
+
+typedef struct {
+  const char* name;
+
+  // init - initialize the profiler plugin
+  // Input
+  //  - context        : opaque profiler context object for separating profiler behavior across comms
+  //  - commId         : communicator id
+  //  - commName       : user assigned communicator name
+  //  - nNodes         : number of nodes in communicator
+  //  - nranks         : number of ranks in communicator
+  //  - rank           : rank identifier in communicator
+  //  - logfn          : logger function
+  // Output
+  //  - eActivationMask: bitmask of active events set by the plugin
+  ncclResult_t (*init)(void** context, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn);
+
+  // startEvent - initialize and start a new event for the supplied event descriptor inside the eventset
+  // Input
+  //  - context: opaque profiler context object
+  //  - eDescr : pointer to ncclProfilerEventDescr_t object
+  // Output
+  //  - eHandle: return event handle for supplied event descriptor object
+  ncclResult_t (*startEvent)(void* context, void** eHandle, ncclProfilerEventDescr_v5_t* eDescr);
+
+  // stopEvent - stop/finalize an event inside and event set
+  // Input
+  //  - eHandle: handle to event object
+  ncclResult_t (*stopEvent)(void* eHandle);
+
+  // recordEventState - record event state transitions and event attribute updates
+  // Input
+  //  - eHandle   : handle to event object created through startEvent
+  //  - eStateArgs: optional argument used to capture event attribute updates associated with the state transition
+  //  - eState    : event state transition
+  ncclResult_t (*recordEventState)(void* eHandle, ncclProfilerEventState_v5_t eState, ncclProfilerEventStateArgs_v5_t* eStateArgs);
+
+  // finalize - finalize the profiler plugin
+  // Input
+  //  - context: opaque profiler context object
+  ncclResult_t (*finalize)(void* context);
+} ncclProfiler_v5_t;
+
+#endif
diff --git a/src/include/plugin/tuner/tuner_v5.h b/src/include/plugin/tuner/tuner_v5.h
new file mode 100644
index 000000000..9e621f842
--- /dev/null
+++ b/src/include/plugin/tuner/tuner_v5.h
@@ -0,0 +1,87 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef TUNER_V5_H_
+#define TUNER_V5_H_
+
+// NVL domain information struct
+typedef struct {
+  int nNvlDomains;                    // number of NVLink domains
+  int minRanksPerNvlDomain;           // minimum ranks across all NVLink domains
+  int maxRanksPerNvlDomain;           // maximum ranks across all NVLink domains
+} ncclNvlDomainInfo_v5_t;
+
+#define NCCL_NUM_ALGORITHMS_V5 7 // Tree/Ring/CollNet*/PAT
+#define NCCL_NUM_PROTOCOLS_V5 3 // Simple/LL/LL128
+#define NCCL_NUM_HW_LINKS_V5 3
+#define NCCL_NUM_COMPCAPS_V5 4
+#define NCCL_NUM_TUNING_SCALES_V5 3
+
+typedef struct {
+  double baseLatencies [NCCL_NUM_ALGORITHMS_V5][NCCL_NUM_PROTOCOLS_V5];
+  double hwLatencies [NCCL_NUM_HW_LINKS_V5][NCCL_NUM_ALGORITHMS_V5][NCCL_NUM_PROTOCOLS_V5];
+
+  double llMaxBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5];
+  double perChMaxRingLL128Bws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5];
+  double perChMaxTreeLL128Bws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5];
+  double perChMaxTreeBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5];
+  double perChMaxNVLSTreeBws [NCCL_NUM_COMPCAPS_V5][NCCL_NUM_TUNING_SCALES_V5];
+
+
+} ncclTunerConstants_v5_t;
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // Inputs:
+  //   - commId: communicator identifier
+  //   - nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  //   - nNodes: number of nodes in current communicator.
+  //   - logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  //   - nvlDomainInfo: NVL domain information struct
+  // Outputs:
+  //   - context: tuner context object
+  // Input/Output:
+  //   - constants: tuner constants
+  ncclResult_t (*init)(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction,
+                      ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_v5_t* constants);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - context: tuner context object
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - numPipeOps: number of operations in the group
+  //   - numAlgo: number of algorithms in collCostTable
+  //   - numProto: number of protocols in collCostTable
+  //   - regBuff: can register user buffer
+  //
+  // Outputs:
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // InOut:
+  //   - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
+  //                    NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
+                              int numPipeOps, float** collCostTable, int numAlgo, int numProto,
+                              int regBuff, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  // context: tuner context object
+  ncclResult_t (*finalize)(void* context);
+} ncclTuner_v5_t;
+
+#endif
diff --git a/src/include/profiler.h b/src/include/profiler.h
index 2fb6a7d38..f7f9980b5 100644
--- a/src/include/profiler.h
+++ b/src/include/profiler.h
@@ -28,12 +28,48 @@ struct ncclProfilerProxy {
   struct ncclProxyConnector recvProxyConn[MAXCHANNELS];
 };
 
+enum groupApiState {
+  ncclProfilerGroupApiStartStateReset   = 0,
+  ncclProfilerGroupApiStartStateStarted = 1,
+  ncclProfilerGroupApiStartStateStopped = 2,
+};
+
+// Used by the profiler to track state for API events
+typedef struct ncclProfilerApiState {
+  int profilerGroupDepth;
+  int eActivationMask;
+  groupApiState state;
+  void *groupApiEventHandle;
+  // Tracks the latest API event handles for p2p/collectives
+  void* p2pApiEventHandle;
+  void *collApiEventHandle;
+} ncclProfilerApiState_t;
+
+extern __thread ncclProfilerApiState_t ncclProfilerApiState;
+
 extern int ncclProfilerEventMask;
 
 // Plugin Init/Finalize Wrappers
 ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm);
 ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm);
 
+// Profiler Start/Stop/Record wrappers for ncclGroupStart and ncclGroupEnd API calls
+ncclResult_t ncclProfilerStartGroupApiEvent(struct ncclInfo *info, bool isGraphCaptured);
+ncclResult_t ncclProfilerStopGroupApiEvent();
+ncclResult_t ncclProfilerRecordGroupApiEventState(ncclProfilerEventState_t eState);
+
+//Profiler Start/Stop wrappers for P2p API calls
+ncclResult_t ncclProfilerStartP2pApiEvent(struct ncclInfo *info, bool isGraphCaptured);
+ncclResult_t ncclProfilerStopP2pApiEvent();
+
+//Profiler Start/Stop wrappers for Collective API calls
+ncclResult_t ncclProfilerStartCollApiEvent(struct ncclInfo *info, bool isGraphCaptured);
+ncclResult_t ncclProfilerStopCollApiEvent();
+
+// Kernel Launch Start/Stop Event Wrappers
+ncclResult_t ncclProfilerStartKernelLaunchEvent(struct ncclKernelPlan* plan, cudaStream_t stream);
+ncclResult_t ncclProfilerStopKernelLaunchEvent(struct ncclKernelPlan* plan);
+
 // Profiler Start/Stop Group Wrappers
 ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan);
 ncclResult_t ncclProfilerStopGroupEvent(struct ncclKernelPlan* plan);
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 772aa206c..4613ada49 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -69,6 +69,7 @@ struct ncclProxyOp {
   uint8_t /*ncclDataType_t*/ dtype;
   uint8_t /*ncclDevRedOp_t*/ redOp;
   uint8_t /*ncclFunc_t*/ coll;
+  uint8_t /*ncclFunc_t*/ collAPI;
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t protocol;
   uint8_t algorithm;
@@ -81,6 +82,8 @@ struct ncclProxyOp {
   int isOneRPN;
   RingAlgorithm *ringAlgo;
   union ncclProxyOpSpecifics specifics;
+  int nChannels;
+  int nPeers;
 
   // Profiler plugin
   union {
@@ -175,11 +178,14 @@ struct ncclProxyArgs {
   uint8_t /*ncclDevRedOp_t*/ redOp;
   uint8_t /*ncclPattern_t*/ pattern;
   uint8_t /*ncclFunc_t*/ coll;
+  uint8_t /*ncclFunc_t*/ collAPI;
   uint8_t protocol;
   uint8_t algorithm;
   int state;
   char* sharedBuff[NCCL_STEPS];
   int sharedSize[NCCL_STEPS];
+  int nChannels;
+  int nPeers;
 
   int idle;
 
@@ -338,6 +344,11 @@ struct ncclProxyState {
   // Progress thread
   struct ncclProxyProgressState progressState;
 
+  // Network plugin
+  void* netContext;
+  ncclNetAttr_t netAttr;
+  void* collNetContext;
+
   // Profiler plugin
   void* profilerContext;
 
diff --git a/src/include/register.h b/src/include/register.h
index 231cbfc34..edfc722de 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef NCCL_REGISTER_H_
 #define NCCL_REGISTER_H_
 
@@ -29,15 +35,6 @@ struct ncclRegNetHandles {
   struct ncclRegNetHandles* next;
 };
 
-struct ncclSymRegTask {
-  struct ncclSymRegTask *next;
-  void* buff;
-  size_t baseSize;
-  CUmemGenericAllocationHandle memHandle;
-  struct ncclReg* regHandle;
-  size_t alignment;
-};
-
 struct ncclReg {
   // common attributes
   uintptr_t begAddr, endAddr; // page aligned
@@ -58,10 +55,6 @@ struct ncclReg {
   // general ipc reg
   struct ncclPeerRegIpcAddr regIpcAddrs;
   struct ncclIpcRegInfo* ipcInfos[NCCL_MAX_LOCAL_RANKS];
-  // symmetric reg
-  void* baseSymPtr;
-  size_t symSize;
-  int winFlags;
 };
 
 struct ncclRegCache {
@@ -70,14 +63,9 @@ struct ncclRegCache {
   uintptr_t pageSize;
 };
 
-struct ncclWindow {
-  struct ncclReg* handle;
-};
-
 ncclResult_t ncclRegCleanup(struct ncclComm* comm);
 ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
 ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *handle);
 ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid);
-ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle);
 
 #endif
diff --git a/src/include/register_inline.h b/src/include/register_inline.h
index fb7641b13..76181c4ac 100644
--- a/src/include/register_inline.h
+++ b/src/include/register_inline.h
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef NCCL_REGISTER_INLINE_H_
 #define NCCL_REGISTER_INLINE_H_
 
@@ -18,16 +24,5 @@ static inline ncclResult_t ncclRegFind(struct ncclComm* comm, const void* data,
   }
 }
 
-static inline ncclResult_t ncclRegFindSymmetric(struct ncclComm* comm, const void* data, size_t size, void** symPtr, struct ncclReg** outReg) {
-  struct ncclReg* regRecord = NULL;
-  *symPtr = NULL;
-  *outReg = NULL;
-  NCCLCHECK(ncclRegFind(comm, data, size, &regRecord));
-  if (regRecord && regRecord->baseSymPtr) {
-    *symPtr = (void*)((uintptr_t)regRecord->baseSymPtr + (uintptr_t)data - (uintptr_t)regRecord->begAddr);
-    *outReg = regRecord;
-  }
-  return ncclSuccess;
-}
 
 #endif
diff --git a/src/include/scheduler.h b/src/include/scheduler.h
new file mode 100644
index 000000000..9ee9bb232
--- /dev/null
+++ b/src/include/scheduler.h
@@ -0,0 +1,17 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SCHEDULER_H_
+#define NCCL_SCHEDULER_H_
+
+#include "nccl.h"
+#include "comm.h"
+#include "sym_kernels.h"
+
+ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next>* symTaskQueue, struct ncclTaskColl** remainTasksHead);
+ncclResult_t ncclSymmetricTaskScheduler(struct ncclComm* comm, struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next>* symTaskQueue, struct ncclKernelPlan* plan);
+
+#endif // NCCL_SCHEDULER_H_
diff --git a/src/include/shm.h b/src/include/shm.h
index 223d87346..b944241a4 100644
--- a/src/include/shm.h
+++ b/src/include/shm.h
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #ifndef NCCL_SHM_H_
 #define NCCL_SHM_H_
 
diff --git a/src/include/shmutils.h b/src/include/shmutils.h
index 097b4c657..199b9f717 100644
--- a/src/include/shmutils.h
+++ b/src/include/shmutils.h
@@ -15,8 +15,8 @@ ncclResult_t ncclShmClose(ncclShmHandle_t handle);
 ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
 
 struct ncclShmemCollBuff {
-  volatile size_t *cnt[2];
-  volatile void *ptr[2];
+  size_t *cnt[2];
+  void *ptr[2];
   int round;
   size_t maxTypeSize;
 };
diff --git a/src/include/sym_kernels.h b/src/include/sym_kernels.h
new file mode 100644
index 000000000..4e742eff7
--- /dev/null
+++ b/src/include/sym_kernels.h
@@ -0,0 +1,112 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SYM_KERNELS_H_
+#define NCCL_SYM_KERNELS_H_
+#include "nccl.h"
+#include "nccl_device.h"
+#include "nccl_common.h"
+#include "device.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclSymk[Foo]: Kernels built on the device API
+
+#define NCCL_SYM_KERNEL_CELL_SIZE 1024 // no less than 16 bytes minimal cell size
+
+constexpr int ncclSymkMaxBlocks = 64;
+constexpr int ncclSymkMaxThreads = 512;
+constexpr int ncclSymkLLMaxEltSize = 8;
+
+constexpr __host__ __device__ int ncclSymkLLMaxSlots(int eltSize = ncclSymkLLMaxEltSize) {
+  return ncclSymkMaxThreads*ncclSymkLLMaxEltSize/eltSize;
+}
+
+enum ncclSymkKernelId {
+  ncclSymkKernelId_AllReduce_AGxLL_R,
+  ncclSymkKernelId_AllReduce_AGxLLMC_R,
+  ncclSymkKernelId_AllReduce_RSxLD_AGxST,
+  ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC,
+  ncclSymkKernelId_AllReduce_RSxNet_ARxMC_AGxNet,
+
+  ncclSymkKernelId_AllGather_LL,
+  ncclSymkKernelId_AllGather_LLMC,
+  ncclSymkKernelId_AllGather_ST,
+  ncclSymkKernelId_AllGather_STMC,
+
+  ncclSymkKernelId_ReduceScatter_LL,
+  ncclSymkKernelId_ReduceScatter_LD,
+  ncclSymkKernelId_ReduceScatter_LDMC,
+
+  ncclSymkKernelId_Count
+};
+
+struct ncclSymkDevComm {
+  struct ncclDevComm devComm;
+  struct ncclLLA2AHandle lsaLLA2A;
+};
+
+struct ncclSymkState {
+  bool initialized;
+  struct ncclSymkDevComm kcomm;
+};
+
+struct ncclSymkChannelWorkRange {
+  uint16_t workHi; // inclusive index of my ending work
+  uint16_t fracHi; // 16-bit fraction in (0.0, 1.0] indicating where my part ends
+};
+
+// 16 bytes aligned
+struct alignas(16) ncclSymkDevWork {
+  uint64_t redOpArg; // must be collectively uniform
+  size_t nElts;
+  struct ncclWindow_vidmem* inputWin, *outputWin;
+  size_t inputOff, outputOff; // these = origUserOffset + cbdPartOffset
+  uint64_t rootRank;
+  uint64_t sChannelId:16, nChannels:16, padding:32;
+};
+
+struct alignas(16) ncclSymkDevWorkArgs {
+  struct ncclSymkDevComm kcomm;
+  int nMaxChannels;
+  // starting of channelWorkRange will be aligned to 16 bytes
+  // channelWorkRange[nChannels];
+  // ncclSymDevWork[nWorks];
+  // aux functions
+  __host__ static constexpr size_t calcArgsSize(int nChannels, int nWorks) {
+    return alignUp(sizeof(struct ncclSymkDevWorkArgs), 16) + alignUp(nChannels * sizeof(struct ncclSymkChannelWorkRange), 16) + nWorks * sizeof(struct ncclSymkDevWork);
+  }
+  __host__ __device__ struct ncclSymkChannelWorkRange* getWorkRange() const {
+    return (struct ncclSymkChannelWorkRange*)((uint8_t*)this + alignUp(sizeof(struct ncclSymkDevWorkArgs), 16));
+  }
+  __host__ __device__ struct ncclSymkDevWork* getWorks(int nChannels) const {
+    return (struct ncclSymkDevWork*)((uint8_t*)this->getWorkRange() + alignUp(nChannels * sizeof(struct ncclSymkChannelWorkRange), 16));
+  }
+};
+
+union ncclSymkDevWorkArgs4K {
+  struct ncclSymkDevWorkArgs args;
+  char buf4K[4096];
+};
+
+// We assume ncclComm contains a field: `ncclSymkState symkState`
+ncclResult_t ncclSymkInitOnce(struct ncclComm* comm);
+ncclResult_t ncclSymkFinalize(struct ncclComm* comm);
+
+bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red,
+                       ncclDataType_t ty, size_t nElts);
+ncclResult_t ncclSymkPickKernel(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty,
+                                size_t nEltsTotal, size_t nEltsMax, int nWorks,
+                                float* estTimeUs, ncclSymkKernelId* kernelId, int* nBlocks, int* nWarps);
+
+ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclSymkDevWork* outDevWork);
+
+// Generated by src/device/symmetric/generate.py
+extern int const ncclSymkKernelCount;
+extern void* const ncclSymkKernelList[];
+void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
+const char* ncclSymkKernelIdToString(int kernelId);
+
+#endif
diff --git a/src/include/symmetric.h b/src/include/symmetric.h
deleted file mode 100644
index 7a189bcca..000000000
--- a/src/include/symmetric.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef NCCL_DEVICE_SYMMETRIC_H_
-#define NCCL_DEVICE_SYMMETRIC_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "bitops.h"
-
-constexpr int ncclSymMaxBlocks = 64;
-constexpr int ncclSymMaxThreads = 512;
-constexpr int ncclSymLLMaxEltSize = 64;
-
-constexpr __host__ __device__ int ncclSymLLMaxSlots(int eltSize = ncclSymLLMaxEltSize) {
-  return ncclSymMaxThreads*ncclSymLLMaxEltSize/eltSize;
-}
-
-constexpr __host__ __device__ int ncclSymLLEpochSize(int nRanks) {
-  return /*LL Overhead*/2 * maxval(ncclSymMaxThreads*nRanks*8, ncclSymLLMaxSlots(ncclSymLLMaxEltSize)*ncclSymLLMaxEltSize);
-}
-
-struct alignas(16) ncclSymDevBase {
-  uint32_t llEpoch[ncclSymMaxBlocks];
-  uint32_t barEpochMc[ncclSymMaxBlocks], barEpochUc[ncclSymMaxBlocks];
-  uint32_t barInboxMc[ncclSymMaxBlocks];
-  uint32_t barInboxPerPeer[];
-
-  static constexpr size_t size(int nRanks) {
-    return sizeof(ncclSymDevBase) +
-           alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16) +
-           ncclSymMaxBlocks * /*epochs=*/2 * ncclSymLLEpochSize(nRanks);
-  }
-};
-
-static __device__ uint4* ncclSymDevBase_getLLBuf(struct ncclSymDevBase* base, int nRanks, int block, uint32_t epoch) {
-  // Get pointer to buffer trailing the header struct.
-  char* ans = (char*)(base + 1);
-  // Skip over barInboxPerPeer[]
-  ans += alignUp(ncclSymMaxBlocks*nRanks*sizeof(uint32_t), 16);
-  // Skip to our block
-  int epochSize = ncclSymLLEpochSize(nRanks);
-  ans += block * /*epochs=*/2 * epochSize;
-  ans += (epoch & 1)*epochSize;
-  return (uint4*)ans;
-}
-
-struct ncclSymDevComm {
-  ncclSymDevBase* base;
-  ncclSymDevBase* baseMc;
-  uint32_t stride4G;
-  int nRanks, rank;
-  uint32_t nRanks_rcp32; // idivRcp32(nRanks)
-};
-
-struct alignas(16) ncclSymDevArgs {
-  struct ncclSymDevComm comm;
-  int rootRank;
-  uint64_t redOpArg; // must be collectively uniform
-  size_t nElts;
-  char* input;
-  char* output;
-};
-
-enum ncclSymKernelId {
-  ncclSymKernelId_AllReduce_AGxLL_R,
-  ncclSymKernelId_AllReduce_AGxLLMC_R,
-  ncclSymKernelId_AllReduce_RSxLD_AGxST,
-  ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC,
-
-  ncclSymKernelId_AllGather_LL,
-  ncclSymKernelId_AllGather_LLMC,
-  ncclSymKernelId_AllGather_ST,
-  ncclSymKernelId_AllGather_STMC,
-
-  ncclSymKernelId_ReduceScatter_LL,
-  ncclSymKernelId_ReduceScatter_LD,
-  ncclSymKernelId_ReduceScatter_LDMC,
-
-  ncclSymKernelId_Count
-};
-
-bool ncclSymImplemented(ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
-
-ncclResult_t ncclSymPickKernel(struct ncclComm* comm, ncclFunc_t fn, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts, float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps);
-
-// Generated by src/device/symmetric/generate.py
-extern int const ncclSymKernelCount;
-extern void* const ncclSymKernelList[];
-void* ncclSymGetKernelPtr(ncclSymKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
-const char* ncclSymKernelIdToString(int kernelId);
-
-#endif
diff --git a/src/include/transport.h b/src/include/transport.h
index a9971a74f..39e479e24 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -162,13 +162,9 @@ ncclResult_t ncclRegisterCollBuffers(struct ncclComm* comm, struct ncclTaskColl*
 ncclResult_t ncclRegisterCollNvlsBuffers(struct ncclComm* comm, struct ncclTaskColl* info, void* outRegBufSend[NCCL_MAX_LOCAL_RANKS], void* outRegBufRecv[NCCL_MAX_LOCAL_RANKS], struct ncclIntruQueue<struct ncclCommCallback, &ncclCommCallback::next>* cleanupQueue, bool* regNeedConnect);
 ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels);
 
-ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
-ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
-ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr);
-ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
-ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
-ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
-ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr);
-ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
+#if CUDART_VERSION >= 12010
+ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle);
+ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle);
+#endif
 
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index bfed2722c..46389985f 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -551,4 +551,6 @@ T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
     return head;
   }
 }
+
+ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)(int), char *buf, size_t bufLen, const char *wildcard);
 #endif
diff --git a/src/init.cc b/src/init.cc
index af784c02d..ebf942c02 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -27,10 +27,14 @@
 #include <dlfcn.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <sys/resource.h>
 #include <unistd.h>
 #include "param.h"
 #include "nvtx_payload_schemas.h"
 #include "utils.h"
+#include <mutex>
+#include "ce_coll.h"
+#include "nvtx.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -54,6 +58,9 @@ NCCL_PARAM(WinEnable, "WIN_ENABLE", 1);
 NCCL_PARAM(CollnetEnable, "COLLNET_ENABLE", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(CtaPolicy, "CTA_POLICY", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(SetCpuStackSize, "SET_CPU_STACK_SIZE", 1);
+
+extern int64_t ncclParamSingleProcMemRegEnable();
 
 static ncclResult_t commReclaim(ncclComm_t comm);
 
@@ -70,11 +77,50 @@ ncclResult_t initGdrCopy() {
   return ncclSuccess;
 }
 
+// The default Linux stack size (8MB) is safe.
+#define SAFE_STACK_SIZE (8192*1024)
+
+static ncclResult_t setCpuStackSize() {
+  if (ncclParamSetCpuStackSize() != 0) {
+    // Query the stack size used for newly launched threads.
+    pthread_attr_t attr;
+    size_t stackSize;
+    PTHREADCHECK(pthread_attr_init(&attr), "pthread_attr_init");
+    PTHREADCHECK(pthread_attr_getstacksize(&attr, &stackSize), "pthread_attr_getstacksize");
+
+    if (stackSize < SAFE_STACK_SIZE) {
+      // GNU libc normally uses RLIMIT_STACK as the default pthread stack size, unless it's set to "unlimited" --
+      // in that case a fallback value of 2MB (!) is used.
+
+      // Query the actual resource limit so that we can distinguish between the settings of 2MB and unlimited.
+      struct rlimit stackLimit;
+      char buf[30];
+      SYSCHECK(getrlimit(RLIMIT_STACK, &stackLimit), "getrlimit");
+      if (stackLimit.rlim_cur == RLIM_INFINITY)
+        strcpy(buf, "unlimited");
+      else
+        snprintf(buf, sizeof(buf), "%ldKB", stackLimit.rlim_cur/1024);
+      INFO(NCCL_INIT|NCCL_ENV, "Stack size limit (%s) is unsafe; will use %dKB for newly launched threads",
+           buf, SAFE_STACK_SIZE/1024);
+
+      // Change the default pthread stack size (via a nonportable API, which will become necessary if we switch
+      // to C++ threads).
+      PTHREADCHECK(pthread_attr_setstacksize(&attr, SAFE_STACK_SIZE), "pthread_attr_setstacksize");
+      PTHREADCHECK(pthread_setattr_default_np(&attr), "pthread_setattr_default_np");
+    }
+
+    PTHREADCHECK(pthread_attr_destroy(&attr), "pthread_attr_destroy");
+  }
+
+  return ncclSuccess;
+}
+
 static ncclResult_t initResult = ncclSuccess;
-static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static std::once_flag initOnceFlag;
 
 static void initOnceFunc() {
   initEnv();
+  setCpuStackSize();
   initGdrCopy();
   // Always initialize bootstrap network
   NCCLCHECKGOTO(bootstrapNetInit(), initResult, exit);
@@ -84,7 +130,7 @@ exit:;
 }
 
 static ncclResult_t ncclInit() {
-  pthread_once(&initOnceControl, initOnceFunc);
+  std::call_once(initOnceFlag, initOnceFunc);
   return initResult;
 }
 
@@ -180,10 +226,12 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm == NULL)
     return ncclSuccess;
 
-  if (comm->symmetricSupport && comm->symDevComm.base) {
-    NCCLCHECK(ncclCommSymmetricFreeInternal(comm, comm->baseUCSymPtr + comm->rank * comm->baseStride));
-  }
+  NCCLCHECK(ncclCeFinalize(comm));
 
+  if (comm->symmetricSupport) {
+    NCCLCHECK(ncclSymkFinalize(comm));
+    NCCLCHECK(ncclDevrFinalize(comm));
+  }
   NCCLCHECK(ncclRasCommFini(comm));
 
   /* in commReclaim, we have guaranteed only last rank which calls ncclCommDestroy() will
@@ -263,10 +311,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   NCCLCHECK(ncclRegCleanup(comm));
 
-  if (comm->symmetricSupport) {
-    NCCLCHECK(ncclNvlsSymmetricFinalize(comm));
-    NCCLCHECK(ncclIpcSymmetricFinalize(comm));
-  }
   INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - %s COMPLETE", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId, abort ? "Abort" : "Destroy");
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
@@ -414,6 +458,7 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 
   ncclIntruQueueMpscConstruct(&comm->callbackQueue);
   ncclIntruQueueConstruct(&comm->legacyRegCleanupQueue);
+  ncclIntruQueueConstruct(&comm->ceInitTaskQueue);
 
   comm->regCache.pageSize = sysconf(_SC_PAGESIZE);
 
@@ -436,8 +481,8 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
 static ncclResult_t devCommSetup(ncclComm_t comm) {
   ncclResult_t ret = ncclSuccess;
   int nRanks = comm->nRanks;
-  struct ncclDevCommAndChannels tmpCommAndChans;
-  struct ncclDevCommAndChannels *devCommAndChans = NULL;
+  struct ncclKernelCommAndChannels tmpCommAndChans;
+  struct ncclKernelCommAndChannels *devCommAndChans = NULL;
   struct ncclNvmlCCStatus ccStatus;
   bool ccEnable;
   cudaStream_t deviceStream;
@@ -465,7 +510,7 @@ static ncclResult_t devCommSetup(ncclComm_t comm) {
   comm->workArgsBytes = std::min<size_t>(ncclParamWorkArgsBytes(), ncclMaxKernelArgsSize(comm->cudaArch));
 
   memset(&ccStatus, 0, sizeof(ccStatus));
-  ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE);
+  ccEnable = (ncclSuccess == ncclNvmlGetCCStatus(&ccStatus)) && (ccStatus.CCEnabled || ccStatus.multiGpuProtectedPCIE || ccStatus.multiGpuNVLE);
   if (ccEnable) {
     comm->workFifoBytes = 0;
   } else {
@@ -582,14 +627,28 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
     info->fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED;
     (void) ncclNvmlDeviceGetGpuFabricInfoV(nvmlDev, &info->fabricInfo);
     if (info->fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
+      unsigned long uuid0 = 0;
+      unsigned long uuid1 = 0;
       if (ncclParamMNNVLUUID() != -1) {
-        ((long*)&info->fabricInfo.clusterUuid)[0] = ncclParamMNNVLUUID();
-        ((long*)&info->fabricInfo.clusterUuid)[1] = ncclParamMNNVLUUID();
+        unsigned long temp_uuid0 = (unsigned long)ncclParamMNNVLUUID();
+        unsigned long temp_uuid1 = (unsigned long)ncclParamMNNVLUUID();
+        memcpy(info->fabricInfo.clusterUuid, &temp_uuid0, sizeof(temp_uuid0));
+        memcpy(info->fabricInfo.clusterUuid + sizeof(temp_uuid0), &temp_uuid1, sizeof(temp_uuid1));
       }
-      if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
+      memcpy(&uuid0, info->fabricInfo.clusterUuid, sizeof(uuid0));
+      memcpy(&uuid1, info->fabricInfo.clusterUuid + sizeof(uuid0), sizeof(uuid1));
+      if (ncclParamMNNVLCliqueId() == -2) {
+        nvmlPlatformInfo_t platformInfo = { 0 };
+        NCCLCHECK(ncclNvmlDeviceGetPlatformInfo(nvmlDev, &platformInfo));
+        INFO(NCCL_INIT, "MNNVL rack serial %s slot %d tray %d hostId %d peerType %d moduleId %d",
+             platformInfo.chassisSerialNumber, platformInfo.slotNumber, platformInfo.trayIndex,
+             platformInfo.hostId, platformInfo.peerType, platformInfo.moduleId);
+        // Use a hash of the Rack serial number to partition the NVLD clique
+        info->fabricInfo.cliqueId = getHash(platformInfo.chassisSerialNumber, sizeof(platformInfo.chassisSerialNumber));
+      } else if (ncclParamMNNVLCliqueId() != -1) info->fabricInfo.cliqueId = ncclParamMNNVLCliqueId();
       INFO(NCCL_INIT, "MNNVL busId 0x%lx fabric UUID %lx.%lx cliqueId 0x%x state %d healthMask 0x%x",
            info->busId,
-           ((long *)&info->fabricInfo.clusterUuid)[0], ((long *)&info->fabricInfo.clusterUuid)[1],
+           uuid0, uuid1,
            info->fabricInfo.cliqueId, info->fabricInfo.state, info->fabricInfo.healthMask);
     }
   }
@@ -670,6 +729,18 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 #define TIMER_INIT_ALLOC 7
 #define TIMERS_INIT_COUNT 8
 
+static ncclResult_t initNvlDomainInfo(struct ncclComm* comm) {
+  // Initialize NVLink domain info
+  comm->nvlDomainInfo.nNvlDomains = comm->nNodes;
+  comm->nvlDomainInfo.minRanksPerNvlDomain = comm->minLocalRanks;
+  comm->nvlDomainInfo.maxRanksPerNvlDomain = comm->maxLocalRanks;
+  
+  TRACE(NCCL_INIT, "NVLink domains: %d domains, min ranks per domain: %d, max ranks per domain: %d",
+        comm->nNodes, comm->nvlDomainInfo.minRanksPerNvlDomain, comm->nvlDomainInfo.maxRanksPerNvlDomain);
+
+  return ncclSuccess;
+}
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
@@ -781,6 +852,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
 
     // Buffer Registration is not supported with MNNVL
     if (comm->MNNVL) comm->nvlsRegSupport = 0;
+    else if (ncclParamSingleProcMemRegEnable()) comm->nvlsRegSupport = 1;
 
     TRACE(NCCL_INIT,"pidHash[%d] %lx intraProcRank %d intraProcRanks %d intraProcRank0 %d",
         rank, comm->peerInfo[rank].pidHash, intraProcRank, intraProcRanks, intraProcRank0);
@@ -969,10 +1041,12 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     comm->rankToLocalRank[r] = comm->nodeRanks[node].localRanks;
     comm->nodeRanks[node].localRanks++;
   }
+  comm->minLocalRanks = INT_MAX;
   // Allocate ranks arrays for each node
   for (int n=0; n<comm->nNodes; n++) {
     NCCLCHECKGOTO(ncclCalloc(&comm->nodeRanks[n].localRankToRank, comm->nodeRanks[n].localRanks), ret, fail);
     comm->maxLocalRanks = std::max(comm->maxLocalRanks, comm->nodeRanks[n].localRanks);
+    comm->minLocalRanks = std::min(comm->minLocalRanks, comm->nodeRanks[n].localRanks);
     comm->nodeRanks[n].localRanks = 0;
   }
   // And fill the ranks arrays
@@ -985,6 +1059,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   comm->localRank = comm->rankToLocalRank[rank];
   comm->localRanks = comm->nodeRanks[comm->node].localRanks;
 
+  NCCLCHECKGOTO(initNvlDomainInfo(comm), ret, fail);
+
   TRACE(NCCL_INIT,"hostHash[%d] %lx localRank %d localRanks %d localRank0 %d",
         rank, comm->peerInfo[rank].hostHash, comm->localRank, comm->localRanks, comm->localRankToRank[0]);
   if (comm->localRank == -1 || comm->localRankToRank[0] == -1 || comm->localRanks == 0) {
@@ -1227,6 +1303,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels);
 
   // Compute time models for algorithm and protocol combinations
+  NCCLCHECKGOTO(ncclTopoInitTunerConstants(comm), ret, fail);
+  NCCLCHECKGOTO(ncclTunerPluginLoad(comm), ret, fail);
+  if (comm->tuner) {
+    NCCLCHECK(comm->tuner->init(&comm->tunerContext, comm->commHash, comm->nRanks, comm->nNodes, ncclDebugLog, &comm->nvlDomainInfo, &comm->tunerConstants));
+  }
   NCCLCHECKGOTO(ncclTopoTuneModel(comm, comm->minCompCap, comm->maxCompCap, graphs), ret, fail);
 
   INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d nvls channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->nChannels, comm->nvlsChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer);
@@ -1248,7 +1329,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   }
 
   comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
-  comm->baseStride = 0;
+  comm->devrState.bigSize = 0;
+
+  comm->ceColl.baseUCSymReadyPtr = NULL;
+  comm->ceColl.baseUCSymComplPtr = NULL;
 
   // Call devCommSetup before the last barrier, making sure we don't have a thread running in front and starting to
   // launch NCCL kernels before all cuda mem allocation is complete. That could cause a deadlock.
@@ -1287,6 +1371,10 @@ NCCL_PARAM(MaxCTAs, "MAX_CTAS", NCCL_CONFIG_UNDEF_INT);
 NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 
+NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", NCCL_CONFIG_UNDEF_INT);
+NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
+
+
 #define NCCL_COMMINIT_FUNCNAME_LEN 128
 struct ncclCommInitRankAsyncJob {
   struct ncclAsyncJob base;
@@ -1416,15 +1504,15 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
       // Negative color does not create a new comm object. We needed to take part in the allgather, but we're done now.
       if (job->color == NCCL_SPLIT_NOCOLOR) goto exit;
     }
-    timers[TIMER_INIT_ALLOC] = clockNano();
-    NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
-    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
     // child hash obtained from (parent hash, split count, color)
     uint64_t hacc[2] = {1, 1};
     eatHash(hacc, &job->parent->commHash);
     eatHash(hacc, &job->splitCount);
     eatHash(hacc, &job->color);
     comm->commHash = digestHash(hacc);
+    timers[TIMER_INIT_ALLOC] = clockNano();
+    NCCLCHECKGOTO(commAlloc(comm, job->parent, job->nranks, job->myrank), res, fail);
+    timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx parent %p splitCount %d color %d key %d- Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, job->parent, job->splitCount, job->color, job->key);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1433,11 +1521,11 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
     // debug info, no commId was used
     commIdHash = 0;
   } else {
+    // obtain a unique hash using the first commId
+    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     timers[TIMER_INIT_ALLOC] = clockNano();
     NCCLCHECKGOTO(commAlloc(comm, NULL, job->nranks, job->myrank), res, fail);
     timers[TIMER_INIT_ALLOC] = clockNano() - timers[TIMER_INIT_ALLOC];
-    // obtain a unique hash using the first commId
-    comm->commHash = commIdHash = getHash(job->commId->internal, NCCL_UNIQUE_ID_BYTES);
     INFO(NCCL_INIT, "%s comm %p rank %d nranks %d cudaDev %d nvmlDev %d busId %lx commId 0x%llx - Init START", job->funcName,
          comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev, comm->busId, commIdHash);
     timers[TIMER_INIT_BOOTSTRAP] = clockNano();
@@ -1447,10 +1535,6 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
   comm->cudaArch = cudaArch;
 
   NCCLCHECKGOTO(initTransportsRank(comm, job->parent, timers), res, fail);
-  NCCLCHECKGOTO(ncclTunerPluginLoad(comm), res, fail);
-  if (comm->tuner) {
-    NCCLCHECK(comm->tuner->init(comm->nRanks, comm->nNodes, ncclDebugLog, &comm->tunerContext));
-  }
 
   // update communicator state
   comm->initState = ncclSuccess;
@@ -1511,8 +1595,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   int ctaPolicyEnv;
   int shrinkShareEnv;
   int nvlsCTAsEnv;
+  int nChannelsPerNetPeerEnv;
+  int nvlinkUtilCentricSchedEnableEnv;
 
-  /* override configuration from env variable. */
+  /* override configuration with env variable. */
   blockingEnv = ncclParamCommBlocking();
   if (blockingEnv == 0 || blockingEnv == 1)
     comm->config.blocking = blockingEnv;
@@ -1541,6 +1627,23 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
       comm->config.maxCTAs = maxCTAsEnv;
   }
 
+  /* override configuration with env variable. */
+  nChannelsPerNetPeerEnv = ncclParamNChannelsPerNetPeer();
+  if (nChannelsPerNetPeerEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (nChannelsPerNetPeerEnv <= 0)
+      INFO(NCCL_ENV, "NCCL_NCHANNELS_PER_NET_PEER %d is too low, leaving it set at %d", nChannelsPerNetPeerEnv, comm->config.nChannelsPerNetPeer);
+    else
+      comm->config.nChannelsPerNetPeer = nChannelsPerNetPeerEnv;
+  }
+
+  nvlinkUtilCentricSchedEnableEnv = ncclParamNvlinkUtilCentricSchedEnable();
+  if (nvlinkUtilCentricSchedEnableEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (nvlinkUtilCentricSchedEnableEnv != 0 && nvlinkUtilCentricSchedEnableEnv != 1)
+      INFO(NCCL_ENV, "NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE %d is not valid, leaving it set at %d", nvlinkUtilCentricSchedEnableEnv, comm->config.nvlinkCentricSched);
+    else
+      comm->config.nvlinkCentricSched = nvlinkUtilCentricSchedEnableEnv;
+  }
+
   envNetName = ncclGetEnv("NCCL_NET");
   if (envNetName)
     tmpNetName = envNetName;
@@ -1608,7 +1711,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     comm->config.collnetEnable = 0;
   }
 
-  if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY) {
+  if (comm->config.CTAPolicy < NCCL_CTA_POLICY_DEFAULT || comm->config.CTAPolicy > NCCL_CTA_POLICY_ZERO) {
     INFO(NCCL_ENV, "CTAPolicy %d is not a valid value, set it to %d", comm->config.CTAPolicy, NCCL_CTA_POLICY_DEFAULT);
     comm->config.CTAPolicy = NCCL_CTA_POLICY_DEFAULT;
   }
@@ -1617,6 +1720,7 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
     INFO(NCCL_ENV, "nvlsCTAs %d is not a valid value, NCCL will decide the default value automatically", comm->config.nvlsCTAs);
     comm->config.nvlsCTAs = NCCL_CONFIG_UNDEF_INT;
   }
+
   return ret;
 }
 
@@ -1668,6 +1772,10 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
       internalConfigPtr->shrinkShare = defaultConfig.shrinkShare;
       internalConfigPtr->nvlsCTAs = defaultConfig.nvlsCTAs;
     }
+    if (internalConfigPtr->version < NCCL_VERSION(2, 28, 0)) {
+      internalConfigPtr->nChannelsPerNetPeer = defaultConfig.nChannelsPerNetPeer;
+      internalConfigPtr->nvlinkCentricSched = defaultConfig.nvlinkCentricSched;
+    }
   }
 
   /* check input config attributes, -1 means user-undefined and we should use default value from NCCL. */
@@ -1706,7 +1814,7 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   }
 
   if (internalConfigPtr->CTAPolicy != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->CTAPolicy < NCCL_CTA_POLICY_DEFAULT ||
-    internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_EFFICIENCY)) {
+    internalConfigPtr->CTAPolicy > NCCL_CTA_POLICY_ZERO)) {
     WARN("Invalid config policy attribute value %d", internalConfigPtr->CTAPolicy);
     ret = ncclInvalidArgument;
     goto fail;
@@ -1724,6 +1832,18 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
     goto fail;
   }
 
+  if (internalConfigPtr->nChannelsPerNetPeer != NCCL_CONFIG_UNDEF_INT && (internalConfigPtr->nChannelsPerNetPeer <= 0 || internalConfigPtr->nChannelsPerNetPeer > MAXCHANNELS)) {
+    WARN("Invalid config nChannelsPerNetPeer attribute value %d", internalConfigPtr->nChannelsPerNetPeer);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
+  if (internalConfigPtr->nvlinkCentricSched != NCCL_CONFIG_UNDEF_INT && internalConfigPtr->nvlinkCentricSched != 0 && internalConfigPtr->nvlinkCentricSched != 1) {
+    WARN("Invalid config nvlinkCentricSched attribute value %d", internalConfigPtr->nvlinkCentricSched);
+    ret = ncclInvalidArgument;
+    goto fail;
+  }
+
   /* default config value can be tuned on different platform. */
   NCCL_CONFIG_DEFAULT(internalConfigPtr, blocking, NCCL_CONFIG_UNDEF_INT, 1, "Blocking", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, cgaClusterSize, NCCL_CONFIG_UNDEF_INT, 4, "CGA cluster size", "%d");
@@ -1737,6 +1857,9 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   NCCL_CONFIG_DEFAULT(internalConfigPtr, CTAPolicy, NCCL_CONFIG_UNDEF_INT, NCCL_CTA_POLICY_DEFAULT, "CTA policy flags", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, shrinkShare, NCCL_CONFIG_UNDEF_INT, 0, "shrinkShare", "%d");
   NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlsCTAs, NCCL_CONFIG_UNDEF_INT, NCCL_CONFIG_UNDEF_INT, "nvlsCTAs", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, nChannelsPerNetPeer, NCCL_CONFIG_UNDEF_INT,
+                      NCCL_CONFIG_UNDEF_INT, "nChannelsPerNetPeer", "%d");
+  NCCL_CONFIG_DEFAULT(internalConfigPtr, nvlinkCentricSched, NCCL_CONFIG_UNDEF_INT, 0, "nvlinkCentricSched", "%d");
 
   /* assign config to communicator */
   comm->config.blocking = internalConfigPtr->blocking;
@@ -1751,6 +1874,8 @@ static ncclResult_t parseCommConfig(ncclComm_t comm, ncclConfig_t *config) {
   comm->config.CTAPolicy = internalConfigPtr->CTAPolicy;
   comm->config.shrinkShare = internalConfigPtr->shrinkShare;
   comm->config.nvlsCTAs = internalConfigPtr->nvlsCTAs;
+  comm->config.nChannelsPerNetPeer = internalConfigPtr->nChannelsPerNetPeer;
+  comm->config.nvlinkCentricSched = internalConfigPtr->nvlinkCentricSched;
   NCCLCHECKGOTO(envConfigOverride(comm), ret, fail);
 
 exit:
@@ -1779,8 +1904,8 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
   NCCLCHECKGOTO(ncclInit(), res, fail);
 
   if (ncclDebugLevel > NCCL_LOG_WARN || (ncclDebugLevel != NCCL_LOG_NONE && myrank == 0)) {
-    static pthread_once_t once = PTHREAD_ONCE_INIT;
-    pthread_once(&once, showVersion);
+    static std::once_flag once;
+    std::call_once(once, showVersion);
   }
   // Make sure the CUDA runtime is initialized.
   CUDACHECKGOTO(cudaFree(NULL), res, fail);
@@ -2054,7 +2179,7 @@ static ncclResult_t commDestroySync(struct ncclAsyncJob* job_) {
 static ncclResult_t commCleanup(ncclComm_t comm) {
   CUDACHECK(cudaSetDevice(comm->cudaDev));
   if (comm->tuner != NULL) {
-    NCCLCHECK(comm->tuner->destroy(comm->tunerContext));
+    NCCLCHECK(comm->tuner->finalize(comm->tunerContext));
     NCCLCHECK(ncclTunerPluginUnload(comm));
   }
   NCCLCHECK(commFree(comm));
@@ -2158,7 +2283,7 @@ static ncclResult_t commReclaim(struct ncclAsyncJob* job_) {
 NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm);
 ncclResult_t ncclCommDestroy(ncclComm_t comm) {
   if (comm == NULL) {
-    NVTX3_FUNC_RANGE_IN(nccl_domain);
+    NCCL_NVTX3_FUNC_RANGE;
     return ncclSuccess;
   }
 
@@ -2210,6 +2335,10 @@ ncclResult_t ncclCommAbort(ncclComm_t comm) {
   if (comm == NULL) {
     return ncclSuccess;
   }
+
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Abort START",
+      comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+
   NCCLCHECK(ncclGroupStartInternal());
   // Ask anything that might still be running on the device to quit
   NCCLCHECK(setCommAbortFlags(comm,1));
@@ -2418,7 +2547,7 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
 NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count);
 ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
 
   NCCLCHECK(CommCheck(comm, "CommCount", "comm"));
   NCCLCHECK(PtrCheck(count, "CommCount", "count"));
@@ -2432,7 +2561,7 @@ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) {
 
 NCCL_API(ncclResult_t, ncclCommCuDevice, const ncclComm_t comm, int* devid);
 ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
 
   NCCLCHECK(CommCheck(comm, "CommCuDevice", "comm"));
   NCCLCHECK(PtrCheck(devid, "CommCuDevice", "devid"));
@@ -2445,7 +2574,7 @@ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* devid) {
 
 NCCL_API(ncclResult_t, ncclCommUserRank, const ncclComm_t comm, int* rank);
 ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) {
-  NVTX3_FUNC_RANGE_IN(nccl_domain);
+  NCCL_NVTX3_FUNC_RANGE;
 
   NCCLCHECK(CommCheck(comm, "CommUserRank", "comm"));
   NCCLCHECK(PtrCheck(rank, "CommUserRank", "rank"));
diff --git a/src/init_nvtx.cc b/src/init_nvtx.cc
index 1cb1277d2..b7005123b 100644
--- a/src/init_nvtx.cc
+++ b/src/init_nvtx.cc
@@ -1,5 +1,12 @@
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "nccl.h"
 #include "nvtx.h"
+#include "param.h"
 
 static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
   {"Sum", ncclSum, 0},
@@ -9,9 +16,15 @@ static constexpr const nvtxPayloadEnum_t NvtxEnumRedSchema[] = {
   {"Avg", ncclAvg, 0}
 };
 
+NCCL_PARAM(NvtxDisable, "NVTX_DISABLE", 0);
+
 // Must be called before the first call to any reduction operation.
 void initNvtxRegisteredEnums() {
   // Register schemas and strings
+  if (ncclParamNvtxDisable()) {
+    return;
+  }
+
   constexpr const nvtxPayloadEnumAttr_t eAttr {
     .fieldMask = NVTX_PAYLOAD_ENUM_ATTR_ENTRIES | NVTX_PAYLOAD_ENUM_ATTR_NUM_ENTRIES |
       NVTX_PAYLOAD_ENUM_ATTR_SIZE | NVTX_PAYLOAD_ENUM_ATTR_SCHEMA_ID,
diff --git a/src/misc/CMakeLists.txt b/src/misc/CMakeLists.txt
new file mode 100644
index 000000000..984becc5f
--- /dev/null
+++ b/src/misc/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Misc sources
+set(MISC_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/strongstream.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/socket.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/ibvwrap.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/mlx5dvsymbols.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/mlx5dvwrap.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/cudawrap.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/param.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/ipcsocket.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/shmutils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/nvmlwrap.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/argcheck.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/gdrwrap.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/ibvsymbols.cc
+)
+
+# Add misc sources to parent scope
+set(MISC_SOURCES ${MISC_SOURCES} PARENT_SCOPE)
diff --git a/src/misc/cudawrap.cc b/src/misc/cudawrap.cc
index 5b66fea92..1ecb35fb2 100644
--- a/src/misc/cudawrap.cc
+++ b/src/misc/cudawrap.cc
@@ -9,6 +9,7 @@
 #include "debug.h"
 #include "param.h"
 #include "cudawrap.h"
+#include <mutex>
 
 // This env var (NCCL_CUMEM_ENABLE) toggles cuMem API usage
 NCCL_PARAM(CuMemEnable, "CUMEM_ENABLE", -2);
@@ -153,6 +154,12 @@ DECLARE_CUDA_PFN(cuMulticastCreate, 12010);
 DECLARE_CUDA_PFN(cuMulticastGetGranularity, 12010);
 DECLARE_CUDA_PFN(cuMulticastUnbind, 12010);
 #endif
+/* Stream MemOp support */
+DECLARE_CUDA_PFN(cuStreamBatchMemOp, 11070);
+DECLARE_CUDA_PFN(cuStreamWaitValue32, 11070);
+DECLARE_CUDA_PFN(cuStreamWaitValue64, 11070);
+DECLARE_CUDA_PFN(cuStreamWriteValue32, 11070);
+DECLARE_CUDA_PFN(cuStreamWriteValue64, 11070);
 #endif
 
 #define CUDA_DRIVER_MIN_VERSION 11030
@@ -238,11 +245,17 @@ static ncclResult_t cudaPfnFuncLoader(void) {
   LOAD_SYM(cuMulticastGetGranularity, 12010, 1);
   LOAD_SYM(cuMulticastUnbind, 12010, 1);
 #endif
+/* Stream MemOp support */
+  LOAD_SYM(cuStreamBatchMemOp, 11070, 1);
+  LOAD_SYM(cuStreamWaitValue32, 11070, 1);
+  LOAD_SYM(cuStreamWaitValue64, 11070, 1);
+  LOAD_SYM(cuStreamWriteValue32, 11070, 1);
+  LOAD_SYM(cuStreamWriteValue64, 11070, 1);
   return ncclSuccess;
 }
 #endif
 
-static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static std::once_flag initOnceFlag;
 static ncclResult_t initResult;
 
 static void initOnceFunc() {
@@ -295,6 +308,6 @@ static void initOnceFunc() {
 }
 
 ncclResult_t ncclCudaLibraryInit() {
-  pthread_once(&initOnceControl, initOnceFunc);
+  std::call_once(initOnceFlag, initOnceFunc);
   return initResult;
 }
diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc
index 3b46759c6..cef254cf2 100644
--- a/src/misc/gdrwrap.cc
+++ b/src/misc/gdrwrap.cc
@@ -5,6 +5,7 @@
  ************************************************************************/
 
 #include "gdrwrap.h"
+#include <mutex>
 
 #ifndef GDR_DIRECT
 #include "core.h"
@@ -47,7 +48,7 @@ pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER;
     *cast = tmp;                                         \
   } while (0)
 
-static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static std::once_flag initOnceFlag;
 static ncclResult_t initResult;
 
 static void initOnceFunc(void) {
@@ -97,7 +98,7 @@ static void initOnceFunc(void) {
 
 
 ncclResult_t wrap_gdr_symbols(void) {
-  pthread_once(&initOnceControl, initOnceFunc);
+  std::call_once(initOnceFlag, initOnceFunc);
   return initResult;
 }
 
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 59f52e320..6d6586e78 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -7,6 +7,7 @@
 #include "ibvwrap.h"
 #include <sys/types.h>
 #include <unistd.h>
+#include <mutex>
 
 #ifdef NCCL_BUILD_RDMA_CORE
 #include <infiniband/verbs.h>
@@ -15,12 +16,12 @@
 #endif
 #include "ibvsymbols.h"
 
-static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static std::once_flag initOnceFlag;
 static ncclResult_t initResult;
 struct ncclIbvSymbols ibvSymbols;
 
 ncclResult_t wrap_ibv_symbols(void) {
-  pthread_once(&initOnceControl,
+  std::call_once(initOnceFlag,
                [](){ initResult = buildIbvSymbols(&ibvSymbols); });
   return initResult;
 }
diff --git a/src/misc/mlx5dvwrap.cc b/src/misc/mlx5dvwrap.cc
index 930ed5d2e..af4f41dff 100644
--- a/src/misc/mlx5dvwrap.cc
+++ b/src/misc/mlx5dvwrap.cc
@@ -7,6 +7,7 @@
 #include "mlx5/mlx5dvwrap.h"
 #include <sys/types.h>
 #include <unistd.h>
+#include <mutex>
 
 #ifdef NCCL_BUILD_MLX5DV
 #include <infiniband/mlx5dv.h>
@@ -15,12 +16,12 @@
 #endif
 #include "mlx5/mlx5dvsymbols.h"
 
-static pthread_once_t initOnceControl = PTHREAD_ONCE_INIT;
+static std::once_flag initOnceFlag;
 static ncclResult_t initResult;
 struct ncclMlx5dvSymbols mlx5dvSymbols;
 
 ncclResult_t wrap_mlx5dv_symbols(void) {
-  pthread_once(&initOnceControl,
+  std::call_once(initOnceFlag,
                [](){ initResult = buildMlx5dvSymbols(&mlx5dvSymbols); });
   return initResult;
 }
@@ -28,7 +29,7 @@ ncclResult_t wrap_mlx5dv_symbols(void) {
 /* CHECK_NOT_NULL: helper macro to check for NULL symbol */
 #define CHECK_NOT_NULL(container, internal_name) \
   if (container.internal_name == NULL) { \
-     WARN("lib wrapper not initialized."); \
+     WARN("NET/MLX5: lib wrapper not initialized."); \
      return ncclInternalError; \
   }
 
@@ -36,16 +37,7 @@ ncclResult_t wrap_mlx5dv_symbols(void) {
   CHECK_NOT_NULL(container, internal_name); \
   retval = container.call; \
   if (retval == error_retval) { \
-    WARN("Call to " name " failed with error %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-  return ncclSuccess;
-
-#define MLX5DV_INT_CHECK_RET_ERRNO(container, internal_name, call, success_retval, name) \
-  CHECK_NOT_NULL(container, internal_name); \
-  int ret = container.call; \
-  if (ret != success_retval) { \
-    INFO(NCCL_NET, "Call to " name " failed with error %s errno %d", strerror(ret), ret); \
+    WARN("NET/MLX5: Call to " name " failed with error %s", strerror(errno)); \
     return ncclSystemError; \
   } \
   return ncclSuccess;
@@ -57,8 +49,14 @@ bool wrap_mlx5dv_is_supported(struct ibv_device *device) {
   return mlx5dvSymbols.mlx5dv_internal_is_supported(device);
 }
 
-ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context *context, char *buf, size_t buf_len) {
-  MLX5DV_INT_CHECK_RET_ERRNO(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path, mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len), 0, "mlx5dv_get_data_direct_sysfs_path");
+ncclResult_t wrap_mlx5dv_get_data_direct_sysfs_path(struct ibv_context* context, char* buf, size_t buf_len) {
+  CHECK_NOT_NULL(mlx5dvSymbols, mlx5dv_internal_get_data_direct_sysfs_path);
+  int ret = mlx5dvSymbols.mlx5dv_internal_get_data_direct_sysfs_path(context, buf, buf_len);
+  if (ret == 0) return ncclSuccess;
+  /* ENODEV can happen if the devices is not data-direct but mlx5 is used. It's not an error*/
+  if (ret == ENODEV) return ncclInvalidArgument;
+  INFO(NCCL_NET, "NET/MLX5: Call to mlx5dv_internal_get_data_direct_sysfs_path failed with error %s errno %d", strerror(ret), ret);
+  return ncclSystemError;
 }
 
 /* DMA-BUF support */
@@ -72,4 +70,4 @@ struct ibv_mr * wrap_direct_mlx5dv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t off
     return NULL;
   }
   return mlx5dvSymbols.mlx5dv_internal_reg_dmabuf_mr(pd, offset, length, iova, fd, access, mlx5_access);
-}
\ No newline at end of file
+}
diff --git a/src/misc/nvmlwrap.cc b/src/misc/nvmlwrap.cc
index 66ba2d4c8..d26a6facf 100644
--- a/src/misc/nvmlwrap.cc
+++ b/src/misc/nvmlwrap.cc
@@ -41,6 +41,7 @@ namespace {
   NCCL_NVML_FN(nvmlDeviceGetFieldValues, nvmlReturn_t, (nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values))
   // MNNVL support
   NCCL_NVML_FN(nvmlDeviceGetGpuFabricInfoV, nvmlReturn_t, (nvmlDevice_t device, nvmlGpuFabricInfoV_t *gpuFabricInfo))
+  NCCL_NVML_FN(nvmlDeviceGetPlatformInfo, nvmlReturn_t, (nvmlDevice_t device, nvmlPlatformInfo_t *platfromInfo))
   // CC support
   NCCL_NVML_FN(nvmlSystemGetConfComputeState, nvmlReturn_t, (nvmlConfComputeSystemState_t *state));
   NCCL_NVML_FN(nvmlSystemGetConfComputeSettings, nvmlReturn_t, (nvmlSystemConfComputeSettings_t *setting));
@@ -95,6 +96,7 @@ ncclResult_t ncclNvmlEnsureInitialized() {
       {(void**)&pfn_nvmlDeviceGetFieldValues, "nvmlDeviceGetFieldValues"},
       // MNNVL support
       {(void**)&pfn_nvmlDeviceGetGpuFabricInfoV, "nvmlDeviceGetGpuFabricInfoV"},
+      {(void**)&pfn_nvmlDeviceGetPlatformInfo, "nvmlDeviceGetPlatformInfo"},
       // CC support
       {(void**)&pfn_nvmlSystemGetConfComputeState, "nvmlSystemGetConfComputeState"},
       {(void**)&pfn_nvmlSystemGetConfComputeSettings, "nvmlSystemGetConfComputeSettings"}
@@ -298,6 +300,15 @@ ncclResult_t ncclNvmlDeviceGetGpuFabricInfoV(nvmlDevice_t device, nvmlGpuFabricI
   return ncclSuccess;
 }
 
+ncclResult_t ncclNvmlDeviceGetPlatformInfo(nvmlDevice_t device, nvmlPlatformInfo_t *platformInfo) {
+  NCCLCHECK(ncclNvmlEnsureInitialized());
+  std::lock_guard<std::mutex> locked(lock);
+  platformInfo->version = nvmlPlatformInfo_v2;
+  NVMLTRY(nvmlDeviceGetPlatformInfo, device, platformInfo);
+  return ncclSuccess;
+}
+
+
 ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
   NCCLCHECK(ncclNvmlEnsureInitialized());
   std::lock_guard<std::mutex> locked(lock);
@@ -314,6 +325,10 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
       status->multiGpuProtectedPCIE = true;
     else
       status->multiGpuProtectedPCIE = false;
+    if (ccInfo.settingV12040.multiGpuMode == NVML_CC_SYSTEM_MULTIGPU_NVLE)
+      status->multiGpuNVLE = true;
+    else
+      status->multiGpuNVLE = false;
   } else if (pfn_nvmlSystemGetConfComputeState != NULL) {
     NVMLTRY(nvmlSystemGetConfComputeState, &ccInfo.settingV12020);
     if (ccInfo.settingV12020.ccFeature == NVML_CC_SYSTEM_FEATURE_ENABLED)
@@ -321,9 +336,11 @@ ncclResult_t ncclNvmlGetCCStatus(struct ncclNvmlCCStatus *status) {
     else
       status->CCEnabled = false;
     status->multiGpuProtectedPCIE = false;
+    status->multiGpuNVLE = false;
   } else {
     status->CCEnabled = false;
     status->multiGpuProtectedPCIE = false;
+    status->multiGpuNVLE = false;
   }
   return ncclSuccess;
 }
diff --git a/src/misc/param.cc b/src/misc/param.cc
index d7c324fe9..9060b0066 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -15,6 +15,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 #include <pthread.h>
+#include <mutex>
 #include <pwd.h>
 
 const char* userHomeDir() {
@@ -67,13 +68,13 @@ static void initEnvFunc() {
 }
 
 void initEnv() {
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once, initEnvFunc);
+  static std::once_flag once;
+  std::call_once(once, initEnvFunc);
 }
 
 void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache) {
-  static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
-  pthread_mutex_lock(&mutex);
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> lock(mutex);
   if (__atomic_load_n(cache, __ATOMIC_RELAXED) == uninitialized) {
     const char* str = ncclGetEnv(env);
     int64_t value = deftVal;
@@ -89,7 +90,6 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
     }
     __atomic_store_n(cache, value, __ATOMIC_RELAXED);
   }
-  pthread_mutex_unlock(&mutex);
 }
 
 const char* ncclGetEnv(const char* name) {
diff --git a/src/misc/shmutils.cc b/src/misc/shmutils.cc
index eb9cd1015..59adedf24 100644
--- a/src/misc/shmutils.cc
+++ b/src/misc/shmutils.cc
@@ -114,8 +114,11 @@ ncclResult_t ncclShmOpen(char* shmPath, size_t shmPathSize, size_t shmSize, void
   }
 
   if (devShmPtr) {
+    cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+    CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), ret, fail);
     CUDACHECKGOTO(cudaHostRegister((void*)hptr, realShmSize, cudaHostRegisterPortable | cudaHostRegisterMapped), ret, fail);
     CUDACHECKGOTO(cudaHostGetDevicePointer(&dptr, (void*)hptr, 0), ret, fail);
+    CUDACHECKGOTO(cudaThreadExchangeStreamCaptureMode(&mode), ret, fail);
   }
 
   shmHandleInit(fd, shmPath, shmSize, realShmSize, hptr, dptr, create, tmphandle);
@@ -182,34 +185,36 @@ ncclResult_t ncclShmUnlink(ncclShmHandle_t handle) {
 
 ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize) {
   ncclResult_t ret = ncclSuccess;
-  int curRound;
-  size_t mycnt;
+  int nextRound = shmem->round + 1;
+  int curIndex = shmem->round % 2;
+  bool done;
+  int index = 0;
+  size_t maxTypeSize = shmem->maxTypeSize;
 
-  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || shmem->maxTypeSize < typeSize) {
+  if (comm == NULL || shmem == NULL || sendbuff == NULL || recvbuff == NULL || maxTypeSize < typeSize) {
     ret = ncclInvalidArgument;
     goto exit;
   }
 
-  curRound = shmem->round;
-  memcpy((char*)shmem->ptr[curRound] + comm->localRank * typeSize, sendbuff, typeSize);
-  /* sync among local ranks */
-  mycnt = __atomic_add_fetch(shmem->cnt[curRound], 1, __ATOMIC_ACQ_REL);
-  if (mycnt == comm->localRanks) {
-    *shmem->cnt[curRound ^ 1] = 0; /* prepare next round */
-    __atomic_store_n(shmem->cnt[curRound], comm->localRanks + 1, __ATOMIC_RELEASE); /* release everyone */
-  } else {
-    uint64_t t0 = clockNano();
-    while(__atomic_load_n(shmem->cnt[curRound], __ATOMIC_ACQUIRE) != comm->localRanks + 1) {
-      if (clockNano() - t0 >= 5 * 1000) sched_yield();
-      if (__atomic_load_n(comm->abortFlag, __ATOMIC_ACQUIRE) == 1) {
-        ret = ncclInternalError;
-        goto exit;
+  memcpy((char*)shmem->ptr[curIndex] + comm->localRank * maxTypeSize, sendbuff, typeSize);
+  /* reset the previous round and notify I arrive this round */
+  __atomic_store_n((int*)((char*)shmem->cnt[curIndex] + CACHE_LINE_SIZE * comm->localRank), nextRound, __ATOMIC_RELEASE);
+
+  do {
+    done = true;
+    for (int i = index; i < comm->localRanks; ++i) {
+      if (i != comm->localRank && __atomic_load_n((int*)((char*)shmem->cnt[curIndex] + CACHE_LINE_SIZE * i), __ATOMIC_ACQUIRE) < nextRound) {
+        done = false;
+        index = i;
+        break;
       }
     }
-  }
+  } while (!done);
 
-  memcpy(recvbuff, (const void*)shmem->ptr[curRound], comm->localRanks * typeSize);
-  shmem->round ^= 1;
+  for (int i = 0; i < comm->localRanks; ++i) {
+    memcpy((uint8_t*)recvbuff + i * typeSize, (uint8_t*)shmem->ptr[curIndex] + i * maxTypeSize, typeSize);
+  }
+  shmem->round = nextRound;
 
 exit:
   return ret;
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index d066d2829..5633fef3e 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -149,6 +149,9 @@ static ncclResult_t findInterfaces(const char* prefixList, char* names, union nc
     if (family != AF_INET && family != AF_INET6)
       continue;
 
+    /* Only consider running interfaces, i.e. UP and physically attached. */
+    if (!(interface->ifa_flags & IFF_RUNNING)) continue;
+
     TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, ncclSocketToString((union ncclSocketAddress *) interface->ifa_addr, line));
 
     /* Allow the caller to force the socket family type */
@@ -377,11 +380,12 @@ ncclResult_t ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs,
         NCCLCHECK(ncclFindInterfaceMatchSubnet(ifNames, ifAddrs, &idAddr, ifNameMaxSize, nIfs));
       }
     }
-    // Then look for anything else (but not docker or lo)
-    if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+    // Then look for anything else (but not docker,lo, or virtual)
+    if (*nIfs == 0) NCCLCHECK(findInterfaces("^docker,lo,virbr", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
     // Finally look for docker, then lo.
     if (*nIfs == 0) NCCLCHECK(findInterfaces("docker", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
     if (*nIfs == 0) NCCLCHECK(findInterfaces("lo", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
+    if (*nIfs == 0) NCCLCHECK(findInterfaces("virbr", ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs, nIfs));
   }
   return ncclSuccess;
 }
diff --git a/src/misc/strongstream.cc b/src/misc/strongstream.cc
index 1766f4167..d92b506cb 100644
--- a/src/misc/strongstream.cc
+++ b/src/misc/strongstream.cc
@@ -8,6 +8,7 @@
 #include "cudawrap.h"
 #include "checks.h"
 #include "param.h"
+#include <mutex>
 
 #if CUDART_VERSION >= 13000
 #define cudaStreamGetCaptureInfo_v3 cudaStreamGetCaptureInfo
@@ -27,14 +28,14 @@ struct ncclStrongStreamCapture {
 ////////////////////////////////////////////////////////////////////////////////
 
 static ncclCudaContext* cxtListHead = nullptr;
-static pthread_mutex_t cxtListLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex cxtListMutex;
 
 ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
   ncclResult_t result = ncclSuccess;
   CUcontext hcontext;
   CUCHECK(cuCtxGetCurrent(&hcontext));
 
-  pthread_mutex_lock(&cxtListLock);
+  std::lock_guard<std::mutex> lock(cxtListMutex);
   struct ncclCudaContext* p = cxtListHead;
   while (1) {
     if (p == nullptr) {
@@ -53,13 +54,12 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
     p = p->next;
   }
 leave:
-  pthread_mutex_unlock(&cxtListLock);
   *out = p;
   return ncclSuccess;
 }
 
 void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
-  pthread_mutex_lock(&cxtListLock);
+  std::lock_guard<std::mutex> lock(cxtListMutex);
   if (0 == --cxt->refCount) {
     struct ncclCudaContext** pp = &cxtListHead;
     while (*pp != cxt) pp = &(*pp)->next;
@@ -68,7 +68,6 @@ void ncclCudaContextDrop(struct ncclCudaContext* cxt) {
     ncclStrongStreamDestruct(&cxt->launchOrder);
     free(cxt);
   }
-  pthread_mutex_unlock(&cxtListLock);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/misc/utils.cc b/src/misc/utils.cc
index bb59947e4..7e7179411 100644
--- a/src/misc/utils.cc
+++ b/src/misc/utils.cc
@@ -10,6 +10,7 @@
 #include "nvmlwrap.h"
 
 #include <stdlib.h>
+#include <mutex>
 
 // Get current Compute Capability
 int ncclCudaCompCap() {
@@ -107,8 +108,8 @@ static void getHostHashOnce() {
   hostHashValue = getHash(hostHash, strlen(hostHash));
 }
 uint64_t getHostHash(void) {
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once, getHostHashOnce);
+  static std::once_flag once;
+  std::call_once(once, getHostHashOnce);
   return hostHashValue;
 }
 
@@ -289,3 +290,28 @@ void ncclMemoryStackDestruct(struct ncclMemoryStack* me) {
     h = h1;
   }
 }
+
+/* return concatenated string representing each set bit */
+ncclResult_t ncclBitsToString(uint32_t bits, uint32_t mask, const char* (*toStr)(int), char *buf, size_t bufLen, const char *wildcard) {
+  if (!buf || !bufLen)
+    return ncclInvalidArgument;
+
+  bits &= mask;
+
+  // print wildcard value if all bits set
+  if (wildcard && bits == mask) {
+    snprintf(buf, bufLen, "%s", wildcard);
+    return ncclSuccess;
+  }
+
+  // Add each set bit to string
+  int pos = 0;
+  for (int i = 0; bits; i++, bits >>= 1) {
+    if (bits & 1) {
+      if (pos > 0) pos += snprintf(buf + pos, bufLen - pos, "|");
+      pos += snprintf(buf + pos, bufLen - pos, "%s", toStr(i));
+    }
+  }
+
+  return ncclSuccess;
+}
diff --git a/src/mnnvl.cc b/src/mnnvl.cc
index 34a18b80a..fb41106ab 100644
--- a/src/mnnvl.cc
+++ b/src/mnnvl.cc
@@ -36,7 +36,11 @@ ncclResult_t ncclMnnvlCheck(struct ncclComm* comm) {
     nvmlGpuFabricInfoV_t *fabricInfo2 = &comm->peerInfo[i].fabricInfo;
     // Check if the cluster UUID and cliqueId match
     // A zero UUID means we don't have MNNVL fabric info - disable MNNVL
-    if ((((long *)&fabricInfo2->clusterUuid)[0]|((long *)fabricInfo2->clusterUuid)[1]) == 0) return ncclSuccess;
+    unsigned long uuid0 = 0;
+    unsigned long uuid1 = 0;
+    memcpy(&uuid0, fabricInfo2->clusterUuid, sizeof(uuid0));
+    memcpy(&uuid1, fabricInfo2->clusterUuid + sizeof(uuid0), sizeof(uuid1));
+    if ((uuid0 | uuid1) == 0) return ncclSuccess;
     if ((memcmp(fabricInfo1->clusterUuid, fabricInfo2->clusterUuid, NVML_GPU_FABRIC_UUID_LEN) == 0) &&
         (fabricInfo1->cliqueId == fabricInfo2->cliqueId)) {
       if (i == comm->rank) {
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 292a83914..0c53c826e 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -12,7 +12,7 @@
 #if CUDART_VERSION >= 11000
 #include <cuda_bf16.h>
 #endif
-#if CUDART_VERSION >= 11080
+#if __cplusplus && CUDART_VERSION >= 11080
 #include <cuda_fp8.h>
 #endif
 
@@ -29,9 +29,10 @@ extern "C" {
 #endif
 
 #include <limits.h>
+
 /* Opaque handle to communicator */
 typedef struct ncclComm* ncclComm_t;
-typedef struct ncclWindow* ncclWindow_t;
+typedef struct ncclWindow_vidmem* ncclWindow_t;
 #define NCCL_COMM_NULL NULL
 
 #define NCCL_UNIQUE_ID_BYTES 128
@@ -57,9 +58,12 @@ typedef enum { ncclSuccess                 =  0,
 #define NCCL_WIN_DEFAULT 0x00
 #define NCCL_WIN_COLL_SYMMETRIC 0x01
 
+#define NCCL_WIN_REQUIRED_ALIGNMENT 4096
+
 /* NCCL performance policy */
 #define NCCL_CTA_POLICY_DEFAULT 0x00
 #define NCCL_CTA_POLICY_EFFICIENCY 0x01
+#define NCCL_CTA_POLICY_ZERO 0x02
 
 /* ncclCommShrink flags*/
 #define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
@@ -67,7 +71,7 @@ typedef enum { ncclSuccess                 =  0,
 
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
-typedef struct ncclConfig_v22700 {
+typedef struct ncclConfig_v22800 {
   /* attributes that users should never touch. */
   size_t size;
   unsigned int magic;
@@ -85,6 +89,8 @@ typedef struct ncclConfig_v22700 {
   int CTAPolicy;
   int shrinkShare;
   int nvlsCTAs;
+  int nChannelsPerNetPeer;
+  int nvlinkCentricSched;
 } ncclConfig_t;
 
 /* Config initializer must be assigned to initialize config structure when it is created.
@@ -105,6 +111,8 @@ typedef struct ncclConfig_v22700 {
   NCCL_CONFIG_UNDEF_INT,                    /* CTAPolicy */             \
   NCCL_CONFIG_UNDEF_INT,                    /* shrinkShare */           \
   NCCL_CONFIG_UNDEF_INT,                    /* nvlsCTAs */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* nChannelsPerNetPeer */   \
+  NCCL_CONFIG_UNDEF_INT,                    /* nvlinkCentricSched */    \
 }
 
 /* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
@@ -220,7 +228,9 @@ const char*  ncclGetLastError(ncclComm_t comm);
 const char* pncclGetLastError(ncclComm_t comm);
 
 /* Reload environment variables that determine logging. */
+__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
 void  ncclResetDebugInit();
+__attribute__ ((deprecated("pncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
 void pncclResetDebugInit();
 
 /* Checks whether the comm has encountered any asynchronous errors */
@@ -427,6 +437,49 @@ ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
 
+/*
+ * All-to-All
+ *
+ * Each device sends count values to all other devices and receives count values
+ * from all other devices. Data to send to destination rank j is taken from
+ * sendbuff+j*count and data received from source rank i is placed at
+ * recvbuff+i*count.
+ */
+ncclResult_t  ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Gather
+ *
+ * Each rank sends count elements from sendbuff to the root rank.
+ * On the root rank, data from rank i is placed at recvbuff + i*count.
+ * On non-root ranks, recvbuff is not used.
+ * root is the rank where data will be gathered.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + root * count.
+ */
+ncclResult_t  ncclGather(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Scatter
+ *
+ * On the root rank, count elements from sendbuff+i*count are sent to rank i.
+ * On non-root ranks, sendbuff is not used.
+ * Each rank receives count elements into recvbuff.
+ * root is the rank that will distribute the data.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + root * count.
+ */
+ncclResult_t  ncclScatter(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+
 /*
  * Send
  *
diff --git a/src/nccl_device/CMakeLists.txt b/src/nccl_device/CMakeLists.txt
new file mode 100644
index 000000000..9d0c3d100
--- /dev/null
+++ b/src/nccl_device/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Register sources
+set(SYM_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/core.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/ll_a2a.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/mem_barrier.cc
+)
+
+# Add register sources to parent scope
+set(SYM_SOURCES ${SYM_SOURCES} PARENT_SCOPE)
diff --git a/src/nccl_device/core.cc b/src/nccl_device/core.cc
new file mode 100644
index 000000000..bae6b39bf
--- /dev/null
+++ b/src/nccl_device/core.cc
@@ -0,0 +1,57 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "comm.h"
+#include "nccl_device/impl/core__funcs.h"
+
+NCCL_API(ncclTeam_t, ncclTeamWorld, ncclComm_t comm);
+ncclTeam_t ncclTeamWorld(ncclComm_t comm) {
+  ncclTeam_t ans;
+  ans.nRanks = comm->nRanks;
+  ans.rank = comm->rank;
+  ans.stride = 1;
+  return ans;
+}
+
+NCCL_API(ncclTeam_t, ncclTeamLsa, ncclComm_t comm);
+ncclTeam_t ncclTeamLsa(ncclComm_t comm) {
+  // Ignoring errors since if it fails ncclDevrInitOnce will try again.
+  // The returned team will be junk and the next "interesting" API call that
+  // needs ncclDevrInitOnce will report the error.
+  if (ncclSuccess != ncclDevrInitOnce(comm)) return ncclTeam_t{};
+
+  ncclTeam_t ans;
+  ans.nRanks = comm->devrState.lsaSize;
+  ans.rank = comm->devrState.lsaSelf;
+  ans.stride = 1;
+  return ans;
+}
+
+NCCL_API(ncclTeam_t, ncclTeamRail, ncclComm_t comm);
+ncclTeam_t ncclTeamRail(ncclComm_t comm) {
+  // Ignoring errors as above.
+  if (ncclSuccess != ncclDevrInitOnce(comm)) return ncclTeam_t{};
+
+  ncclTeam_t ans;
+  ans.nRanks = comm->nRanks/comm->devrState.lsaSize;
+  ans.rank = comm->rank/comm->devrState.lsaSize;
+  ans.stride = comm->devrState.lsaSize;
+  return ans;
+}
+
+NCCL_API(int, ncclTeamRankToWorld, ncclComm_t comm, ncclTeam_t team, int rank);
+int ncclTeamRankToWorld(ncclComm_t comm, ncclTeam_t team, int rank) {
+  return comm->rank + (rank - team.rank)*team.stride;
+}
+
+NCCL_API(int, ncclTeamRankToLsa, ncclComm_t comm, ncclTeam_t team, int rank);
+int ncclTeamRankToLsa(ncclComm_t comm, ncclTeam_t team, int rank) {
+  // Ignoring errors as above.
+  if (ncclSuccess != ncclDevrInitOnce(comm)) return -1;
+
+  return comm->devrState.lsaSelf + (rank - team.rank)*team.stride;
+}
diff --git a/src/nccl_device/ll_a2a.cc b/src/nccl_device/ll_a2a.cc
new file mode 100644
index 000000000..6a51d0f2b
--- /dev/null
+++ b/src/nccl_device/ll_a2a.cc
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_device/impl/ll_a2a__funcs.h"
+
+NCCL_API(int, ncclLLA2ACalcSlots, int maxElts, int maxEltSize);
+int ncclLLA2ACalcSlots(int maxElts, int maxEltSize) {
+  return maxElts*divUp(maxEltSize, 8);
+}
+
+NCCL_API(ncclResult_t, ncclLLA2ACreateRequirement, int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+ncclResult_t ncclLLA2ACreateRequirement(
+    int nBlocks, int nSlots, ncclLLA2AHandle_t* outHandle,
+    ncclDevResourceRequirements_t* outReq
+  ) {
+  outHandle->nSlots = nSlots;
+  memset(outReq, 0, sizeof(*outReq));
+  outReq->bufferSize = nBlocks*(1 + 2*nSlots)*16;
+  outReq->bufferAlign = 16;
+  outReq->outBufferHandle = &outHandle->bufHandle;
+  return ncclSuccess;
+}
diff --git a/src/nccl_device/mem_barrier.cc b/src/nccl_device/mem_barrier.cc
new file mode 100644
index 000000000..b6c400fa4
--- /dev/null
+++ b/src/nccl_device/mem_barrier.cc
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_device/impl/mem_barrier__funcs.h"
+
+NCCL_API(ncclResult_t, ncclLsaBarrierCreateRequirement, ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+ncclResult_t ncclLsaBarrierCreateRequirement(
+    ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle,
+    ncclDevResourceRequirements_t* outReq
+  ) {
+  memset(outReq, 0, sizeof(*outReq));
+  outHandle->nBarriers = nBarriers;
+  outReq->bufferSize = (3*nBarriers + nBarriers*team.nRanks)*sizeof(uint32_t);
+  outReq->bufferAlign = alignof(uint32_t);
+  outReq->outBufferHandle = &outHandle->bufHandle;
+  return ncclSuccess;
+}
diff --git a/src/plugin/CMakeLists.txt b/src/plugin/CMakeLists.txt
new file mode 100644
index 000000000..2ef9282f6
--- /dev/null
+++ b/src/plugin/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Add plugin subdirectories
+add_subdirectory(profiler)
+add_subdirectory(net)
+add_subdirectory(tuner)
+
+# Plugin sources
+set(PLUGIN_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/net.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/plugin_open.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuner.cc
+    ${PLUGIN_NET_SOURCES}
+    ${PLUGIN_PROFILER_SOURCES}
+    ${PLUGIN_TUNER_SOURCES}
+)
+
+# Add plugin sources to parent scope
+set(PLUGIN_SOURCES ${PLUGIN_SOURCES} PARENT_SCOPE)
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
index aa80c12ab..6abd0804d 100644
--- a/src/plugin/net.cc
+++ b/src/plugin/net.cc
@@ -12,6 +12,7 @@
 
 #include <string.h>
 #include <errno.h>
+#include <mutex>
 //#include <sys/types.h>
 //#include <sys/stat.h>
 //#include <unistd.h>
@@ -24,17 +25,19 @@ extern getNcclNet_t getNcclNet_v7;
 extern getNcclNet_t getNcclNet_v8;
 extern getNcclNet_t getNcclNet_v9;
 extern getNcclNet_t getNcclNet_v10;
+extern getNcclNet_t getNcclNet_v11;
 extern getNcclCollNet_t getNcclCollNet_v6;
 extern getNcclCollNet_t getNcclCollNet_v7;
 extern getNcclCollNet_t getNcclCollNet_v8;
 extern getNcclCollNet_t getNcclCollNet_v9;
 extern getNcclCollNet_t getNcclCollNet_v10;
+extern getNcclCollNet_t getNcclCollNet_v11;
 
-NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
-#define NCCL_NET_VERSION_COUNT 5
-int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6};
-getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
-getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7,  getNcclCollNet_v6};
+NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 0);
+#define NCCL_NET_VERSION_COUNT 6
+int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {11, 10, 9, 8, 7, 6};
+getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v11, getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
+getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v11, getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
 
 #define NCCL_NET_NUM_INTERNAL_PLUGINS 2
 
@@ -56,19 +59,27 @@ typedef struct netPluginLib {
   ncclNetPluginState_t ncclNetPluginState;      // State of the nccl net plugin
   ncclNetPluginState_t ncclCollNetPluginState;  // State of the nccl coll net plugin
   int ncclNetPluginRefCount;                    // Reference count for the nccl net plugin
+  int netPhysDevs;                              // ncclNet - number of physical devices
+  int netVirtDevs;                              // ncclNet - number of virtual devices
+  int collNetPhysDevs;                          // ncclCollNet -  number of physical devices
+  int collNetVirtDevs;                          // ncclCollNet -  number of virtual devices
 } netPluginLib_t;
 
 int pluginCount = 0;
 bool netPluginLibsInitialized = false;
 netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 };
-static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
+static std::mutex netPluginMutex;
+static std::once_flag initPluginLibsOnceFlag;
 
 static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
   if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
     INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
     NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle, ncclPluginTypeNet));
+    // memset will reset the status to ncllNetPluginStateLoadReady
     memset(pluginLib, 0, sizeof(netPluginLib_t));
+    // reset the count of devices to UNDEF_DEV_COUNT
+    pluginLib->netPhysDevs = pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT;
+    pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
   }
   return ncclSuccess;
 }
@@ -85,11 +96,15 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
   }
 
   // if we fail to find a net, exit
-  if (pluginLib->ncclNet == nullptr) goto fail;
+  if (pluginLib->ncclNet == nullptr) {
+    INFO(NCCL_INIT|NCCL_NET, "External network plugin %s is unsupported",
+         (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name));
+    goto fail;
+  }
 
   pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady;
 
-  // load ncclColNet
+  // load ncclCollNet
   for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
     pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle);
     if (pluginLib->ncclCollNet) break;
@@ -100,7 +115,8 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
   else
     pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
 
-  INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name);
+  INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external network plugin %s",
+       (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name));
 exit:
   return ncclSuccess;
 fail:
@@ -137,25 +153,35 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) {
+static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* pluginLib) {
   int ndev;
-  if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
-    if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
+  if (pluginLib->ncclNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclNet) {
+    ncclNetCommConfig_t commConfig = {};
+    commConfig.trafficClass = comm->config.trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : comm->config.trafficClass;
+    if (pluginLib->ncclNet->init(&comm->netContext, comm->commHash, &commConfig, ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
     if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
+    pluginLib->netPhysDevs = ndev;
+    pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT;
   }
   pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
   INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
 
-  if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
-    if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  if (pluginLib->ncclCollNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
+    if (pluginLib->ncclCollNet->init(&comm->collNetContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
     else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
     else {
+      pluginLib->collNetPhysDevs = ndev;
+      pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
       pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
     }
   }
 exit:
   return ncclSuccess;
 fail:
+  INFO(NCCL_INIT|NCCL_NET, "Failed to initialize NET plugin %s", pluginLib->ncclNet->name);
+  pluginLib->ncclNet->finalize(comm->netContext);
+  pluginLib->netPhysDevs = pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT;
+  pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
   pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
   pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
   goto exit;
@@ -214,6 +240,9 @@ static void initPluginLibsOnceFunc() {
   memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t));
   envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN");
   if (envNetPlugin) {
+    INFO(NCCL_ENV|NCCL_NET, "NCCL_NET_PLUGIN set by environment to %s", envNetPlugin);
+    if (strcasecmp(envNetPlugin, "none") == 0)
+      envNetPlugin = "";
     envNetPluginList = strdup(envNetPlugin);
     // Iterate over list until the list is empty
     netPluginName = strtok_r(envNetPluginList, ",", &savePtr);
@@ -221,7 +250,7 @@ static void initPluginLibsOnceFunc() {
       // We have 2 internal plugins (ib and socket)
       // So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list
       if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) {
-        INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
+        INFO(NCCL_NET|NCCL_ENV,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
         break;
       }
       // need to leave space for the name + "\n"
@@ -231,7 +260,7 @@ static void initPluginLibsOnceFunc() {
         strcpy(netPluginLibs[pluginCounter].name, netPluginName);
         pluginCounter++;
       } else {
-        INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
+        INFO(NCCL_NET|NCCL_ENV,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
       }
       netPluginName = strtok_r(nullptr, ",", &savePtr);
     }
@@ -253,14 +282,14 @@ static void initPluginLibsOnceFunc() {
 
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
   bool ncclNetPluginInitialized = false;
-  pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc);
-  pthread_mutex_lock(&netPluginLock);
+  std::call_once(initPluginLibsOnceFlag, initPluginLibsOnceFunc);
+  std::lock_guard<std::mutex> lock(netPluginMutex);
   for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) {
     if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
       NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
     }
-    if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) {
-      NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex]));
+    if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady) {
+      NCCLCHECK(ncclNetPluginInit(comm, &netPluginLibs[pluginIndex]));
     }
     if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
       bool isAssigned = false;
@@ -273,7 +302,6 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
       }
     }
   }
-  pthread_mutex_unlock(&netPluginLock);
   if (ncclNetPluginInitialized) return ncclSuccess;
   WARN("Failed to initialize any NET plugin");
   return ncclInvalidUsage;
@@ -281,15 +309,60 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
 
 ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
   int pluginIndex = comm->netPluginIndex;
-  pthread_mutex_lock(&netPluginLock);
+  std::lock_guard<std::mutex> lock(netPluginMutex);
+  NCCLCHECK(comm->ncclNet->finalize(comm->netContext));
+  if (comm->collNetContext) NCCLCHECK(comm->ncclCollNet->finalize(comm->collNetContext));
   netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
   for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
     NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
   }
-  pthread_mutex_unlock(&netPluginLock);
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDevs, int* nVirtDevs) {
+  if (netPluginLibs[netPluginIndex].ncclNetPluginState != ncclNetPluginStateEnabled ||
+     netPluginLibs[netPluginIndex].netPhysDevs == NCCL_UNDEF_DEV_COUNT) goto fail;
+  // lock not needed as it's called within a lock already in ncclTopoGetSystem
+  *nPhysDevs = netPluginLibs[netPluginIndex].netPhysDevs;
+  *nVirtDevs = netPluginLibs[netPluginIndex].netVirtDevs;
+  return ncclSuccess;
+fail:
+  WARN("%s: trying to access the number of devices of an uninitialized netPlugin[%d]", __func__, netPluginIndex);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclCollNetGetDevCount(int netPluginIndex, int* nPhysDevs, int* nVirtDevs) {
+  if (netPluginLibs[netPluginIndex].ncclCollNetPluginState != ncclNetPluginStateEnabled ||
+     netPluginLibs[netPluginIndex].collNetPhysDevs == NCCL_UNDEF_DEV_COUNT) goto fail;
+  // lock not needed as it's called within a lock already in ncclTopoGetSystem
+  *nPhysDevs = netPluginLibs[netPluginIndex].collNetPhysDevs;
+  *nVirtDevs = netPluginLibs[netPluginIndex].collNetVirtDevs;
+  return ncclSuccess;
+fail:
+  WARN("%s: trying to access the number of devices of an uninitialized netPlugin[%d]", __func__, netPluginIndex);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDevs) {
+  if (netPluginLibs[netPluginIndex].ncclNetPluginState != ncclNetPluginStateEnabled || nVirtDevs < 0) goto fail;
+  // lock not needed as it's called within a lock already in ncclTopoGetSystem
+  netPluginLibs[netPluginIndex].netVirtDevs = nVirtDevs;
+  return ncclSuccess;
+fail:
+  WARN("%s: failed to set the number of devices for netPlugin[%d] to %d", __func__, netPluginIndex,nVirtDevs);
+  return ncclInternalError;
+}
+
+ncclResult_t ncclCollNetSetVirtDevCount(int netPluginIndex, int nVirtDevs) {
+  if (netPluginLibs[netPluginIndex].ncclCollNetPluginState != ncclNetPluginStateEnabled || nVirtDevs < 0) goto fail;
+  // lock not needed as it's called within a lock already in ncclTopoGetSystem
+  netPluginLibs[netPluginIndex].collNetVirtDevs = nVirtDevs;
+  return ncclSuccess;
+fail:
+  WARN("%s: failed to set the number of devices for netPlugin[%d] to %d", __func__, netPluginIndex,nVirtDevs);
+  return ncclInternalError;
+}
+
 ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
   constexpr int GPU_BUF_SIZE = 2*1024*1024;
 #if CUDART_VERSION >= 11030
@@ -324,7 +397,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     void* mHandle = NULL;
     ncclResult_t ret;
     ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTO(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1);
 
     bool connected;
     connected = false;
@@ -336,7 +409,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
       }
 
       if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(dev, NULL, &handle, &sComm, NULL), ret, cleanup2);
+        NCCLCHECKGOTO(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2);
 
       if (rComm == NULL)
         NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
diff --git a/src/plugin/net/CMakeLists.txt b/src/plugin/net/CMakeLists.txt
new file mode 100644
index 000000000..0a6fcb237
--- /dev/null
+++ b/src/plugin/net/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Net plugin sources
+set(PLUGIN_NET_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v9.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v6.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v7.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v8.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v10.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_v11.cc
+)
+
+# Add net plugin sources to parent scope
+set(PLUGIN_NET_SOURCES ${PLUGIN_NET_SOURCES} PARENT_SCOPE)
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
index 682f239f7..591a57ac0 100644
--- a/src/plugin/net/net_v10.cc
+++ b/src/plugin/net/net_v10.cc
@@ -7,26 +7,203 @@
 #include "nccl_net.h"
 #include "net_device.h"
 #include "proxy.h"
+#include "checks.h"
+#include <dlfcn.h>
 
+static ncclNet_t ncclNet;
+static ncclCollNet_t ncclCollNet;
 static ncclNet_v10_t* ncclNet_v10;
 static ncclCollNet_v10_t* ncclCollNet_v10;
 
+#define NET_INDEX 0
+#define COLLNET_INDEX 1
+#define INDEX_NUMS 2
+static int refCount[INDEX_NUMS];
+
+static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v10_t props_v10;
+  NCCLCHECK(ncclNet_v10->getProperties(dev, &props_v10));
+  props->name = props_v10.name;
+  props->pciPath = props_v10.pciPath;
+  props->guid = props_v10.guid;
+  props->ptrSupport = props_v10.ptrSupport;
+  props->regIsGlobal = props_v10.regIsGlobal;
+  props->forceFlush = props_v10.forceFlush;
+  props->speed = props_v10.speed;
+  props->port = props_v10.port;
+  props->latency = props_v10.latency;
+  props->maxComms = props_v10.maxComms;
+  props->maxRecvs = props_v10.maxRecvs;
+  props->netDeviceType = props_v10.netDeviceType;
+  props->netDeviceVersion = props_v10.netDeviceVersion;
+  props->vProps.ndevs = props_v10.vProps.ndevs;
+  for (int i = 0; i < props->vProps.ndevs; i++) {
+    props->vProps.devs[i] = props_v10.vProps.devs[i];
+  }
+  props->maxP2pBytes = props_v10.maxP2pBytes;
+  props->maxCollBytes = props_v10.maxCollBytes;
+  props->maxMultiRequestSize = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle, void** listenComm) {
+  return ncclNet_v10->listen(dev, handle, listenComm);
+}
+
+static ncclResult_t ncclNet_connect(void* ctx, int dev, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+  return ncclNet_v10->connect(dev, (ncclNetCommConfig_v10_t *)ctx, handle, sendComm, sendDevComm);
+}
+
+static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_v11_t* props) {
+  return ncclNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t *)props);
+}
+
+static ncclResult_t ncclNet_finalize(void* ctx) {
+  refCount[NET_INDEX]--;
+  free(ctx);
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_init(void** ctx, uint64_t commId __attribute__((unused)),
+    ncclNetCommConfig_t* config, ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  // since ncclNet_v11, the ncclNetCommConfig_t has been moved from connect to init. Since the config is per comm,
+  // this allows the config to be passed only once, instead of multiple times (once per connect). To preserve the
+  // ncclNet_v10 behavior, in the compat layer, we store the config in the context pointer and pass it to the connect
+  // function.
+  ncclNetCommConfig_v10_t* config_v10 = nullptr;
+  NCCLCHECK(ncclCalloc(&config_v10, 1));
+  config_v10->trafficClass = config->trafficClass;
+  *ctx = config_v10;
+  // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclNet_v10 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[NET_INDEX]++) return ncclSuccess;
+  NCCLCHECK(ncclNet_v10->init(logfn, proffn));
+  ncclNet.devices = ncclNet_v10->devices;
+  ncclNet.getProperties = ncclNet_getProperties;
+  ncclNet.listen = ncclNet_listen;
+  ncclNet.connect = ncclNet_connect;
+  ncclNet.accept = ncclNet_v10->accept;
+  ncclNet.regMr = ncclNet_v10->regMr;
+  ncclNet.regMrDmaBuf = ncclNet_v10->regMrDmaBuf;
+  ncclNet.deregMr = ncclNet_v10->deregMr;
+  ncclNet.isend = ncclNet_v10->isend;
+  ncclNet.irecv = ncclNet_v10->irecv;
+  ncclNet.iflush = ncclNet_v10->iflush;
+  ncclNet.test = ncclNet_v10->test;
+  ncclNet.closeSend = ncclNet_v10->closeSend;
+  ncclNet.closeRecv = ncclNet_v10->closeRecv;
+  ncclNet.closeListen = ncclNet_v10->closeListen;
+  ncclNet.getDeviceMr = ncclNet_v10->getDeviceMr;
+  ncclNet.irecvConsumed = ncclNet_v10->irecvConsumed;
+  ncclNet.makeVDevice = (ncclNet_v10->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  ncclNet.finalize = ncclNet_finalize;
+  ncclNet.setNetAttr = nullptr;
+  return ncclSuccess;
+}
+
 ncclNet_t* getNcclNet_v10(void* lib) {
   ncclNet_v10 = (ncclNet_v10_t*)dlsym(lib, "ncclNetPlugin_v10");
   if (ncclNet_v10) {
+    ncclNet.name = ncclNet_v10->name;
+    ncclNet.init = ncclNet_init;
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v10)", ncclNet_v10->name);
-    return ncclNet_v10;
+    return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v10 symbol.");
   return nullptr;
 }
 
+static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
+  ncclNetProperties_v10_t props_v10;
+  NCCLCHECK(ncclCollNet_v10->getProperties(dev, &props_v10));
+  props->name = props_v10.name;
+  props->pciPath = props_v10.pciPath;
+  props->guid = props_v10.guid;
+  props->ptrSupport = props_v10.ptrSupport;
+  props->regIsGlobal = props_v10.regIsGlobal;
+  props->forceFlush = props_v10.forceFlush;
+  props->speed = props_v10.speed;
+  props->port = props_v10.port;
+  props->latency = props_v10.latency;
+  props->maxComms = props_v10.maxComms;
+  props->maxRecvs = props_v10.maxRecvs;
+  props->netDeviceType = props_v10.netDeviceType;
+  props->netDeviceVersion = props_v10.netDeviceVersion;
+  props->vProps.ndevs = props_v10.vProps.ndevs;
+  for (int i = 0; i < props->vProps.ndevs; i++) {
+    props->vProps.devs[i] = props_v10.vProps.devs[i];
+  }
+  props->maxP2pBytes = props_v10.maxP2pBytes;
+  props->maxCollBytes = props_v10.maxCollBytes;
+  props->maxMultiRequestSize = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle , void** listenComm) {
+  return ncclCollNet_v10->listen(dev, handle, listenComm);
+}
+
+static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request) {
+  return ncclCollNet_v10->iallgather(collComm, sendData, nRecvParts, (ncclNetSGE_v10_t*)recvParts, bytesPerRank,
+                             windowOffset, windowBytes, sendMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, ncclNetSGE_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request) {
+  return ncclCollNet_v10->ireducescatter(collComm, nSendParts, (ncclNetSGE_v10_t*)sendParts, recvData, bytesPerRank,
+                                 windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
+}
+
+static ncclResult_t ncclCollNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclCollNet_v10->makeVDevice(d, (ncclNetVDeviceProps_v10_t *)props);
+}
+
+static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[COLLNET_INDEX]--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclDebugLogger_t logfn) {
+  // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclCollNet_v10 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[COLLNET_INDEX]++) return ncclSuccess;
+  NCCLCHECK(ncclCollNet_v10->init(logfn));
+  ncclCollNet.devices = ncclCollNet_v10->devices;
+  ncclCollNet.getProperties = ncclCollNet_getProperties;
+  ncclCollNet.listen = ncclCollNet_listen;
+  ncclCollNet.connect = ncclCollNet_v10->connect;
+  ncclCollNet.reduceSupport = ncclCollNet_v10->reduceSupport;
+  ncclCollNet.regMr = ncclCollNet_v10->regMr;
+  ncclCollNet.regMrDmaBuf = ncclCollNet_v10->regMrDmaBuf;
+  ncclCollNet.deregMr = ncclCollNet_v10->deregMr;
+  ncclCollNet.iallreduce = ncclCollNet_v10->iallreduce;
+  ncclCollNet.iallgather = ncclCollNet_iallgather;
+  ncclCollNet.ireducescatter = ncclCollNet_ireducescatter;
+  ncclCollNet.iflush = ncclCollNet_v10->iflush;
+  ncclCollNet.test = ncclCollNet_v10->test;
+  ncclCollNet.closeColl = ncclCollNet_v10->closeColl;
+  ncclCollNet.closeListen = ncclCollNet_v10->closeListen;
+  ncclCollNet.makeVDevice = ncclCollNet_makeVDevice;
+  ncclCollNet.finalize = ncclCollNet_finalize;
+  return ncclSuccess;
+}
+
 ncclCollNet_t* getNcclCollNet_v10(void* lib) {
   ncclCollNet_v10 = (ncclCollNet_v10_t*)dlsym(lib, "ncclCollNetPlugin_v10");
   if (ncclCollNet_v10) {
-    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclNet_v10->name);
-    return ncclCollNet_v10;
+    ncclCollNet.name = ncclCollNet_v10->name;
+    ncclCollNet.init = ncclCollNet_init;
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v10)", ncclCollNet_v10->name);
+    return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v10 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/net/net_v11.cc b/src/plugin/net/net_v11.cc
new file mode 100644
index 000000000..b13a0efb9
--- /dev/null
+++ b/src/plugin/net/net_v11.cc
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "nccl_net.h"
+#include "net_device.h"
+#include "proxy.h"
+#include <dlfcn.h>
+
+static ncclNet_v11_t* ncclNet_v11;
+static ncclCollNet_v11_t* ncclCollNet_v11;
+
+ncclNet_t* getNcclNet_v11(void* lib) {
+  ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11");
+  if (ncclNet_v11) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v11)", ncclNet_v11->name);
+    return ncclNet_v11;
+  }
+  return nullptr;
+}
+
+ncclCollNet_t* getNcclCollNet_v11(void* lib) {
+  ncclCollNet_v11 = (ncclCollNet_v11_t*)dlsym(lib, "ncclCollNetPlugin_v11");
+  if (ncclCollNet_v11) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v11)", ncclCollNet_v11->name);
+    return ncclCollNet_v11;
+  }
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
index baff67935..73eb8614d 100644
--- a/src/plugin/net/net_v6.cc
+++ b/src/plugin/net/net_v6.cc
@@ -8,12 +8,18 @@
 #include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclNet_t ncclNet;
 static ncclCollNet_t ncclCollNet;
 static ncclNet_v6_t* ncclNet_v6;
 static ncclCollNet_v6_t* ncclCollNet_v6;
 
+#define NET_INDEX 0
+#define COLLNET_INDEX 1
+#define INDEX_NUMS 2
+static int refCount[INDEX_NUMS];
+
 static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclNet_v6->getProperties(dev, &p6);
@@ -35,6 +41,7 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
@@ -43,7 +50,14 @@ static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type,
   return ncclNet_v6->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)),
+    int d, void* handle, void** listenComm) {
+  return ncclNet_v6->listen(d, handle, listenComm);
+}
+
+static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)),
+    int dev,
+    void* handle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   return ncclNet_v6->connect(dev, handle, sendComm);
 }
 
@@ -51,7 +65,9 @@ static ncclResult_t ncclNet_accept(void* listenComm, void** recvComm, ncclNetDev
   return ncclNet_v6->accept(listenComm, recvComm);
 }
 
-static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle,
+    void* pHandle __attribute__((unused)),
+    void** request) {
   int sizeInt;
   if (size > MAX_NET_SIZE) return ncclInternalError;
   sizeInt = (int)size;
@@ -59,7 +75,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t
   return ans;
 }
 
-static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles,
+    void** pHandles __attribute__((unused)),
+    void** request) {
   int sizesInt[NCCL_PROXY_MAX_SUBS];
   //reset to nullptr if optional receive completion is set
   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
@@ -71,6 +89,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   return ans;
 }
 
+static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[NET_INDEX]--;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v6_t p6;
   ncclResult_t ans = ncclCollNet_v6->getProperties(dev, &p6);
@@ -92,9 +115,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
+static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)),
+    int d, void* handle, void** listenComm) {
+  return ncclCollNet_v6->listen(d, handle, listenComm);
+}
+
 static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v6->regMr(comm, data, (int) size, type, mhandle);
@@ -110,11 +139,24 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void*
   return ans;
 }
 
-static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[COLLNET_INDEX]--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclNetCommConfig_t* config __attribute__((unused)),
+    ncclDebugLogger_t logfn,
+    ncclProfilerCallback_t proffn __attribute__((unused))) {
+  // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclNet_v6 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[NET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclNet_v6->init(logfn));
   ncclNet.devices = ncclNet_v6->devices;
   ncclNet.getProperties = ncclNet_getProperties;
-  ncclNet.listen = ncclNet_v6->listen;
+  ncclNet.listen = ncclNet_listen;
   ncclNet.connect = ncclNet_connect;
   ncclNet.accept =  ncclNet_accept;
   ncclNet.regMr = ncclNet_regMr;
@@ -130,6 +172,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = NULL;
   ncclNet.irecvConsumed = NULL;
   ncclNet.makeVDevice  = NULL;
+  ncclNet.finalize = ncclNet_finalize;
+  ncclNet.setNetAttr = nullptr;
   return ncclSuccess;
 }
 
@@ -141,15 +185,20 @@ ncclNet_t* getNcclNet_v6(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v6)", ncclNet_v6->name);
     return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v6 symbol.");
   return nullptr;
 }
 
-static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclDebugLogger_t logfn) {
+  // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclCollNet_v6 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[COLLNET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclCollNet_v6->init(logfn));
   ncclCollNet.devices = ncclCollNet_v6->devices;
   ncclCollNet.getProperties = ncclCollNet_getProperties;
-  ncclCollNet.listen = ncclCollNet_v6->listen;
+  ncclCollNet.listen = ncclCollNet_listen;
   ncclCollNet.connect = ncclCollNet_v6->connect;
   ncclCollNet.reduceSupport = ncclCollNet_v6->reduceSupport;
   ncclCollNet.regMr = ncclCollNet_regMr;
@@ -162,6 +211,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v6->test;
   ncclCollNet.closeColl = ncclCollNet_v6->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v6->closeListen;
+  ncclCollNet.makeVDevice  = NULL;
+  ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
 
@@ -173,6 +224,5 @@ ncclCollNet_t* getNcclCollNet_v6(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v6)", ncclCollNet_v6->name);
     return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
index 4bad5ec26..a13717294 100644
--- a/src/plugin/net/net_v7.cc
+++ b/src/plugin/net/net_v7.cc
@@ -8,12 +8,18 @@
 #include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclNet_t ncclNet;
 static ncclCollNet_t ncclCollNet;
 static ncclNet_v7_t* ncclNet_v7;
 static ncclCollNet_v7_t* ncclCollNet_v7;
 
+#define NET_INDEX 0
+#define COLLNET_INDEX 1
+#define INDEX_NUMS 2
+static int refCount[INDEX_NUMS];
+
 static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclNet_v7->getProperties(dev, &p7);
@@ -35,10 +41,18 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle, void** listenComm) {
+  return ncclNet_v7->listen(dev, handle, listenComm);
+}
+
+static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)),
+    int dev,
+    void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
   return ncclNet_v7->connect(dev, handle, sendComm, sendDevComm);
 }
 
@@ -47,7 +61,9 @@ static ncclResult_t ncclNet_regMr(void* comm, void* data, size_t size, int type,
   return ncclNet_v7->regMr(comm, data, (int) size, type, mhandle);
 }
 
-static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle,
+    void* pHandle __attribute__((unused)),
+    void** request) {
   int sizeInt;
   if (size > MAX_NET_SIZE) return ncclInternalError;
   sizeInt = (int)size;
@@ -55,7 +71,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t
   return ans;
 }
 
-static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles,
+    void** pHandles __attribute__((unused)),
+    void** request) {
   int sizesInt[NCCL_PROXY_MAX_SUBS];
   //reset to nullptr if optional receive completion is set
   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
@@ -67,6 +85,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   return ans;
 }
 
+static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[NET_INDEX]--;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v7_t p7;
   ncclResult_t ans = ncclCollNet_v7->getProperties(dev, &p7);
@@ -88,9 +111,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
+static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)),
+    int d, void* handle, void** listenComm) {
+  return ncclCollNet_v7->listen(d, handle, listenComm);
+}
+
 static ncclResult_t ncclCollNet_regMr(void* comm, void* data, size_t size, int type, void** mhandle) {
   if (size >= 1UL<<31) return ncclInternalError;
   return ncclCollNet_v7->regMr(comm, data, (int) size, type, mhandle);
@@ -106,11 +135,24 @@ static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void*
   return ans;
 }
 
-static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[COLLNET_INDEX]--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclNetCommConfig_t* config __attribute__((unused)),
+    ncclDebugLogger_t logfn,
+    ncclProfilerCallback_t proffn __attribute__((unused))) {
+  // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclNet_v7 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[NET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclNet_v7->init(logfn));
   ncclNet.devices = ncclNet_v7->devices;
   ncclNet.getProperties = ncclNet_getProperties; // ncclNet_v5->getProperties;
-  ncclNet.listen = ncclNet_v7->listen;
+  ncclNet.listen = ncclNet_listen;
   ncclNet.connect = ncclNet_connect;
   ncclNet.accept =  ncclNet_v7->accept;
   ncclNet.regMr = ncclNet_regMr;
@@ -126,6 +168,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = ncclNet_v7->getDeviceMr;
   ncclNet.irecvConsumed = ncclNet_v7->irecvConsumed;
   ncclNet.makeVDevice  = NULL;
+  ncclNet.finalize = ncclNet_finalize;
+  ncclNet.setNetAttr = nullptr;
   return ncclSuccess;
 }
 
@@ -137,15 +181,20 @@ ncclNet_t* getNcclNet_v7(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v7)", ncclNet_v7->name);
     return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v7 symbol.");
   return nullptr;
 }
 
-static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclDebugLogger_t logfn) {
+  // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclCollNet_v7 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[COLLNET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclCollNet_v7->init(logfn));
   ncclCollNet.devices = ncclCollNet_v7->devices;
   ncclCollNet.getProperties = ncclCollNet_getProperties;
-  ncclCollNet.listen = ncclCollNet_v7->listen;
+  ncclCollNet.listen = ncclCollNet_listen;
   ncclCollNet.connect = ncclCollNet_v7->connect;
   ncclCollNet.reduceSupport = ncclCollNet_v7->reduceSupport;
   ncclCollNet.regMr = ncclCollNet_regMr;
@@ -158,6 +207,7 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v7->test;
   ncclCollNet.closeColl = ncclCollNet_v7->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v7->closeListen;
+  ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
 
@@ -169,6 +219,5 @@ ncclCollNet_t* getNcclCollNet_v7(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v7)", ncclCollNet_v7->name);
     return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v7 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
index b43bb895e..d241d5dc5 100644
--- a/src/plugin/net/net_v8.cc
+++ b/src/plugin/net/net_v8.cc
@@ -8,12 +8,18 @@
 #include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclNet_t ncclNet;
 static ncclCollNet_t ncclCollNet;
 static ncclNet_v8_t* ncclNet_v8;
 static ncclCollNet_v8_t* ncclCollNet_v8;
 
+#define NET_INDEX 0
+#define COLLNET_INDEX 1
+#define INDEX_NUMS 2
+static int refCount[INDEX_NUMS];
+
 static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v8_t p8;
   ncclResult_t ans = ncclNet_v8->getProperties(dev, &p8);
@@ -35,14 +41,24 @@ static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle, void** listenComm) {
+  return ncclNet_v8->listen(dev, handle, listenComm);
+}
+
+static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)),
+    int dev,
+    void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
   return ncclNet_v8->connect(dev, handle, sendComm, sendDevComm);
 }
 
-static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle,
+    void* pHandle __attribute__((unused)),
+    void** request) {
   int sizeInt;
   if (size > MAX_NET_SIZE) return ncclInternalError;
   sizeInt = (int)size;
@@ -50,7 +66,9 @@ static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int t
   return ans;
 }
 
-static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles,
+    void** pHandles __attribute__((unused)),
+    void** request) {
   int sizesInt[NCCL_PROXY_MAX_SUBS];
   //reset to nullptr if optional receive completion is set
   if (*request == (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION) *request = nullptr;
@@ -62,6 +80,11 @@ static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* si
   return ans;
 }
 
+static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[NET_INDEX]--;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
   ncclNetProperties_v8_t p8;
   ncclResult_t ans = ncclCollNet_v8->getProperties(dev, &p8);
@@ -83,9 +106,15 @@ static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* prop
   props->vProps.devs[0] = dev;
   props->maxP2pBytes = MAX_NET_SIZE;
   props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
+static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle, void** listenComm) {
+  return ncclCollNet_v8->listen(dev, handle, listenComm);
+}
+
 static ncclResult_t ncclCollNet_iallreduce(void* collComm, void* sendData, void* recvData, size_t count,
       ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
   int countInt;
@@ -128,11 +157,23 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n
   return ans;
 }
 
-static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[COLLNET_INDEX]--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclNetCommConfig_t* config __attribute__((unused)),
+    ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclNet_v8 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[NET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclNet_v8->init(logfn));
   ncclNet.devices = ncclNet_v8->devices;
   ncclNet.getProperties = ncclNet_getProperties;
-  ncclNet.listen = ncclNet_v8->listen;
+  ncclNet.listen = ncclNet_listen;
   ncclNet.connect = ncclNet_connect;
   ncclNet.accept =  ncclNet_v8->accept;
   ncclNet.regMr = ncclNet_v8->regMr;
@@ -148,6 +189,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = ncclNet_v8->getDeviceMr;
   ncclNet.irecvConsumed = ncclNet_v8->irecvConsumed;
   ncclNet.makeVDevice   = NULL;
+  ncclNet.finalize = ncclNet_finalize;
+  ncclNet.setNetAttr = nullptr;
   return ncclSuccess;
 }
 
@@ -159,15 +202,20 @@ ncclNet_t* getNcclNet_v8(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v8)", ncclNet_v8->name);
     return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v8 symbol.");
   return nullptr;
 }
 
-static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclDebugLogger_t logfn) {
+  // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclCollNet_v8 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[COLLNET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclCollNet_v8->init(logfn));
   ncclCollNet.devices = ncclCollNet_v8->devices;
   ncclCollNet.getProperties = ncclCollNet_getProperties;
-  ncclCollNet.listen = ncclCollNet_v8->listen;
+  ncclCollNet.listen = ncclCollNet_listen;
   ncclCollNet.connect = ncclCollNet_v8->connect;
   ncclCollNet.reduceSupport = ncclCollNet_v8->reduceSupport;
   ncclCollNet.regMr = ncclCollNet_v8->regMr;
@@ -180,6 +228,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v8->test;
   ncclCollNet.closeColl = ncclCollNet_v8->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v8->closeListen;
+  ncclCollNet.makeVDevice = nullptr;
+  ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
 
@@ -191,6 +241,5 @@ ncclCollNet_t* getNcclCollNet_v8(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v8)", ncclCollNet_v8->name);
     return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v8 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
index 34e039332..12011aa8c 100644
--- a/src/plugin/net/net_v9.cc
+++ b/src/plugin/net/net_v9.cc
@@ -8,25 +8,64 @@
 #include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclNet_t ncclNet;
 static ncclCollNet_t ncclCollNet;
 static ncclNet_v9_t* ncclNet_v9;
 static ncclCollNet_v9_t* ncclCollNet_v9;
 
+#define NET_INDEX 0
+#define COLLNET_INDEX 1
+#define INDEX_NUMS 2
+static int refCount[INDEX_NUMS];
+
 static ncclResult_t ncclNet_getProperties(int dev, ncclNetProperties_t* props) {
-  return ncclNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+  ncclNetProperties_v9_t props_v9;
+  NCCLCHECK(ncclNet_v9->getProperties(dev, &props_v9));
+  props->name = props_v9.name;
+  props->pciPath = props_v9.pciPath;
+  props->guid = props_v9.guid;
+  props->ptrSupport = props_v9.ptrSupport;
+  props->regIsGlobal = props_v9.regIsGlobal;
+  props->forceFlush = props_v9.forceFlush;
+  props->speed = props_v9.speed;
+  props->port = props_v9.port;
+  props->latency = props_v9.latency;
+  props->maxComms = props_v9.maxComms;
+  props->maxRecvs = props_v9.maxRecvs;
+  props->netDeviceType = props_v9.netDeviceType;
+  props->netDeviceVersion = props_v9.netDeviceVersion;
+  props->vProps.ndevs = props_v9.vProps.ndevs;
+  for (int i = 0; i < props->vProps.ndevs; i++) {
+    props->vProps.devs[i] = props_v9.vProps.devs[i];
+  }
+  props->maxP2pBytes = props_v9.maxP2pBytes;
+  props->maxCollBytes = props_v9.maxCollBytes;
+  props->maxMultiRequestSize = 1;
+  return ncclSuccess;
 }
 
-static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle, void* pHandle, void** request) {
+static ncclResult_t ncclNet_isend(void* sendComm, void* data, size_t size, int tag, void* mhandle,
+    void* pHandle __attribute__((unused)),
+    void** request) {
   return ncclNet_v9->isend(sendComm, data, size, tag, mhandle, request);
 }
 
-static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles, void** pHandles, void** request) {
+static ncclResult_t ncclNet_irecv(void* recvComm, int n, void** data, size_t* sizes, int* tags, void** mhandles,
+    void** pHandles __attribute__((unused)),
+    void** request) {
   return ncclNet_v9->irecv(recvComm, n, data, sizes, tags, mhandles, request);
 }
 
-static ncclResult_t ncclNet_connect(int dev, ncclNetCommConfig_t* config, void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
+static ncclResult_t ncclNet_listen(void* ctx __attribute__((unused)),
+    int dev, void* handle, void** listenComm) {
+  return ncclNet_v9->listen(dev, handle, listenComm);
+}
+
+static ncclResult_t ncclNet_connect(void* ctx __attribute__((unused)),
+    int dev,
+    void* handle, void** sendComm, ncclNetDeviceHandle_t** sendDevComm) {
   return ncclNet_v9->connect(dev, handle, sendComm, sendDevComm);
 }
 
@@ -34,8 +73,40 @@ static ncclResult_t ncclNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
   return ncclNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t*)props);
 }
 
+static ncclResult_t ncclNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[NET_INDEX]--;
+  return ncclSuccess;
+}
+
 static ncclResult_t ncclCollNet_getProperties(int dev, ncclNetProperties_t* props) {
-  return ncclCollNet_v9->getProperties(dev, (ncclNetProperties_v9_t *)props);
+  ncclNetProperties_v9_t props_v9;
+  NCCLCHECK(ncclCollNet_v9->getProperties(dev, &props_v9));
+  props->name = props_v9.name;
+  props->pciPath = props_v9.pciPath;
+  props->guid = props_v9.guid;
+  props->ptrSupport = props_v9.ptrSupport;
+  props->regIsGlobal = props_v9.regIsGlobal;
+  props->forceFlush = props_v9.forceFlush;
+  props->speed = props_v9.speed;
+  props->port = props_v9.port;
+  props->latency = props_v9.latency;
+  props->maxComms = props_v9.maxComms;
+  props->maxRecvs = props_v9.maxRecvs;
+  props->netDeviceType = props_v9.netDeviceType;
+  props->netDeviceVersion = props_v9.netDeviceVersion;
+  props->vProps.ndevs = props_v9.vProps.ndevs;
+  for (int i = 0; i < props->vProps.ndevs; i++) {
+    props->vProps.devs[i] = props_v9.vProps.devs[i];
+  }
+  props->maxP2pBytes = props_v9.maxP2pBytes;
+  props->maxCollBytes = props_v9.maxCollBytes;
+  props->maxMultiRequestSize = 1;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCollNet_listen(void* ctx __attribute__((unused)),
+    int d, void* handle, void** listenComm) {
+  return ncclCollNet_v9->listen(d, handle, listenComm);
 }
 
 static ncclResult_t ncclCollNet_iallgather(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_t* recvParts,
@@ -53,11 +124,27 @@ static ncclResult_t ncclCollNet_ireducescatter(void* collComm, int nSendParts, n
                                  windowOffset, windowBytes, dataType, redOp, recvMhandle, request);
 }
 
-static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+static ncclResult_t ncclCollNet_makeVDevice(int* d, ncclNetVDeviceProps_t* props) {
+  return ncclCollNet_v9->makeVDevice(d, (ncclNetVDeviceProps_v9_t *)props);
+}
+
+static ncclResult_t ncclCollNet_finalize(void* ctx __attribute__((unused))) {
+  refCount[COLLNET_INDEX]--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclNetCommConfig_t* config __attribute__((unused)),
+    ncclDebugLogger_t logfn, ncclProfilerCallback_t proffn) {
+  // before ncclNet_v11 the net plugin was initialized only once. With ncclNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclNet_v9 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[NET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclNet_v9->init(logfn));
   ncclNet.devices = ncclNet_v9->devices;
   ncclNet.getProperties = ncclNet_getProperties;
-  ncclNet.listen = ncclNet_v9->listen;
+  ncclNet.listen = ncclNet_listen;
   ncclNet.connect = ncclNet_connect;
   ncclNet.accept = ncclNet_v9->accept;
   ncclNet.regMr = ncclNet_v9->regMr;
@@ -73,6 +160,8 @@ static ncclResult_t ncclNet_init(ncclDebugLogger_t logfn, ncclProfilerCallback_t
   ncclNet.getDeviceMr = ncclNet_v9->getDeviceMr;
   ncclNet.irecvConsumed = ncclNet_v9->irecvConsumed;
   ncclNet.makeVDevice = (ncclNet_v9->makeVDevice) ? ncclNet_makeVDevice : nullptr;
+  ncclNet.finalize = ncclNet_finalize;
+  ncclNet.setNetAttr = nullptr;
   return ncclSuccess;
 }
 
@@ -84,15 +173,20 @@ ncclNet_t* getNcclNet_v9(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded net plugin %s (v9)", ncclNet_v9->name);
     return &ncclNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclNetPlugin_v9 symbol.");
   return nullptr;
 }
 
-static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
+static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
+    uint64_t commId __attribute__((unused)),
+    ncclDebugLogger_t logfn) {
+  // before ncclCollNet_v11 the collnet plugin was initialized only once. With ncclCollNet_v11 this is no longer the case.
+  // The compat layer preserves the ncclCollNet_v9 behavior using a refCount to track the number of times the plugin
+  // is initialized, and avoid initializing it multiple times.
+  if (refCount[COLLNET_INDEX]++) return ncclSuccess;
   NCCLCHECK(ncclCollNet_v9->init(logfn));
   ncclCollNet.devices = ncclCollNet_v9->devices;
   ncclCollNet.getProperties = ncclCollNet_getProperties;
-  ncclCollNet.listen = ncclCollNet_v9->listen;
+  ncclCollNet.listen = ncclCollNet_listen;
   ncclCollNet.connect = ncclCollNet_v9->connect;
   ncclCollNet.reduceSupport = ncclCollNet_v9->reduceSupport;
   ncclCollNet.regMr = ncclCollNet_v9->regMr;
@@ -105,6 +199,8 @@ static ncclResult_t ncclCollNet_init(ncclDebugLogger_t logfn) {
   ncclCollNet.test = ncclCollNet_v9->test;
   ncclCollNet.closeColl = ncclCollNet_v9->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v9->closeListen;
+  ncclCollNet.makeVDevice = (ncclCollNet_v9->makeVDevice) ? ncclCollNet_makeVDevice : nullptr;
+  ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
 
@@ -116,6 +212,5 @@ ncclCollNet_t* getNcclCollNet_v9(void* lib) {
     INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded collnet plugin %s (v9)", ncclCollNet_v9->name);
     return &ncclCollNet;
   }
-  INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find ncclCollNetPlugin_v9 symbol.");
   return nullptr;
 }
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index f80321c81..740f22065 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -7,6 +7,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
+#include <link.h>
 #include <dlfcn.h>
 
 #include "debug.h"
@@ -16,6 +17,7 @@
 
 #define NUM_LIBS 3
 static char* libNames[NUM_LIBS];
+char* ncclPluginLibPaths[NUM_LIBS];
 static void *libHandles[NUM_LIBS];
 static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
 static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
@@ -50,6 +52,14 @@ static void appendNameToList(char* nameList, int *leftChars, char* name) {
   *leftChars -= strlen(name) + 1;
 }
 
+static char* getLibPath(void* handle) {
+  struct link_map* lm;
+  if (dlinfo(handle, RTLD_DI_LINKMAP, &lm) != 0)
+    return nullptr;
+  else
+    return strdup(lm->l_name);
+}
+
 static void* openPluginLib(enum ncclPluginType type, const char* libName) {
   int openErr, len = PATH_MAX;
   char libName_[MAX_STR_LEN] = { 0 };
@@ -58,49 +68,44 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
 
   if (libName && strlen(libName)) {
     snprintf(libName_, MAX_STR_LEN, "%s", libName);
-    libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-    if (libHandles[type]) {
-      INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-      libNames[type] = strdup(libName_);
-      return libHandles[type];
-    }
-    if (openErr == ENOENT) {
-      appendNameToList(eNoEntNameList, &len, libName_);
-    } else {
-      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
-    }
-
-    // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so')
-    if (strchr(libName, '/') == nullptr && (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") || strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) {
-      snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
-      libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
-      if (libHandles[type]) {
-        INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
-        libNames[type] = strdup(libName_);
-        return libHandles[type];
-      }
-      if (openErr == ENOENT) {
-        appendNameToList(eNoEntNameList, &len, libName_);
-      } else {
-        INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
-      }
-    }
   } else {
     snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
+  }
+
+  libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
+  if (libHandles[type]) {
+    libNames[type] = strdup(libName_);
+    ncclPluginLibPaths[type] = getLibPath(libHandles[type]);
+    return libHandles[type];
+  }
+  if (openErr == ENOENT) {
+    appendNameToList(eNoEntNameList, &len, libName_);
+  } else {
+    INFO(subsys[type], "%s/Plugin: %s: %s", pluginNames[type], libName_, openErrStr);
+  }
+
+  // libName can't be a relative or absolute path (start with '.' or contain any '/'). It can't be a library name either (start with 'lib' or end with '.so')
+  if (libName && strlen(libName) && strchr(libName, '/') == nullptr &&
+      (strncmp(libName, "lib", strlen("lib")) || strlen(libName) < strlen(".so") ||
+       strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")))) {
+    snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
+
     libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
     if (libHandles[type]) {
       libNames[type] = strdup(libName_);
+      ncclPluginLibPaths[type] = getLibPath(libHandles[type]);
       return libHandles[type];
     }
     if (openErr == ENOENT) {
       appendNameToList(eNoEntNameList, &len, libName_);
     } else {
-      INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
+      INFO(subsys[type], "%s/Plugin: %s: %s", pluginNames[type], libName_, openErrStr);
     }
   }
 
   if (strlen(eNoEntNameList)) {
-    INFO(subsys[type], "%s/Plugin: Could not find:%s. %s", pluginNames[type], eNoEntNameList, pluginFallback[type]);
+    INFO(subsys[type], "%s/Plugin: Could not find:%s%s%s", pluginNames[type], eNoEntNameList,
+         (strlen(pluginFallback[type]) > 0 ? ". " : ""), pluginFallback[type]);
   } else if (strlen(pluginFallback[type])) {
     INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], pluginFallback[type]);
   }
@@ -123,6 +128,7 @@ void* ncclGetNetPluginLib(enum ncclPluginType type) {
   if (libNames[ncclPluginTypeNet]) {
     // increment the reference counter of the net library
     libNames[type] = strdup(libNames[ncclPluginTypeNet]);
+    ncclPluginLibPaths[type] = strdup(ncclPluginLibPaths[ncclPluginTypeNet]);
     libHandles[type] = dlopen(libNames[ncclPluginTypeNet], RTLD_NOW | RTLD_LOCAL);
   }
   return libHandles[type];
@@ -132,6 +138,8 @@ ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type) {
   if (handle && libHandles[type] == handle) {
     dlclose(handle);
     libHandles[type] = nullptr;
+    free(ncclPluginLibPaths[type]);
+    ncclPluginLibPaths[type] = nullptr;
     free(libNames[type]);
     libNames[type] = nullptr;
   }
diff --git a/src/plugin/profiler.cc b/src/plugin/profiler.cc
index 15c3f2bc2..8514db17c 100644
--- a/src/plugin/profiler.cc
+++ b/src/plugin/profiler.cc
@@ -13,17 +13,22 @@
 #include "profiler.h"
 #include "transport.h"
 #include "plugin.h"
+#include <mutex>
 
 extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
 extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
 extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
 extern ncclProfiler_t* getNcclProfiler_v4(void* lib);
+extern ncclProfiler_t* getNcclProfiler_v5(void* lib);
 
-static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex profilerMutex;
 static int profilerPluginRefCount;
 static void* profilerPluginLib;
 static ncclProfiler_t* ncclProfiler;
 
+extern __thread int ncclGroupDepth;
+__thread ncclProfilerApiState_t ncclProfilerApiState;
+
 #define MAX_STR_LEN 256
 
 enum {
@@ -35,22 +40,37 @@ static int profilerPluginStatus = profilerPluginLoadReady;
 static pid_t pid;
 
 static ncclResult_t ncclProfilerPluginLoad(void) {
+  const char* profilerName;
   if (profilerPluginLoadFailed == profilerPluginStatus) {
     return ncclSuccess;
   }
 
-  pthread_mutex_lock(&profilerLock);
+  std::lock_guard<std::mutex> lock(profilerMutex);
   if (profilerPluginLoadSuccess == profilerPluginStatus) {
     ++profilerPluginRefCount;
     goto exit;
   }
 
-  profilerPluginLib = ncclOpenProfilerPluginLib(ncclGetEnv("NCCL_PROFILER_PLUGIN"));
+  if ((profilerName = ncclGetEnv("NCCL_PROFILER_PLUGIN")) != nullptr) {
+    INFO(NCCL_ENV, "NCCL_PROFILER_PLUGIN set by environment to %s", profilerName);
+    if (strcasecmp(profilerName, "none") == 0)
+      goto fail;
+  }
+  profilerPluginLib = ncclOpenProfilerPluginLib(profilerName);
   if (profilerPluginLib == nullptr) {
-    goto fail;
+    profilerPluginLib = ncclGetNetPluginLib(ncclPluginTypeProfiler);
+    if (nullptr == profilerPluginLib) {
+      goto fail;
+    }
+    profilerName = nullptr;
+  } else if (ncclPluginLibPaths[ncclPluginTypeProfiler]) {
+    profilerName = ncclPluginLibPaths[ncclPluginTypeProfiler];
   }
 
-  ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
+  ncclProfiler = getNcclProfiler_v5(profilerPluginLib);
+  if (ncclProfiler == nullptr) {
+    ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
+  }
   if (ncclProfiler == nullptr) {
     ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
   }
@@ -61,8 +81,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
     ncclProfiler = getNcclProfiler_v1(profilerPluginLib);
   }
   if (ncclProfiler == NULL) {
+    if (profilerName) INFO(NCCL_INIT, "External profiler plugin %s is unsupported", profilerName);
     goto fail;
   }
+  if (profilerName) INFO(NCCL_INIT, "Successfully loaded external profiler plugin %s", profilerName);
 
   ++profilerPluginRefCount;
   profilerPluginStatus = profilerPluginLoadSuccess;
@@ -74,7 +96,6 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
   pid = getpid();
 
 exit:
-  pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 fail:
   if (profilerPluginLib) NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler));
@@ -84,15 +105,16 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
 }
 
 static ncclResult_t ncclProfilerPluginUnload(void) {
-  pthread_mutex_lock(&profilerLock);
+  std::lock_guard<std::mutex> lock(profilerMutex);
   if (0 == (--profilerPluginRefCount)) {
-    INFO(NCCL_ENV, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
+    if (__builtin_expect(ncclProfiler != NULL, 0)) {
+      INFO(NCCL_INIT, "PROFILER/Plugin: Closing profiler plugin %s", ncclProfiler->name);
+    }
     NCCLCHECK(ncclClosePluginLib(profilerPluginLib, ncclPluginTypeProfiler));
     profilerPluginLib = nullptr;
     ncclProfiler = nullptr;
     profilerPluginStatus = profilerPluginLoadReady;
   }
-  pthread_mutex_unlock(&profilerLock);
   return ncclSuccess;
 }
 
@@ -167,10 +189,9 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
   TIME_START_EVENT(init);
   ncclProfilerPluginLoad();
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
+    int err = ncclProfiler->init(&comm->profilerContext, comm->commHash, &ncclProfilerEventMask, comm->config.commName, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
     if (err) {
-      WARN("Profiler init failed with error (%d). Continue without profiler.", err);
-      ncclProfiler = NULL;
+      INFO(NCCL_INIT, "Profiler init failed with error '%d': %s. Continue without profiler.", err, strerror(errno));
     }
   }
   TIME_STOP_EVENT(init);
@@ -179,7 +200,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
 
 ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
   TIME_START_EVENT(finalize);
-  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && comm->profilerContext) {
     ncclProfiler->finalize(comm->profilerContext);
   }
   ncclProfilerPluginUnload();
@@ -189,6 +210,143 @@ ncclResult_t ncclProfilerPluginFinalize(struct ncclComm* comm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclProfilerStartGroupApiEvent(struct ncclInfo* info, bool isGraphCaptured) {
+  ncclProfilerEventDescr_t eDescr = { 0 };
+  eDescr.type = ncclProfileGroupApi;
+  eDescr.groupApi.graphCaptured = isGraphCaptured;
+
+  ncclProfilerApiState.eActivationMask = __atomic_load_n(&ncclProfilerEventMask, __ATOMIC_RELAXED);
+  int groupApiMask = ncclProfileGroupApi | ncclProfileP2pApi | ncclProfileCollApi | ncclProfileKernelLaunch | ncclProfileGroup | ncclProfileColl | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin;
+  // Only count outermost groups when emitting group API events
+  if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & groupApiMask)) {
+    if (ncclProfilerApiState.profilerGroupDepth == 0) {
+      eDescr.groupApi.groupDepth = ncclGroupDepth;
+      ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.groupApiEventHandle, &eDescr);
+      ncclProfilerApiState.profilerGroupDepth = ncclGroupDepth;
+      ncclProfilerApiState.state = ncclProfilerGroupApiStartStateStarted;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopGroupApiEvent() {
+  void* groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  if (__builtin_expect(ncclProfiler != NULL, 0) && groupApiEventHandle && ncclProfilerApiState.profilerGroupDepth == 0) {
+    ncclProfiler->stopEvent(groupApiEventHandle);
+    ncclProfilerApiState.groupApiEventHandle = nullptr;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerRecordGroupApiEventState(ncclProfilerEventState_t eState) {
+  void* groupApiEventHandle = ncclProfilerApiState.groupApiEventHandle;
+  bool shouldRecord = false;
+  if (eState == ncclProfilerGroupStartApiStop && ncclProfilerApiState.state == ncclProfilerGroupApiStartStateStarted) {
+    ncclProfilerApiState.state = ncclProfilerGroupApiStartStateStopped;
+    shouldRecord = true;
+  } else if (eState == ncclProfilerGroupEndApiStart && ncclProfilerApiState.state == ncclProfilerGroupApiStartStateStopped) {
+    ncclProfilerApiState.state = ncclProfilerGroupApiStartStateReset;
+    shouldRecord = true;
+  }
+
+  if (__builtin_expect(ncclProfiler != NULL, 0) && groupApiEventHandle && shouldRecord) {
+    ncclProfiler->recordEventState(groupApiEventHandle, eState, NULL);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartP2pApiEvent(struct ncclInfo *info, bool isGraphCaptured) {
+  ncclProfilerEventDescr_t eDescr = { 0 };
+  eDescr.type = ncclProfileP2pApi;
+  eDescr.parentObj = ncclProfilerApiState.groupApiEventHandle;
+  eDescr.p2pApi.func = ncclFuncToString(info->coll);
+  eDescr.p2pApi.count = info->count;
+  eDescr.p2pApi.datatype = ncclDatatypeToString(info->datatype);
+  eDescr.p2pApi.stream = (void *) info->stream;
+  eDescr.p2pApi.graphCaptured = isGraphCaptured;
+  int p2pApiMask = ncclProfileP2pApi | ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin;
+  if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & p2pApiMask)) {
+    ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.p2pApiEventHandle, &eDescr);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopP2pApiEvent() {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && ncclProfilerApiState.p2pApiEventHandle) {
+    ncclProfiler->stopEvent(ncclProfilerApiState.p2pApiEventHandle);
+    ncclProfilerApiState.p2pApiEventHandle = nullptr;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartCollApiEvent(struct ncclInfo *info, bool isGraphCaptured) {
+  ncclProfilerEventDescr_t eDescr = { 0 };
+  eDescr.type = ncclProfileCollApi;
+  eDescr.parentObj = ncclProfilerApiState.groupApiEventHandle;
+  eDescr.collApi.func = ncclFuncToString(info->coll);
+  eDescr.collApi.count = info->count;
+  eDescr.collApi.datatype = ncclDatatypeToString(info->datatype);
+  eDescr.collApi.stream = (void *) info->stream;
+  eDescr.collApi.root = info->root;
+  eDescr.collApi.graphCaptured = isGraphCaptured;
+  int collApiMask = ncclProfileCollApi | ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin;
+  if (__builtin_expect(ncclProfiler != NULL, 0) && (ncclProfilerApiState.eActivationMask & collApiMask)) {
+    ncclProfiler->startEvent(info->comm->profilerContext, &ncclProfilerApiState.collApiEventHandle, &eDescr);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopCollApiEvent() {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && ncclProfilerApiState.collApiEventHandle) {
+    ncclProfiler->stopEvent(ncclProfilerApiState.collApiEventHandle);
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStartKernelLaunchEvent(struct ncclKernelPlan* plan, cudaStream_t stream) {
+  ncclProfilerEventDescr_t eDescr = { 0 };
+  if (__builtin_expect(ncclProfiler != NULL, 0)) {
+    void* groupApiEventHandle = NULL;
+    // Check if any collective in the plan has a set event activation mask
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    int eActivationMask_ = 0;
+    while (ct) {
+      if (ct->eActivationMask) {
+        eActivationMask_ = ct->eActivationMask;
+        groupApiEventHandle = ct->groupApiEventHandle;
+        goto startKernelLaunchEvent;
+      }
+      ct = ct->next;
+    }
+    // Check if any pt2pt in the plan has a set event activation mask
+    while (pt) {
+      if (pt->eActivationMask) {
+        eActivationMask_ = pt->eActivationMask;
+        groupApiEventHandle = pt->groupApiEventHandle;
+        goto startKernelLaunchEvent;
+      }
+      pt = pt->next;
+    }
+
+  startKernelLaunchEvent:
+    if (eActivationMask_ & ncclProfileKernelLaunch) {
+      eDescr.type = ncclProfileKernelLaunch;
+      eDescr.parentObj = groupApiEventHandle;
+      eDescr.kernelLaunch.stream = (void *) stream;
+      ncclProfiler->startEvent(plan->comm->profilerContext, &plan->kernelLaunchEventHandle, &eDescr);
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclProfilerStopKernelLaunchEvent(struct ncclKernelPlan* plan) {
+  if (__builtin_expect(ncclProfiler != NULL, 0) && plan->kernelLaunchEventHandle) {
+    ncclProfiler->stopEvent(plan->kernelLaunchEventHandle);
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclProfilerStartGroupEvent(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(groupStart);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
@@ -237,26 +395,25 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
   struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
   while (ct) {
     if (__builtin_expect(ncclProfiler != NULL, 0)) {
-      if (plan->groupEventHandle) {
-        int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
-        if (enable) {
-          ncclProfilerEventDescr_t eDescr = { 0 };
-          eDescr.type = ncclProfileColl;
-          eDescr.parentObj = plan->groupEventHandle;
-          eDescr.rank = plan->comm->rank;
-          eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
-          eDescr.coll.func = ncclFuncToString(ct->func);
-          eDescr.coll.sendBuff = ct->sendbuff;
-          eDescr.coll.recvBuff = ct->recvbuff;
-          eDescr.coll.count = ct->count;
-          eDescr.coll.root = ct->root;
-          eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
-          eDescr.coll.nChannels = ct->nChannels;
-          eDescr.coll.nWarps = ct->nWarps;
-          eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
-          eDescr.coll.proto = ncclProtoToString(ct->protocol);
-          ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
-        }
+      int enable = ct->eActivationMask & (ncclProfileColl | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
+      if (enable) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileColl;
+        eDescr.coll.parentGroup = plan->groupEventHandle;
+        eDescr.parentObj = ct->collApiEventHandle;
+        eDescr.rank = plan->comm->rank;
+        eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
+        eDescr.coll.func = ncclFuncToString(ct->func);
+        eDescr.coll.sendBuff = ct->sendbuff;
+        eDescr.coll.recvBuff = ct->recvbuff;
+        eDescr.coll.count = ct->count;
+        eDescr.coll.root = ct->root;
+        eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
+        eDescr.coll.nChannels = ct->nChannels;
+        eDescr.coll.nWarps = ct->nWarps;
+        eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
+        eDescr.coll.proto = ncclProtoToString(ct->protocol);
+        ncclProfiler->startEvent(plan->comm->profilerContext, &ct->eventHandle, &eDescr);
       }
     }
     // comm->seqNumber values are updated even if the plugin is not active, since they are used by RAS as well.
@@ -265,31 +422,30 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
     // reports from RAS.  Instead, we choose not to include graph-captured collectives in our counts.  An exception is
     // made if ncclProfileKernelCh profiler events are active, as they result in proxy events always being added, which
     // gives the consistency.
-    if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
+    if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && (plan->groupEventHandle || ct->collApiEventHandle) &&
                               (ct->eActivationMask & ncclProfileKernelCh)))
       __atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED);
     ct = ct->next;
   }
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (plan->groupEventHandle) {
-      struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
-      while (pt) {
-        int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh);
-        if (enable) {
-          ncclProfilerEventDescr_t eDescr = { 0 };
-          eDescr.type = ncclProfileP2p;
-          eDescr.parentObj = plan->groupEventHandle;
-          eDescr.rank = plan->comm->rank;
-          eDescr.p2p.func = ncclFuncToString(pt->func);
-          eDescr.p2p.buff = pt->buff;
-          eDescr.p2p.count = pt->count;
-          eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
-          eDescr.p2p.peer = pt->root;
-          eDescr.p2p.nChannels = pt->nChannels;
-          ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
-        }
-        pt = pt->next;
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    while (pt) {
+      int enable = pt->eActivationMask & (ncclProfileP2p | ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileKernelCh | ncclProfileNetPlugin);
+      if (enable) {
+        ncclProfilerEventDescr_t eDescr = { 0 };
+        eDescr.type = ncclProfileP2p;
+        eDescr.p2p.parentGroup = plan->groupEventHandle;
+        eDescr.parentObj = pt->p2pApiEventHandle;
+        eDescr.rank = plan->comm->rank;
+        eDescr.p2p.func = ncclFuncToString(pt->func);
+        eDescr.p2p.buff = pt->buff;
+        eDescr.p2p.count = pt->count;
+        eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
+        eDescr.p2p.peer = pt->root;
+        eDescr.p2p.nChannels = pt->nChannels;
+        ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
       }
+      pt = pt->next;
     }
   }
   TIME_STOP_EVENT(taskStart);
@@ -299,17 +455,15 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
 ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
   TIME_START_EVENT(taskStop);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (plan->groupEventHandle) {
-      struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
-      while (ct) {
-        if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle);
-        ct = ct->next;
-      }
-      struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
-      while (pt) {
-        if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle);
-        pt = pt->next;
-      }
+    struct ncclTaskColl* ct = ncclIntruQueueHead(&plan->collTaskQueue);
+    while (ct) {
+      if (ct->eventHandle) ncclProfiler->stopEvent(ct->eventHandle);
+      ct = ct->next;
+    }
+    struct ncclTaskP2p* pt = ncclIntruQueueHead(&plan->p2pTaskQueue);
+    while (pt) {
+      if (pt->eventHandle) ncclProfiler->stopEvent(pt->eventHandle);
+      pt = pt->next;
     }
   }
   TIME_STOP_EVENT(taskStop);
@@ -357,18 +511,18 @@ ncclResult_t ncclProfilerStopProxyOpEvent(int s, struct ncclProxyArgs* args) {
 ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
+  int step_ = DIVUP(stepId, args->sliceSteps);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
-      int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
       eDescr.parentObj = sub->opEventHandle;
       eDescr.rank = sub->rank;
       eDescr.proxyStep.step = step_;
       ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
-      sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
     }
   }
+  sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
@@ -376,18 +530,18 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
 ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* args, int stepId) {
   TIME_START_EVENT(proxyStepStart);
   struct ncclProxySubArgs* sub = &args->subs[s];
+  int step_ = DIVUP(stepId, args->sliceSteps);
   if (__builtin_expect(ncclProfiler != NULL, 0)) {
-    if (sub->opEventHandle && (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin))) {
-      int step_ = DIVUP(stepId, args->sliceSteps);
+    if (sub->eActivationMask & (ncclProfileProxyStep | ncclProfileNetPlugin)) {
       ncclProfilerEventDescr_t eDescr = { 0 };
       eDescr.type = ncclProfileProxyStep;
       eDescr.parentObj = sub->opEventHandle;
       eDescr.rank = sub->rank;
       eDescr.proxyStep.step = step_;
       ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
-      sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
     }
   }
+  sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
   TIME_STOP_EVENT(proxyStepStart);
   return ncclSuccess;
 }
@@ -503,11 +657,11 @@ ncclResult_t ncclProfilerAddPidToProxyOp(struct ncclProxyOp* op) {
   return ncclSuccess;
 }
 
-static pthread_mutex_t proxyProfilerConnectLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex proxyProfilerConnectMutex;
 
 static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxyOp* op) {
   ncclResult_t ret = ncclSuccess;
-  pthread_mutex_lock(&proxyProfilerConnectLock);
+  std::lock_guard<std::mutex> lock(proxyProfilerConnectMutex);
   if (comm->profiler.initialized) goto exit;
   for (int c = 0; c < MAXCHANNELS; c++) {
     NCCLCHECKGOTO(ncclProxyConnect(comm, TRANSPORT_PROFILER, 0, comm->rank, &comm->profiler.sendProxyConn[c]), ret, exit);
@@ -517,7 +671,6 @@ static ncclResult_t proxyProfilerConnect(struct ncclComm* comm, struct ncclProxy
   }
   comm->profiler.initialized = true;
 exit:
-  pthread_mutex_unlock(&proxyProfilerConnectLock);
   return ret;
 }
 
diff --git a/src/plugin/profiler/CMakeLists.txt b/src/plugin/profiler/CMakeLists.txt
new file mode 100644
index 000000000..1a5cc9a30
--- /dev/null
+++ b/src/plugin/profiler/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Profiler plugin sources
+set(PLUGIN_PROFILER_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v3.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v4.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v1.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v2.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler_v5.cc
+)
+
+# Add profiler plugin sources to parent scope
+set(PLUGIN_PROFILER_SOURCES ${PLUGIN_PROFILER_SOURCES} PARENT_SCOPE)
diff --git a/src/plugin/profiler/profiler_v1.cc b/src/plugin/profiler/profiler_v1.cc
index 2126afc68..ef8ef6b5d 100644
--- a/src/plugin/profiler/profiler_v1.cc
+++ b/src/plugin/profiler/profiler_v1.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "nccl_profiler.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclProfiler_t ncclProfiler;
 static ncclProfiler_v1_t* ncclProfiler_v1;
@@ -63,6 +64,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
     case ncclProfileColl: {
       eDescr_v1.coll.name = nullptr; // removed in v4
       eDescr_v1.coll.commHash = 0; // removed in v4
+      eDescr_v1.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5
       eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
       eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
       eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
@@ -80,6 +82,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
     case ncclProfileP2p: {
       eDescr_v1.p2p.name = nullptr; // removed in v4
       eDescr_v1.p2p.commHash = 0; // removed in v4
+      eDescr_v1.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5
       eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
       eDescr_v1.p2p.buff = eDescr->p2p.buff;
       eDescr_v1.p2p.count = eDescr->p2p.count;
@@ -125,7 +128,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve
   return ncclProfiler_v1->recordEventState(eHandle, eState, &args);
 }
 
-static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+static ncclResult_t ncclProfiler_init(void** context,
+    uint64_t commId __attribute__((unused)),
+    int* eActivationMask __attribute__((unused)),
+    const char* commName __attribute__((unused)),
+    int nNodes __attribute__((unused)),
+    int nranks __attribute__((unused)),
+    int rank __attribute__((unused)),
+    ncclDebugLogger_t logfn __attribute__((unused))
+  ) {
   NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
   ncclProfiler.startEvent = ncclProfiler_startEvent;
   ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
@@ -139,9 +150,8 @@ ncclProfiler_t* getNcclProfiler_v1(void* lib) {
   if (ncclProfiler_v1) {
     ncclProfiler.name = ncclProfiler_v1->name;
     ncclProfiler.init = ncclProfiler_init;
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v1->name);
+    INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v1)", ncclProfiler_v1->name);
     return &ncclProfiler;
   }
-  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v1.");
   return NULL;
 }
diff --git a/src/plugin/profiler/profiler_v2.cc b/src/plugin/profiler/profiler_v2.cc
index 11e521e90..d1c83cf1a 100644
--- a/src/plugin/profiler/profiler_v2.cc
+++ b/src/plugin/profiler/profiler_v2.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "nccl_profiler.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclProfiler_t ncclProfiler;
 static ncclProfiler_v2_t* ncclProfiler_v2;
@@ -20,6 +21,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
   switch(eDescr->type) {
     case ncclProfileGroup: break;
     case ncclProfileColl: {
+      eDescr_v2.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5
       eDescr_v2.coll.name = nullptr; // removed in v4
       eDescr_v2.coll.commHash = 0; // removed in v4
       eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
@@ -38,6 +40,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
     case ncclProfileP2p: {
       eDescr_v2.p2p.name = nullptr; // removed in v4
       eDescr_v2.p2p.commHash = 0; // removed in v4
+      eDescr_v2.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5
       eDescr_v2.p2p.func = eDescr->p2p.func;
       eDescr_v2.p2p.buff = eDescr->p2p.buff;
       eDescr_v2.p2p.count = eDescr->p2p.count;
@@ -83,7 +86,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve
   return ncclProfiler_v2->recordEventState(eHandle, eState, &args);
 }
 
-static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+static ncclResult_t ncclProfiler_init(void** context,
+    uint64_t commId __attribute__((unused)),
+    int* eActivationMask __attribute__((unused)),
+    const char* commName __attribute__((unused)),
+    int nNodes __attribute__((unused)),
+    int nranks __attribute__((unused)),
+    int rank __attribute__((unused)),
+    ncclDebugLogger_t logfn __attribute__((unused))
+  ) {
   NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
   ncclProfiler.startEvent = ncclProfiler_startEvent;
   ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
@@ -97,9 +108,8 @@ ncclProfiler_t* getNcclProfiler_v2(void* lib) {
   if (ncclProfiler_v2) {
     ncclProfiler.name = ncclProfiler_v2->name;
     ncclProfiler.init = ncclProfiler_init;
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v2->name);
+    INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v2)", ncclProfiler_v2->name);
     return &ncclProfiler;
   }
-  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v2");
   return NULL;
 }
diff --git a/src/plugin/profiler/profiler_v3.cc b/src/plugin/profiler/profiler_v3.cc
index 3dba3231a..84ec1468e 100644
--- a/src/plugin/profiler/profiler_v3.cc
+++ b/src/plugin/profiler/profiler_v3.cc
@@ -7,6 +7,7 @@
 #include "comm.h"
 #include "nccl_profiler.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclProfiler_t ncclProfiler;
 static ncclProfiler_v3_t* ncclProfiler_v3;
@@ -22,6 +23,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
     case ncclProfileColl: {
       eDescr_v3.coll.name = nullptr; // removed in v4
       eDescr_v3.coll.commHash = 0; // removed in v4
+      eDescr_v3.parentObj = eDescr->coll.parentGroup; // Hierarchy changed in v5
       eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber;
       eDescr_v3.coll.func = eDescr->coll.func;
       eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff;
@@ -37,6 +39,7 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
     case ncclProfileP2p: {
       eDescr_v3.p2p.name = nullptr; // removed in v4
       eDescr_v3.p2p.commHash = 0; // removed in v4
+      eDescr_v3.parentObj = eDescr->p2p.parentGroup; // Hierarchy changed in v5
       eDescr_v3.p2p.func = eDescr->p2p.func;
       eDescr_v3.p2p.buff = eDescr->p2p.buff;
       eDescr_v3.p2p.count = eDescr->p2p.count;
@@ -89,7 +92,15 @@ static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEve
   return ncclProfiler_v3->recordEventState(eHandle, eState, &args);
 }
 
-static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
+static ncclResult_t ncclProfiler_init(void** context,
+    uint64_t commId __attribute__((unused)),
+    int* eActivationMask __attribute__((unused)),
+    const char* commName __attribute__((unused)),
+    int nNodes __attribute__((unused)),
+    int nranks __attribute__((unused)),
+    int rank __attribute__((unused)),
+    ncclDebugLogger_t logfn __attribute__((unused))
+  ) {
   NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask));
   ncclProfiler.startEvent = ncclProfiler_startEvent;
   ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent;
@@ -103,9 +114,8 @@ ncclProfiler_t* getNcclProfiler_v3(void* lib) {
   if (ncclProfiler_v3) {
     ncclProfiler.name = ncclProfiler_v3->name;
     ncclProfiler.init = ncclProfiler_init;
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
+    INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v3)", ncclProfiler_v3->name);
     return &ncclProfiler;
   }
-  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
   return NULL;
 }
diff --git a/src/plugin/profiler/profiler_v4.cc b/src/plugin/profiler/profiler_v4.cc
index 11bed891a..53b57ce80 100644
--- a/src/plugin/profiler/profiler_v4.cc
+++ b/src/plugin/profiler/profiler_v4.cc
@@ -7,15 +7,113 @@
 #include "comm.h"
 #include "nccl_profiler.h"
 #include "checks.h"
+#include <dlfcn.h>
 
 static ncclProfiler_v4_t* ncclProfiler_v4;
+static ncclProfiler_t ncclProfiler;
+
+static ncclResult_t ncclProfiler_startEvent(void* ctx, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
+  ncclProfilerEventDescr_v4_t eDescr_v4;
+  eDescr_v4.type = eDescr->type;
+  eDescr_v4.parentObj = eDescr->parentObj;
+  eDescr_v4.rank = eDescr->rank;
+  switch(eDescr->type) {
+    case ncclProfileGroup: break;
+    case ncclProfileColl: {
+      eDescr_v4.coll.seqNumber = eDescr->coll.seqNumber;
+      eDescr_v4.coll.func = eDescr->coll.func;
+      eDescr_v4.coll.sendBuff = eDescr->coll.sendBuff;
+      eDescr_v4.coll.recvBuff = eDescr->coll.recvBuff;
+      eDescr_v4.coll.count = eDescr->coll.count;
+      eDescr_v4.coll.root = eDescr->coll.root;
+      eDescr_v4.coll.datatype = eDescr->coll.datatype;
+      eDescr_v4.coll.nChannels = eDescr->coll.nChannels;
+      eDescr_v4.coll.nWarps = eDescr->coll.nWarps;
+      eDescr_v4.coll.algo = eDescr->coll.algo;
+      eDescr_v4.coll.proto = eDescr->coll.proto;
+      eDescr_v4.parentObj = eDescr->coll.parentGroup;
+    } break;
+    case ncclProfileP2p: {
+      eDescr_v4.p2p.func = eDescr->p2p.func;
+      eDescr_v4.p2p.buff = eDescr->p2p.buff;
+      eDescr_v4.p2p.count = eDescr->p2p.count;
+      eDescr_v4.p2p.datatype = eDescr->p2p.datatype;
+      eDescr_v4.p2p.peer = eDescr->p2p.peer;
+      eDescr_v4.parentObj = eDescr->p2p.parentGroup;
+    } break;
+    case ncclProfileProxyOp: {
+      eDescr_v4.proxyOp.pid = eDescr->proxyOp.pid;
+      eDescr_v4.proxyOp.channelId = eDescr->proxyOp.channelId;
+      eDescr_v4.proxyOp.peer = eDescr->proxyOp.peer;
+      eDescr_v4.proxyOp.nSteps = eDescr->proxyOp.nSteps;
+      eDescr_v4.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
+      eDescr_v4.proxyOp.isSend = eDescr->proxyOp.isSend;
+    } break;
+    case ncclProfileProxyStep: {
+      eDescr_v4.proxyStep.step = eDescr->proxyStep.step;
+    } break;
+    case ncclProfileProxyCtrl: break;
+    case ncclProfileKernelCh: {
+      eDescr_v4.kernelCh.channelId = eDescr->kernelCh.channelId;
+      eDescr_v4.kernelCh.pTimer = eDescr->kernelCh.pTimer;
+    } break;
+    case ncclProfileNetPlugin: {
+      eDescr_v4.netPlugin.id = eDescr->netPlugin.id;
+      eDescr_v4.netPlugin.data = eDescr->netPlugin.data;
+    } break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v4->startEvent(ctx, eHandle, &eDescr_v4);
+}
+
+static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
+  ncclProfilerEventStateArgs_v4_t eStateArgs_v4;
+  switch(eState) {
+    case ncclProfilerProxyOpInProgress_v4:
+      break;
+    case ncclProfilerProxyStepSendGPUWait:
+    case ncclProfilerProxyStepSendPeerWait_v4:
+    case ncclProfilerProxyStepSendWait:
+    case ncclProfilerProxyStepRecvWait:
+    case ncclProfilerProxyStepRecvFlushWait:
+    case ncclProfilerProxyStepRecvGPUWait:
+      eStateArgs_v4.proxyStep.transSize = eStateArgs->proxyStep.transSize;
+      break;
+    case ncclProfilerNetPluginUpdate:
+      eStateArgs_v4.netPlugin.data = eStateArgs->netPlugin.data;
+      break;
+    case ncclProfilerKernelChStop:
+      eStateArgs_v4.kernelCh.pTimer = eStateArgs->kernelCh.pTimer;
+      break;
+    case ncclProfilerProxyCtrlIdle:
+    case ncclProfilerProxyCtrlActive:
+    case ncclProfilerProxyCtrlSleep:
+    case ncclProfilerProxyCtrlWakeup:
+    case ncclProfilerProxyCtrlAppend:
+    case ncclProfilerProxyCtrlAppendEnd:
+      eStateArgs_v4.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
+      break;
+    default: return ncclSuccess;
+  }
+  return ncclProfiler_v4->recordEventState(eHandle, (ncclProfilerEventState_v4_t)eState, &eStateArgs_v4);
+}
+
+static ncclResult_t ncclProfiler_init(void** ctx, uint64_t commId, int* eActivationMask, const char* commName, int nNodes, int nRanks, int rank, ncclDebugLogger_t logfn) {
+  NCCLCHECK(ncclProfiler_v4->init(ctx, eActivationMask, commName, commId, nNodes, nRanks, rank, logfn));
+  ncclProfiler.startEvent = ncclProfiler_startEvent;
+  ncclProfiler.recordEventState = ncclProfiler_recordEventState;
+  ncclProfiler.stopEvent = ncclProfiler_v4->stopEvent;
+  ncclProfiler.finalize = ncclProfiler_v4->finalize;
+  return ncclSuccess;
+}
 
 ncclProfiler_t* getNcclProfiler_v4(void* lib) {
   ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
   if (ncclProfiler_v4) {
-    INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
-    return ncclProfiler_v4;
+    ncclProfiler.name = ncclProfiler_v4->name;
+    ncclProfiler.init = ncclProfiler_init;
+    INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v4)", ncclProfiler_v4->name);
+    return &ncclProfiler;
   }
-  INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
   return NULL;
 }
diff --git a/src/plugin/profiler/profiler_v5.cc b/src/plugin/profiler/profiler_v5.cc
new file mode 100644
index 000000000..01d73db05
--- /dev/null
+++ b/src/plugin/profiler/profiler_v5.cc
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "nccl_profiler.h"
+#include "checks.h"
+#include <dlfcn.h>
+
+static ncclProfiler_v5_t* ncclProfiler_v5;
+
+ncclProfiler_t* getNcclProfiler_v5(void* lib) {
+  ncclProfiler_v5 = (ncclProfiler_v5_t*)dlsym(lib, "ncclProfiler_v5");
+  if (ncclProfiler_v5) {
+    INFO(NCCL_INIT, "PROFILER/Plugin: Loaded %s (v5)", ncclProfiler_v5->name);
+    return ncclProfiler_v5;
+  }
+  return NULL;
+}
diff --git a/src/plugin/tuner.cc b/src/plugin/tuner.cc
index 24a59de2e..dfa21ae7e 100644
--- a/src/plugin/tuner.cc
+++ b/src/plugin/tuner.cc
@@ -7,6 +7,7 @@
 
 #include <errno.h>
 #include <stdlib.h>
+#include <mutex>
 
 #include "checks.h"
 #include "debug.h"
@@ -16,8 +17,9 @@
 extern ncclTuner_t* getNcclTuner_v2(void* lib);
 extern ncclTuner_t* getNcclTuner_v3(void* lib);
 extern ncclTuner_t* getNcclTuner_v4(void* lib);
+extern ncclTuner_t* getNcclTuner_v5(void* lib);
 
-pthread_mutex_t tunerPluginLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex tunerPluginMutex;
 static int tunerPluginRefCount;
 static void* tunerPluginLib = nullptr;
 static ncclTuner_t* tunerSymbol = nullptr;
@@ -33,13 +35,14 @@ enum {
 static int status = tunerPluginLoadReady;
 
 ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
+  const char* tunerName;
   // Initialize to nullptr by default if plugin tuner cannot be loaded.
   comm->tuner = nullptr;
   if (tunerPluginLoadFailed == status) {
     return ncclSuccess;
   }
 
-  pthread_mutex_lock(&tunerPluginLock);
+  std::lock_guard<std::mutex> lock(tunerPluginMutex);
   if (tunerPluginLoadFailed == status) {
     goto exit;
   }
@@ -50,15 +53,26 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
     goto exit;
   }
 
-  tunerPluginLib = ncclOpenTunerPluginLib(ncclGetEnv("NCCL_TUNER_PLUGIN"));
+  if ((tunerName = ncclGetEnv("NCCL_TUNER_PLUGIN")) != nullptr) {
+    INFO(NCCL_ENV|NCCL_TUNING, "NCCL_TUNER_PLUGIN set by environment to %s", tunerName);
+    if (strcasecmp(tunerName, "none") == 0)
+      goto fail;
+  }
+  tunerPluginLib = ncclOpenTunerPluginLib(tunerName);
   if (nullptr == tunerPluginLib) {
     tunerPluginLib = ncclGetNetPluginLib(ncclPluginTypeTuner);
     if (nullptr == tunerPluginLib) {
       goto fail;
     }
+    tunerName = nullptr;
+  } else if (ncclPluginLibPaths[ncclPluginTypeTuner]) {
+    tunerName = ncclPluginLibPaths[ncclPluginTypeTuner];
   }
 
-  tunerSymbol = getNcclTuner_v4(tunerPluginLib);
+  tunerSymbol = getNcclTuner_v5(tunerPluginLib);
+  if (tunerSymbol == NULL) {
+    tunerSymbol = getNcclTuner_v4(tunerPluginLib);
+  }
   if (tunerSymbol == NULL) {
     tunerSymbol = getNcclTuner_v3(tunerPluginLib);
   }
@@ -66,8 +80,10 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
     tunerSymbol = getNcclTuner_v2(tunerPluginLib);
   }
   if (tunerSymbol == NULL) {
+    if (tunerName) INFO(NCCL_INIT|NCCL_TUNING, "External tuner plugin %s is unsupported", tunerName);
     goto fail;
   }
+  if (tunerName) INFO(NCCL_INIT|NCCL_TUNING, "Successfully loaded external tuner plugin %s", tunerName);
 
   comm->tuner = tunerSymbol;
   ++tunerPluginRefCount;
@@ -75,7 +91,6 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
   comm->tunerPluginLoaded = 1;
 
 exit:
-  pthread_mutex_unlock(&tunerPluginLock);
   return ncclSuccess;
 fail:
   if (tunerPluginLib) NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner));
@@ -85,9 +100,9 @@ ncclResult_t ncclTunerPluginLoad(struct ncclComm* comm) {
 }
 
 ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
-  pthread_mutex_lock(&tunerPluginLock);
+  std::lock_guard<std::mutex> lock(tunerPluginMutex);
   if (comm->tunerPluginLoaded && 0 == (--tunerPluginRefCount)) {
-    INFO(NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
+    INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Closing tuner: '%s'", tunerSymbol->name);
     NCCLCHECK(ncclClosePluginLib(tunerPluginLib, ncclPluginTypeTuner));
     tunerPluginLib = nullptr;
     tunerSymbol = nullptr;
@@ -95,6 +110,5 @@ ncclResult_t ncclTunerPluginUnload(struct ncclComm* comm) {
     status = tunerPluginLoadReady;
     comm->tunerPluginLoaded = 0;
   }
-  pthread_mutex_unlock(&tunerPluginLock);
   return ncclSuccess;
 }
diff --git a/src/plugin/tuner/CMakeLists.txt b/src/plugin/tuner/CMakeLists.txt
new file mode 100644
index 000000000..71f4498ad
--- /dev/null
+++ b/src/plugin/tuner/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Tuner plugin sources
+set(PLUGIN_TUNER_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v2.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v3.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v4.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/tuner_v5.cc
+)
+
+# Add tuner plugin sources to parent scope
+set(PLUGIN_TUNER_SOURCES ${PLUGIN_TUNER_SOURCES} PARENT_SCOPE)
diff --git a/src/plugin/tuner/tuner_v2.cc b/src/plugin/tuner/tuner_v2.cc
index 005638f01..9deefc1fd 100644
--- a/src/plugin/tuner/tuner_v2.cc
+++ b/src/plugin/tuner/tuner_v2.cc
@@ -46,10 +46,15 @@ static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, si
   return ncclSuccess;
 }
 
-static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
-  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, context));
+static ncclResult_t ncclTuner_finalize(void* ctx) {
+  return ncclTuner_v2->destroy(ctx);
+}
+
+static ncclResult_t ncclTuner_init(void** ctx, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn,
+                                   ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) {
+  NCCLCHECK(ncclTuner_v2->init(nRanks, nNodes, logfn, ctx));
   ncclTuner.getCollInfo = ncclTuner_getCollInfo;
-  ncclTuner.destroy = ncclTuner_v2->destroy;
+  ncclTuner.finalize = ncclTuner_finalize;
   return ncclSuccess;
 }
 
@@ -58,9 +63,8 @@ ncclTuner_t* getNcclTuner_v2(void* lib) {
   if (ncclTuner_v2) {
     ncclTuner.name = ncclTuner_v2->name;
     ncclTuner.init = ncclTuner_init;
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v2->name);
+    INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v2)", ncclTuner_v2->name);
     return &ncclTuner;
   }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v2 symbol, using internal tuner instead.");
   return NULL;
 }
diff --git a/src/plugin/tuner/tuner_v3.cc b/src/plugin/tuner/tuner_v3.cc
index 3898243bc..3f896a644 100644
--- a/src/plugin/tuner/tuner_v3.cc
+++ b/src/plugin/tuner/tuner_v3.cc
@@ -18,10 +18,15 @@ static ncclResult_t ncclTuner_getCollInfo(void* context, ncclFunc_t collType, si
   return ncclSuccess;
 }
 
-static ncclResult_t ncclTuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn, void** context) {
+static ncclResult_t ncclTuner_finalize(void* ctx) {
+  return ncclTuner_v3->destroy(ctx);
+}
+
+static ncclResult_t ncclTuner_init(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn,
+                                   ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) {
   NCCLCHECK(ncclTuner_v3->init(nRanks, nNodes, logfn, context));
   ncclTuner.getCollInfo = ncclTuner_getCollInfo;
-  ncclTuner.destroy = ncclTuner_v3->destroy;
+  ncclTuner.finalize = ncclTuner_finalize;
   return ncclSuccess;
 }
 
@@ -30,9 +35,8 @@ ncclTuner_t* getNcclTuner_v3(void* lib) {
   if (ncclTuner_v3) {
     ncclTuner.name = ncclTuner_v3->name;
     ncclTuner.init = ncclTuner_init;
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v3->name);
+    INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v3)", ncclTuner_v3->name);
     return &ncclTuner;
   }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v3 symbol.");
   return NULL;
 }
diff --git a/src/plugin/tuner/tuner_v4.cc b/src/plugin/tuner/tuner_v4.cc
index 4bfd116bb..077ed0aea 100644
--- a/src/plugin/tuner/tuner_v4.cc
+++ b/src/plugin/tuner/tuner_v4.cc
@@ -7,16 +7,32 @@
 
 #include <dlfcn.h>
 #include "debug.h"
+#include "checks.h"
 #include "nccl_tuner.h"
 
 static ncclTuner_v4_t* ncclTuner_v4;
+static ncclTuner_t ncclTuner;
+
+static ncclResult_t ncclTuner_finalize(void* ctx) {
+  return ncclTuner_v4->destroy(ctx);
+}
+
+static ncclResult_t ncclTuner_init(void** context, uint64_t commId, size_t nRanks, size_t nNodes, ncclDebugLogger_t logfn,
+                                   ncclNvlDomainInfo_v5_t* nvlDomainInfo, ncclTunerConstants_t* /*constants*/) {
+  NCCLCHECK(ncclTuner_v4->init(nRanks, nNodes, logfn, context));
+  ncclTuner.getCollInfo = ncclTuner_v4->getCollInfo;
+  ncclTuner.finalize = ncclTuner_finalize;
+  return ncclSuccess;
+}
 
 ncclTuner_t* getNcclTuner_v4(void* lib) {
   ncclTuner_v4 = (ncclTuner_v4_t*)dlsym(lib, "ncclTunerPlugin_v4");
   if (ncclTuner_v4) {
-    INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Using tuner plugin %s", ncclTuner_v4->name);
-    return ncclTuner_v4;
+    ncclTuner.name = ncclTuner_v4->name;
+    ncclTuner.init = ncclTuner_init;
+
+    INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v4)", ncclTuner_v4->name);
+    return &ncclTuner;
   }
-  INFO(NCCL_ENV|NCCL_TUNING, "TUNER/Plugin: Failed to find ncclTunerPlugin_v4 symbol.");
   return NULL;
 }
diff --git a/src/plugin/tuner/tuner_v5.cc b/src/plugin/tuner/tuner_v5.cc
new file mode 100644
index 000000000..22c3d4b42
--- /dev/null
+++ b/src/plugin/tuner/tuner_v5.cc
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_tuner.h"
+
+static ncclTuner_v5_t* ncclTuner_v5;
+
+ncclTuner_t* getNcclTuner_v5(void* lib) {
+  ncclTuner_v5 = (ncclTuner_v5_t*)dlsym(lib, "ncclTunerPlugin_v5");
+  if (ncclTuner_v5) {
+    INFO(NCCL_INIT|NCCL_TUNING, "TUNER/Plugin: Using %s (v5)", ncclTuner_v5->name);
+    return ncclTuner_v5;
+  }
+  return NULL;
+}
diff --git a/src/proxy.cc b/src/proxy.cc
index 74ec70f0e..25a14cd64 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -13,12 +13,14 @@
 #include "timer.h"
 #include "profiler.h"
 #include "transport.h"
+#include "cpuset.h"
 
 #include <sys/syscall.h>
 #include <assert.h>
 #include <unistd.h>
 #include <sys/time.h>
 #include <sched.h>
+#include <algorithm>
 
 #define NCCL_MAX_PROXY_CONNECTIONS (NCCL_MAX_LOCAL_RANKS+1)
 
@@ -385,6 +387,8 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   sub->workCounter = op->workCounter;
   args->nsubs = subIndex+1;
   if (subIndex) {
+    args->nChannels = std::min(args->nChannels, op->nChannels);
+    args->nPeers = std::min(args->nPeers, op->nPeers);
     if ((args->sliceSteps != op->sliceSteps) ||
         (args->chunkSteps != op->chunkSteps) ||
         (args->protocol != op->protocol) ||
@@ -398,7 +402,7 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
       WARN("Proxy append on running operation");
       return ncclInternalError;
     }
-    return ncclSuccess;
+    goto exit;
   }
   //memset(&args->progress, 0, sizeof(struct ncclProxyArgs)-offsetof(struct ncclProxyArgs, progress));
   args->done = 0;
@@ -411,11 +415,15 @@ static ncclResult_t ncclProxyOpToArgs(struct ncclProxyOp* op, struct ncclProxyAr
   args->pattern = op->pattern;
   args->protocol = op->protocol;
   args->coll = op->coll;
+  args->collAPI = op->collAPI;
   args->algorithm = op->algorithm;
+  args->nChannels = op->nChannels;
+  args->nPeers = op->nPeers;
   args->specifics = op->specifics;
   args->state = ncclProxyOpReady;
   args->progress = op->connection->tcomm->proxyProgress;
   args->proxyAppendPtr = op->connection->proxyAppendPtr;
+exit:
   if (args->pattern != ncclPatternProfiler) ncclProfilerStartProxyOpEvent(subIndex, args);
   return ncclSuccess;
 }
@@ -744,6 +752,7 @@ static ncclResult_t removeOp(struct ncclProxyProgressState* state, struct ncclPr
 static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclProxyProgressState* state, struct ncclProxyArgs* opStart, int* idle) {
   struct ncclProxyArgs* prevOp = NULL;
   struct ncclProxyArgs* op = opStart;
+  ncclResult_t status = ncclSuccess;
   while (op) {
     if (op->state == ncclProxyOpNone) return ncclInternalError;
     TIME_START(0); TIME_START(1);
@@ -751,6 +760,8 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
     if (op->idle) { TIME_STOP(1); TIME_CANCEL(0); } else { TIME_CANCEL(1); TIME_STOP(0); }
     *idle &= op->idle;
     if (op->state == ncclProxyOpNone || ret != ncclSuccess) {
+      //track first error that occured
+      if (ret != ncclSuccess && status == ncclSuccess) status = ret;
       TIME_START(2);
       NCCLCHECK(removeOp(state, &op, &prevOp));
       TIME_STOP(2);
@@ -759,7 +770,7 @@ static ncclResult_t progressOps(struct ncclProxyState* proxyState, struct ncclPr
       op = op->next;
     }
   }
-  return ncclSuccess;
+  return status;
 }
 
 NCCL_PARAM(ProxyAppendBatchSize, "PROXY_APPEND_BATCH_SIZE", 16);
@@ -899,16 +910,43 @@ static int setProxyThreadContext(struct ncclProxyState* proxyState) {
 NCCL_PARAM(ProxyDumpSignal, "PROXY_DUMP_SIGNAL", -1);
 NCCL_PARAM(ProgressAppendOpFreq, "PROGRESS_APPENDOP_FREQ", 8);
 
+static cpu_set_t proxyCpuset;
+static pthread_once_t proxyCpusetOnce = PTHREAD_ONCE_INIT;
+void proxyCpusetOnceFunc() {
+  const char* setEnv = ncclGetEnv("NCCL_PROXY_CPUSET");
+  if (setEnv) {
+    ncclResult_t res = ncclStrListToCpuset(setEnv, &proxyCpuset);
+    if (res != ncclSuccess) {
+      INFO(NCCL_ENV, "failed to decode NCCL_PROXY_CPUSET=%s. Ignoring", setEnv);
+      goto fail;
+    }
+    // debug info
+    char msg[1024] = {0};
+    cpu_set_t currSet;
+    sched_getaffinity(0, sizeof(cpu_set_t), &currSet);
+    (void)ncclCpusetToStrList(&currSet, msg, sizeof(msg));
+    snprintf(msg + strlen(msg), sizeof(msg) - strlen(msg), " changed to ");
+    (void)ncclCpusetToStrList(&proxyCpuset, msg + strlen(msg), sizeof(msg) - strlen(msg));
+    INFO(NCCL_ENV, "NCCL_PROXY_CPUSET = %s: %s", setEnv, msg);
+    return;
+  }
+  // if we arrive here we have either no env or we have failed to decode it
+fail:
+  CPU_ZERO(&proxyCpuset);
+  return;
+}
+
 void* ncclProxyProgress(void *proxyState_) {
   struct ncclProxyState* proxyState = (struct ncclProxyState*)proxyState_;
+
+  // This thread is created by proxyService, therefore setting the affinity is not needed.
+  INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   if (setProxyThreadContext(proxyState)) {
     INFO(NCCL_INIT, "[Proxy Progress] Set CUDA context on device %d", proxyState->cudaDev);
   } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
     WARN("[Proxy Progress] Failed to set CUDA device %d", proxyState->cudaDev);
   }
-  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-
-  INFO(NCCL_INIT, "[Proxy Progress] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
 
   struct ncclProxyProgressState* state = &proxyState->progressState;
   state->nextOps = -1;
@@ -1567,15 +1605,17 @@ enum {
 
 void* ncclProxyService(void* _args) {
   struct ncclProxyState* proxyState =  (struct ncclProxyState*) _args;
-  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
+
+  // set the thread affinity before setting the cuda context
+  pthread_once(&proxyCpusetOnce,proxyCpusetOnceFunc);
+  if (CPU_COUNT(&proxyCpuset)) sched_setaffinity(0, sizeof(cpu_set_t), &proxyCpuset);
+  INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   if (setProxyThreadContext(proxyState)) {
     INFO(NCCL_INIT, "[Proxy Service] Created CUDA context on device %d", proxyState->cudaDev);
   } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
     WARN("[Proxy Service] Failed to set CUDA device %d", proxyState->cudaDev);
   }
-  // if (CPU_COUNT(&comm->cpuAffinity)) sched_setaffinity(0, sizeof(cpu_set_t), &comm->cpuAffinity);
-
-  INFO(NCCL_INIT, "[Proxy Service] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
 
   // Prepare poll descriptor
   struct ncclProxyConnectionPool connectionPool;
@@ -1760,14 +1800,17 @@ void* ncclProxyServiceUDS(void* _args) {
   struct ncclProxyState* proxyState =  (struct ncclProxyState*) _args;
   struct pollfd pollfds[1];
 
+  // set the thread affinity before setting the cuda context
+  pthread_once(&proxyCpusetOnce,proxyCpusetOnceFunc);
+  if (CPU_COUNT(&proxyCpuset)) sched_setaffinity(0, sizeof(cpu_set_t), &proxyCpuset);
+  INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
+
   if (setProxyThreadContext(proxyState)) {
     INFO(NCCL_INIT, "[Proxy Service UDS] Set CUDA context on device %d", proxyState->cudaDev);
   } else if (cudaSetDevice(proxyState->cudaDev) != cudaSuccess) {
     WARN("[Proxy Service UDS] Failed to set CUDA device %d", proxyState->cudaDev);
   }
 
-  INFO(NCCL_INIT, "[Proxy Service UDS] Device %d CPU core %d", proxyState->cudaDev, sched_getcpu());
-
   if (ncclIpcSocketGetFd(&proxyState->ipcSock, &pollfds[0].fd) != ncclSuccess) {
     WARN("[Proxy Service UDS] Get listenSock fd fails");
     return NULL;
@@ -1807,6 +1850,7 @@ ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union
   comm->proxyState->listenSock = sock;
   comm->proxyState->peerAddresses = peerAddresses;
   comm->proxyState->peerAddressesUDS = peerAddressesUDS;
+  comm->proxyState->netAttr = NCCL_NET_ATTR_INIT;
 
   // UDS support
   NCCLCHECK(ncclIpcSocketInit(&comm->proxyState->ipcSock, comm->rank, peerAddressesUDS[comm->rank], comm->abortFlag));
@@ -1831,6 +1875,8 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->dmaBufSupport = comm->dmaBufSupport;
     proxyState->ncclNet = comm->ncclNet;
     proxyState->ncclCollNet = comm->ncclCollNet;
+    proxyState->netContext = comm->netContext;
+    proxyState->collNetContext = comm->collNetContext;
     proxyState->profilerContext = comm->profilerContext;
     proxyState->directMode = comm->directMode;
     memcpy(proxyState->buffSizes, comm->buffSizes, sizeof(comm->buffSizes));
diff --git a/src/ras/CMakeLists.txt b/src/ras/CMakeLists.txt
new file mode 100644
index 000000000..2c08b8f99
--- /dev/null
+++ b/src/ras/CMakeLists.txt
@@ -0,0 +1,11 @@
+# RAS sources
+set(RAS_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/collectives.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/rasnet.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/peers.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/ras.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/client_support.cc
+)
+
+# Add RAS sources to parent scope
+set(RAS_SOURCES ${RAS_SOURCES} PARENT_SCOPE)
diff --git a/src/ras/ras.cc b/src/ras/ras.cc
index 8ef551c64..948e26446 100644
--- a/src/ras/ras.cc
+++ b/src/ras/ras.cc
@@ -4,10 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-// Workaround for libstdc++ trying to force public visibility of std:: symbols.  We don't want to do that in libnccl.so.
-#include <bits/c++config.h>
-#undef _GLIBCXX_VISIBILITY
-#define _GLIBCXX_VISIBILITY(V)
 #include <cstddef>
 #include <mutex>
 #include <poll.h>
@@ -76,7 +72,7 @@ static ncclResult_t rasNetSendNack(struct rasSocket* sock);
 
 static void* rasThreadMain(void*);
 
-static void rasTerminate() __attribute__((destructor));
+static void rasTerminate();
 
 NCCL_PARAM(RasTimeoutFactor, "RAS_TIMEOUT_FACTOR", 1);
 
@@ -111,6 +107,8 @@ ncclResult_t ncclRasCommInit(struct ncclComm* comm, struct rasRankInit* myRank)
       ncclSetThreadName(rasThread, "NCCL RAS");
 
       rasInitialized = true;
+
+      atexit(rasTerminate);
     }
   }
   ncclAtomicRefCountIncrement(&rasInitRefCount);
diff --git a/src/register/CMakeLists.txt b/src/register/CMakeLists.txt
new file mode 100644
index 000000000..b3b35bfe6
--- /dev/null
+++ b/src/register/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Register sources
+set(REGISTER_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/register.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/coll_reg.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/sendrecv_reg.cc
+)
+
+# Add register sources to parent scope
+set(REGISTER_SOURCES ${REGISTER_SOURCES} PARENT_SCOPE)
diff --git a/src/register/coll_reg.cc b/src/register/coll_reg.cc
index d9d9fb436..6cbb0c75f 100644
--- a/src/register/coll_reg.cc
+++ b/src/register/coll_reg.cc
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "register.h"
 #include "transport.h"
 #include "enqueue.h"
@@ -176,7 +182,8 @@ ncclResult_t ncclRegisterCollBuffers(
     // IPC buffer registration
     if (info->func == ncclFuncReduceScatter && info->algorithm != NCCL_ALGO_COLLNET_DIRECT) goto exit;
     if (info->algorithm == NCCL_ALGO_RING && ((info->func == ncclFuncAllReduce && info->sendbuff == info->recvbuff) || info->func == ncclFuncReduce)) goto exit;
-    if ((info->algorithm == NCCL_ALGO_TREE || info->algorithm == NCCL_ALGO_COLLNET_CHAIN) && info->sendbuff == info->recvbuff) goto exit;
+    if (info->algorithm == NCCL_ALGO_TREE && info->sendbuff == info->recvbuff) goto exit;
+    if (info->algorithm == NCCL_ALGO_COLLNET_CHAIN && info->sendbuff == info->recvbuff && comm->maxLocalRanks > 1) goto exit;
     if (info->func == ncclFuncAllGather && info->algorithm == NCCL_ALGO_PAT) goto exit;
 
     int peerRanks[NCCL_MAX_LOCAL_RANKS];
diff --git a/src/register/register.cc b/src/register/register.cc
index 59928f57e..b118a4cc4 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -14,18 +14,6 @@
 
 NCCL_PARAM(LocalRegister, "LOCAL_REGISTER", 1);
 
-static ncclResult_t regFindHandleFromSymAddr(struct ncclComm* comm, void* baseSymPtr, struct ncclReg** handle) {
-  struct ncclRegCache* cache = &comm->regCache;
-  *handle = NULL;
-  for (int slot = 0; slot < cache->population; slot++) {
-    if (baseSymPtr == cache->slots[slot]->baseSymPtr) {
-      *handle = cache->slots[slot];
-      break;
-    }
-  }
-  return ncclSuccess;
-}
-
 ncclResult_t ncclRegLocalIsValid(struct ncclReg *reg, bool *isValid) {
   if (reg && isValid) {
     if (reg->localRefs)
@@ -174,104 +162,3 @@ ncclResult_t ncclCommGraphDeregister(const ncclComm_t comm, struct ncclReg *hand
   NCCLCHECK(commDeregister(comm, true, handle));
   return ncclSuccess;
 }
-
-ncclResult_t ncclCommSymmetricRegisterInternal(struct ncclComm* comm, void* buff, size_t baseSize, size_t alignment, CUmemGenericAllocationHandle memHandle, struct ncclReg* regHandle) {
-  ncclResult_t ret = ncclSuccess;
-  void* regSymAddr = NULL;
-  ALIGN_SIZE(comm->symAllocHead, alignment);
-  NCCLCHECKGOTO(ncclIpcSymmetricMap(comm, comm->symAllocHead, baseSize, memHandle, &regSymAddr), ret, fail);
-  NCCLCHECKGOTO(ncclNvlsSymmetricMap(comm, comm->symAllocHead, baseSize, regSymAddr), ret, fail);
-  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
-  comm->symAllocHead += baseSize;
-  regHandle->baseSymPtr = regSymAddr;
-  regHandle->symSize = baseSize;
-exit:
-  return ret;
-fail:
-  regHandle->baseSymPtr = NULL;
-  regHandle->symSize = 0;
-  goto exit;
-}
-
-NCCL_API(ncclResult_t, ncclCommWindowRegister, ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
-ncclResult_t ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags) {
-  ncclResult_t ret = ncclSuccess;
-  CUmemGenericAllocationHandle memHandle;
-  size_t baseSize;
-  void* baseAddr = NULL;
-  struct ncclReg* regHandle = NULL;
-  int saveDev;
-
-  *win = NULL;
-
-  CUDACHECK(cudaGetDevice(&saveDev));
-  NCCLCHECK(ncclGroupStartInternal());
-  if (!ncclParamLocalRegister() || !ncclCuMemEnable()) {
-    goto exit;
-  }
-
-  NCCLCHECKGOTO(ncclCommEnsureReady(comm), ret, fail);
-
-  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-  if (comm && buff && size && win) {
-    size_t alignment = 0;
-    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)buff), ret, fail);
-    // size and alignment check
-    if (!((uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0 && baseSize % NCCL_REC_PAGE_SIZE == 0 && (uintptr_t)buff + size <= (uintptr_t)baseAddr + baseSize)) {
-      WARN("buffer %p (baseAddr %p align %d) size %zu (baseSize %ld align %d) does not satisfy symmetric registration requirements", buff, baseAddr, (uintptr_t)baseAddr % NCCL_REC_PAGE_SIZE == 0, size, baseSize, baseSize % NCCL_REC_PAGE_SIZE == 0);
-      goto fail;
-    }
-    NCCLCHECKGOTO(ncclRegister(comm, baseAddr, baseSize, false, (void**)&regHandle), ret, fail);
-    NCCLCHECKGOTO(ncclCalloc(win, 1), ret, fail);
-    (*win)->handle = regHandle;
-    regHandle->winFlags = winFlags;
-    if (regHandle->baseSymPtr == NULL && comm->symmetricSupport) {
-      struct ncclSymRegTask* task;
-      CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, baseAddr), ret, fail);
-      CUCHECKGOTO(cuMemRelease(memHandle), ret, fail);
-      alignment = baseSize >= NCCL_REC_PAGE_SIZE * 72L ? NCCL_MAX_PAGE_SIZE : NCCL_REC_PAGE_SIZE;
-      NCCLCHECKGOTO(ncclCalloc(&task, 1), ret, fail);
-      task->buff = buff;
-      task->baseSize = baseSize;
-      task->memHandle = memHandle;
-      task->regHandle = regHandle;
-      task->alignment = alignment;
-      ncclIntruQueueEnqueue(&comm->symRegTaskQueue, task);
-      ncclGroupCommJoin(comm, ncclGroupTaskTypeSymRegister);
-    }
-  }
-
-exit:
-  ncclGroupErrCheck(ret);
-  NCCLCHECK(ret = ncclGroupEndInternal());
-  cudaSetDevice(saveDev);
-  return ret;
-fail:
-  free(*win);
-  *win = NULL;
-  goto exit;
-}
-
-NCCL_API(ncclResult_t, ncclCommWindowDeregister, ncclComm_t comm, ncclWindow_t win);
-ncclResult_t ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win) {
-  ncclResult_t ret = ncclSuccess;
-  int saveDev;
-  struct ncclReg* regHandle;
-  CUDACHECK(cudaGetDevice(&saveDev));
-  if (win == NULL) goto exit;
-  regHandle = win->handle;
-  if (regHandle && ncclParamLocalRegister() && ncclCuMemEnable()) {
-    if (regHandle->baseSymPtr) {
-      CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, fail);
-      NCCLCHECKGOTO(ncclNvlsSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
-      NCCLCHECKGOTO(ncclIpcSymmetricFree(comm, regHandle->symSize, regHandle->baseSymPtr), ret, fail);
-    }
-    NCCLCHECKGOTO(commDeregister(comm, false, regHandle), ret, fail);
-  }
-  free(win);
-exit:
-  CUDACHECK(cudaSetDevice(saveDev));
-  return ret;
-fail:
-  goto exit;
-}
diff --git a/src/register/sendrecv_reg.cc b/src/register/sendrecv_reg.cc
index f82fbd714..9114fab01 100644
--- a/src/register/sendrecv_reg.cc
+++ b/src/register/sendrecv_reg.cc
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "register.h"
 #include "transport.h"
 
diff --git a/src/scheduler/CMakeLists.txt b/src/scheduler/CMakeLists.txt
new file mode 100644
index 000000000..f6583bd4d
--- /dev/null
+++ b/src/scheduler/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Scheduler sources
+set(SCHEDULER_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/symmetric_sched.cc
+)
+
+# Add scheduler sources to parent scope
+set(SCHEDULER_SOURCES ${SCHEDULER_SOURCES} PARENT_SCOPE)
diff --git a/src/scheduler/symmetric_sched.cc b/src/scheduler/symmetric_sched.cc
new file mode 100644
index 000000000..440b6061b
--- /dev/null
+++ b/src/scheduler/symmetric_sched.cc
@@ -0,0 +1,235 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SYMMETRIC_SCHED_H_
+#define NCCL_SYMMETRIC_SCHED_H_
+
+#include "scheduler.h"
+
+ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next>* symTaskQueue, struct ncclTaskColl** remainTasksHead) {
+  ncclResult_t ret = ncclSuccess;
+  int fnOpTySymCount = 0;
+  struct ncclTaskColl* tasksSymByFnOpTy[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes];
+  int fnOpTySymIndices[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes];
+  struct ncclKernelPlanner* planner = &comm->planner;
+  struct ncclTaskColl* remainTasksTail = nullptr;
+
+  memset(tasksSymByFnOpTy, 0, sizeof(tasksSymByFnOpTy));
+  *remainTasksHead = nullptr;
+  while (task != nullptr) {
+    int index = ((int)task->func*ncclNumDevRedOps + (int)task->opDev.op)*ncclNumTypes + (int)task->datatype;
+    struct ncclTaskColl* next = task->next;
+    NCCLCHECK(ncclDevrFindWindow(comm, task->sendbuff, &task->sendWin));
+    NCCLCHECK(ncclDevrFindWindow(comm, task->recvbuff, &task->recvWin));
+    bool symAvailable = ncclSymkAvailable(comm, task->func, task->opDev.op, task->datatype, task->count);
+
+    if (task->sendWin && task->recvWin && (task->sendWin->winFlags & task->recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && symAvailable) {
+      if (tasksSymByFnOpTy[index] == nullptr) fnOpTySymIndices[fnOpTySymCount++] = index;
+      task->next = tasksSymByFnOpTy[index];
+      tasksSymByFnOpTy[index] = task;
+      planner->nTasksColl--;
+    } else {
+      if (*remainTasksHead) {
+        remainTasksTail->next = task;
+       remainTasksTail = task;
+      } else {
+        *remainTasksHead = remainTasksTail = task;
+      }
+    }
+    task = next;
+  }
+  if (remainTasksTail) remainTasksTail->next = nullptr;
+
+  // make sure kernel args space can hold at least a single work
+  assert(comm->workArgsBytes >= ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, 1));
+
+  // Determine symmetric tasks kernels
+  for (int cursor = 0; cursor < fnOpTySymCount; cursor++) {
+    struct ncclTaskColl* task = tasksSymByFnOpTy[fnOpTySymIndices[cursor]];
+    while (task != NULL) {
+      ncclSymkKernelId kernelId = ncclSymkKernelId_Count;
+      int nChannels = MAXCHANNELS;
+      int nWarps = 0;
+      int nWorks = 0;
+      float estTimeUs = 1.e18;
+      size_t countTotal = 0, countMax = 0;
+      struct ncclTaskColl* headTask = task;
+      size_t cellCount = NCCL_SYM_KERNEL_CELL_SIZE / ncclTypeSize(headTask->datatype);
+      // For now we assume higher kernel id means a kernel for larger data size
+      while (task != nullptr) {
+        size_t count;
+        nWorks++;
+        count = alignUp(task->count, cellCount);
+        countTotal += count;
+        if (count > countMax) countMax = count;
+        if (ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, nWorks + 1) > comm->workArgsBytes || task->next == nullptr) {
+          task->isSymLast = 1;
+          break;
+        }
+        task = task->next;
+      }
+      NCCLCHECK(ncclSymkPickKernel(comm, headTask->func, headTask->opDev.op, headTask->datatype,
+                                   countTotal, countMax, nWorks,
+                                   &estTimeUs, &kernelId, &nChannels, &nWarps));
+      if (kernelId == ncclSymkKernelId_Count) {
+        char const* name = ncclGetEnv("NCCL_SYM_KERNEL");
+        WARN("Error: no symmetric kernel available for function %s.%s%s",
+             ncclFuncToString(headTask->func), (name ? " NCCL_SYM_KERNEL was set to " : ""), (name ? name: ""));
+        ret = (name ? ncclInvalidUsage : ncclInternalError);
+        goto fail;
+      }
+      // set all symmetric tasks to the same kernel
+      task = headTask;
+      while (task != nullptr) {
+        struct ncclTaskColl* next = task->next;
+        int isSymLast = task->isSymLast;
+        task->devFuncId = (uint32_t)kernelId;
+        task->nMaxChannels = nChannels;
+        task->nWarps = nWarps;
+        ncclIntruQueueEnqueue(&planner->collSymTaskQueue, task);
+        task = next;
+        if (isSymLast) break;
+      }
+    }
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+ncclResult_t ncclSymmetricTaskScheduler(struct ncclComm* comm, struct ncclIntruQueue<struct ncclTaskColl, &ncclTaskColl::next>* symTaskQueue, struct ncclKernelPlan* plan) {
+  struct ncclTaskColl* headTask = ncclIntruQueueHead(symTaskQueue);
+  int devFuncId = headTask->devFuncId;
+  struct ncclTaskColl* task = NULL;
+  ssize_t totalCount = 0;  // aligned bytes
+  ssize_t logCount = 0;
+  ssize_t remainCell = 0;
+  ssize_t cellPerChannel = 0;
+  int workCount = 0, workIndex = 0;
+  size_t cellCount = NCCL_SYM_KERNEL_CELL_SIZE / ncclTypeSize(headTask->datatype); // minimal cell size
+  ncclResult_t ret = ncclSuccess;
+  int curChannel = 0;
+  int curChannelWork = 0;
+  int nMaxChannels = headTask->nMaxChannels;
+  struct ncclSymkDevWork* workBufPtr = NULL;
+  struct ncclSymkChannelWorkRange* workRangePtr = NULL;
+  const char* funcName = ncclFuncToString(headTask->func);
+  const char* kernelName = ncclSymkKernelIdToString(headTask->devFuncId);
+  struct ncclSymkDevWorkArgs* argsBuf = NULL;
+
+  plan->isSymColl = true;
+  plan->threadPerBlock = headTask->nWarps * WARP_SIZE;
+  plan->hasProxyOps = false;
+  plan->kernelFn = ncclSymkGetKernelPtr((ncclSymkKernelId)headTask->devFuncId, headTask->opDev.op, headTask->datatype);
+  task = headTask;
+  while (task != nullptr && task->devFuncId == devFuncId) {
+    workCount++;
+    totalCount += alignUp(task->count, cellCount);
+    logCount += task->count;
+    if (task->isSymLast == 1) break;
+    task = task->next;
+  }
+
+  plan->kernelArgsSize = ncclSymkDevWorkArgs::calcArgsSize(nMaxChannels, workCount);
+  argsBuf = (struct ncclSymkDevWorkArgs*)calloc(1, plan->kernelArgsSize);
+
+  remainCell = cellPerChannel = DIVUP(DIVUP(totalCount, nMaxChannels), cellCount);
+  workRangePtr = argsBuf->getWorkRange();
+  workBufPtr = argsBuf->getWorks(nMaxChannels);
+  argsBuf->nMaxChannels = nMaxChannels;
+
+  while (!ncclIntruQueueEmpty(symTaskQueue)) {
+    struct ncclSymkDevWork devWork = {};
+    size_t cellLeft = 0, taskCell = 0;
+    uint8_t isSymLast = 0;
+
+    if (ncclIntruQueueHead(symTaskQueue)->devFuncId != devFuncId) break; // scheduling is done
+
+    task = ncclIntruQueueDequeue(symTaskQueue);
+    isSymLast = task->isSymLast;
+
+    NCCLCHECKGOTO(ncclSymkMakeDevWork(comm, task, &devWork), ret, fail);
+
+    cellLeft = taskCell = DIVUP(task->count, cellCount);
+    for (;curChannel < nMaxChannels;) {
+      workRangePtr[curChannel].workHi = workIndex;
+      if (curChannelWork == 0) {
+        if (devWork.nChannels == 0) {
+          devWork.sChannelId = curChannel;
+          devWork.nChannels = 1;
+        } else if (cellLeft <= remainCell) {
+          // the last segment of the task
+          assert(devWork.nChannels > 0);
+          // if the remaining cell is less than 1024 bytes, we can fuse the last channel
+          if ((remainCell - cellLeft) * NCCL_SYM_KERNEL_CELL_SIZE <= (1 << 10) || ncclIntruQueueEmpty(symTaskQueue)) devWork.nChannels++;
+        } else {
+          // middle segment of the task
+          devWork.nChannels++;
+        }
+      } else {
+        assert(cellLeft == taskCell);
+        if (taskCell <= remainCell) {
+          // the first segment of the task is fully scheduled onto the channel
+          devWork.sChannelId = curChannel;
+          devWork.nChannels = 1;
+        }
+      }
+      if (cellLeft < remainCell) {
+        workRangePtr[curChannel].fracHi = uint16_t(0x10000UL - 1);
+        remainCell -= cellLeft;
+        curChannelWork++;
+        break;
+      } else if (cellLeft == remainCell) {
+        workRangePtr[curChannel].fracHi = uint16_t(0x10000UL - 1);
+        remainCell = cellPerChannel;
+        curChannel++;
+        curChannelWork = 0;
+        break;
+      } else {
+        // cellLeft > remainCell; the task is partially scheduled onto the channel
+        cellLeft -= remainCell;
+        workRangePtr[curChannel].fracHi = uint16_t(DIVUP(0x10000L * (taskCell - cellLeft), taskCell) - 1);
+        remainCell = cellPerChannel;
+        curChannel++;
+        curChannelWork = 0;
+      }
+    }
+    memcpy(workBufPtr + workIndex, &devWork, sizeof(struct ncclSymkDevWork));
+    workIndex++;
+
+    // Profiler
+    plan->groupApiEventHandle = task->groupApiEventHandle;
+
+    ncclMemoryPoolFree<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, task);
+    if (isSymLast == 1) break;
+    if (curChannel == nMaxChannels) {
+      WARN("ncclSymmetricTaskScheduler ran out of channel space (nMaxChannels=%d, workCount=%d, workIndex=%d)",
+           nMaxChannels, workCount, workIndex);
+      goto fail;
+    }
+  }
+  if (remainCell < cellPerChannel) curChannel++;
+
+  memcpy(&argsBuf->kcomm, &comm->symkState.kcomm, sizeof(comm->symkState.kcomm));
+  plan->workBytes = totalCount * ncclTypeSize(headTask->datatype);
+  plan->channelMask = uint64_t(-1) >> (64 - curChannel);
+  plan->kernelSymArgs = (void*)argsBuf;
+  plan->workStorageType = ncclDevWorkStorageTypeArgs;
+
+  if (comm->rank == 0) {
+    INFO(NCCL_TUNING, "%s [Symmetric]: %ld Bytes -> Kernel %s nchannels %d nthreads %d nWorks %d", funcName,
+         logCount * ncclTypeSize(headTask->datatype), kernelName, curChannel, plan->threadPerBlock, workCount);
+  }
+
+exit:
+  return ret;
+fail:
+  goto exit;
+}
+
+#endif // NCCL_SYMMETRIC_SCHED_H_
diff --git a/src/symmetric.cc b/src/sym_kernels.cc
similarity index 52%
rename from src/symmetric.cc
rename to src/sym_kernels.cc
index f5b1e6c22..df4965d56 100644
--- a/src/symmetric.cc
+++ b/src/sym_kernels.cc
@@ -1,14 +1,22 @@
-#include "symmetric.h"
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "sym_kernels.h"
 #include "comm.h"
 #include "device.h"
+#include "transport.h"
 #include <cmath>
 
 constexpr char const* kernelName[] = {
-  // Must align with enum ncclSymKernelId definition in src/include/symmetric.h
+  // Must align with enum ncclSymkKernelId definition in src/include/sym_kernels.h
   "AllReduce_AGxLL_R",
   "AllReduce_AGxLLMC_R",
   "AllReduce_RSxLD_AGxST",
   "AllReduce_RSxLDMC_AGxSTMC",
+  "AllReduce_RSxNet_ARxMC_AGxNet",
   "AllGather_LL",
   "AllGather_LLMC",
   "AllGather_ST",
@@ -18,34 +26,34 @@ constexpr char const* kernelName[] = {
   "ReduceScatter_LDMC"
 };
 
-constexpr uint32_t kernelMask_STMC = 1<<ncclSymKernelId_AllGather_LLMC |
-                                     1<<ncclSymKernelId_AllGather_STMC |
-                                     1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
-                                     1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
-                                     1<<ncclSymKernelId_ReduceScatter_LDMC;
+constexpr uint32_t kernelMask_STMC = 1<<ncclSymkKernelId_AllGather_LLMC |
+                                     1<<ncclSymkKernelId_AllGather_STMC |
+                                     1<<ncclSymkKernelId_AllReduce_AGxLLMC_R |
+                                     1<<ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                     1<<ncclSymkKernelId_ReduceScatter_LDMC;
 
-constexpr uint32_t kernelMask_LDMC = 1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
-                                     1<<ncclSymKernelId_ReduceScatter_LDMC;
+constexpr uint32_t kernelMask_LDMC = 1<<ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                     1<<ncclSymkKernelId_ReduceScatter_LDMC;
 
-constexpr uint32_t kernelMask_LL = 1<<ncclSymKernelId_AllReduce_AGxLL_R |
-                                   1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
-                                   1<<ncclSymKernelId_AllGather_LL |
-                                   1<<ncclSymKernelId_AllGather_LLMC |
-                                   1<<ncclSymKernelId_ReduceScatter_LL;
+constexpr uint32_t kernelMask_LL = 1<<ncclSymkKernelId_AllReduce_AGxLL_R |
+                                   1<<ncclSymkKernelId_AllReduce_AGxLLMC_R |
+                                   1<<ncclSymkKernelId_AllGather_LL |
+                                   1<<ncclSymkKernelId_AllGather_LLMC |
+                                   1<<ncclSymkKernelId_ReduceScatter_LL;
 
-constexpr uint32_t kernelMask_AG = 1<<ncclSymKernelId_AllGather_LL |
-                                   1<<ncclSymKernelId_AllGather_LLMC |
-                                   1<<ncclSymKernelId_AllGather_ST |
-                                   1<<ncclSymKernelId_AllGather_STMC;
+constexpr uint32_t kernelMask_AG = 1<<ncclSymkKernelId_AllGather_LL |
+                                   1<<ncclSymkKernelId_AllGather_LLMC |
+                                   1<<ncclSymkKernelId_AllGather_ST |
+                                   1<<ncclSymkKernelId_AllGather_STMC;
 
-constexpr uint32_t kernelMask_AR = 1<<ncclSymKernelId_AllReduce_AGxLLMC_R |
-                                   1<<ncclSymKernelId_AllReduce_AGxLL_R |
-                                   1<<ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC |
-                                   1<<ncclSymKernelId_AllReduce_RSxLD_AGxST;
+constexpr uint32_t kernelMask_AR = 1<<ncclSymkKernelId_AllReduce_AGxLLMC_R |
+                                   1<<ncclSymkKernelId_AllReduce_AGxLL_R |
+                                   1<<ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                   1<<ncclSymkKernelId_AllReduce_RSxLD_AGxST;
 
-constexpr uint32_t kernelMask_RS = 1<<ncclSymKernelId_ReduceScatter_LD |
-                                   1<<ncclSymKernelId_ReduceScatter_LDMC |
-                                   1<<ncclSymKernelId_ReduceScatter_LL;
+constexpr uint32_t kernelMask_RS = 1<<ncclSymkKernelId_ReduceScatter_LD |
+                                   1<<ncclSymkKernelId_ReduceScatter_LDMC |
+                                   1<<ncclSymkKernelId_ReduceScatter_LL;
 
 static uint32_t kernelMask_coll(ncclFunc_t coll) {
   switch (coll) {
@@ -64,11 +72,11 @@ static uint32_t kernelMask_user() {
     // the parseList() used by NCCL_ALGO/PROTO.
     char const* name = ncclGetEnv("NCCL_SYM_KERNEL");
     if (name == nullptr || strcmp(name, "^") == 0) {
-      static_assert((int)ncclSymKernelId_Count < 32, "Use more than 32 bits");
-      got = (1<<(int)ncclSymKernelId_Count)-1;
+      static_assert((int)ncclSymkKernelId_Count < 32, "Use more than 32 bits");
+      got = (1<<(int)ncclSymkKernelId_Count)-1;
     } else {
       got = 0;
-      for (int k=0; k < (int)ncclSymKernelId_Count; k++) {
+      for (int k=0; k < (int)ncclSymkKernelId_Count; k++) {
         if (strcmp(kernelName[k], name) == 0) {
           __atomic_store_n(&cache, 1<<k, __ATOMIC_RELAXED);
           got = 1<<k;
@@ -102,11 +110,11 @@ static double model(double busBytes, double baseLat, int nSMs, double smBw, doub
 // Given the kernel and bytes, return the minimum number of blocks to run on such that
 // perf is 99% of running at max blocks, and return the estimate runtime for that
 // block count.
-static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes, float* timeUs, int* nBlocks) {
+static void queryModel(struct ncclComm* comm, ncclSymkKernelId k, size_t nBytes, float* timeUs, int* nBlocks) {
   constexpr double LL_BusFactor = 9; // 2X the bytes, plus some processing, plus no unrolling
 
   int nRanks = comm->nRanks;
-  int nMaxBlocks = ncclSymMaxBlocks;
+  int nMaxBlocks = ncclSymkMaxBlocks;
   int nMaxBlocksNvls = divUp((comm->cudaArch < 1000 ? 16 : 32), nRanks);
   size_t busBytes; // max(bytes sent, bytes received)
   double busMultiplier = 1;
@@ -116,45 +124,45 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes,
     busBytes = size_t(1)<<50;
     break;
 
-  case ncclSymKernelId_AllReduce_AGxLL_R:
+  case ncclSymkKernelId_AllReduce_AGxLL_R:
     busBytes = nRanks*nBytes*LL_BusFactor;
     break;
-  case ncclSymKernelId_AllReduce_AGxLLMC_R:
+  case ncclSymkKernelId_AllReduce_AGxLLMC_R:
     busBytes = nRanks*nBytes*LL_BusFactor;
     busMultiplier = 1.1; // To beat non-MC LL
     break;
-  case ncclSymKernelId_AllReduce_RSxLD_AGxST:
+  case ncclSymkKernelId_AllReduce_RSxLD_AGxST:
     busBytes = 2*nBytes*(nRanks-1)/nRanks;
     break;
-  case ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC:
+  case ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC:
     busBytes = nBytes/nRanks + nBytes;
     busMultiplier = nRanks;
     nMaxBlocks = nMaxBlocksNvls;
     break;
 
-  case ncclSymKernelId_AllGather_LL:
+  case ncclSymkKernelId_AllGather_LL:
     busBytes = nRanks*nBytes*LL_BusFactor;
     break;
-  case ncclSymKernelId_AllGather_LLMC:
+  case ncclSymkKernelId_AllGather_LLMC:
     busBytes = nRanks*nBytes*LL_BusFactor;
     busMultiplier = 1.1; // To beat non-MC LL
     break;
-  case ncclSymKernelId_AllGather_ST:
+  case ncclSymkKernelId_AllGather_ST:
     busBytes = (nRanks-1)*nBytes;
     break;
-  case ncclSymKernelId_AllGather_STMC:
+  case ncclSymkKernelId_AllGather_STMC:
     busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
     busMultiplier = 0.55*nRanks;
     nMaxBlocks = nMaxBlocksNvls;
     break;
 
-  case ncclSymKernelId_ReduceScatter_LL:
+  case ncclSymkKernelId_ReduceScatter_LL:
     busBytes = nRanks*nBytes*LL_BusFactor;
     break;
-  case ncclSymKernelId_ReduceScatter_LD:
+  case ncclSymkKernelId_ReduceScatter_LD:
     busBytes = (nRanks-1)*nBytes;
     break;
-  case ncclSymKernelId_ReduceScatter_LDMC:
+  case ncclSymkKernelId_ReduceScatter_LDMC:
     busBytes = (nRanks-1)*nBytes; // Wrong. Should be nRanks*nBytes but we want to beat non-MC.
     busMultiplier = 0.55*nRanks;
     nMaxBlocks = nMaxBlocksNvls;
@@ -164,7 +172,7 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes,
   nMaxBlocks = std::min<int>(nMaxBlocks, comm->config.maxCTAs);
   int nMinBlocks = comm->config.minCTAs;
 
-  int nUserCTAs = std::min<int>(ncclSymMaxBlocks, ncclParamSymCTAs());
+  int nUserCTAs = std::min<int>(ncclSymkMaxBlocks, ncclParamSymCTAs());
   if (nUserCTAs > 0) nMinBlocks = nMaxBlocks = nUserCTAs;
 
   bool isLL = kernelMask_LL>>k & 1;
@@ -175,11 +183,11 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes,
   if (comm->cudaArch < 1000) {
     baseLat = isLL ? 4.5 : 7.8;
     smBw = isAR ? 65*GBps : 44*GBps;
-    peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps;
+    peakBw = k == ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC ? 480*GBps : 320*GBps;
   } else {
     baseLat = isLL ? (isAG ? 8.5 : 11) : (isAR ? 19.5 : 13.0);
     smBw = 55*GBps;
-    peakBw = k == ncclSymKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps;
+    peakBw = k == ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC ? 1000*GBps : 600*GBps;
   }
   *nBlocks = nMaxBlocks;
   *timeUs = model(busBytes, baseLat, nMaxBlocks, smBw, busMultiplier, peakBw);
@@ -194,7 +202,36 @@ static void queryModel(struct ncclComm* comm, ncclSymKernelId k, size_t nBytes,
   }
 }
 
-bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
+ncclResult_t ncclSymkInitOnce(struct ncclComm* comm) {
+  struct ncclSymkState* symk = &comm->symkState;
+  if (!symk->initialized) {
+    symk->initialized = true;
+    struct ncclDevCommRequirements reqs = {};
+    reqs.lsaMultimem = comm->nvlsSupport;
+    reqs.lsaBarrierCount = ncclSymkMaxBlocks;
+
+    struct ncclDevResourceRequirements lla2aReq;
+    ncclLLA2ACreateRequirement(
+      ncclSymkMaxBlocks, ncclLLA2ACalcSlots(ncclTeamLsa(comm).nRanks*ncclSymkMaxThreads, ncclSymkLLMaxEltSize),
+      &symk->kcomm.lsaLLA2A, &lla2aReq
+    );
+    lla2aReq.next = reqs.resourceRequirementsList;
+    reqs.resourceRequirementsList = &lla2aReq;
+
+    NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm));
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclSymkFinalize(struct ncclComm* comm) {
+  struct ncclSymkState* symk = &comm->symkState;
+  if (symk->initialized) {
+    NCCLCHECK(ncclDevCommDestroy(comm, &symk->kcomm.devComm));
+  }
+  return ncclSuccess;
+}
+
+static bool ncclSymkImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty) {
   bool isFloat;
   switch (ty) {
   case ncclFloat64:
@@ -221,10 +258,7 @@ bool ncclSymImplemented(ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType
   }
 }
 
-ncclResult_t ncclSymPickKernel(
-    struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts,
-    float* estTimeUs, ncclSymKernelId* kernelId, int* nBlocks, int* nWarps
-  ) {
+static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty, size_t nElts) {
   uint32_t kmask = kernelMask_coll(coll);
   kmask &= kernelMask_user();
 
@@ -263,14 +297,37 @@ ncclResult_t ncclSymPickKernel(
   // to be at least 32 bytes per chunk)
   if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
 
-  ncclSymKernelId bestKernel = ncclSymKernelId_Count;
+  return kmask;
+}
+
+bool ncclSymkAvailable(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red,
+                       ncclDataType_t ty, size_t nElts) {
+  if (!ncclSymkImplemented(coll, red, ty))
+    return false;
+
+  return (ncclSymkMask(comm, coll, red, ty, nElts) != 0);
+}
+
+ncclResult_t ncclSymkPickKernel(
+    struct ncclComm* comm, ncclFunc_t coll, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty,
+    size_t nEltsTotal, size_t nEltsMax, int nWorks,
+    float* estTimeUs, ncclSymkKernelId* kernelId, int* nBlocks, int* nWarps
+  ) {
+  uint32_t kmask = ncclSymkMask(comm, coll, red, ty, nEltsMax);
+
+  // We currently don't support grouping for LL kernels.
+  if (nWorks > 1)
+    kmask &= ~kernelMask_LL;
+
+  ncclSymkKernelId bestKernel = ncclSymkKernelId_Count;
   float bestTime = 1.e30f;
   int bestBlocks = 999;
+  size_t nBytes = nEltsTotal*ncclTypeSize(ty);
 
   constexpr float smPenalty = .025f; // 2.5% percent increase in time per SM
   uint32_t kmaskRemain = kmask;
   while (kmaskRemain != 0) {
-    ncclSymKernelId k = (ncclSymKernelId)popFirstOneBit(&kmaskRemain);
+    ncclSymkKernelId k = (ncclSymkKernelId)popFirstOneBit(&kmaskRemain);
     float kTime;
     int kBlocks;
     queryModel(comm, k, nBytes, &kTime, &kBlocks);
@@ -282,15 +339,29 @@ ncclResult_t ncclSymPickKernel(
   }
 
   *kernelId = bestKernel;
-  *estTimeUs = kmask==0 || kernelMask_user() == (1<<ncclSymKernelId_Count)-1 ? bestTime : 0.0f;
+  *estTimeUs = kmask==0 || kernelMask_user() == (1<<ncclSymkKernelId_Count)-1 ? bestTime : 0.0f;
   *nBlocks = bestBlocks;
   *nWarps = 16;
   return ncclSuccess;
 }
 
-const char* ncclSymKernelIdToString(int kernelId) {
-  if (kernelId < 0 || kernelId >= ncclSymKernelId_Count) {
+const char* ncclSymkKernelIdToString(int kernelId) {
+  if (kernelId < 0 || kernelId >= ncclSymkKernelId_Count) {
     return "Unknown";
   }
   return kernelName[kernelId];
 }
+
+/* this function fills in the devWork except nextWorkOffset */
+ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* task, struct ncclSymkDevWork* outDevWork) {
+  outDevWork->rootRank = task->root;
+  outDevWork->redOpArg = task->opDev.scalarArg;
+  outDevWork->nElts = task->count;
+  outDevWork->inputWin = task->sendWin->vidmem;
+  outDevWork->inputOff = (uint8_t*)task->sendbuff - (uint8_t*)task->sendWin->userPtr;
+  outDevWork->outputWin = task->recvWin->vidmem;
+  outDevWork->outputOff = (uint8_t*)task->recvbuff - (uint8_t*)task->recvWin->userPtr;
+  outDevWork->sChannelId = 0xffff;
+  outDevWork->nChannels = 0;
+  return ncclSuccess;
+}
diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt
new file mode 100644
index 000000000..0485008c0
--- /dev/null
+++ b/src/transport/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Transport sources
+set(TRANSPORT_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/nvls.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_socket.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/p2p.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/net_ib.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/coll_net.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/shm.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/generic.cc
+)
+
+# Add transport sources to parent scope
+set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} PARENT_SCOPE)
diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc
index 386865e21..6cf6b18c7 100644
--- a/src/transport/coll_net.cc
+++ b/src/transport/coll_net.cc
@@ -355,7 +355,7 @@ static ncclResult_t sharedListen(struct ncclProxyState* proxyState, int netDev,
     collNet->resources = resources;
   }
   if (resources->collNetComms[netDev] == NULL)
-    NCCLCHECK(proxyState->ncclCollNet->listen(netDev, collNetHandle, resources->collNetListenComms + netDev));
+    NCCLCHECK(proxyState->ncclCollNet->listen(proxyState->collNetContext, netDev, collNetHandle, resources->collNetListenComms + netDev));
   return ncclSuccess;
 }
 
@@ -1223,14 +1223,19 @@ ncclResult_t ncclCollnetLocalRegisterBuffer(struct ncclComm* comm, const void* u
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   bool isValid = false;
+  void *base = NULL;
+  size_t baseSize = 0;
 
   *outRegBufFlag = 0;
   *outHandle = NULL;
   if (comm && userbuff && buffSize > 0) {
     NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
     NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
-    if (isValid)
-      NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
+    if (isValid) {
+      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+      if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit;
+    }
+    NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
   }
 exit:
   return ret;
@@ -1256,13 +1261,14 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
   ncclResult_t ret = ncclSuccess;
   struct ncclCollnetCleanupCallback* record = NULL;
   struct ncclReg *regRecord = NULL;
-  void *baseSend = NULL;
-  size_t baseSendSize = 0;
+  void *base = NULL;
+  size_t baseSize = 0;
 
   *outRegBufFlag = 0;
   if (comm && userbuff && buffSize > 0) {
-    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
-    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit;
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, base, baseSize, (void**)&regRecord), ret, fail);
     NCCLCHECKGOTO(collnetRegisterBuffer(comm, userbuff, buffSize, type, regRecord, outRegBufFlag, outHandle), ret, fail);
 
     if (*outRegBufFlag) {
@@ -1473,11 +1479,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
   ncclResult_t ret = ncclSuccess;
   int rank = comm->rank;
   int collNetSetupFail = 0;
-  // Find all head ranks
-  int nHeadsUnique = 0;
-  int* headsUnique = NULL;
   bool share;
-  struct ncclTopoGraph* directGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
 
   struct collnetShareInfo {
     int headPosition;
@@ -1485,20 +1487,30 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
   };
   struct collnetShareInfo* infos = NULL;
 
-  NCCLCHECKGOTO(ncclCalloc(&headsUnique, directGraph->nChannels), ret, fail);
-  { uint64_t mask = 0;
+  struct ncclTopoGraph* collNetGraph;
+
+  if (!comm->nvlsSupport) {
+    collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
+    NCCLCHECKGOTO(ncclCalloc(&comm->collNetHeads, collNetGraph->nChannels), ret, fail);
+    uint64_t mask = 0;
     // Head GPU index is always 0
-    for (int c = 0; c < directGraph->nChannels; c++) {
-      int head = directGraph->intra[c * comm->localRanks + 0];
+    for (int c = 0; c < collNetGraph->nChannels; c++) {
+      int head = collNetGraph->intra[c * comm->localRanks + 0];
       assert(comm->rankToNode[head] == comm->node);
       uint64_t mask0 = mask;
       mask |= 1ull<<comm->rankToLocalRank[head];
-      if (mask != mask0) headsUnique[nHeadsUnique++] = head;
+      if (mask != mask0) comm->collNetHeads[comm->collNetHeadsNum++] = head;
     }
+  } else {
+    // Use the NVLS graph to get the head ranks for collnet setup. comm->nvlsHeads already has unique heads.
+    // nHeads is the same on all the channels, see connectNvls function
+    collNetGraph = graphs[NCCL_ALGO_NVLS];
+    NCCLCHECKGOTO(ncclCalloc(&comm->collNetHeads, collNetGraph->nChannels), ret, fail);
+    comm->collNetHeadsNum = comm->channels[0].nvls.nHeads;
+    // Copy over comm->collNetHeads from comm->nvlsHeads since they are freed in different places.
+    memcpy(comm->collNetHeads, comm->nvlsHeads, comm->collNetHeadsNum * sizeof(int));
   }
 
-  comm->collNetHeads = headsUnique;
-  comm->collNetHeadsNum = nHeadsUnique;
   if (parent && parent->config.collnetEnable && parent->nNodes == comm->nNodes) {
     if (!parent->shareResources) {
       collNetSetupFail = 1;
@@ -1508,7 +1520,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
     /* check whether child can share collnet resources of parent. Since parent builds each collnet communicator
      * based on heads with the same head position in each node, as long as the collnet heads of child comm
      * can match parent's heads, we can let child communicator share parent's collnet resources. */
-    for (int h = 0; h < nHeadsUnique; ++h) {
+    for (int h = 0; h < comm->collNetHeadsNum; ++h) {
       int prev = INT_MIN;
       struct collnetShareInfo* myinfo;
 
@@ -1516,7 +1528,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
       myinfo = infos + comm->rank;
       memset(myinfo, 0, sizeof(struct collnetShareInfo));
       /* find the child head position in parent collnet heads. */
-      if (headsUnique[h] == comm->rank) {
+      if (comm->collNetHeads[h] == comm->rank) {
         myinfo->headPosition = -1;
         myinfo->isMaster = 1;
         for (int th = 0; th < parent->collNetHeadsNum; ++th)
@@ -1567,11 +1579,11 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
     for (int c = 0; c < comm->nChannels; c++) {
       struct ncclChannel* channel = comm->channels + c;
       NCCLCHECKGOTO(initCollnetChannel(comm, c, parent, false), ret, fail);
-      for (int h = 0; h < nHeadsUnique; h++) {
-        const int head = headsUnique[h];
+      for (int h = 0; h < comm->collNetHeadsNum; h++) {
+        const int head = comm->collNetHeads[h];
         ncclConnect connect;
-        collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetRecv, &connect);
-        if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, directGraph, channel, head, head, h, collNetSend, &connect);
+        collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetRecv, &connect);
+        if (!collNetSetupFail) collNetSetupFail |= ncclTransportCollNetSetup(comm, collNetGraph, channel, head, head, h, collNetSend, &connect);
       }
       // Verify CollNet setup across ranks after trying the first channel
       if (c == 0) {
@@ -1592,7 +1604,7 @@ ncclResult_t ncclCollNetSetup(ncclComm_t comm, ncclComm_t parent, struct ncclTop
       bool isHead = false;
       matrix = nullptr;
       NCCLCHECKGOTO(ncclCalloc(&matrix, comm->nRanks), ret, matrix_end);
-      for (int h = 0; h < nHeadsUnique; h++) isHead |= (headsUnique[h] == comm->rank);
+      for (int h = 0; h < comm->collNetHeadsNum; h++) isHead |= (comm->collNetHeads[h] == comm->rank);
       if (isHead) {
         for (int ty=0; ty < ncclNumTypes; ty++) {
           for (int op=0; op < 4; op++) {
diff --git a/src/transport/generic.cc b/src/transport/generic.cc
index 47b023667..a42418773 100644
--- a/src/transport/generic.cc
+++ b/src/transport/generic.cc
@@ -1,3 +1,9 @@
+/*************************************************************************
+ * Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
 #include "comm.h"
 #include "transport.h"
 #include "bootstrap.h"
diff --git a/src/transport/net.cc b/src/transport/net.cc
index c0cd20d6e..5d1a5601c 100644
--- a/src/transport/net.cc
+++ b/src/transport/net.cc
@@ -178,6 +178,101 @@ struct setupReq {
 NCCL_PARAM(NetOptionalRecvCompletion, "NET_OPTIONAL_RECV_COMPLETION", 1);
 
 static_assert(sizeof(ncclNetHandle_t) + sizeof(int) <= CONNECT_SIZE, "Not large enough ncclConnect to hold ncclNetHandle_t and useGdr flag");
+
+// Common function to initialize network attributes from a ncclComm
+static void populateCommNetAttrs(struct ncclComm* comm, struct ncclConnector* conn, ncclNetAttr_t* netAttr) {
+  *netAttr = NCCL_NET_ATTR_INIT;
+  netAttr->sendCommAttr.minConcurrentPeers = 1;
+  netAttr->sendCommAttr.minFlowsPerPeer = 1;
+
+  netAttr->recvCommAttr.minConcurrentPeers = 1;
+  netAttr->recvCommAttr.minFlowsPerPeer = 1;
+
+  if (conn->p2pOnly) {
+    size_t maxConcPeers = comm->p2pnChannels * NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
+    if (comm->nRanks < maxConcPeers) maxConcPeers = comm->nRanks;
+
+    netAttr->sendCommAttr.maxConcurrentPeers = maxConcPeers;
+    netAttr->sendCommAttr.maxFlowsPerPeer = comm->p2pnChannelsPerPeer;
+    netAttr->recvCommAttr.maxConcurrentPeers = maxConcPeers;
+    netAttr->recvCommAttr.maxFlowsPerPeer = comm->p2pnChannelsPerPeer;
+    netAttr->op = BIT(ncclFuncSend) | BIT(ncclFuncRecv) |
+                  BIT(ncclFuncAlltoAll) | BIT(ncclFuncScatter) | BIT(ncclFuncGather);
+  } else {
+    size_t maxConcPeers = (NCCL_MAX_TREE_ARITY - 1) * 2;
+    if (comm->nRanks < maxConcPeers) maxConcPeers = comm->nRanks;
+    netAttr->sendCommAttr.maxConcurrentPeers = maxConcPeers;
+    netAttr->sendCommAttr.maxFlowsPerPeer = comm->nChannels;
+    netAttr->recvCommAttr.maxConcurrentPeers = maxConcPeers;
+    netAttr->recvCommAttr.maxFlowsPerPeer = comm->nChannels;
+  }
+}
+
+// Apply the netAttr to the netComm
+void setNetAttrs(struct ncclProxyState* proxyState, ncclNetAttr_t* netAttr)
+{
+  if (proxyState->ncclNet->setNetAttr) {
+    proxyState->ncclNet->setNetAttr(proxyState->netContext, netAttr);
+    proxyState->netAttr = *netAttr;
+  }
+}
+
+void printNetAttrs(ncclNetAttr_t* netAttr, const char *task)
+{
+  const int opBufLen = ncclNumFuncs*32;
+  char opBuf[opBufLen] = "";
+  const int algoBufLen = NCCL_NUM_ALGORITHMS*32;
+  char algoBuf[algoBufLen] = "";
+  const int protoBufLen = NCCL_NUM_PROTOCOLS*32;
+  char protoBuf[protoBufLen] = "";
+
+  ncclBitsToString(netAttr->op, MASK(ncclNumFuncs), (const char* (*)(int))ncclFuncToString, opBuf, opBufLen, "*");
+  ncclBitsToString(netAttr->algo, MASK(NCCL_NUM_ALGORITHMS), ncclAlgoToString, algoBuf, algoBufLen, "*");
+  ncclBitsToString(netAttr->proto, MASK(NCCL_NUM_PROTOCOLS), ncclProtoToString, protoBuf, protoBufLen, "*");
+
+  TRACE(NCCL_NET, "%s hints, send peers/flows: [%d-%d][%d-%d] recv peers/flows: [%d-%d][%d-%d] op: %s algo: %s proto: %s",
+        task, netAttr->sendCommAttr.minConcurrentPeers, netAttr->sendCommAttr.maxConcurrentPeers,
+        netAttr->sendCommAttr.minFlowsPerPeer, netAttr->sendCommAttr.maxFlowsPerPeer,
+        netAttr->recvCommAttr.minConcurrentPeers, netAttr->recvCommAttr.maxConcurrentPeers,
+        netAttr->recvCommAttr.minFlowsPerPeer, netAttr->recvCommAttr.maxFlowsPerPeer,
+        opBuf, algoBuf, protoBuf);
+}
+
+// Set the netAttr for a transfer operation
+void setXferNetAttrs(struct ncclProxyState* proxyState, struct ncclProxyArgs* args, int send)
+{
+  ncclNetAttr_t netAttr;
+
+  if (!proxyState->ncclNet->setNetAttr)
+    return;
+
+  netAttr = proxyState->netAttr;
+
+  if (send) {
+    netAttr.sendCommAttr.maxConcurrentPeers = args->nPeers;
+    netAttr.sendCommAttr.minConcurrentPeers = args->nPeers;
+    netAttr.sendCommAttr.maxFlowsPerPeer = args->nChannels;
+    netAttr.sendCommAttr.minFlowsPerPeer = args->nChannels;
+  } else {
+    netAttr.recvCommAttr.maxConcurrentPeers = args->nPeers;
+    netAttr.recvCommAttr.minConcurrentPeers = args->nPeers;
+    netAttr.recvCommAttr.maxFlowsPerPeer = args->nChannels;
+    netAttr.recvCommAttr.minFlowsPerPeer = args->nChannels;
+  }
+
+  netAttr.op = BIT(args->collAPI);
+  // algo/proto are undefined for p2p
+  if (args->collAPI < NCCL_NUM_FUNCTIONS) {
+    netAttr.algo = BIT(args->algorithm);
+    netAttr.proto = BIT(args->protocol);
+  }
+
+  if (memcmp(&proxyState->netAttr, &netAttr, sizeof(netAttr))) {
+    setNetAttrs(proxyState, &netAttr);
+    printNetAttrs(&netAttr, send ? "send" : "recv");
+  }
+}
+
 // Forward declaration
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args);
 
@@ -307,11 +402,12 @@ static ncclResult_t netDumpMap(struct connectMap* map) {
 
 struct netSendConnectArgs {
   ncclNetHandle_t handle;
-  int trafficClass;
+  ncclNetAttr_t netAttr;
 };
 
 struct netRecvConnectArgs {
   int proxyRank;
+  ncclNetAttr_t netAttr;
 };
 
 static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* connectInfo, int nranks, int rank, struct ncclConnector* send) {
@@ -331,7 +427,9 @@ static ncclResult_t sendConnect(struct ncclComm* comm, struct ncclConnect* conne
     INFO(NCCL_PROXY, "sendConnect ncclProxyCallAsync opId=%p", opId);
     netSendConnectArgs args = {0};
     memcpy(&args.handle, connectInfo, sizeof(ncclNetHandle_t));
-    args.trafficClass = comm->config.trafficClass;
+
+    populateCommNetAttrs(comm, send, &args.netAttr);
+
     NCCLCHECK(ncclProxyCallAsync(comm, &send->proxyConn, ncclProxyMsgConnect, &args, sizeof(netSendConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId =  send;
@@ -442,6 +540,9 @@ static ncclResult_t recvConnect(struct ncclComm* comm, struct ncclConnect* conne
        opId, &recv->proxyConn, connectInfo);
     netRecvConnectArgs args = {0};
     args.proxyRank = *((int*)connectInfo);
+
+    populateCommNetAttrs(comm, recv, &args.netAttr);
+
     NCCLCHECK(ncclProxyCallAsync(comm, &recv->proxyConn, ncclProxyMsgConnect, &args, sizeof(netRecvConnectArgs), sizeof(struct connectMap), opId));
   } else {
     opId = recv;
@@ -677,7 +778,7 @@ static ncclResult_t recvProxySetup(struct ncclProxyConnection* connection, struc
   }
 
   if (respSize != sizeof(ncclNetHandle_t)) return ncclInternalError;
-  NCCLCHECK(proxyState->ncclNet->listen(req->netDev, respBuff, &resources->netListenComm));
+  NCCLCHECK(proxyState->ncclNet->listen(proxyState->netContext, req->netDev, respBuff, &resources->netListenComm));
   *done = 1;
 
   return ncclSuccess;
@@ -707,11 +808,12 @@ static ncclResult_t ncclNetGetDeviceHandle(ncclNetDeviceType type, int version,
 
 static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   struct sendNetResources* resources = (struct sendNetResources*)(connection->transportResources);
-  ncclNetCommConfig_t commConfig = {0};
   if (reqSize != sizeof(netSendConnectArgs)) return ncclInternalError;
   ncclResult_t ret = ncclSuccess;
   netSendConnectArgs* req = (netSendConnectArgs*) reqBuff;
-  commConfig.trafficClass = req->trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : req->trafficClass;
+
+  setNetAttrs(proxyState, &req->netAttr);
+
   NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, false /*isRecv*/, &resources->netDeviceHandle));
   if (resources->shared) {
     // Shared buffers
@@ -736,25 +838,29 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         comms->activeConnect[resources->channelId] = (resources->tpLocalRank + 1);
       if (comms->sendComm[resources->channelId] == NULL
           && comms->activeConnect[resources->channelId] == (resources->tpLocalRank + 1)) {
-        ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle,
+        ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle,
             comms->sendComm + resources->channelId, &resources->netDeviceHandle);
       }
       resources->netSendComm = comms->sendComm[resources->channelId];
       if (comms->sendComm[resources->channelId]) comms->sendRefCount[resources->channelId]++;
     } else {
-      ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+      ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     }
   } else {
     // Connect to remote peer
-    ret = proxyState->ncclNet->connect(resources->netDev, &commConfig, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
+    ret = proxyState->ncclNet->connect(proxyState->netContext, resources->netDev, req->handle, &resources->netSendComm, &resources->netDeviceHandle);
     connection->proxyAppendPtr = &connection->proxyAppend;
   }
 
-  NCCLCHECK(ret);
+  if (ret != ncclSuccess) {
+    if (resources->netSendComm) proxyState->ncclNet->closeSend(resources->netSendComm);
+    NCCLCHECK(ret);
+  }
   if (resources->netSendComm == NULL) {
     *done = 0;
     return ncclInProgress;
   }
+  printNetAttrs(&req->netAttr, "send connect");
   *done = 1;
 
   if (resources->netDeviceHandle) {
@@ -872,6 +978,8 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
   resources->tpRemoteProxyRank = req->proxyRank;
   ncclResult_t ret = ncclSuccess;
 
+  setNetAttrs(proxyState, &req->netAttr);
+
   NCCLCHECK(ncclNetGetDeviceHandle(resources->netDeviceType, resources->netDeviceVersion, true /*isRecv*/, &resources->netDeviceHandle));
   // Finish connection establishment from remote peer
   if (resources->shared) {
@@ -917,6 +1025,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
     *done = 0;
     return ncclInProgress;
   }
+  printNetAttrs(&req->netAttr, "recv connect");
   *done = 1;
 
   if (resources->netDeviceHandle) {
@@ -1106,6 +1215,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
 static_assert(NCCL_STEPS <= NCCL_NET_MAX_REQUESTS, "Not enough net requests to cover for steps");
 
 static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
+  int checkedNetAttr = 0;
   if (args->state == ncclProxyOpReady) {
     for (int s=0; s<args->nsubs; s++) {
       struct ncclProxySubArgs* sub = args->subs+s;
@@ -1207,6 +1317,8 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
             // since size is a plain integer.
             // coverity[use_invalid:FALSE]
             void* phandle = &sub->pHandles[DIVUP(transmittedStepId, args->sliceSteps)%NCCL_STEPS];
+            if (!checkedNetAttr++)
+              setXferNetAttrs(proxyState, args, 1);
             NCCLCHECK(proxyState->ncclNet->isend(resources->netSendComm, buff, size, resources->tpRank, sub->sendMhandle, phandle, sub->requests+buffSlot));
             if (sub->requests[buffSlot] != NULL) {
               TRACE(NCCL_NET, "sendProxy [%ld/%d/%d] Isend posted, req %p, buff %p, size %d, proto %d, myRank %d, channelId %d, mhandle %p", sub->transmitted, buffSlot, sub->nsteps, sub->requests[buffSlot], buff, size, p, proxyState->tpRank, sub->channelId, sub->sendMhandle);
@@ -1258,6 +1370,7 @@ static ncclResult_t sendProxyProgress(struct ncclProxyState* proxyState, struct
 }
 
 static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct ncclProxyArgs* args) {
+  int checkedNetAttr = 0;
   if (args->state == ncclProxyOpReady) {
     // Initialize subs and group them by same recvComm.
     void* recvComm;
@@ -1363,6 +1476,8 @@ static ncclResult_t recvProxyProgress(struct ncclProxyState* proxyState, struct
         struct recvNetResources* resources = (struct recvNetResources*) (subGroup->connection->transportResources);
         void** requestPtr = subGroup->requests+(step%NCCL_STEPS);
         bool ignoreCompletion = ncclParamNetOptionalRecvCompletion() && ((args->protocol == NCCL_PROTO_LL128) || (args->protocol == NCCL_PROTO_LL)) && (subCount == 1);
+        if (!checkedNetAttr++)
+          setXferNetAttrs(proxyState, args, 0);
         if (ignoreCompletion) *requestPtr = (void *)NCCL_NET_OPTIONAL_RECV_COMPLETION;
         NCCLCHECK(proxyState->ncclNet->irecv(resources->netRecvComm, subCount, ptrs, sizes, tags, mhandles, phandles, requestPtr));
         if (*requestPtr) {
@@ -1594,13 +1709,18 @@ ncclResult_t ncclNetLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   bool isValid = false;
+  void *base = NULL;
+  size_t baseSize = 0;
 
   *outRegBufFlag = 0;
   if (comm && userbuff && buffSize > 0 && nPeers > 0) {
     NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
     NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
-    if (isValid)
+    if (isValid) {
+      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+      if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit;
       NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
+    }
   }
 
 exit:
@@ -1627,13 +1747,14 @@ ncclResult_t ncclNetGraphRegisterBuffer(ncclComm* comm, const void* userbuff, si
   ncclResult_t ret = ncclSuccess;
   struct ncclNetCleanupCallback *record = NULL;
   struct ncclReg *regRecord = NULL;
-  void *baseSend;
-  size_t baseSendSize;
+  void *base = NULL;
+  size_t baseSize = 0;
 
   *outRegBufFlag = 0;
   if (comm && userbuff && buffSize > 0 && nPeers > 0) {
-    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)userbuff), ret, fail);
-    NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&regRecord), ret, fail);
+    CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&base, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    if ((uint64_t)base + baseSize < (uint64_t)userbuff + buffSize) goto exit;
+    NCCLCHECKGOTO(ncclCommGraphRegister(comm, base, baseSize, (void**)&regRecord), ret, fail);
     NCCLCHECKGOTO(netRegisterBuffer(comm, userbuff, buffSize, peerConns, nPeers, regRecord, outRegBufFlag, outHandle), ret, fail);
     if (*outRegBufFlag) {
       NCCLCHECKGOTO(ncclCalloc(&record, 1), ret, fail);
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 709e7ad40..3614dec61 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -21,6 +21,7 @@
 #include <poll.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <mutex>
 #define ENABLE_TIMER 0
 #include "timer.h"
 
@@ -32,6 +33,8 @@
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress ncclIbIfAddr;
 
+static ncclNetCommConfig_t ibContext;
+
 struct ncclIbMr {
   uintptr_t addr;
   size_t pages;
@@ -70,7 +73,7 @@ const char* ibProviderName[] = {
 
 static int ncclNIbDevs = -1;
 struct alignas(64) ncclIbDev {
-  pthread_mutex_t lock;
+  std::mutex mutex;
   int device;
   uint64_t guid;
   uint8_t portNum;
@@ -102,9 +105,16 @@ struct alignas(64) ncclIbDev {
 #define MAX_IB_VDEVS MAX_IB_DEVS*8
 struct ncclIbMergedDev ncclIbMergedDevs[MAX_IB_VDEVS];
 struct ncclIbDev ncclIbDevs[MAX_IB_DEVS];
-pthread_mutex_t ncclIbLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex ncclIbMutex;
 static int ncclIbRelaxedOrderingEnabled = 0;
 
+// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
+// rather than once for all communicators. However, the internal plugin implementation
+// still assumes the plugin is initialized only once across all communicators. The ref
+// counter makes sure the plugin internally initializes only once. When per communicator
+// context support is added to the plugin the ref counter can be removed.
+static int netRefCount;
+
 #define NCCL_IB_LLSTR(ll) (((ll) == IBV_LINK_LAYER_INFINIBAND) ? "IB" : (((ll) == IBV_LINK_LAYER_ETHERNET) ? "RoCE" : "UNSPECIFIED"))
 
 #define NCCL_IB_SL_DEFAULT 0
@@ -184,6 +194,9 @@ static void* ncclIbAsyncThreadMain(void* args) {
       // SRQ are not used in NCCL
       WARN("NET/IB : %s:%d async fatal event on SRQ, unused for now (%p): %s", dev->devName, dev->portNum, srq, str);
       break;
+    case IBV_EVENT_GID_CHANGE:
+      WARN("NET/IB : %s:%d GID table changed", dev->devName, dev->portNum);
+      break;
     case IBV_EVENT_PATH_MIG_ERR:
     case IBV_EVENT_PORT_ERR:
     case IBV_EVENT_PATH_MIG:
@@ -597,15 +610,22 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
 }
 
 ncclResult_t ncclIbMakeVDevice(int* d, ncclNetVDeviceProps_t* props) {
-  pthread_mutex_lock(&ncclIbLock);
+  std::lock_guard<std::mutex> lock(ncclIbMutex);
   ncclResult_t res = ncclIbMakeVDeviceInternal(d, props);
-  pthread_mutex_unlock(&ncclIbLock);
   return res;
+
+}
+
+ncclResult_t ncclIbSetNetAttr(void *ctx, ncclNetAttr_t *netAttr) {
+  (void)ctx;
+  (void)netAttr;
+  return ncclSuccess;
 }
 
 static ncclProfilerCallback_t ncclProfilerFunction;
 
-ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  if (netRefCount++) return ncclSuccess;
   ncclResult_t ret = ncclSuccess;
   ncclProfilerFunction = profFunction;
   if (ncclParamIbDisable()) return ncclInternalError;
@@ -614,7 +634,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
   if(wrap_mlx5dv_symbols() != ncclSuccess) { INFO(NCCL_NET, "NET/IB : Failed to open mlx5dv symbols. Advance features like CX-8 Direct-NIC will be disabled."); }
 
   if (ncclNIbDevs == -1) {
-    pthread_mutex_lock(&ncclIbLock);
+    std::lock_guard<std::mutex> lock(ncclIbMutex);
     wrap_ibv_fork_init();
     if (ncclNIbDevs == -1) {
       int nIpIfs = 0;
@@ -644,25 +664,15 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
       if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
 
       for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
-        struct ibv_context * context;
+        struct ibv_context * context = NULL;
         if (ncclSuccess != wrap_ibv_open_device(&context, devices[d]) || context == NULL) {
           WARN("NET/IB : Unable to open device %s", devices[d]->name);
           continue;
         }
-        enum ncclIbProvider ibProvider = IB_PROVIDER_NONE;
-        char dataDirectDevicePath[PATH_MAX];
-        int dataDirectSupported = 0;
-        int skipNetDevForDataDirect = 0;
-        if (wrap_mlx5dv_is_supported(devices[d])) {
-          ibProvider = IB_PROVIDER_MLX5;
-          snprintf(dataDirectDevicePath, PATH_MAX, "/sys");
-          if((ncclMlx5dvDmaBufCapable(context)) && (wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + 4, PATH_MAX - 4) == ncclSuccess)) {
-            INFO(NCCL_INIT|NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device:%s", devices[d]->name);
-            // Now check whether Data Direct has been disabled by the user
-            if(ncclParamIbDataDirect() == 1) { dataDirectSupported = 1; skipNetDevForDataDirect = 1; }
-            if(ncclParamIbDataDirect() == 2) { dataDirectSupported = 1; skipNetDevForDataDirect = 0; }
-          }
-        }
+        char dataDirectDevicePath[PATH_MAX] = "/sys";
+        int devCount = /*undefined*/-1, devOffset = 0;
+        enum ncclIbProvider ibProvider = wrap_mlx5dv_is_supported(devices[d]) ? IB_PROVIDER_MLX5 : IB_PROVIDER_NONE;
+
         int nPorts = 0;
         struct ibv_device_attr devAttr;
         memset(&devAttr, 0, sizeof(devAttr));
@@ -672,78 +682,99 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
           continue;
         }
         for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
-          // dataDirect = 0 exposes the devices normally, dataDirect = 1 exposes the devices through direct NIC
-          for (int dataDirect = skipNetDevForDataDirect; dataDirect < 1 + dataDirectSupported; ++dataDirect) {
             struct ibv_port_attr portAttr;
             if (ncclSuccess != wrap_ibv_query_port(context, port_num, &portAttr)) {
               WARN("NET/IB : Unable to query port_num %d", port_num);
               continue;
             }
             if (portAttr.state != IBV_PORT_ACTIVE) continue;
-            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND
-                && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
+            if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue;
 
             // check against user specified HCAs/ports
             if (! (matchIfList(devices[d]->name, port_num, userIfs, nUserIfs, searchExact) ^ searchNot)) {
               continue;
             }
-            pthread_mutex_init(&ncclIbDevs[ncclNIbDevs].lock, NULL);
-            ncclIbDevs[ncclNIbDevs].device = d;
-            ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider;
-            ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
-            ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
-            ncclIbDevs[ncclNIbDevs].portNum = port_num;
-            ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
-            if (portAttr.active_speed_ex)
-              // A non-zero active_speed_ex indicates XDR rate (0x100) or higher
-              ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width);
-            else
-              ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
-            ncclIbDevs[ncclNIbDevs].context = context;
-            ncclIbDevs[ncclNIbDevs].pdRefs = 0;
-            ncclIbDevs[ncclNIbDevs].pd = NULL;
-            if (!dataDirect) {
-              strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
-              NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
-            } else {
-              snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name);
-              NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX));
-              strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX);
-              ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1;
+
+            // check for mlx5 data direct support only once for a each device
+            if (devCount == -1) {
+              devCount = 1;
+              devOffset = 0;
+              if (ncclParamIbDataDirect() > 0 && ibProvider == IB_PROVIDER_MLX5 && ncclMlx5dvDmaBufCapable(context)) {
+                int pathLen = strlen(dataDirectDevicePath);
+                ncclResult_t res = wrap_mlx5dv_get_data_direct_sysfs_path(context, dataDirectDevicePath + pathLen, sizeof(dataDirectDevicePath) - pathLen);
+                if (res == ncclSuccess) {
+                  // data direct devices are exposed twice: with the C2C + PCIe link and with the data direct link
+                  devCount = 2;
+                  // by default only expose the data direct NIC (devOffset = 1), unless set to 2 by the user
+                  devOffset = (ncclParamIbDataDirect() == 2) ? 0 : 1;
+                  INFO(NCCL_INIT | NCCL_NET, "NET/IB: Data Direct DMA Interface is detected for device %s", devices[d]->name);
+                } else if (res == ncclInvalidArgument) {
+                  TRACE(NCCL_NET, "NET/IB: Device %s does not support Data Direct DMA.", devices[d]->name);
+                } else {
+                  WARN("NET/IB: Error in mlx5dv_get_data_direct_sysfs_path with device %s", devices[d]->name);
+                  return res;
+                }
+              }
+            }
+            for (int dev = devOffset; dev < devCount; ++dev) {
+              ncclIbDevs[ncclNIbDevs].device = d;
+              ncclIbDevs[ncclNIbDevs].ibProvider = ibProvider;
+              ncclIbDevs[ncclNIbDevs].guid = devAttr.sys_image_guid;
+              ncclIbDevs[ncclNIbDevs].portAttr = portAttr;
+              ncclIbDevs[ncclNIbDevs].portNum = port_num;
+              ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer;
+              if (portAttr.active_speed_ex) {
+                // A non-zero active_speed_ex indicates XDR rate (0x100) or higher
+                ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed_ex) * ncclIbWidth(portAttr.active_width);
+              } else {
+                ncclIbDevs[ncclNIbDevs].speed = ncclIbSpeed(portAttr.active_speed) * ncclIbWidth(portAttr.active_width);
+              }
+              ncclIbDevs[ncclNIbDevs].context = context;
+              ncclIbDevs[ncclNIbDevs].pdRefs = 0;
+              ncclIbDevs[ncclNIbDevs].pd = NULL;
+              if (dev == 0) {
+                strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE);
+                NCCLCHECKGOTO(ncclIbGetPciPath(ncclIbDevs[ncclNIbDevs].devName, &ncclIbDevs[ncclNIbDevs].pciPath, &ncclIbDevs[ncclNIbDevs].realPort), ret, fail);
+              } else {
+                snprintf(ncclIbDevs[ncclNIbDevs].devName, MAXNAMESIZE, "%s_dma", devices[d]->name);
+                NCCLCHECK(ncclCalloc(&ncclIbDevs[ncclNIbDevs].pciPath, PATH_MAX));
+                strncpy(ncclIbDevs[ncclNIbDevs].pciPath, dataDirectDevicePath, PATH_MAX);
+                ncclIbDevs[ncclNIbDevs].capsProvider.mlx5.dataDirect = 1;
+              }
+              ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
+              ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
+              ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
+              ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
+              NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
+
+              // Enable ADAPTIVE_ROUTING by default on IB networks
+              // But allow it to be overloaded by an env parameter
+              ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
+              if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
+
+              INFO(NCCL_NET, "NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name,
+                   ncclIbDevs[ncclNIbDevs].portNum, NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context,
+                   ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
+
+              PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
+              ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
+              PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
+
+              // Add this plain physical device to the list of virtual devices
+              int vDev;
+              ncclNetVDeviceProps_t vProps = {0};
+              vProps.ndevs = 1;
+              vProps.devs[0] = ncclNIbDevs;
+              NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
+
+              ncclNIbDevs++;
+              nPorts++;
             }
-            ncclIbDevs[ncclNIbDevs].maxQp = devAttr.max_qp;
-            ncclIbDevs[ncclNIbDevs].mrCache.capacity = 0;
-            ncclIbDevs[ncclNIbDevs].mrCache.population = 0;
-            ncclIbDevs[ncclNIbDevs].mrCache.slots = NULL;
-            NCCLCHECK(ncclIbStatsInit(&ncclIbDevs[ncclNIbDevs].stats));
-
-            // Enable ADAPTIVE_ROUTING by default on IB networks
-            // But allow it to be overloaded by an env parameter
-            ncclIbDevs[ncclNIbDevs].ar = (portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND) ? 1 : 0;
-            if (ncclParamIbAdaptiveRouting() != -2) ncclIbDevs[ncclNIbDevs].ar = ncclParamIbAdaptiveRouting();
-
-            INFO(NCCL_NET,"NET/IB: [%d] %s:%s:%d/%s provider=%s speed=%d context=%p pciPath=%s ar=%d", d, devices[d]->name, devices[d]->dev_name, ncclIbDevs[ncclNIbDevs].portNum,
-                NCCL_IB_LLSTR(portAttr.link_layer), ibProviderName[ncclIbDevs[ncclNIbDevs].ibProvider], ncclIbDevs[ncclNIbDevs].speed, context, ncclIbDevs[ncclNIbDevs].pciPath, ncclIbDevs[ncclNIbDevs].ar);
-
-            PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
-            ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
-            PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
-
-            // Add this plain physical device to the list of virtual devices
-            int vDev;
-            ncclNetVDeviceProps_t vProps = {0};
-            vProps.ndevs = 1;
-            vProps.devs[0] = ncclNIbDevs;
-            NCCLCHECK(ncclIbMakeVDeviceInternal(&vDev, &vProps));
-
-            ncclNIbDevs++;
-            nPorts++;
-          }
         }
         if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
       }
 
-      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
+      if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; }
     }
     if (ncclNIbDevs == 0) {
       INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
@@ -762,12 +793,12 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t pr
     INFO(NCCL_INIT|NCCL_NET, "NET/IB : Using%s %s; OOB %s:%s", line, ncclIbRelaxedOrderingEnabled ? "[RO]" : "",
           ncclIbIfName, ncclSocketToString(&ncclIbIfAddr, addrline));
 
-    pthread_mutex_unlock(&ncclIbLock);
   }
 exit:
+  ibContext.trafficClass = config->trafficClass;
+  *ctx = &ibContext;
   return ret;
 fail:
-  pthread_mutex_unlock(&ncclIbLock);
   goto exit;
 }
 
@@ -789,8 +820,8 @@ static void ibGdrSupportInitOnce() {
                           KNL_MODULE_LOADED("/sys/module/nvidia_peermem/version");
 }
 ncclResult_t ncclIbGdrSupport() {
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once, ibGdrSupportInitOnce);
+  static std::once_flag once;
+  std::call_once(once, ibGdrSupportInitOnce);
   if (!ncclIbGdrModuleLoaded)
     return ncclSystemError;
   return ncclSuccess;
@@ -825,13 +856,10 @@ static void ibDmaBufSupportInitOnce(){
 // ncclSuccess : DMA-BUF support is available
 // ncclSystemError : DMA-BUF is not supported by the kernel
 ncclResult_t ncclIbDmaBufSupport(int dev) {
-  struct oncewrap {
-    pthread_once_t once = PTHREAD_ONCE_INIT;
-  };
-  static oncewrap onces[MAX_IB_DEVS];
+  static std::once_flag onces[MAX_IB_DEVS];
   // init the device only once
   ibDmaSupportInitDev = dev;
-  pthread_once(&onces[dev].once, ibDmaBufSupportInitOnce);
+  std::call_once(onces[dev], ibDmaBufSupportInitOnce);
   ncclIbMergedDev* mergedDev = ncclIbMergedDevs + ibDmaSupportInitDev;
   ncclIbDev* ibDev = ncclIbDevs + mergedDev->vProps.devs[0];
   int dmaBufSupported = ibDev->dmaBufSupported;
@@ -843,7 +871,7 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
 
 ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
   struct ncclIbDev* ibDev = ncclIbDevs + dev;
-  pthread_mutex_lock(&ibDev->lock);
+  std::lock_guard<std::mutex> lock(ibDev->mutex);
   props->name = ibDev->devName;
   props->speed = ibDev->speed;
   props->pciPath = ibDev->pciPath;
@@ -867,7 +895,8 @@ ncclResult_t ncclIbGetPhysProperties(int dev, ncclNetProperties_t* props) {
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
-  pthread_mutex_unlock(&ibDev->lock);
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
@@ -1132,18 +1161,13 @@ static void ncclIbAddEvent(struct ncclIbRequest* req, int devIndex, struct ncclI
 ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base, void* cq_context) {
   base->ibDevN = ibDevN;
   ncclIbDev* ibDev = ncclIbDevs + ibDevN;
-  pthread_mutex_lock(&ibDev->lock);
-  if (0 == ibDev->pdRefs++) {
-    ncclResult_t res;
-    NCCLCHECKGOTO(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context), res, failure);
-    if (0) {
-    failure:
-      pthread_mutex_unlock(&ibDev->lock);
-      return res;
+  {
+    std::lock_guard<std::mutex> lock(ibDev->mutex);
+    if (0 == ibDev->pdRefs++) {
+      NCCLCHECK(wrap_ibv_alloc_pd(&ibDev->pd, ibDev->context));
     }
+    base->pd = ibDev->pd;
   }
-  base->pd = ibDev->pd;
-  pthread_mutex_unlock(&ibDev->lock);
 
   // Recv requests can generate 2 completions (one for the post FIFO, one for the Recv).
   NCCLCHECK(wrap_ibv_create_cq(&base->cq, ibDev->context, 2*MAX_REQUESTS*ncclParamIbQpsPerConn(), cq_context, NULL, 0));
@@ -1152,17 +1176,13 @@ ncclResult_t ncclIbInitCommDevBase(int ibDevN, struct ncclIbNetCommDevBase* base
 }
 
 ncclResult_t ncclIbDestroyBase(struct ncclIbNetCommDevBase* base) {
-  ncclResult_t res;
   NCCLCHECK(wrap_ibv_destroy_cq(base->cq));
 
-  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
+  std::lock_guard<std::mutex> lock(ncclIbDevs[base->ibDevN].mutex);
   if (0 == --ncclIbDevs[base->ibDevN].pdRefs) {
-    NCCLCHECKGOTO(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd), res, returning);
+    NCCLCHECK(wrap_ibv_dealloc_pd(ncclIbDevs[base->ibDevN].pd));
   }
-  res = ncclSuccess;
-returning:
-  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
-  return res;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclIbCreateQp(uint8_t ib_port, struct ncclIbNetCommDevBase* base, int access_flags, void* qp_context, struct ncclIbQp* qp) {
@@ -1250,7 +1270,7 @@ ncclResult_t ncclIbRtsQp(struct ibv_qp* qp) {
   return ncclSuccess;
 }
 
-ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
+ncclResult_t ncclIbListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbListenComm* comm;
   NCCLCHECK(ncclCalloc(&comm, 1));
@@ -1271,7 +1291,7 @@ ncclResult_t ncclIbListen(int dev, void* opaqueHandle, void** listenComm) {
   goto exit;
 }
 
-ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   ncclResult_t ret = ncclSuccess;
   struct ncclIbHandle* handle = (struct ncclIbHandle*) opaqueHandle;
   struct ncclIbCommStage* stage = &handle->stage;
@@ -1333,6 +1353,7 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
   if (stage->offset != sizeof(ncclNetVDeviceProps_t)) return ncclSuccess;
   stage->offset = 0;
   ncclNetVDeviceProps_t remoteVProps;
+  ncclNetCommConfig_t* config;
   memcpy(&remoteVProps, stage->buffer, sizeof(ncclNetVDeviceProps_t));
   mergedDev = ncclIbMergedDevs + dev;
   comm->base.vProps = mergedDev->vProps;
@@ -1425,6 +1446,7 @@ ncclResult_t ncclIbConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHan
       return ncclInternalError;
     }
   }
+  config = (ncclNetCommConfig_t*)ctx;
   meta.fifoAddr = (uint64_t)comm->fifo;
   meta.sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_SL_DEFAULT;
   meta.tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : (config && config->trafficClass != NCCL_NET_TRAFFIC_CLASS_UNDEF) ? config->trafficClass : NCCL_IB_TC_DEFAULT;
@@ -1856,13 +1878,12 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
   struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache;
   uintptr_t addr = (uintptr_t)data & -pageSize;
   size_t pages = ((uintptr_t)data + size - addr + pageSize-1)/pageSize;
-  ncclResult_t res;
-  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
+  std::lock_guard<std::mutex> lock(ncclIbDevs[base->ibDevN].mutex);
   for (int slot=0; /*true*/; slot++) {
     if (slot == cache->population || addr < cache->slots[slot].addr) { // didn't find in cache
       if (cache->population == cache->capacity) { // must grow cache
         cache->capacity = cache->capacity < 32 ? 32 : 2*cache->capacity;
-        NCCLCHECKGOTO(ncclRealloc(&cache->slots, cache->population, cache->capacity), res, returning);
+        NCCLCHECK(ncclRealloc(&cache->slots, cache->population, cache->capacity));
       }
       // Deregister / register
       struct ibv_mr* mr;
@@ -1871,17 +1892,17 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
       if (fd != -1) {
         /* DMA-BUF support */
         if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) {
-          NCCLCHECKGOTO(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags), res, returning);
+          NCCLCHECK(wrap_ibv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags));
         } else {
-          NCCLCHECKGOTO(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT), res, returning);
+          NCCLCHECK(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT));
         }
       } else {
         if (ncclIbRelaxedOrderingEnabled) {
           // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
-          NCCLCHECKGOTO(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags), res, returning);
+          NCCLCHECK(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags));
         }
         else {
-          NCCLCHECKGOTO(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags), res, returning);
+          NCCLCHECK(wrap_ibv_reg_mr(&mr, base->pd, (void*)addr, pages*pageSize, flags));
         }
       }
       TRACE(NCCL_INIT|NCCL_NET,"regAddr=0x%lx size=%lld rkey=0x%x lkey=0x%x fd=%d", (unsigned long)addr, (long long)pages*pageSize, mr->rkey, mr->lkey, fd);
@@ -1892,19 +1913,15 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
       cache->slots[slot].mr = mr;
       cache->population += 1;
       *mhandle = mr;
-      res = ncclSuccess;
-      goto returning;
+      return ncclSuccess;
     } else if ((addr >= cache->slots[slot].addr) &&
         ((addr-cache->slots[slot].addr)/pageSize+pages) <= cache->slots[slot].pages) {
       cache->slots[slot].refs += 1;
       *mhandle = cache->slots[slot].mr;
-      res = ncclSuccess;
-      goto returning;
+      return ncclSuccess;
     }
   }
-returning:
-  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
-  return res;
+  return ncclSuccess;
 }
 
 struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, int devIndex) {
@@ -1942,8 +1959,7 @@ ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** m
 
 ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) {
   struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache;
-  ncclResult_t res;
-  pthread_mutex_lock(&ncclIbDevs[base->ibDevN].lock);
+  std::lock_guard<std::mutex> lock(ncclIbDevs[base->ibDevN].mutex);
   for (int i=0; i < cache->population; i++) {
     if (mhandle == cache->slots[i].mr) {
       if (0 == --cache->slots[i].refs) {
@@ -1953,17 +1969,13 @@ ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle)
           cache->slots = NULL;
           cache->capacity = 0;
         }
-        NCCLCHECKGOTO(wrap_ibv_dereg_mr(mhandle), res, returning);
+        NCCLCHECK(wrap_ibv_dereg_mr(mhandle));
       }
-      res = ncclSuccess;
-      goto returning;
+      return ncclSuccess;
     }
   }
   WARN("NET/IB: could not find mr %p inside cache of %d entries", mhandle, cache->population);
-  res = ncclInternalError;
-returning:
-  pthread_mutex_unlock(&ncclIbDevs[base->ibDevN].lock);
-  return res;
+  return ncclInternalError;
 }
 
 ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) {
@@ -2567,6 +2579,11 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
   return ncclSuccess;
 }
 
+ncclResult_t ncclIbFinalize(void* ctx) {
+  netRefCount--;
+  return ncclSuccess;
+}
+
 ncclNet_t ncclNetIb = {
   "IB",
   ncclIbInit,
@@ -2587,7 +2604,9 @@ ncclNet_t ncclNetIb = {
   ncclIbCloseListen,
   NULL /* getDeviceMr */,
   NULL /* irecvConsumed */,
-  ncclIbMakeVDevice
+  ncclIbMakeVDevice,
+  ncclIbFinalize,
+  ncclIbSetNetAttr,
 };
 
 /*
diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc
index 985810c47..fa331aae2 100644
--- a/src/transport/net_socket.cc
+++ b/src/transport/net_socket.cc
@@ -16,6 +16,8 @@
 #include <poll.h>
 #include <limits.h>
 #include <fcntl.h>
+#include <mutex>
+#include <condition_variable>
 
 /* Init functions */
 static int ncclNetIfs = -1;
@@ -26,7 +28,7 @@ struct ncclNetSocketDev {
 };
 static struct ncclNetSocketDev ncclNetSocketDevs[MAX_IFS];
 
-pthread_mutex_t ncclNetSocketLock = PTHREAD_MUTEX_INITIALIZER;
+static std::mutex ncclNetSocketMutex;
 
 static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
   char devicePath[PATH_MAX];
@@ -38,17 +40,24 @@ static ncclResult_t ncclNetSocketGetPciPath(char* devName, char** pciPath) {
 
 static ncclProfilerCallback_t ncclProfilerFunction;
 
-ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+// With ncclNet_v11_t the NCCL core initializes the network plugin per-communicator
+// rather than once for all communicators. However, the internal plugin implementation
+// still assumes the plugin is initialized only once across all communicators. The ref
+// counter makes sure the plugin internally initializes only once. When per communicator
+// context support is added to the plugin the ref counter can be removed.
+static int netRefCount;
+
+ncclResult_t ncclNetSocketInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  if (netRefCount++) return ncclSuccess;
   ncclProfilerFunction = profFunction;
   if (ncclNetIfs == -1) {
-    pthread_mutex_lock(&ncclNetSocketLock);
+    std::lock_guard<std::mutex> lock(ncclNetSocketMutex);
     if (ncclNetIfs == -1) {
       char names[MAX_IF_NAME_SIZE*MAX_IFS];
       union ncclSocketAddress addrs[MAX_IFS];
       NCCLCHECK(ncclFindInterfaces(names, addrs, MAX_IF_NAME_SIZE, MAX_IFS, &ncclNetIfs));
       if (ncclNetIfs <= 0) {
         WARN("NET/Socket : no interface found");
-        pthread_mutex_unlock(&ncclNetSocketLock);
         return ncclInternalError;
       } else {
         #define MAX_LINE_LEN (2047)
@@ -67,7 +76,6 @@ ncclResult_t ncclNetSocketInit(ncclDebugLogger_t logFunction, ncclProfilerCallba
         INFO(NCCL_INIT|NCCL_NET,"NET/Socket : Using%s", line);
       }
     }
-    pthread_mutex_unlock(&ncclNetSocketLock);
   }
   return ncclSuccess;
 }
@@ -116,6 +124,8 @@ ncclResult_t ncclNetSocketGetProperties(int dev, ncclNetProperties_t* props) {
   props->netDeviceType    = NCCL_NET_DEVICE_HOST;
   props->netDeviceVersion = NCCL_NET_DEVICE_INVALID_VERSION;
   props->maxP2pBytes = NCCL_MAX_NET_SIZE_BYTES;
+  props->maxCollBytes = MAX_COLLNET_SIZE;
+  props->maxMultiRequestSize = 1;
   return ncclSuccess;
 }
 
@@ -193,8 +203,8 @@ struct ncclNetSocketThreadResources {
   int stop;
   struct ncclNetSocketComm* comm;
   struct ncclProfilerInfo* pInfo;
-  pthread_mutex_t threadLock;
-  pthread_cond_t  threadCond;
+  std::mutex threadMutex;
+  std::condition_variable threadCond;
 };
 
 struct ncclNetSocketListenComm {
@@ -269,11 +279,8 @@ void* persistentSocketThread(void *args_) {
       } while (repeat);
     }
     if (idle) {
-      pthread_mutex_lock(&resource->threadLock);
-      while (mark == myQueue->next && resource->stop == 0) { // no new tasks, wait
-        pthread_cond_wait(&resource->threadCond, &resource->threadLock);
-      }
-      pthread_mutex_unlock(&resource->threadLock);
+      std::unique_lock<std::mutex> lock(resource->threadMutex);
+      resource->threadCond.wait(lock, [&] { return mark != myQueue->next || resource->stop; });
     }
     if (resource->stop) return NULL;
   }
@@ -335,7 +342,7 @@ ncclResult_t ncclNetSocketGetNsockNthread(int dev, int* ns, int* nt) {
   goto exit;
 }
 
-ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm) {
+ncclResult_t ncclNetSocketListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     WARN("NET/Socket : ncclNetSocketListen dev=%d ncclNetIfs=%d", dev, ncclNetIfs);
     return ncclInternalError;
@@ -364,7 +371,7 @@ ncclResult_t ncclNetSocketListen(int dev, void* opaqueHandle, void** listenComm)
 }
 
 #define SOCKET_CTRL_SIZE (sizeof(int))
-ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
+ncclResult_t ncclNetSocketConnect(void* ctx, int dev, void* opaqueHandle, void** sendComm, ncclNetDeviceHandle_t** /*sendDevComm*/) {
   if (dev < 0 || dev >= ncclNetIfs) { // data transfer socket is based on specified dev
     return ncclInternalError;
   }
@@ -380,7 +387,7 @@ ncclResult_t ncclNetSocketConnect(int dev, ncclNetCommConfig_t* config, void* op
   if (stage->state == ncclNetSocketCommStateConnect) goto socket_connect_check;
   if (stage->state == ncclNetSocketCommStateSend) goto socket_send;
 
-  NCCLCHECK(ncclCalloc(&comm, 1));
+  comm = new ncclNetSocketComm();
   stage->comm = comm;
   comm->nSocks = handle->nSocks;
   comm->nThreads = handle->nThreads;
@@ -422,7 +429,7 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic
   if (stage->state == ncclNetSocketCommStateAccept) goto socket_accept_check;
   if (stage->state == ncclNetSocketCommStateRecv) goto socket_recv;
 
-  NCCLCHECK(ncclCalloc(&rComm, 1));
+  rComm = new ncclNetSocketComm();
   stage->comm = rComm;
   rComm->nSocks = lComm->nSocks;
   rComm->nThreads = lComm->nThreads;
@@ -449,9 +456,9 @@ ncclResult_t ncclNetSocketAccept(void* listenComm, void** recvComm, ncclNetDevic
     if (done == 0) return ncclSuccess;
 
     if (sendSockIdx == rComm->nSocks)
-      memcpy(&rComm->ctrlSock, sock, sizeof(struct ncclSocket));
+      rComm->ctrlSock = *sock;
     else
-      memcpy(rComm->socks+sendSockIdx, sock, sizeof(struct ncclSocket));
+      rComm->socks[sendSockIdx] = *sock;
     free(sock);
   }
   NCCLCHECK(ncclCalloc(&rComm->inlineData, MAX_REQUESTS * (SOCKET_CTRL_SIZE + ncclParamSocketInlineSize())));
@@ -501,8 +508,6 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro
 #ifdef NCCL_ENABLE_NET_PROFILING
     res->pInfo = pInfo;
 #endif
-    pthread_mutex_init(&res->threadLock, NULL);
-    pthread_cond_init(&res->threadCond, NULL);
     PTHREADCHECK(pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res), "pthread_create");
     ncclSetThreadName(comm->helperThread[tid], "NCCL Sock%c%1u%2u%2u", op == NCCL_SOCKET_SEND ? 'S' : 'R', comm->dev, tid, comm->cudaDev);
   }
@@ -517,10 +522,9 @@ ncclResult_t ncclNetSocketGetTask(struct ncclNetSocketComm* comm, struct ncclPro
     comm->nextSock = (comm->nextSock + 1) % comm->nSocks;
     r->used = 1;
     *req = r;
-    pthread_mutex_lock(&res->threadLock);
+    std::lock_guard<std::mutex> lock(res->threadMutex);
     queue->next = (queue->next+1)%queue->len;
-    pthread_cond_signal(&res->threadCond);
-    pthread_mutex_unlock(&res->threadLock);
+    res->threadCond.notify_one();
     return ncclSuccess;
   }
   WARN("NET/Socket : unable to allocate subtasks");
@@ -686,10 +690,11 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
     for (int i=0; i<comm->nThreads; i++) {
       struct ncclNetSocketThreadResources* res = comm->threadResources+i;
       if (comm->helperThread[i]) {
-        pthread_mutex_lock(&res->threadLock);
-        res->stop = 1;
-        pthread_cond_signal(&res->threadCond);
-        pthread_mutex_unlock(&res->threadLock);
+        {
+          std::lock_guard<std::mutex> lock(res->threadMutex);
+          res->stop = 1;
+          res->threadCond.notify_one();
+        }
         PTHREADCHECK(pthread_join(comm->helperThread[i], NULL), "pthread_join");
       }
       free(res->threadTaskQueue.tasks);
@@ -702,11 +707,16 @@ ncclResult_t ncclNetSocketClose(void* opaqueComm) {
       if (ready) NCCLCHECK(ncclSocketClose(&comm->socks[i]));
     }
     if(comm->inlineData) free(comm->inlineData);
-    free(comm);
+    delete comm;
   }
   return ncclSuccess;
 }
 
+ncclResult_t ncclNetSocketFinalize(void* ctx) {
+  netRefCount--;
+  return ncclSuccess;
+}
+
 ncclNet_t ncclNetSocket = {
   "Socket",
   ncclNetSocketInit,
@@ -727,5 +737,7 @@ ncclNet_t ncclNetSocket = {
   ncclNetSocketCloseListen,
   NULL /* getDeviceMr */,
   NULL /* irecvConsumed */,
-  NULL /* mergeDevices */
+  NULL /* mergeDevices */,
+  ncclNetSocketFinalize,
+  NULL /* setNetAttr */,
 };
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index da8d263f1..1f13bb01b 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -48,7 +48,7 @@ struct ncclTransport nvlsTransport = {
   { NULL, NULL, nvlsRecvFree, NULL, NULL, NULL, NULL, NULL }
 };
 
-ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
+ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop, int rank, unsigned int nranks, CUmemGenericAllocationHandle *mcHandle, char *shareableHandle) {
   CUmemAllocationHandleType type = ncclCuMemHandleType;
   size_t size = prop->size;
 
@@ -70,7 +70,7 @@ ncclResult_t nvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *prop,
   return ncclSuccess;
 }
 
-ncclResult_t nvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
+ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle) {
   CUmemAllocationHandleType type = ncclCuMemHandleType;
   int fd = -1;
   ncclResult_t ret = ncclSuccess;
@@ -205,7 +205,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
 ncclResult_t ncclNvlsTreeConnect(struct ncclComm* comm) {
   ncclResult_t ret = ncclSuccess;
   if (comm && comm->nvlsSupport && comm->nNodes > 1) {
-    for (int c = 0; c < comm->nChannels; c++) {
+    for (int c = 0; c < comm->nvlsChannels; c++) {
       struct ncclChannel* channel = comm->channels + c;
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 1, &channel->nvls.treeUp, 0), ret, fail);
       NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->nvls.treeUp, NCCL_MAX_NVLS_TREE_ARITY, channel->nvls.treeDown, 0), ret, fail);
@@ -242,12 +242,12 @@ static ncclResult_t nvlsAllocateMem(struct ncclComm* comm, const CUmemAccessDesc
   mcprop.size = mcsize;
 
   if (comm->localRank == 0) {
-    NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, mcHandle, shareableHandle), ret, fail);
     allocMcHandle = 1;
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
   } else {
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-    NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], mcHandle), ret, fail);
     allocMcHandle = 1;
   }
 
@@ -330,7 +330,7 @@ ncclResult_t ncclNvlsBufferSetup(struct ncclComm* comm) {
   nHeads = comm->channels[0].nvls.nHeads;
   headRank = comm->channels[0].nvls.headRank;
   resources = comm->nvlsResources;
-  nChannels = comm->nvlsResources->nChannels;
+  nChannels = comm->nvlsChannels;
   nvlsStepSize = comm->nvlsChunkSize;
   buffSize = nvlsStepSize * NCCL_STEPS;
   nvlsPerRankSize = nChannels * 2 * buffSize;
@@ -391,7 +391,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   if (nvlsShare) {
     /* reuse NVLS resources */
     comm->nvlsChannels = std::min(comm->nvlsChannels, parent->nvlsResources->nChannels);
-    for (int c = 0; c < comm->nChannels; c++) {
+    for (int c = 0; c < comm->nvlsChannels; c++) {
       NCCLCHECKGOTO(initNvlsChannel(comm, c, parent, true), res, fail);
     }
 
@@ -400,7 +400,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   } else {
     struct ncclNvlsSharedRes* resources = NULL;
     int nHeads = comm->channels[0].nvls.nHeads;
-    int nChannels = comm->nChannels;
+    int nChannels = comm->nvlsChannels;
     size_t memSize = 64;
     size_t creditSize = nChannels * 2 * memSize * nHeads;
     int nvlsStepSize = comm->nvlsChunkSize;
@@ -420,7 +420,7 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
     }
     comm->nvlsResources->nChannels = comm->nvlsChannels;
 
-    for (int c = 0; c < comm->nChannels; c++) {
+    for (int c = 0; c < nChannels; c++) {
       NCCLCHECKGOTO(initNvlsChannel(comm, c, NULL, false), res, fail);
     }
 
@@ -486,21 +486,21 @@ ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent) {
   // MNNVL does not support NVLS buffer registration
   if (!comm->MNNVL && comm->nvlsResources->nvlsShmemHandle == NULL) {
     /* create shared memory for fast NVLS buffer registration */
-    typeSize = sizeof(struct localRegData) << 1;
+    typeSize = DIVUP(sizeof(struct localRegData) << 1, CACHE_LINE_SIZE) * CACHE_LINE_SIZE;
 
     if (comm->localRank == 0) {
       shmPath[0] = '\0';
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (CACHE_LINE_SIZE * comm->localRanks + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, comm->localRanks - 1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
     } else {
       NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shmPath, sizeof(shmPath)), res, fail);
-      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (sizeof(size_t) + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
+      NCCLCHECKGOTO(ncclShmOpen(shmPath, sizeof(shmPath), (CACHE_LINE_SIZE * comm->localRanks + typeSize * comm->localRanks) * 2, (void**)&nvlsShmem, NULL, -1, &comm->nvlsResources->nvlsShmemHandle), res, fail);
     }
     /* need 2 pools and a shared counter for shmem-based collectives */
     comm->nvlsResources->nvlsShmem.cnt[0] = (size_t*)nvlsShmem;
-    comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + sizeof(size_t));
+    comm->nvlsResources->nvlsShmem.ptr[0] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[0] + CACHE_LINE_SIZE * comm->localRanks);
     comm->nvlsResources->nvlsShmem.cnt[1] = (size_t*)((char*)comm->nvlsResources->nvlsShmem.ptr[0] + typeSize * comm->localRanks);
-    comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + sizeof(size_t));
+    comm->nvlsResources->nvlsShmem.ptr[1] = (void*)((char*)comm->nvlsResources->nvlsShmem.cnt[1] + CACHE_LINE_SIZE * comm->localRanks);
     comm->nvlsResources->nvlsShmem.round = 0;
     comm->nvlsResources->nvlsShmem.maxTypeSize = typeSize;
   }
@@ -607,11 +607,11 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   mcprop.size = mcsize;
 
   if (comm->localRank == 0) {
-    NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
   } else {
     NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-    NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail);
+    NCCLCHECKGOTO(ncclNvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail);
   }
 
   CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
@@ -751,17 +751,36 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   struct ncclReg *recvRegRecord = NULL;
   bool sendIsValid = false;
   bool recvIsValid = false;
+  void *baseSend = NULL;
+  void *baseRecv = NULL;
+  size_t baseSendSize = 0;
+  size_t baseRecvSize = 0;
 
   *outRegBufUsed = 0;
   if (sendbuff) {
     NCCLCHECK(ncclRegFind(comm, sendbuff, sendbuffSize, &sendRegRecord));
     NCCLCHECK(ncclRegLocalIsValid(sendRegRecord, &sendIsValid));
+    if (sendIsValid) {
+      CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff));
+      if ((uint64_t)baseSend + baseSendSize < (uint64_t)sendbuff + sendbuffSize) {
+        // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path
+        goto exit;
+      }
+    }
   } else {
     sendIsValid = true;
   }
+
   if (recvbuff) {
     NCCLCHECK(ncclRegFind(comm, recvbuff, recvbuffSize, &recvRegRecord));
     NCCLCHECK(ncclRegLocalIsValid(recvRegRecord, &recvIsValid));
+    if (recvIsValid) {
+      CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff));
+      if ((uint64_t)baseRecv + baseRecvSize < (uint64_t)recvbuff + recvbuffSize) {
+        // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path
+        goto exit;
+      }
+    }
   } else {
     recvIsValid = true;
   }
@@ -769,6 +788,7 @@ ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *send
   if (sendIsValid && recvIsValid)
     NCCLCHECK(nvlsRegisterBuffer(comm, sendbuff, recvbuff, sendbuffSize, recvbuffSize, sendRegRecord, recvRegRecord, outRegBufUsed, outRegBufSend, outRegBufRecv));
 
+exit:
   return ncclSuccess;
 }
 
@@ -802,11 +822,19 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
   *outRegBufUsed = 0;
   if (sendbuff) {
     CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseSend, &baseSendSize, (CUdeviceptr)sendbuff));
+    if ((uint64_t)baseSend + baseSendSize < (uint64_t)sendbuff + sendbuffSize) {
+      // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path
+      goto exit;
+    }
     NCCLCHECK(ncclCommGraphRegister(comm, baseSend, baseSendSize, (void**)&sendRegRecord));
   }
 
   if (recvbuff) {
     CUCHECK(cuMemGetAddressRange((CUdeviceptr *)&baseRecv, &baseRecvSize, (CUdeviceptr)recvbuff));
+    if ((uint64_t)baseRecv + baseRecvSize < (uint64_t)recvbuff + recvbuffSize) {
+      // the virtual address is backed by multiple physical memory regions, just fall back to non-UB path
+      goto exit;
+    }
     NCCLCHECK(ncclCommGraphRegister(comm, baseRecv, baseRecvSize, (void**)&recvRegRecord));
   }
 
@@ -835,82 +863,8 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
     if (recvbuff) NCCLCHECK(ncclCommGraphDeregister(comm, recvRegRecord));
   }
 
-  return ncclSuccess;
-}
-
-ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm) {
-  ncclResult_t ret = ncclSuccess;
-  if (comm && comm->nvlsSupport) {
-    CUmulticastObjectProp mcprop = {};
-    CUmemGenericAllocationHandle mcHandle;
-    char shareableHandle[NVLS_HANDLE_SIZE];
-    CUmemAccessDesc accessDesc = {};
-
-    mcprop.numDevices = comm->localRanks;
-    mcprop.handleTypes = ncclCuMemHandleType;
-    mcprop.flags = 0;
-    mcprop.size = comm->baseStride;
-
-    if (comm->localRank == 0) {
-      NCCLCHECKGOTO(nvlsGroupCreate(comm, &mcprop, comm->localRank, comm->localRanks, &mcHandle, shareableHandle), ret, fail);
-      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-    } else {
-      NCCLCHECKGOTO(bootstrapIntraNodeBroadcast(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, 0, shareableHandle, NVLS_HANDLE_SIZE), ret, fail);
-      NCCLCHECKGOTO(nvlsGroupConnect(comm, shareableHandle, comm->localRankToRank[0], &mcHandle), ret, fail);
-    }
-
-    CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->cudaDev), ret, fail);
-    CUCHECKGOTO(cuMemAddressReserve((CUdeviceptr*)&comm->baseMCSymPtr, comm->baseStride, NCCL_MAX_PAGE_SIZE, 0, 0), ret, fail);
-    CUCHECKGOTO(cuMemMap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, 0, mcHandle, 0), ret, fail);
-    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    accessDesc.location.id = comm->cudaDev;
-    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    CUCHECKGOTO(cuMemSetAccess((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride, &accessDesc, 1), ret, fail);
-    comm->symMCHandle = mcHandle;
-  }
 exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm) {
-  ncclResult_t ret = ncclSuccess;
-  if (comm && comm->nvlsSupport && comm->baseMCSymPtr) {
-    CUCHECKGOTO(cuMemUnmap((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail);
-    CUCHECKGOTO(cuMemAddressFree((CUdeviceptr)comm->baseMCSymPtr, comm->baseStride), ret, fail);
-    CUCHECKGOTO(cuMemRelease(comm->symMCHandle), ret, fail);
-  }
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-ncclResult_t ncclNvlsSymmetricMap(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr) {
-  ncclResult_t ret = ncclSuccess;
-  assert((uintptr_t)ucaddr % NCCL_REC_PAGE_SIZE == 0 && ucsize % NCCL_REC_PAGE_SIZE == 0);
-  if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) {
-    CUCHECKGOTO(cuMulticastBindAddr(comm->symMCHandle, offset, (CUdeviceptr)ucaddr, ucsize, 0), ret, fail);
-    INFO(NCCL_ALLOC, "NVLS symmetric alloc mc buffer ptr %p offset %ld UC addr %p UC size %ld symAllocHead %ld", comm->baseMCSymPtr + offset, offset, ucaddr, ucsize, comm->symAllocHead);
-  }
-
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
-ncclResult_t ncclNvlsSymmetricFree(struct ncclComm* comm, size_t ucsize, void* ucaddr) {
-  ncclResult_t ret = ncclSuccess;
-  if (comm && comm->nvlsSupport && ucaddr && ucsize > 0) {
-    size_t offset = (size_t)ucaddr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride);
-    CUCHECKGOTO(cuMulticastUnbind(comm->symMCHandle, comm->cudaDev, offset, ucsize), ret, fail);
-  }
-exit:
-  return ret;
-fail:
-  goto exit;
+  return ncclSuccess;
 }
 
 ncclResult_t ncclNvlsRegResourcesQuery(struct ncclComm* comm, struct ncclTaskColl* info, int* recChannels) {
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index d263dda3a..d9fd01da0 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -955,6 +955,8 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   ncclResult_t ret = ncclSuccess;
   struct ncclReg *regRecord = NULL;
   bool isValid = false;
+  void *baseAddr = NULL;
+  size_t baseSize = 0;
 
   *regBufFlag = 0;
   *offsetOut = 0;
@@ -962,8 +964,11 @@ ncclResult_t ncclIpcLocalRegisterBuffer(ncclComm* comm, const void* userbuff, si
   if (comm && userbuff && buffSize > 0 && nPeers > 0) {
     NCCLCHECKGOTO(ncclRegFind(comm, userbuff, buffSize, &regRecord), ret, fail);
     NCCLCHECKGOTO(ncclRegLocalIsValid(regRecord, &isValid), ret, fail);
-    if (isValid)
+    if (isValid) {
+      CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr *)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+      if ((uint64_t)baseAddr + baseSize < (uint64_t)userbuff + buffSize) goto exit;
       NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, NULL), ret, fail);
+    }
   }
 
 exit:
@@ -1001,6 +1006,7 @@ ncclResult_t ncclIpcGraphRegisterBuffer(ncclComm* comm, const void* userbuff, si
   *peerRmtAddrsOut = NULL;
   if (comm && userbuff && buffSize > 0 && nPeers > 0) {
     CUCHECKGOTO(cuMemGetAddressRange((CUdeviceptr*)&baseAddr, &baseSize, (CUdeviceptr)userbuff), ret, fail);
+    if ((uint64_t)baseAddr + baseSize < (uint64_t)userbuff + buffSize) goto exit;
     NCCLCHECKGOTO(ncclCommGraphRegister(comm, baseAddr, baseSize, (void**)&regRecord), ret, fail);
     NCCLCHECKGOTO(ipcRegisterBuffer(comm, userbuff, buffSize, peerRanks, nPeers, type, regRecord, regBufFlag, offsetOut, peerRmtAddrsOut, &isLegacyIpc), ret, fail);
     if (*regBufFlag) {
@@ -1118,88 +1124,6 @@ static ncclResult_t p2pProxyDeregister(struct ncclProxyConnection* connection, s
   goto exit;
 }
 
-ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm) {
-  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&comm->baseUCSymPtr, comm->baseStride * comm->localRanks, NCCL_MAX_PAGE_SIZE, 0, 0));
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm) {
-  if (comm->baseUCSymPtr) {
-    CUCHECK(cuMemAddressFree((CUdeviceptr)comm->baseUCSymPtr, comm->baseStride * comm->localRanks));
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSymmetricMap(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr) {
-  ncclResult_t ret = ncclSuccess;
-  CUmemGenericAllocationHandle impHandle;
-  int impFd = -1;
-  ncclCuDesc* desc = NULL;
-  CUmemAccessDesc accessDesc = {};
-
-  assert(offset % NCCL_REC_PAGE_SIZE == 0 && size % NCCL_REC_PAGE_SIZE == 0);
-  NCCLCHECKGOTO(ncclCalloc(&desc, comm->localRanks), ret, fail);
-  if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-    memcpy(&desc[comm->localRank].data, &memHandle, sizeof(CUmemGenericAllocationHandle));
-  } else {
-    CUCHECKGOTO(cuMemExportToShareableHandle(&desc[comm->localRank].handle, memHandle, ncclCuMemHandleType, 0), ret, fail);
-  }
-
-  NCCLCHECKGOTO(bootstrapIntraNodeAllGather(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, desc, sizeof(ncclCuDesc)), ret, fail);
-
-  // start mapping
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = comm->cudaDev;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  for (int r = 0; r < comm->localRanks; ++r) {
-    CUdeviceptr maddr;
-    if (r == comm->localRank) {
-      impHandle = memHandle;
-    } else {
-      if (ncclCuMemHandleType == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR) {
-        impFd = -1;
-        NCCLCHECKGOTO(ncclProxyClientGetFdBlocking(comm, comm->localRankToRank[r], &desc[r].data, &impFd), ret, fail);
-        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)(uintptr_t)impFd, ncclCuMemHandleType), ret, fail);
-        SYSCHECKGOTO(close(impFd), "close", ret, fail);
-      } else {
-        CUCHECKGOTO(cuMemImportFromShareableHandle(&impHandle, (void*)&desc[r].handle, ncclCuMemHandleType), ret, fail);
-      }
-    }
-    maddr = (CUdeviceptr)(comm->baseUCSymPtr + (size_t)r * comm->baseStride + offset);
-    CUCHECKGOTO(cuMemMap(maddr, size, 0, impHandle, 0), ret, fail);
-    CUCHECKGOTO(cuMemSetAccess(maddr, size, &accessDesc, 1), ret, fail);
-
-    if (r == comm->localRank) {
-      *symPtr = (void*)maddr;
-    } else {
-      CUCHECKGOTO(cuMemRelease(impHandle), ret, fail);
-    }
-  }
-
-  INFO(NCCL_ALLOC, "IPC symmetric alloc buffer %p offset %ld size %ld symAllocHead %ld", *symPtr, offset, size, comm->symAllocHead);
-
-exit:
-  free(desc);
-  return ret;
-fail:
-  goto exit;
-}
-
-ncclResult_t ncclIpcSymmetricFree(struct ncclComm* comm, size_t size, void* symPtr) {
-  ncclResult_t ret = ncclSuccess;
-  if (comm && symPtr && size > 0) {
-    size_t offset = (size_t)symPtr - ((size_t)comm->baseUCSymPtr + comm->localRank * comm->baseStride);
-    for (int r = 0; r < comm->localRanks; ++r) {
-      CUdeviceptr peerAddr = (CUdeviceptr)(comm->baseUCSymPtr + r * comm->baseStride + offset);
-      CUCHECKGOTO(cuMemUnmap(peerAddr, size), ret, fail);
-    }
-  }
-exit:
-  return ret;
-fail:
-  goto exit;
-}
-
 struct ncclTransport p2pTransport = {
   "P2P",
   p2pCanConnect,
diff --git a/src/transport/profiler.cc b/src/transport/profiler.cc
index 6e7b33c16..354fa57bb 100644
--- a/src/transport/profiler.cc
+++ b/src/transport/profiler.cc
@@ -10,7 +10,7 @@
 
 static ncclResult_t profilerProxyConnect(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done) {
   connection->proxyAppendPtr = &connection->proxyAppend;
-  connection->shared = 1;
+  connection->shared = 0;
   return ncclSuccess;
 }
 

From e11d7f77c126561e35909407a5bd1461a437322b Mon Sep 17 00:00:00 2001
From: Mark Santesson <msantesson@nvidia.com>
Date: Wed, 10 Sep 2025 15:38:04 -0700
Subject: [PATCH 18/21] Add root CMakeLists.txt file

---
 CMakeLists.txt         | 166 +++++++++++++++++++++++++++++++++++++++++
 Makefile               |   9 +--
 ext-net/CMakeLists.txt |   4 +
 3 files changed, 174 insertions(+), 5 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 ext-net/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..1941cdafe
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,166 @@
+cmake_minimum_required(VERSION 3.25) # ipp6 is using 3.28
+
+# Version information
+# Read makefiles/version.mk file
+file(READ ${CMAKE_SOURCE_DIR}/makefiles/version.mk VERSION_CONTENT)
+string(REGEX REPLACE ".*NCCL_MAJOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MAJOR "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_MINOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MINOR "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_PATCH[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_PATCH "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_SUFFIX[ ]*:=[ ]*([a-zA-Z0-9]*).*" "\\1" NCCL_SUFFIX "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*PKG_REVISION[ ]*:=[ ]*([0-9]+).*" "\\1" PKG_REVISION "${VERSION_CONTENT}")
+math(EXPR NCCL_VERSION_CODE "(${NCCL_MAJOR} * 10000) + (${NCCL_MINOR} * 100) + ${NCCL_PATCH}")
+
+# Make version information available to C++ source files
+add_compile_definitions(
+    NCCL_USE_CMAKE
+    NCCL_MAJOR=${NCCL_MAJOR}
+    NCCL_MINOR=${NCCL_MINOR}
+    NCCL_PATCH=${NCCL_PATCH}
+    NCCL_VERSION_CODE=${NCCL_VERSION_CODE}
+)
+
+set(ENV{NCCL_USE_CMAKE} "1")
+
+project(NCCL VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+        LANGUAGES CUDA CXX C)
+
+# Make CMAKE_BUILD_TYPE to release by default if not set
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+option(VERBOSE "Enable verbose output" OFF)
+option(KEEP "Keep intermediate files" OFF)
+option(DEBUG "Enable debug build" OFF)
+option(ASAN "Enable Address Sanitizer" OFF)
+option(UBSAN "Enable Undefined Behavior Sanitizer" OFF)
+option(TRACE "Enable tracing" OFF)
+option(WERROR "Treat warnings as errors" OFF)
+option(PROFAPI "Enable profiling API" ON)
+option(NVTX "Enable NVTX" ON)
+option(RDMA_CORE "Enable RDMA core" OFF)
+option(NET_PROFILER "Enable network profiler" OFF)
+option(MLX5DV "Enable MLX5DV" OFF)
+option(MAX_EXT_NET_PLUGINS "Maximum external network plugins" 0)
+
+find_package(CUDAToolkit REQUIRED)
+
+# CUDA version detection
+string(REGEX MATCH "([0-9]+\\.[0-9]+)" CUDA_VERSION "${CUDAToolkit_VERSION}")
+
+# Extract major and minor version numbers
+string(REGEX MATCH "([0-9]+)" CUDA_MAJOR "${CUDA_VERSION}")
+string(REGEX MATCH "([0-9]+)$" CUDA_MINOR "${CUDA_VERSION}")
+string(REGEX REPLACE ".*\\.([0-9]+)$" "\\1" CUDA_MINOR "${CUDA_VERSION}")
+
+# Add CUDA version definitions after find_package
+add_compile_definitions(
+    CUDA_MAJOR=${CUDA_MAJOR}
+    CUDA_MINOR=${CUDA_MINOR}
+)
+
+# CUDA architecture flags
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined or empty, setting default values based on CUDA version")
+
+    if(${CUDA_MAJOR} LESS 9)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61")
+    elseif(${CUDA_MAJOR} EQUAL 9)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
+    elseif(${CUDA_MAJOR} EQUAL 10)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
+    elseif(${CUDA_MAJOR} EQUAL 11)
+        if(${CUDA_MINOR} LESS 8)
+            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80;90")
+        endif()
+    elseif(${CUDA_MAJOR} EQUAL 12)
+        if(${CUDA_MINOR} LESS 8)
+            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;120")
+        endif()
+    elseif(${CUDA_MAJOR} EQUAL 13)
+        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
+    else()
+        # For future CUDA versions, include all architectures up to the latest known
+        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
+    endif()
+endif()
+message(STATUS "Using CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -Wvla -g")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -fPIC")
+
+# Sanitizer options
+if(ASAN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -static-libasan")
+endif()
+
+if(UBSAN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined -static-libubsan")
+endif()
+
+# Additional options
+if(TRACE)
+    add_definitions(-DENABLE_TRACE)
+endif()
+
+if(NOT NVTX)
+    add_definitions(-DNVTX_DISABLE)
+endif()
+
+if(WERROR)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+endif()
+
+if(PROFAPI)
+    add_definitions(-DPROFAPI)
+endif()
+
+set(EXTRA_LIBS)
+
+# RDMA and MLX5DV are Linux-specific features
+if(RDMA_CORE)
+    add_definitions(-DNCCL_BUILD_RDMA_CORE=1)
+    find_library(VERBS_LIBRARY NAMES verbs)
+    if(VERBS_LIBRARY)
+        list(APPEND EXTRA_LIBS ${VERBS_LIBRARY})
+    endif()
+endif()
+
+if(MLX5DV)
+    add_definitions(-DNCCL_BUILD_MLX5DV=1)
+    find_library(MLX5_LIBRARY NAMES mlx5)
+    if(MLX5_LIBRARY)
+        list(APPEND EXTRA_LIBS ${MLX5_LIBRARY})
+    endif()
+endif()
+
+if(NET_PROFILER)
+    add_definitions(-DNCCL_ENABLE_NET_PROFILING=1)
+endif()
+
+if(MAX_EXT_NET_PLUGINS GREATER 0)
+    add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS})
+endif()
+
+# Library dependencies
+find_library(RT_LIBRARY NAMES rt)
+if(RT_LIBRARY)
+    list(APPEND EXTRA_LIBS ${RT_LIBRARY})
+endif()
+
+# Debug/Release specific flags
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS} -O0 -G -g")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS} -O3")
+
+add_subdirectory(ext-net)
+add_subdirectory(ext-profiler/example)
+add_subdirectory(ext-tuner/example)
+add_subdirectory(src)
diff --git a/Makefile b/Makefile
index caed3d42a..458a50741 100644
--- a/Makefile
+++ b/Makefile
@@ -3,15 +3,14 @@
 #
 # See LICENSE.txt for license information
 #
-.PHONY : all clean
+.PHONY: all clean
 
-default : src.build
-install : src.install
+default: src.build
+install: src.install
 BUILDDIR ?= $(abspath ./build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
 TARGETS := src pkg
 clean: ${TARGETS:%=%.clean}
-test.build: src.build
 LICENSE_FILES := LICENSE.txt
 LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
 lic: $(LICENSE_TARGETS)
@@ -19,7 +18,7 @@ lic: $(LICENSE_TARGETS)
 ${BUILDDIR}/%.txt: %.txt
 	@printf "Copying    %-35s > %s\n" $< $@
 	mkdir -p ${BUILDDIR}
-	cp $< $@
+	install -m 644 $< $@
 
 src.%:
 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
diff --git a/ext-net/CMakeLists.txt b/ext-net/CMakeLists.txt
new file mode 100644
index 000000000..a2cc38df2
--- /dev/null
+++ b/ext-net/CMakeLists.txt
@@ -0,0 +1,4 @@
+# Since all the plugins generate binary with the same name, build only one of them
+add_subdirectory(example)
+# add_subdirectory(ib_sharp)
+# add_subdirectory(mock)

From 8d26308e6aba7f1667b24a861b5dc73f0f2e1f40 Mon Sep 17 00:00:00 2001
From: Mark Santesson <msantesson@nvidia.com>
Date: Wed, 24 Sep 2025 12:31:42 -0700
Subject: [PATCH 19/21] Add examples directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The NCCL examples directory provides users and developers with
practical code samples that highlight NCCL’s core features. It covers
basic operations like communicator initialization, point-to-point
communication, and collective operations, as well as advanced features
such as User Buffer (UB), symmetric memory, and the device API.
---
 Makefile                                      |   6 +-
 .../Makefile                                  |  60 ++++
 .../README.md                                 | 177 ++++++++++
 .../main.cc                                   | 256 ++++++++++++++
 .../02_one_device_per_pthread/Makefile        |  68 ++++
 .../02_one_device_per_pthread/README.md       | 158 +++++++++
 .../02_one_device_per_pthread/main.cc         | 196 ++++++++++
 .../03_one_device_per_process_mpi/Makefile    |  66 ++++
 .../03_one_device_per_process_mpi/README.md   | 196 ++++++++++
 .../03_one_device_per_process_mpi/main.cc     | 249 +++++++++++++
 examples/01_communicators/Makefile            |  59 ++++
 examples/01_communicators/README.md           | 107 ++++++
 .../01_ring_pattern/Makefile                  |  57 +++
 .../01_ring_pattern/README.md                 | 149 ++++++++
 .../02_point_to_point/01_ring_pattern/main.cc | 273 ++++++++++++++
 examples/02_point_to_point/Makefile           |  47 +++
 examples/02_point_to_point/README.md          |  65 ++++
 examples/03_collectives/01_allreduce/Makefile |  57 +++
 .../03_collectives/01_allreduce/README.md     | 141 ++++++++
 examples/03_collectives/01_allreduce/main.cc  | 201 +++++++++++
 examples/03_collectives/Makefile              |  47 +++
 examples/03_collectives/README.md             |  68 ++++
 .../01_allreduce/Makefile                     |  77 ++++
 .../01_allreduce/README.md                    | 163 +++++++++
 .../01_allreduce/main.cc                      | 214 +++++++++++
 examples/04_user_buffer_registration/Makefile |  47 +++
 .../04_user_buffer_registration/README.md     |  73 ++++
 .../05_symmetric_memory/01_allreduce/Makefile |  77 ++++
 .../01_allreduce/README.md                    | 165 +++++++++
 .../05_symmetric_memory/01_allreduce/main.cc  | 220 ++++++++++++
 examples/05_symmetric_memory/Makefile         |  47 +++
 examples/05_symmetric_memory/README.md        |  72 ++++
 examples/06_device_api/01_allreduce/Makefile  |  81 +++++
 examples/06_device_api/01_allreduce/README.md | 218 ++++++++++++
 examples/06_device_api/01_allreduce/main.cu   | 251 +++++++++++++
 examples/06_device_api/Makefile               |  47 +++
 examples/06_device_api/README.md              |  70 ++++
 examples/Makefile                             |  54 +++
 examples/README.md                            | 146 ++++++++
 examples/common/README.md                     |  36 ++
 examples/common/include/mpi_utils.h           |  23 ++
 examples/common/include/nccl_utils.h          |  40 +++
 examples/common/include/utils.h               |  55 +++
 examples/common/src/utils.cc                  | 334 ++++++++++++++++++
 makefiles/examples.mk                         |  31 ++
 45 files changed, 5243 insertions(+), 1 deletion(-)
 create mode 100644 examples/01_communicators/01_multiple_devices_single_process/Makefile
 create mode 100644 examples/01_communicators/01_multiple_devices_single_process/README.md
 create mode 100644 examples/01_communicators/01_multiple_devices_single_process/main.cc
 create mode 100644 examples/01_communicators/02_one_device_per_pthread/Makefile
 create mode 100644 examples/01_communicators/02_one_device_per_pthread/README.md
 create mode 100644 examples/01_communicators/02_one_device_per_pthread/main.cc
 create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/Makefile
 create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/README.md
 create mode 100644 examples/01_communicators/03_one_device_per_process_mpi/main.cc
 create mode 100644 examples/01_communicators/Makefile
 create mode 100644 examples/01_communicators/README.md
 create mode 100644 examples/02_point_to_point/01_ring_pattern/Makefile
 create mode 100644 examples/02_point_to_point/01_ring_pattern/README.md
 create mode 100644 examples/02_point_to_point/01_ring_pattern/main.cc
 create mode 100644 examples/02_point_to_point/Makefile
 create mode 100644 examples/02_point_to_point/README.md
 create mode 100644 examples/03_collectives/01_allreduce/Makefile
 create mode 100644 examples/03_collectives/01_allreduce/README.md
 create mode 100644 examples/03_collectives/01_allreduce/main.cc
 create mode 100644 examples/03_collectives/Makefile
 create mode 100644 examples/03_collectives/README.md
 create mode 100644 examples/04_user_buffer_registration/01_allreduce/Makefile
 create mode 100644 examples/04_user_buffer_registration/01_allreduce/README.md
 create mode 100644 examples/04_user_buffer_registration/01_allreduce/main.cc
 create mode 100644 examples/04_user_buffer_registration/Makefile
 create mode 100644 examples/04_user_buffer_registration/README.md
 create mode 100644 examples/05_symmetric_memory/01_allreduce/Makefile
 create mode 100644 examples/05_symmetric_memory/01_allreduce/README.md
 create mode 100644 examples/05_symmetric_memory/01_allreduce/main.cc
 create mode 100644 examples/05_symmetric_memory/Makefile
 create mode 100644 examples/05_symmetric_memory/README.md
 create mode 100644 examples/06_device_api/01_allreduce/Makefile
 create mode 100644 examples/06_device_api/01_allreduce/README.md
 create mode 100644 examples/06_device_api/01_allreduce/main.cu
 create mode 100644 examples/06_device_api/Makefile
 create mode 100644 examples/06_device_api/README.md
 create mode 100644 examples/Makefile
 create mode 100644 examples/README.md
 create mode 100644 examples/common/README.md
 create mode 100644 examples/common/include/mpi_utils.h
 create mode 100644 examples/common/include/nccl_utils.h
 create mode 100644 examples/common/include/utils.h
 create mode 100644 examples/common/src/utils.cc
 create mode 100644 makefiles/examples.mk

diff --git a/Makefile b/Makefile
index 458a50741..2b1a57c5a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -11,6 +11,7 @@ BUILDDIR ?= $(abspath ./build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
 TARGETS := src pkg
 clean: ${TARGETS:%=%.clean}
+examples.build: src.build
 LICENSE_FILES := LICENSE.txt
 LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
 lic: $(LICENSE_TARGETS)
@@ -23,6 +24,9 @@ ${BUILDDIR}/%.txt: %.txt
 src.%:
 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
 
+examples: src.build
+	${MAKE} -C examples NCCL_HOME=${ABSBUILDDIR}
+
 pkg.%:
 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
 
diff --git a/examples/01_communicators/01_multiple_devices_single_process/Makefile b/examples/01_communicators/01_multiple_devices_single_process/Makefile
new file mode 100644
index 000000000..edea518fc
--- /dev/null
+++ b/examples/01_communicators/01_multiple_devices_single_process/Makefile
@@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = multiple_devices_single_process
+
+# Source files
+SOURCES = main.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Multiple Devices Single Process"
+	@echo "=============================================="
+	@echo ""
+	@echo "This example shows how to use ncclCommInitAll to create"
+	@echo "communicators for multiple GPUs in a single process."
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/01_communicators/01_multiple_devices_single_process/README.md b/examples/01_communicators/01_multiple_devices_single_process/README.md
new file mode 100644
index 000000000..9b4fe0f4a
--- /dev/null
+++ b/examples/01_communicators/01_multiple_devices_single_process/README.md
@@ -0,0 +1,177 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Example: Multiple Devices Single Process
+
+This example demonstrates how to use `ncclCommInitAll` to create NCCL
+communicators for multiple GPUs within a single process, without requiring MPI
+or threading.
+
+## Overview
+
+The `ncclCommInitAll` function provides a simplified way to initialize NCCL
+communicators when:
+- All GPUs are managed by a single process
+- Running on a single node
+- No multi-process coordination is needed
+
+This approach is ideal for single-node multi-GPU applications where simplicity
+is preferred over the flexibility of multi-process setups.
+
+## What This Example Does
+
+1. **Device Detection**:
+   - Queries available CUDA devices
+   - Lists device properties for each GPU
+
+2. **Communicator Creation**:
+   - Uses `ncclCommInitAll` to create all communicators in one call
+   - Automatically assigns NCCL ranks 0 through n-1
+   - No NCCL unique ID distribution needed
+
+3. **Verification**:
+   - Displays communicator information for each GPU
+   - Shows rank assignments and device mappings
+   - Confirms successful initialization
+
+4. **Cleanup**:
+   - Properly destroys communicators and streams
+   - Demonstrates correct resource management
+
+## Building and Running
+
+### Build
+```shell
+make [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run with all available GPUs
+```shell
+./multiple_devices_single_process
+```
+
+### Run with specific GPUs
+```shell
+# Use only GPUs 0 and 1
+CUDA_VISIBLE_DEVICES=0,1 ./multiple_devices_single_process
+```
+
+### Run with NCCL debug output
+```shell
+NCCL_DEBUG=INFO ./multiple_devices_single_process
+```
+
+## Code Walk-through
+
+### Key Function: ncclCommInitAll
+For single-node collective examples we use `ncclCommInitAll` as it creates a clique of communicators in one call.
+```c
+int num_gpus; // num_gpus is set by querying the CUDA devices
+ncclComm_t* comms;
+int* devices; // devices needs to be populated with CUDA devices used
+
+// Create communicators for all devices in one call
+NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices));
+```
+
+This single function call:
+- Creates `num_gpus` communicators
+- Assigns ranks 0 to (num_gpus-1)
+- Sets up internal communication paths
+- No unique ID needed
+
+### Comparison with ncclCommInitRank
+`ncclCommInitAll` is a convenience function and has the same functionality as:
+```c
+ncclUniqueId id;
+
+ncclGetUniqueId(&id);
+
+ncclGroupStart();
+for(int i = 0; i < num_gpus; i++) {
+  cudaSetDevice(i);
+  ncclCommInitRank(comms[i], num_gpus, id, devices[i]);
+}
+ncclGroupEnd();
+```
+
+## Expected Output
+
+```
+Found 4 CUDA device(s) available
+
+Available GPU devices:
+  GPU 0: NVIDIA A100-SXM4-40GB (CUDA Device 0)
+    Compute Capability: 8.0
+    Memory: 40.0 GB
+  GPU 1: NVIDIA A100-SXM4-40GB (CUDA Device 1)
+    Compute Capability: 8.0
+    Memory: 40.0 GB
+  GPU 2: NVIDIA A100-SXM4-40GB (CUDA Device 2)
+    Compute Capability: 8.0
+    Memory: 40.0 GB
+  GPU 3: NVIDIA A100-SXM4-40GB (CUDA Device 3)
+    Compute Capability: 8.0
+    Memory: 40.0 GB
+Using ncclCommInitAll() to create all communicators simultaneously
+All 4 NCCL communicators initialized successfully
+
+Communicator Details:
+  Communicator 0: Rank 0/4 on CUDA device 0
+  Communicator 1: Rank 1/4 on CUDA device 1
+  Communicator 2: Rank 2/4 on CUDA device 2
+  Communicator 3: Rank 3/4 on CUDA device 3
+All communicators have the expected size of 4
+
+Synchronizing all CUDA streams...
+All streams synchronized
+Destroying NCCL communicators...
+All NCCL communicators destroyed
+Destroying CUDA streams...
+All CUDA streams destroyed
+
+=============================================================
+SUCCESS: Multiple devices single process example completed!
+=============================================================
+```
+
+## When to Use ncclCommInitAll
+
+### Ideal Use Cases
+- **Single-node workloads**: All GPUs on one machine
+- **Simple applications**: No multi-process complexity needed
+- **Testing/Development**: Quick setup for experiments
+
+### When NOT to Use
+- **Multi-node clusters**: Need MPI for cross-node communication
+- **Process isolation**: When GPUs should be in separate processes
+
+## Performance Considerations
+
+- **Advantages**:
+  - Lower overhead (no inter process communication)
+  - Simpler memory management
+  - Direct access to all GPUs
+
+- **Disadvantages**:
+  - Limited by single process resources
+  - Cannot scale beyond one node
+
+## Common Issues and Solutions
+
+1. **Not all GPUs visible**:
+   - Check `CUDA_VISIBLE_DEVICES`
+   - Ensure user has permissions for all GPUs
+   - Verify no other process is using GPUs exclusively
+
+2. **Out of memory**:
+   - Single process must handle memory for all GPUs
+   - Consider using multiple processes if memory limited
+
+## Next Steps
+
+After understanding this example:
+1. Try the collective operation examples using `ncclCommInitAll`
+2. Compare performance with MPI-based multi-process approach
+3. Experiment with different GPU combinations
diff --git a/examples/01_communicators/01_multiple_devices_single_process/main.cc b/examples/01_communicators/01_multiple_devices_single_process/main.cc
new file mode 100644
index 000000000..19b4edc7c
--- /dev/null
+++ b/examples/01_communicators/01_multiple_devices_single_process/main.cc
@@ -0,0 +1,256 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+/*
+ * NCCL Example: Multiple Devices Single Process
+ * =============================================
+ *
+ * PURPOSE:
+ * This example demonstrates how to initialize NCCL communicators for multiple
+ * GPUs within a single process. This is the simplest NCCL setup and is ideal
+ * for learning NCCL basics or for applications that want to use multiple GPUs
+ * without the complexity of multi-process coordination.
+ *
+ * LEARNING OBJECTIVES:
+ * - Learn how to use ncclCommInitAll() for simple multi-GPU setups
+ * - See proper NCCL communicator lifecycle management
+ * - Understand GPU device management in NCCL applications
+ * - Learn proper resource cleanup patterns
+ *
+ * HOW IT WORKS:
+ * 1. Detect all available CUDA devices
+ * 2. Create communicators for all devices using ncclCommInitAll()
+ * 3. Verify communicator properties (rank, size, device assignment)
+ * 4. Clean up all resources properly
+ *
+ * KEY CONCEPTS:
+ * - ncclCommInitAll(): Creates multiple communicators in a single call
+ * - Single-process topology: All GPUs managed by one process
+ * - Device management: Setting active CUDA device for operations
+ * - Stream management: Each GPU gets its own CUDA stream
+ *
+ * WHEN TO USE THIS PATTERN:
+ * - Learning NCCL fundamentals
+ * - Single-node, multi-GPU applications
+ * - Applications that don't need multi-node scaling
+ * - Prototyping and testing NCCL functionality
+ *
+ * USAGE EXAMPLES:
+ * ./multiple_devices_single_process               # Use all available GPUs
+ *
+ * EXPECTED OUTPUT:
+ * - Detection of all available GPUs
+ * - Successful communicator initialization
+ * - Display of rank/size information for each GPU
+ * - Clean resource cleanup confirmation
+ */
+
+// Enhanced error checking macro for NCCL operations
+// Provides detailed error information including the failed operation
+
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// =============================================================================
+// MAIN FUNCTION - NCCL Communicator Lifecycle Example
+// =============================================================================
+
+int main(int argc, char *argv[]) {
+  // Variables for managing multiple GPU communicators
+  int num_gpus;                 // Number of available CUDA devices
+  ncclComm_t *comms = NULL;     // Array of NCCL communicators (one per GPU)
+  cudaStream_t *streams = NULL; // Array of CUDA streams (one per GPU)
+  int *devices = NULL;          // Array of device IDs to use
+
+  // Discover how many CUDA devices are available
+  // This determines how many NCCL communicators we'll create
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+
+  if (num_gpus == 0) {
+    fprintf(stderr, "ERROR: No CUDA devices found on this system\n");
+    fprintf(
+        stderr,
+        "Please ensure CUDA is properly installed and GPUs are available\n");
+    return 1;
+  }
+
+  printf("Found %d CUDA device(s) available\n\n", num_gpus);
+
+  // =========================================================================
+  // STEP 1: Prepare Device Information and Memory Allocation
+  // =========================================================================
+
+  // Allocate arrays to hold our per-device resources
+  // We need one communicator, stream, and device ID per GPU
+  devices = (int *)malloc(num_gpus * sizeof(int));
+  comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t));
+  streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t));
+
+  if (!devices || !comms || !streams) {
+    fprintf(stderr, "ERROR: Failed to allocate memory for device arrays\n");
+    return 1;
+  }
+
+  // Create device list and display device information
+  // By default, we use all available devices (0, 1, 2, ...)
+  printf("Available GPU devices:\n");
+  for (int i = 0; i < num_gpus; i++) {
+    devices[i] = i; // Use device i for communicator i
+
+    // Query device properties for informational display
+    cudaDeviceProp prop;
+    CUDACHECK(cudaGetDeviceProperties(&prop, devices[i]));
+    printf("  GPU %d: %s (CUDA Device %d)\n", i, prop.name, devices[i]);
+    printf("    Compute Capability: %d.%d\n", prop.major, prop.minor);
+    printf("    Memory: %.1f GB\n",
+           prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
+  }
+
+  // Create a CUDA stream for each GPU
+  // Each GPU needs its own stream for optimal performance
+  for (int i = 0; i < num_gpus; i++) {
+    // Set the active CUDA device before creating resources on it
+    // This ensures the stream is created on the correct GPU
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamCreate(&streams[i]));
+  }
+
+  // =========================================================================
+  // STEP 2 : Initialize NCCL Communicators
+  // =========================================================================
+
+  printf("Using ncclCommInitAll() to create all communicators "
+         "simultaneously\n");
+
+  // ncclCommInitAll() creates all communicators at once and handles the
+  // coordination internally
+  //
+  // Parameters:
+  // - comms: Array to store the created communicators
+  // - num_gpus: Number of communicators to create
+  // - devices: Array of CUDA device IDs to use
+  //
+  // After this call:
+  // - comms[0] will be the communicator for devices[0] with rank 0
+  // - comms[1] will be the communicator for devices[1] with rank 1
+  // - ... and so on
+  //
+  // All communicators will have the same 'size' (total number of
+  // participants)
+  NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices));
+  printf("All %d NCCL communicators initialized successfully\n\n", num_gpus);
+
+  // =========================================================================
+  // STEP 3: Create CUDA Streams and Verify Communicator Properties
+  // =========================================================================
+
+  printf("Communicator Details:\n");
+
+  bool sizes_match = true;
+  for (int i = 0; i < num_gpus; i++) {
+
+    // Query the communicator to verify it was set up correctly
+    // These calls validate that NCCL properly assigned ranks and devices
+    int rank, size, device;
+    // Get this communicator's rank
+    NCCLCHECK(ncclCommUserRank(comms[i], &rank));
+    // Get total number of participants
+    NCCLCHECK(ncclCommCount(comms[i], &size));
+    // Get assigned CUDA device
+    NCCLCHECK(ncclCommCuDevice(comms[i], &device));
+
+    printf("  Communicator %d: Rank %d/%d on CUDA device %d", i, rank, size,
+           device);
+
+    // Verify the assignment is correct
+    if (rank != i) {
+      printf(" [WARNING: Expected rank %d]", i);
+    }
+    if (device != devices[i]) {
+      printf(" [WARNING: Expected device %d]", devices[i]);
+    }
+    printf("\n");
+
+    // Verify that all communicators have the expected size
+    if (size != num_gpus) {
+      printf("WARNING: Communicator %d has size %d, expected %d\n", i, size, num_gpus);
+      sizes_match = false;
+    }
+  }
+  if (sizes_match)
+    printf("All communicators have the expected size of %d\n", num_gpus);
+
+  printf("\n");
+
+  // =========================================================================
+  // STEP 4: Cleanup and Resource Management
+  // =========================================================================
+
+  // IMPORTANT: Proper cleanup is critical for NCCL applications
+  // Resources must be cleaned up in the correct order to avoid issues
+
+  // First, synchronize all streams to ensure no operations are in flight
+  // This prevents destroying resources while they're still being used
+  printf("Synchronizing all CUDA streams...\n");
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamSynchronize(streams[i]));
+  }
+  printf("All streams synchronized\n");
+
+  // Next, destroy NCCL communicators first
+  // This must be done before destroying CUDA resources they depend on
+  printf("Destroying NCCL communicators...\n");
+  for (int i = 0; i < num_gpus; i++) {
+    NCCLCHECK(ncclCommDestroy(comms[i]));
+  }
+  printf("All NCCL communicators destroyed\n");
+
+  // Finally, destroy CUDA streams
+  // This is safe now that the communicators are gone
+  printf("Destroying CUDA streams...\n");
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamDestroy(streams[i]));
+  }
+  printf("All CUDA streams destroyed\n");
+
+  // Free host memory allocations
+  free(devices);
+  free(comms);
+  free(streams);
+
+  printf("\n=============================================================\n");
+  printf("SUCCESS: Multiple devices single process example completed!\n");
+  printf("=============================================================\n\n");
+
+  return 0;
+}
diff --git a/examples/01_communicators/02_one_device_per_pthread/Makefile b/examples/01_communicators/02_one_device_per_pthread/Makefile
new file mode 100644
index 000000000..f7f825a0b
--- /dev/null
+++ b/examples/01_communicators/02_one_device_per_pthread/Makefile
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = one_device_per_pthread
+
+# Add pthread support
+LDFLAGS += -lpthread
+
+# Source files
+SOURCES = main.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+	@echo "Running with default thread count (number of GPUs)"
+	./$(TARGET)
+	@echo ""
+	@if [ "$$(nvidia-smi -L | wc -l)" -ge 2 ]; then \
+		echo "Running with 2 threads"; \
+		NTHREADS=2 ./$(TARGET); \
+	fi
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: One Device per Thread (pthread)"
+	@echo "============================================"
+	@echo ""
+	@echo "This example shows how to use ncclCommInitRank to create"
+	@echo "communicators for multiple GPUs using pthreads."
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build the example (default)"
+	@echo "  test    - Build and run basic tests"
+	@echo "  clean   - Remove build artifacts"
+	@echo "  install - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help    - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/01_communicators/02_one_device_per_pthread/README.md b/examples/01_communicators/02_one_device_per_pthread/README.md
new file mode 100644
index 000000000..170f235b5
--- /dev/null
+++ b/examples/01_communicators/02_one_device_per_pthread/README.md
@@ -0,0 +1,158 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Example: One Device per Thread (pthread)
+
+This example demonstrates NCCL communicator lifecycle management using pthreads, with one GPU per
+thread.
+
+## Overview
+
+This example shows how to use NCCL in a multi-threaded environment where each pthread manages one
+GPU device. It demonstrates the proper initialization and cleanup sequence for NCCL communicators
+within threads.
+
+## What This Example Does
+
+1. **Thread Creation**:
+   - Creates one pthread per available GPU or `NTHREADS` if set
+   - Each thread manages its own CUDA device context
+
+2. **Communicator Creation**:
+   - Uses `ncclCommInitRank` with unique ID across threads
+   - Each thread initializes its own communicator
+   - Demonstrates thread-safe NCCL initialization
+
+3. **Verification**:
+   - Queries communicator properties (rank, size, device)
+   - Confirms successful initialization across all threads
+
+4. **Cleanup**:
+   - Proper resource cleanup order within each thread
+   - Demonstrates correct NCCL and CUDA resource management
+
+## Building and Running
+
+### Build
+```shell
+make  [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run with specific thread count (number of GPUs)
+```shell
+[NTHREADS=n] ./one_device_per_pthread
+```
+
+### Run with NCCL debug output
+```shell
+NCCL_DEBUG=INFO ./one_device_per_pthread
+```
+
+## Code Walk-through
+
+### Key Function: ncclCommInitRank in threads
+```c
+// Each thread creates it's own copy of struct `threadData_t`.
+typedef struct {
+  int thread_id; // thread_id is set when thread is created
+  int num_gpus; // num_gpus is set by querying the CUDA devices
+  ncclUniqueId commId; // commId is set by ncclGetUniqueId
+  ncclComm_t* comms;
+} threadData_t;
+threadData_t* data;
+
+// Each thread initializes its own communicator
+NCCLCHECK(ncclCommInitRank(&data->comms[thread_id], data->num_gpus, data->commId, data->thread_id));
+```
+
+In this approach:
+- Each thread gets its own NCCL rank (0, 1, 2...)
+- Does not need explicit distribution of `uniqueId` since it uses a global variable.
+
+## Expected Output
+
+```
+Using 4 devices with pthreads
+Creating 4 threads for NCCL communicators
+  Thread 0: Set device 0 and created stream
+  Thread 1: Set device 1 and created stream
+  Thread 2: Set device 2 and created stream
+  Thread 3: Set device 3 and created stream
+  Thread 0: NCCL communicator initialized
+  Thread 1: NCCL communicator initialized
+  Thread 2: NCCL communicator initialized
+  Thread 3: NCCL communicator initialized
+All threads synchronized - communicators ready
+  Thread 0: Communicator rank 0 of 4
+  Thread 1: Communicator rank 1 of 4
+  Thread 2: Communicator rank 2 of 4
+  Thread 3: Communicator rank 3 of 4
+  Thread 0: Destroyed NCCL communicator
+  Thread 1: Destroyed NCCL communicator
+  Thread 2: Destroyed NCCL communicator
+  Thread 3: Destroyed NCCL communicator
+  Thread 0: Resources cleaned up
+  Thread 1: Resources cleaned up
+  Thread 2: Resources cleaned up
+  Thread 3: Resources cleaned up
+All threads completed
+Success
+```
+
+## When to Use pthread Approach
+
+### Ideal Use Cases
+- **Thread-based applications**: When your application is already threaded
+- **Single-node workloads**: All GPUs on one machine
+- **Shared memory**: Need to share data structures between GPU contexts
+
+### When NOT to Use
+- **Multi-node clusters**: Cannot scale beyond one node
+- **Process isolation**: When GPU contexts should be isolated
+- **Complex applications**: Multi-process approach may be cleaner
+
+## Performance Considerations
+
+- **Advantages**:
+  - Shared address space between threads
+  - Easier data sharing between GPU contexts
+  - No MPI overhead
+
+- **Disadvantages**:
+  - Thread synchronization complexity
+  - Limited to single node
+
+## Common Issues and Solutions
+
+1. **Thread synchronization errors**:
+   - Ensure all threads use the same NCCL unique ID
+   - Proper pthread synchronization (barriers, joins)
+
+2. **CUDA context conflicts**:
+   - Each thread must call `cudaSetDevice()` before CUDA operations
+   - Don't share CUDA streams between threads
+
+3. **Resource cleanup order**:
+   - Always destroy NCCL communicators before CUDA resources
+   - Synchronize streams before destroying communicators
+
+## Error Handling
+
+The example uses simplified error handling with CHECK macros:
+- **CUDACHECK**: Exits immediately on CUDA errors
+- **NCCLCHECK**: Exits immediately on NCCL errors
+- **No async error checking**: Simplified for clarity
+- **Thread safety**: Each thread handles its own errors
+
+## Highlighted Environment Variables
+
+- `NTHREADS`: Number of threads to create (defaults to number of GPUs)
+
+See examples/README.md for the full list.
+
+## Next Steps
+
+After understanding this example:
+1. Try using the collective examples and add the pthread approach
+2. Compare with MPI-based multi-process approach
diff --git a/examples/01_communicators/02_one_device_per_pthread/main.cc b/examples/01_communicators/02_one_device_per_pthread/main.cc
new file mode 100644
index 000000000..9dc6f7c8b
--- /dev/null
+++ b/examples/01_communicators/02_one_device_per_pthread/main.cc
@@ -0,0 +1,196 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+/**
+ * NCCL Pthread Example - One Device Per Thread (Simple Version)
+ *
+ * This example demonstrates the basic lifecycle of NCCL communicators in a
+ * multi-threaded environment. Each pthread manages one GPU device and shows
+ * how to properly create and destroy NCCL communicators.
+ *
+ * Key Learning Points:
+ * - NCCL communicator creation and destruction within threads
+ * - CUDA stream management per thread
+ * - Proper resource cleanup order
+ *
+ * This is a minimal example focusing purely on communicator lifecycle
+ * management without performing actual collective operations.
+ */
+
+// Enhanced error checking macro for NCCL operations
+// Provides detailed error information including the failed operation
+
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// Thread data structure to pass parameters
+typedef struct {
+  int thread_id;
+  int num_gpus;
+  ncclUniqueId commId;
+  ncclComm_t *comms;
+} threadData_t;
+
+void *thread_worker(void *arg) {
+  threadData_t *data = (threadData_t *)arg;
+  int thread_id = data->thread_id;
+  cudaStream_t stream;
+
+  // =========================================================================
+  // Set Device Context and Create Stream
+  // =========================================================================
+  // Each thread must set its device context before any CUDA operations
+  CUDACHECK(cudaSetDevice(thread_id));
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  printf("  Thread %d: Set device %d and created stream\n", thread_id,
+         thread_id);
+
+  // =========================================================================
+  // Initialize NCCL Communicator
+  // =========================================================================
+  // Each thread creates its own communicator using the shared unique ID
+  NCCLCHECK(ncclCommInitRank(&data->comms[thread_id], data->num_gpus, data->commId,
+                             thread_id));
+
+  printf("  Thread %d: NCCL communicator initialized\n", thread_id);
+
+  if (thread_id == 0) {
+    printf("All threads initialized - communicators ready\n");
+  }
+
+  // =========================================================================
+  // Query Communicator Properties
+  // =========================================================================
+  // Verify the communicator was created correctly
+  int comm_thread_id, comm_size;
+  NCCLCHECK(ncclCommUserRank(data->comms[thread_id], &comm_thread_id));
+  NCCLCHECK(ncclCommCount(data->comms[thread_id], &comm_size));
+
+  printf("  Thread %d: Communicator thread_id %d of %d\n", thread_id,
+         comm_thread_id, comm_size);
+
+  // Synchronize CUDA stream to ensure all GPU work is complete
+  if (stream != NULL) {
+    CUDACHECK(cudaStreamSynchronize(stream));
+  }
+
+  // =========================================================================
+  // Cleanup Resources (Proper Order)
+  // =========================================================================
+  // Destroy NCCL communicator FIRST (before CUDA resources)
+  // This is important - NCCL cleanup should happen before CUDA cleanup
+  if (data->comms[thread_id] != NULL) {
+    NCCLCHECK(ncclCommDestroy(data->comms[thread_id]));
+    printf("  Thread %d: Destroyed NCCL communicator\n", comm_thread_id);
+  }
+
+  // Now destroy CUDA stream
+  if (stream != NULL) {
+    CUDACHECK(cudaStreamDestroy(stream));
+  }
+
+  printf("  Thread %d: Resources cleaned up\n", thread_id);
+
+  return NULL;
+}
+
+int main(int argc, char *argv[]) {
+  int num_gpus;
+  pthread_t *threads;
+  threadData_t *threadData;
+  ncclComm_t *comms;
+  ncclUniqueId commId;
+
+  // =========================================================================
+  // STEP 1: Initialize Variables and Check GPU Availability
+  // =========================================================================
+
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+  const char *nThreadsEnv = getenv("NTHREADS");
+  if (nThreadsEnv) {
+    num_gpus = atoi(nThreadsEnv);
+  }
+
+  if (num_gpus < 1) {
+    printf("No CUDA devices found\n");
+    return EXIT_FAILURE;
+  }
+
+  printf("Using %d devices with pthreads\n", num_gpus);
+
+  // =========================================================================
+  // STEP 2: Allocate Memory and Prepare Data Structures
+  // =========================================================================
+
+  threads = (pthread_t *)malloc(num_gpus * sizeof(pthread_t));
+  threadData = (threadData_t *)malloc(num_gpus * sizeof(threadData_t));
+  comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t));
+
+  // Generate unique ID for NCCL communicator initialization
+  NCCLCHECK(ncclGetUniqueId(&commId));
+
+  // =========================================================================
+  // STEP 3: Create and Launch Pthread Threads
+  // =========================================================================
+
+  printf("Creating %d threads for NCCL communicators\n", num_gpus);
+
+  for (int i = 0; i < num_gpus; i++) {
+    threadData[i].thread_id = i;
+    threadData[i].num_gpus = num_gpus;
+    threadData[i].commId = commId;
+    threadData[i].comms = comms;
+
+    pthread_create(&threads[i], NULL, thread_worker, &threadData[i]);
+  }
+
+  // =========================================================================
+  // STEP 4: Wait for Thread Completion
+  // =========================================================================
+
+  for (int i = 0; i < num_gpus; i++) {
+    pthread_join(threads[i], NULL);
+  }
+
+  printf("All threads completed\n");
+
+  // =========================================================================
+  // STEP 5: Cleanup Resources
+  // =========================================================================
+
+  free(threads);
+  free(threadData);
+  free(comms);
+
+  printf("Success\n");
+  return 0;
+}
diff --git a/examples/01_communicators/03_one_device_per_process_mpi/Makefile b/examples/01_communicators/03_one_device_per_process_mpi/Makefile
new file mode 100644
index 000000000..d12e2fc27
--- /dev/null
+++ b/examples/01_communicators/03_one_device_per_process_mpi/Makefile
@@ -0,0 +1,66 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# This examples needs to be built with MPI support
+MPI = 1
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = one_device_per_process_mpi
+
+# Source files
+SOURCES = main.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+	@echo "Running with 1 process"
+	$(MPIRUN) -np 1 ./$(TARGET)
+	@echo ""
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: One Device per Process (MPI)"
+	@echo "=========================================="
+	@echo ""
+	@echo "This example shows how to use ncclCommInitRank to create"
+	@echo "communicators for multiple GPUs using pthreads."
+	@echo ""
+	@echo "Targets:"
+	@echo "  all           - Build the example (default)"
+	@echo "  test          - Build and run tests with different process counts"
+	@echo "  clean         - Remove build artifacts"
+	@echo "  install       - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help          - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/01_communicators/03_one_device_per_process_mpi/README.md b/examples/01_communicators/03_one_device_per_process_mpi/README.md
new file mode 100644
index 000000000..5d3d2b40d
--- /dev/null
+++ b/examples/01_communicators/03_one_device_per_process_mpi/README.md
@@ -0,0 +1,196 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Example: One Device per Process (MPI)
+
+This example demonstrates NCCL communicator lifecycle management using MPI, with
+one GPU per MPI process.
+
+## Overview
+
+This example shows one of the most common NCCL deployment pattern: one GPU
+device per process. This approach is ideal for distributed training across
+multiple nodes and provides the foundation for scalable multi-GPU applications.
+MPI is used as it provides a parallel launcher and broadcast functions. It is,
+however, not a requirement for multi-node NCCL applications.
+
+Other approaches use server-client models or spawn parallel processes using
+sockets. NCCL only requires that the unique ID is distributed among each
+thread/process taking part in collective communication and all threads/processes
+call some NCCL initialization function.
+
+## What This Example Does
+
+1. **Multi-node Support**:
+   - Determines local rank on each node automatically
+   - Maps MPI processes to GPUs on each node
+   - Uses `MPI_Comm_split_type` with `MPI_COMM_TYPE_SHARED` to assign each local
+     rank a GPU.
+
+2. **Communicator Creation**:
+   - Uses `ncclCommInitRank` with MPI-coordinated unique ID
+   - Rank `0` generates and broadcasts NCCL unique ID
+   - Each process joins the distributed communicator
+
+3. **Verification**:
+   - Displays MPI rank → NCCL rank → GPU device mapping
+   - Confirms successful initialization across all processes
+
+4. **Cleanup**:
+   - Proper resource cleanup order
+   - MPI synchronization for clean shutdown
+
+## Building and Running
+
+### Build
+```shell
+make MPI=1 [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run example
+```shell
+mpirun -np <num_processes> ./one_device_per_process_mpi
+```
+
+### Run with NCCL debug output
+```shell
+NCCL_DEBUG=INFO mpirun -np <num_processes> ./one_device_per_process_mpi
+```
+
+## Code Walk-through
+
+This approach:
+- Automatically handles multi-node GPU assignment
+- Uses MPI for coordination and NCCL for GPU communication
+- Supports both single-node and multi-node deployments
+
+### Unique ID Distribution
+The NCCL unique ID must be shared with all process which call `ncclCommInitRank`. We use MPI for that:
+```c
+// Rank 0 generates unique ID
+if (mpi_rank == 0) {
+    NCCLCHECK(ncclGetUniqueId(&nccl_id));
+}
+
+// Broadcast to all processes
+MPI_Bcast(&nccl_id, sizeof(ncclUniqueId), MPI_BYTE, 0, MPI_COMM_WORLD);
+```
+
+### Key Function: Multi-node GPU assignment
+```c
+// Separate function to determine the node local rank via `MPI_Comm_split_type`
+int local_rank = getLocalRank(MPI_COMM_WORLD);
+
+// Use the local rank as the GPU device number. This assumes you only start as many processes as available GPUs
+CUDACHECK(cudaSetDevice(local_rank));
+
+ncclComm_t comm;
+int mpi_rank, mpi_size; // mpi_rank & mpi_size are set during MPI initialization
+ncclUniqueId nccl_id; // nccl_id is generated and broadcasted as above
+
+// Initialize NCCL communicator across all processes
+NCCLCHECK(ncclCommInitRank(&comm, mpi_size, nccl_id, mpi_rank));
+```
+
+## Expected Output
+
+### Single Node (4 processes)
+```
+Starting NCCL communicator lifecycle example with 4 processes
+  MPI initialized - Process 0 of 4 total processes
+  Found 4 CUDA devices on this node
+  MPI rank 0 assigned to CUDA device 0
+Rank 0 generated NCCL unique ID for all processes
+  Rank 0 received NCCL unique ID
+  Rank 0 created NCCL communicator
+  MPI rank 0 → NCCL rank 0/4 on GPU device 0
+
+[Similar output for ranks 1-3]
+
+All communicators initialized successfully! Beginning cleanup...
+  Rank 0 destroyed NCCL communicator
+
+All NCCL communicators created and cleaned up properly!
+This example demonstrated the complete NCCL communicator lifecycle.
+Next steps: Try running NCCL collective operations (AllReduce, etc.)
+```
+
+### Multi-node (8 processes, 2 nodes)
+```
+Starting NCCL communicator lifecycle example with 8 processes
+  MPI initialized - Process 0 of 8 total processes
+  MPI initialized - Process 1 of 8 total processes
+  MPI initialized - Process 2 of 8 total processes
+  MPI initialized - Process 3 of 8 total processes
+  MPI initialized - Process 4 of 8 total processes
+  MPI initialized - Process 5 of 8 total processes
+  MPI initialized - Process 6 of 8 total processes
+  MPI initialized - Process 7 of 8 total processes
+...
+
+  MPI rank 0 → NCCL rank 0/8 on GPU device 0
+  MPI rank 1 → NCCL rank 1/8 on GPU device 1
+  MPI rank 2 → NCCL rank 2/8 on GPU device 2
+  MPI rank 3 → NCCL rank 3/8 on GPU device 3
+  MPI rank 4 → NCCL rank 4/8 on GPU device 0
+  MPI rank 5 → NCCL rank 5/8 on GPU device 1
+  MPI rank 6 → NCCL rank 6/8 on GPU device 2
+  MPI rank 7 → NCCL rank 7/8 on GPU device 3
+
+All NCCL communicators created and cleaned up properly!
+```
+
+## When to Use MPI Approach
+
+### Ideal Use Cases
+- **Multi-node clusters**: Scales across multiple machines
+- **Production deployments**: Industry standard for distributed training,
+  inference, and most HPC codes
+- **Process isolation**: Each GPU in separate process for robustness
+- **Large scale**: Supports thousands of GPUs
+
+### When NOT to Use
+- **Single-node testing**: Simpler approaches available
+- **No MPI available**: Some environments don't support MPI
+- **Shared memory needs**: Single-process approaches may be simpler
+
+## Performance Considerations
+
+- **Advantages**:
+  - MPI has been optimized for large parallel startup.
+  - Industry standard deployment pattern
+  - Optimal for large-scale training
+
+- **Disadvantages**:
+  - MPI setup complexity
+  - Inter-process communication overhead
+  - Requires MPI runtime environment
+
+## Common Issues and Solutions
+
+1. **More MPI processes than GPUs on a node**:
+   - The example reports an error if local rank exceeds available devices
+   - Use fewer processes per node or more GPUs
+
+2. **MPI broadcast hangs**:
+   - Ensure all ranks participate in collective operations
+   - Check MPI installation and network connectivity
+
+3. **Multi-node communication fails**:
+   - Check firewall settings and network configuration
+   - Set `NCCL_SOCKET_IFNAME` to specify network interface
+
+## Error Handling
+
+The example uses simplified error handling with CHECK macros:
+- **CUDACHECK**: Exits immediately on CUDA errors
+- **NCCLCHECK**: Exits immediately on NCCL errors
+- **No async error checking**: Simplified for clarity
+- **No global error coordination**: Each process exits on its own errors
+
+## Next Steps
+
+After understanding this example:
+1. Try running collective operations (AllReduce, AllGather, etc.)
+2. Experiment with multi-node deployments
diff --git a/examples/01_communicators/03_one_device_per_process_mpi/main.cc b/examples/01_communicators/03_one_device_per_process_mpi/main.cc
new file mode 100644
index 000000000..44a2377bf
--- /dev/null
+++ b/examples/01_communicators/03_one_device_per_process_mpi/main.cc
@@ -0,0 +1,249 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "mpi.h"
+#include "nccl.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/**
+ * NCCL Example: One Device per Process with MPI
+ * =============================================
+ *
+ * LEARNING OBJECTIVE:
+ * This example teaches the fundamental NCCL pattern: one GPU device per MPI
+ * process. This is the most common deployment pattern for multi-GPU distributed
+ * training.
+ *
+ * WHAT THIS CODE DEMONSTRATES:
+ * - How to initialize NCCL communicators across multiple processes
+ * - Proper GPU assignment in both single-node and multi-node environments
+ * - Complete NCCL communicator lifecycle management
+ * - Error handling best practices for production code
+ *
+ * STEP-BY-STEP PROCESS:
+ * 1. MPI Setup: Initialize MPI and determine process layout
+ * 2. GPU Assignment: Map each process to a local GPU device
+ * 3. NCCL ID Sharing: Rank 0 creates unique ID, broadcasts to all processes
+ * 4. Communicator Creation: Each process joins the NCCL communicator
+ * 5. Verification: Query and verify communicator properties
+ * 6. Clean Shutdown: Properly destroy all resources in correct order
+ *
+ * MULTI-NODE INTELLIGENCE:
+ * - Automatically detects which processes are on the same physical node
+ * - Assigns local GPU indices (0, 1, 2, 3...) to processes on each node
+ * - Uses MPI_Comm_split_type with MPI_COMM_TYPE_SHARED for robust node
+ * identification
+ * - Leverages MPI's native shared memory detection for optimal performance
+ *
+ * USAGE EXAMPLES:
+ *   Single node (4 GPUs): mpirun -np 4 ./one_device_per_process_mpi
+ *
+ * EXPECTED OUTPUT:
+ *   Each process will report: MPI rank → NCCL rank → GPU device assignment
+ *   Success message confirms all communicators were created properly
+ */
+
+// Enhanced error checking macro for NCCL operations
+// Provides detailed error information including the failed operation
+
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+// =============================================================================
+// LOCAL RANK UTILITY FUNCTION - For Multi-Node GPU Assignment
+// =============================================================================
+
+/**
+ * Determine the local rank of this process on its physical node
+ *
+ * Algorithm:
+ * 1. Split the communicator based on shared memory (i.e., nodes)
+ * 2. Get the rank within the node communicator
+ * 3. This rank becomes the local rank for GPU assignment
+ *
+ * @param comm The MPI communicator to use for determining local rank
+ * @return Local rank (0, 1, 2...) for GPU assignment, or -1 on error
+ */
+int getLocalRank(MPI_Comm comm) {
+
+  int world_size;
+  MPI_Comm_size(comm, &world_size);
+
+  int world_rank;
+  MPI_Comm_rank(comm, &world_rank);
+
+  // Split the communicator based on shared memory (i.e., nodes)
+  MPI_Comm node_comm;
+  MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, world_rank, MPI_INFO_NULL,
+                      &node_comm);
+
+  // Get the rank and size within the node communicator
+  int node_rank, node_size;
+  MPI_Comm_rank(node_comm, &node_rank);
+  MPI_Comm_size(node_comm, &node_size);
+
+  // Clean up the node communicator
+  MPI_Comm_free(&node_comm);
+
+  return node_rank;
+}
+
+// =============================================================================
+// MAIN FUNCTION - NCCL Communicator Lifecycle Example
+// =============================================================================
+
+int main(int argc, char *argv[]) {
+  // Variables for MPI, CUDA, and NCCL components
+  int mpi_rank, mpi_size, local_rank;
+  int num_gpus = 0;
+  ncclComm_t comm = NULL;
+  cudaStream_t stream = NULL;
+  ncclUniqueId nccl_id;
+
+  // =========================================================================
+  // STEP 1: Initialize MPI and determine process layout
+  // =========================================================================
+
+  MPI_Init(&argc, &argv);
+  MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
+
+
+  if (mpi_rank == 0) {
+    printf("Starting NCCL communicator lifecycle example with %d processes\n",
+           mpi_size);
+  }
+  // Determine which local GPU this process should use
+  local_rank = getLocalRank(MPI_COMM_WORLD);
+
+  printf("  MPI initialized - Process %d of %d total processes\n", mpi_rank,
+         mpi_size);
+
+  // =========================================================================
+  // STEP 2: Setup CUDA device for this process
+  // =========================================================================
+
+  // Check how many CUDA devices are available on this node
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+  printf("  Found %d CUDA devices on this node\n", num_gpus);
+
+  if (num_gpus == 0) {
+    fprintf(stderr, "ERROR: No CUDA devices found on this node!\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (local_rank >= num_gpus) {
+    fprintf(stderr,
+            "ERROR: Process %d needs GPU %d but only %d devices available\n",
+            mpi_rank, local_rank, num_gpus);
+    exit(EXIT_FAILURE);
+  }
+
+  // Assign this process to its designated GPU device
+  CUDACHECK(cudaSetDevice(local_rank));
+
+  // Create CUDA stream for GPU operations
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  printf("  MPI rank %d assigned to CUDA device %d\n", mpi_rank,
+         local_rank);
+
+  // =========================================================================
+  // STEP 3: Initialize NCCL communicator
+  // =========================================================================
+
+  // Generate NCCL unique ID (only rank 0 needs to do this)
+  if (mpi_rank == 0) {
+    NCCLCHECK(ncclGetUniqueId(&nccl_id));
+    printf("Rank 0 generated NCCL unique ID for all processes\n");
+  }
+
+  // Share the unique ID with all processes using MPI broadcast
+  MPI_Bcast(&nccl_id, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, MPI_COMM_WORLD);
+  printf("INFO: Rank %d received NCCL unique ID\n", mpi_rank);
+
+  // Create NCCL communicator for this process
+  // This is where each process joins the distributed NCCL communicator
+  NCCLCHECK(ncclCommInitRank(&comm, mpi_size, nccl_id, mpi_rank));
+  printf("  Rank %d created NCCL communicator\n", mpi_rank);
+
+  // =========================================================================
+  // STEP 4: Verify communicator setup
+  // =========================================================================
+
+  // Query communicator properties to verify everything is set up correctly
+  int comm_rank, comm_size, comm_device;
+  NCCLCHECK(ncclCommUserRank(comm, &comm_rank));
+  NCCLCHECK(ncclCommCount(comm, &comm_size));
+  NCCLCHECK(ncclCommCuDevice(comm, &comm_device));
+
+  printf("  MPI rank %d → NCCL rank %d/%d on GPU device %d\n", mpi_rank,
+         comm_rank, comm_size, comm_device);
+
+  // Give all processes a chance to finish their printf
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  // =========================================================================
+  // STEP 5: Clean shutdown and resource cleanup
+  // =========================================================================
+
+  if (mpi_rank == 0) {
+    printf(
+        "\nAll communicators initialized successfully! Beginning cleanup...\n");
+  }
+
+  // Synchronize CUDA stream to ensure all GPU work is complete
+  if (stream != NULL) {
+    CUDACHECK(cudaStreamSynchronize(stream));
+  }
+
+  // Destroy NCCL communicator FIRST (before CUDA resources)
+  // This is important - NCCL cleanup should happen before CUDA cleanup
+  if (comm != NULL) {
+    NCCLCHECK(ncclCommDestroy(comm));
+    printf("  Rank %d destroyed NCCL communicator\n", mpi_rank);
+  }
+
+  // Now destroy CUDA stream
+  if (stream != NULL) {
+    CUDACHECK(cudaStreamDestroy(stream));
+  }
+
+  if (mpi_rank == 0) {
+    printf(
+        "\nAll NCCL communicators created and cleaned up properly!\n");
+    printf("This example demonstrated the complete NCCL communicator "
+           "lifecycle.\n");
+    printf("Next steps: Try running NCCL collective operations (AllReduce, "
+           "etc.)\n");
+  }
+
+  MPI_Finalize();
+  return 0;
+}
diff --git a/examples/01_communicators/Makefile b/examples/01_communicators/Makefile
new file mode 100644
index 000000000..1b89e5904
--- /dev/null
+++ b/examples/01_communicators/Makefile
@@ -0,0 +1,59 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../makefiles/common.mk
+
+# NCCL Fundamental Examples
+EXAMPLES = 01_multiple_devices_single_process 02_one_device_per_pthread
+
+ifeq ($(MPI), 1)
+EXAMPLES += 03_one_device_per_process_mpi
+endif
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+ifneq ($(MPI),1)
+		$(MAKE) -C 03_one_device_per_process_mpi clean
+endif
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test || exit 1; \
+	done
+
+# Help
+help:
+	@echo "NCCL Communicator Init Examples"
+	@echo "==============================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_multiple_devices_single_process - Create communicators using multiple GPUs in a single thread"
+	@echo "  02_one_device_per_pthread - Create communicators using one GPU per thread"
+	@echo "  03_one_device_per_process_mpi - Create communicators using one GPU per MPI process"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_multiple_devices_single_process"
+
+.PHONY: all clean test help $(EXAMPLES)
diff --git a/examples/01_communicators/README.md b/examples/01_communicators/README.md
new file mode 100644
index 000000000..1c2218c9e
--- /dev/null
+++ b/examples/01_communicators/README.md
@@ -0,0 +1,107 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Communicator Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL communicator
+lifecycle management (creation, query, and destruction) using different
+initialization patterns.
+
+## Examples
+
+### [01_multiple_devices_single_process](01_multiple_devices_single_process/)
+**Multiple Devices Single Process**
+- **Pattern**: Single process manages all GPUs
+- **API**: `ncclCommInitAll` (no external coordination)
+- **Use case**: Simple single-node applications
+- **Key features**:
+  - Simplest initialization method
+  - No MPI or threading required
+  - Automatic rank assignment (0 to n-1)
+  - Cannot span multiple nodes
+
+**Run command:**
+```shell
+./01_multiple_devices_single_process/multiple_devices_single_process
+```
+
+### [02_one_device_per_pthread](02_one_device_per_pthread/)
+**One Device per Thread with pthreads**
+- **Pattern**: One thread per GPU within single process
+- **API**: `ncclCommInitRank` with pthread coordination
+- **Use case**: Single-node multi-GPU, thread-based parallelism
+- **Key features**:
+  - pthread barriers for synchronization
+  - Shared memory for unique ID
+  - Lower overhead than multi-process
+  - Cannot span multiple nodes
+
+**Run command:**
+```shell
+[NTHREADS=n] ./02_one_device_per_pthread/one_device_per_pthread
+```
+
+### [03_one_device_per_process_mpi](03_one_device_per_process_mpi/)
+**One Device per Process with MPI**
+- **Pattern**: One MPI process per GPU
+- **API**: `ncclCommInitRank` with MPI coordination
+- **Use case**: Multi-node clusters, distributed training
+- **Key features**:
+  - MPI broadcast for unique ID distribution
+  - Process-to-GPU mapping by local MPI ranks
+  - Scalable to multiple nodes
+
+**Run command:**
+```shell
+mpirun -np <num_processes> ./03_one_device_per_process_mpi/one_device_per_process_mpi
+```
+
+## Choosing the Right Approach
+
+| Feature                | ncclCommInitAll | pthread          | MPI      |
+|------------------------|-----------------|------------------|----------|
+| **Multi-node support** | ✗               | ✗                | ✓        |
+| **Process isolation**  | ✗               | ✗                | ✓        |
+| **Setup complexity**   | Low             | Medium           | High     |
+| **Memory overhead**    | Low             | Medium           | High     |
+| **Best for**           | Simple test     | Single-node apps | Clusters |
+
+### When to use each:
+- **ncclCommInitAll**: Development, testing, simple single-node apps
+- **pthread**: Single-node with thread-based parallelism needs
+- **MPI**: Production distributed training, multi-node setups
+
+## Building
+
+### **Quick Start**
+```shell
+# Build all examples [or single directory]
+make [directory]
+
+# Test all examples
+make test
+```
+
+### **Individual Examples**
+```shell
+# Build specific example
+make 01_multiple_devices_single_process
+make 02_one_device_per_pthread
+make 03_one_device_per_process_mpi
+
+# Test individual example
+cd 01_multiple_devices_single_process && make test
+cd 02_one_device_per_pthread && make test
+cd 03_one_device_per_process_mpi && make test
+```
+
+## References
+- [NVIDIA NCCL User Guide
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
+- [MPI Standard](https://www.mpi-forum.org/docs/)
diff --git a/examples/02_point_to_point/01_ring_pattern/Makefile b/examples/02_point_to_point/01_ring_pattern/Makefile
new file mode 100644
index 000000000..52b51dbfa
--- /dev/null
+++ b/examples/02_point_to_point/01_ring_pattern/Makefile
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = ring_pattern
+
+# Source files
+SOURCES = main.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: P2P Ring Pattern"
+	@echo "=============================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/02_point_to_point/01_ring_pattern/README.md b/examples/02_point_to_point/01_ring_pattern/README.md
new file mode 100644
index 000000000..7ed046e59
--- /dev/null
+++ b/examples/02_point_to_point/01_ring_pattern/README.md
@@ -0,0 +1,149 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Ring Communication Pattern Example
+
+This example demonstrates a ring communication pattern using NCCL P2P
+operations. It runs on a single node where a single process manages all GPUs and
+data flows in a circular pattern.
+
+## Overview
+
+The ring communication pattern creates a circular data flow where each GPU sends
+data to its "next" neighbor and receives from its "previous" neighbor in the
+ring. This example uses `ncclCommInitAll` for simplified single-threaded,
+single-process multi-GPU setup.
+
+## What This Example Does
+
+1. **Detects and initializes all available GPUs** using `ncclCommInitAll` for
+   simplified single-process setup
+2. **Creates ring topology** where each GPU calculates its next and previous
+   neighbors using modulo
+3. **Executes simultaneous point-to-point communication** with each GPU sending
+   to next and receiving from previous
+4. **Verifies data correctness** by checking that each GPU received the expected
+   data from its predecessor
+
+## Building and Running
+
+### Build the Example
+```bash
+cd examples/02_point_to_point/01_ring_pattern
+make [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run with All Available GPUs
+```bash
+./ring_pattern
+```
+
+## Code Walk-through
+
+### Ring Topology Setup
+
+The example calculates ring neighbors using modulo arithmetic:
+
+```cpp
+for (int i = 0; i < num_gpus; i++) {
+    int next = (i + 1) % num_gpus;        // Next neighbor in ring
+    int prev = (i - 1 + num_gpus) % num_gpus;  // Previous neighbor in ring
+}
+```
+
+### Simultaneous Communication
+
+Uses `ncclGroupStart/End` to prevent deadlocks when scheduling all send and
+receive operations:
+
+```cpp
+float **d_sendbuff; // device side send and receive buffer are allocated through cudaMalloc
+float **d_recvbuff;
+size_t count;       // count is set to the number of floats to be sent (usually the size of the buffers)
+ncclComm_t *comms;  // comms are set during ncclCommInitAll
+cudaStream_t *streams; // streams are set in cudaStreamCreate
+
+// Each GPU simultaneously sends to next and receives from previous
+NCCLCHECK(ncclGroupStart());
+for (int i = 0; i < num_gpus; i++) {
+    int next = (i + 1) % num_gpus;
+    int prev = (i - 1 + num_gpus) % num_gpus;
+
+    NCCLCHECK(ncclSend(d_sendbuff[i], count, ncclFloat, next, comms[i], streams[i]));
+    NCCLCHECK(ncclRecv(d_recvbuff[i], count, ncclFloat, prev, comms[i], streams[i]));
+}
+NCCLCHECK(ncclGroupEnd());
+```
+
+## Expected Output
+
+```
+Starting NCCL ring communication example
+Using 4 GPUs for ring communication
+Preparing data structures
+Initializing NCCL communicators
+All communicators initialized successfully
+Creating CUDA streams and verifying setup
+  GPU 0 -> NCCL rank 0/4 on CUDA device 0
+  GPU 1 -> NCCL rank 1/4 on CUDA device 1
+  GPU 2 -> NCCL rank 2/4 on CUDA device 2
+  GPU 3 -> NCCL rank 3/4 on CUDA device 3
+Setting up ring topology
+Data flow -> GPU 0 -> GPU 1 -> ... -> GPU 3 -> GPU 0
+Ring transfer with 268435456 elements (1.00 GB per GPU)
+Allocating and initializing buffers
+Executing ring communication
+  GPU 0 sends to GPU 1, receives from GPU 3
+  GPU 1 sends to GPU 2, receives from GPU 0
+  GPU 2 sends to GPU 3, receives from GPU 1
+  GPU 3 sends to GPU 0, receives from GPU 2
+Ring communication completed successfully
+Verifying data correctness
+  GPU 0 received data from GPU 3: CORRECT
+  GPU 1 received data from GPU 0: CORRECT
+  GPU 2 received data from GPU 1: CORRECT
+  GPU 3 received data from GPU 2: CORRECT
+SUCCESS - All GPUs received correct data
+Cleaning up resources
+Example completed successfully!
+```
+
+## When to Use
+
+- **Learning NCCL fundamentals**: Understanding point-to-point communication
+  patterns
+- **Algorithm development**: Building custom collective operations based on
+  point to point communications
+- **Single-node applications**: Pipeline parallelism or custom data distribution
+  patterns
+
+## Key Insights
+- `ncclCommInitAll` simplifies single-node multi-GPU setup
+- No MPI or pthreads needed for single-node patterns
+- Ring pattern enables circular data flow among all GPUs
+- `ncclGroupStart/End` prevents deadlock in simultaneous operations
+- Each GPU both sends and receives in parallel
+
+## Common Issues and Solutions
+
+### Issue: Deadlock without group operations
+**Solution:** Always use `ncclGroupStart()` and `ncclGroupEnd()` when performing
+simultaneous send/recv operations.
+
+### Issue: Verification failures
+**Solution:** Check ring topology calculations and data initialization patterns.
+Ensure correct neighbor calculations.
+
+## Error Handling
+
+This example uses comprehensive error checking with `NCCLCHECK` and `CUDACHECK`
+macros that immediately exit on any failure. In production code, consider more
+graceful error handling and recovery mechanisms.
+
+## Next Steps
+
+After this example, try:
+- **Collective operations**: Examples in `03_collectives/`
+- **Multi-node approach**: Use the MPI implementation from `01_communicators` to
+  send data across nodes.
diff --git a/examples/02_point_to_point/01_ring_pattern/main.cc b/examples/02_point_to_point/01_ring_pattern/main.cc
new file mode 100644
index 000000000..2f317b9b5
--- /dev/null
+++ b/examples/02_point_to_point/01_ring_pattern/main.cc
@@ -0,0 +1,273 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * NCCL Ring Pattern Example - Educational Version
+ *
+ * This example demonstrates the fundamental ring communication pattern using
+ * NCCL's point-to-point operations. Understanding ring patterns is essential
+ * for NCCL programming as they form the basis of many collective algorithms.
+ *
+ * Learning Objectives:
+ * - Understand ring topology and neighbor communication
+ * - Learn NCCL point-to-point send/recv operations
+ * - See how data flows in a ring pattern
+ * - Practice deadlock avoidance with ncclGroup operations
+ * - Understand single-process multi-GPU patterns
+ *
+ */
+
+// Enhanced error checking macro for NCCL operations
+// Provides detailed error information including the failed operation
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+int main(int argc, char *argv[]) {
+  // ========================================================================
+  // STEP 1: Initialize Environment and Detect GPUs
+  // ========================================================================
+
+  int num_gpus = 0;
+  ncclComm_t *comms = NULL;
+  cudaStream_t *streams = NULL;
+  float **h_sendbuff = NULL;
+  float **h_recvbuff = NULL;
+  float **d_sendbuff = NULL;
+  float **d_recvbuff = NULL;
+
+  printf("Starting NCCL ring communication example\n");
+
+  // Get number of available CUDA devices
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+
+  if (num_gpus == 0) {
+    fprintf(stderr, "No CUDA devices found\n");
+    return 1;
+  }
+
+  if (num_gpus < 2) {
+    printf("At least 2 GPU are necessary to create inter-GPU traffic\n");
+    printf("Found only %d GPU(s) - pattern will be limited\n", num_gpus);
+  }
+
+  printf("Using %d GPUs for ring communication\n", num_gpus);
+
+  // ========================================================================
+  // STEP 2: Prepare Data Structures and Device List
+  // ========================================================================
+
+  printf("Preparing data structures\n");
+
+  // Create device list (use all available devices)
+  int *devices = (int *)malloc(num_gpus * sizeof(int));
+  for (int i = 0; i < num_gpus; i++) {
+    devices[i] = i;
+  }
+
+  // Allocate communicators, streams, and buffer pointers
+  comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t));
+  streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t));
+  h_sendbuff = (float **)malloc(num_gpus * sizeof(float *));
+  h_recvbuff = (float **)malloc(num_gpus * sizeof(float *));
+  d_sendbuff = (float **)malloc(num_gpus * sizeof(float *));
+  d_recvbuff = (float **)malloc(num_gpus * sizeof(float *));
+
+  // ========================================================================
+  // STEP 3: Initialize NCCL Communicators
+  // ========================================================================
+
+  /*
+   * ncclCommInitAll is the simplest way to initialize NCCL communicators
+   * for single-process, multi-GPU scenarios. It automatically:
+   * - Creates one communicator per GPU
+   * - Assigns ranks sequentially (GPU 0 = rank 0, GPU 1 = rank 1, etc.)
+   */
+  printf("Initializing NCCL communicators\n");
+  NCCLCHECK(ncclCommInitAll(comms, num_gpus, devices));
+  printf("All communicators initialized successfully\n");
+
+  // ========================================================================
+  // STEP 4: Create Streams and Verify Communicator Setup
+  // ========================================================================
+
+  printf("Creating CUDA streams and verifying setup\n");
+
+  // Create streams and verify communicator info
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamCreate(&streams[i]));
+
+    // Query communicator information for verification
+    int rank, size, device;
+    NCCLCHECK(ncclCommUserRank(comms[i], &rank));
+    NCCLCHECK(ncclCommCount(comms[i], &size));
+    NCCLCHECK(ncclCommCuDevice(comms[i], &device));
+
+    printf("  GPU %d -> NCCL rank %d/%d on CUDA device %d\n", i, rank, size,
+           device);
+  }
+
+  // ========================================================================
+  // STEP 5: Set Up Ring Topology and Allocate Buffers
+  // ========================================================================
+
+  printf("Setting up ring topology\n");
+  printf("Data flow -> GPU 0 -> ... -> GPU %d -> GPU 0\n", num_gpus - 1);
+
+  // Test with 1GB of data
+  const size_t count = 256 * 1024 * 1024; // 256M floats = 1GB
+  const size_t size_bytes = count * sizeof(float);
+
+  printf("Ring transfer with %zu elements (%.2f GB per GPU)\n", count,
+         size_bytes / (1024.0 * 1024.0 * 1024.0));
+
+  // Allocate buffers for each GPU
+  printf("Allocating and initializing buffers\n");
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+
+    h_sendbuff[i] = (float *)malloc(size_bytes);
+    h_recvbuff[i] = (float *)malloc(size_bytes);
+    CUDACHECK(cudaMalloc((void **)&d_sendbuff[i], size_bytes));
+    CUDACHECK(cudaMalloc((void **)&d_recvbuff[i], size_bytes));
+
+    // Initialize data with GPU-specific pattern for verification
+    for (size_t j = 0; j < count; j++) {
+      h_sendbuff[i][j] = (float)(i * 1000 + j % 1000);
+    }
+    CUDACHECK(cudaMemcpy(d_sendbuff[i], h_sendbuff[i], size_bytes,
+                         cudaMemcpyHostToDevice));
+  }
+
+  // ========================================================================
+  // STEP 6: Execute Ring Communication Pattern
+  // ========================================================================
+
+  /*
+   * The ring communication uses ncclGroup operations to avoid deadlock.
+   * Without grouping, if all GPUs tried to send first, they would deadlock
+   * waiting for receivers. Grouping allows NCCL to execute operations
+   * in the optimal order.
+   */
+  printf("Executing ring communication\n");
+
+  // NOTE: ncclGroupStart and ncclGroupEnd are essential to avoid deadlock
+  // when using ncclCommInitAll!
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < num_gpus; i++) {
+    int next = (i + 1) % num_gpus;
+    int prev = (i - 1 + num_gpus) % num_gpus;
+    printf("  GPU %d sends to GPU %d, receives from GPU %d\n", i, next, prev);
+
+    // Each GPU simultaneously sends to next and receives from previous
+    NCCLCHECK(
+        ncclSend(d_sendbuff[i], count, ncclFloat, next, comms[i], streams[i]));
+    NCCLCHECK(
+        ncclRecv(d_recvbuff[i], count, ncclFloat, prev, comms[i], streams[i]));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  // Synchronize all streams to ensure communication completes
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamSynchronize(streams[i]));
+  }
+
+  printf("Ring communication completed successfully\n");
+
+  // ========================================================================
+  // STEP 7: Verify Data Correctness and Report Results
+  // ========================================================================
+
+  printf("Verifying data correctness\n");
+  bool all_correct = true;
+
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaMemcpy(h_recvbuff[i], d_recvbuff[i], size_bytes,
+                         cudaMemcpyDeviceToHost));
+
+    int prev = (i - 1 + num_gpus) % num_gpus;
+    // Verify that GPU i received data from GPU prev
+    float expected = (float)(prev * 1000);
+    bool correct = (h_recvbuff[i][0] == expected);
+
+    printf("  GPU %d received data from GPU %d: %s\n", i, prev,
+           correct ? "CORRECT" : "ERROR");
+
+    if (!correct) {
+      all_correct = false;
+      printf("  Expected %.0f, got %.0f\n", expected, h_recvbuff[i][0]);
+    }
+  }
+
+  if (all_correct) {
+    printf("SUCCESS - All GPUs received correct data\n");
+  } else {
+    printf("FAILURE - Data verification failed\n");
+  }
+
+  // ========================================================================
+  // STEP 8: Cleanup Resources
+  // ========================================================================
+
+  printf("Cleaning up resources\n");
+
+  // Free buffers
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(devices[i]));
+    free(h_sendbuff[i]);
+    free(h_recvbuff[i]);
+    CUDACHECK(cudaFree(d_sendbuff[i]));
+    CUDACHECK(cudaFree(d_recvbuff[i]));
+  }
+
+  // Destroy communicators and streams
+  for (int i = 0; i < num_gpus; i++) {
+    NCCLCHECK(ncclCommDestroy(comms[i]));
+    CUDACHECK(cudaSetDevice(devices[i]));
+    CUDACHECK(cudaStreamDestroy(streams[i]));
+  }
+
+  // Free allocated memory
+  free(devices);
+  free(comms);
+  free(streams);
+  free(h_sendbuff);
+  free(h_recvbuff);
+  free(d_sendbuff);
+  free(d_recvbuff);
+
+  printf("Example completed successfully!\n");
+
+  return 0;
+}
diff --git a/examples/02_point_to_point/Makefile b/examples/02_point_to_point/Makefile
new file mode 100644
index 000000000..0310c0471
--- /dev/null
+++ b/examples/02_point_to_point/Makefile
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL Fundamental Examples
+EXAMPLES = 01_ring_pattern
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test || exit 1; \
+	done
+
+# Help
+help:
+	@echo "NCCL Point to Point Examples"
+	@echo "============================"
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_ring_pattern - Use send and receive operations to form a ring pattern"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_ring_pattern"
+
+.PHONY: all clean test $(EXAMPLES)
diff --git a/examples/02_point_to_point/README.md b/examples/02_point_to_point/README.md
new file mode 100644
index 000000000..26f1cc555
--- /dev/null
+++ b/examples/02_point_to_point/README.md
@@ -0,0 +1,65 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Point-to-Point Communication Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL point-to-point
+(P2P) communication patterns on a single node. These examples focus on clarity
+and correct communicator usage, resource management, and verification.
+
+## Examples
+
+### [01_ring_pattern](01_ring_pattern/)
+**Ring Communication Pattern**
+- **Pattern**: Circular data flow among all GPUs
+- **API**: `ncclCommInitAll` with P2P operations (`ncclSend`/`ncclRecv`)
+- **Use case**: Learning P2P communication; pipeline/data movement patterns on a
+  single node
+- **Key features**:
+  - Initializes all GPUs in a single process
+  - Computes ring neighbors with modulo arithmetic
+  - Uses `ncclGroupStart/End` to prevent deadlocks
+  - Verifies data correctness after transfers
+
+## Choosing the Right Pattern
+
+*Scenario* : Pipeline parallel training needs to send data from one GPU to
+another
+*Addresses* : Individual transfers between two ranks
+*Dependencies* : A functional NCCL library and its dependencies
+
+### Why `ncclCommInitAll` here?
+For single-node collective examples we use `ncclCommInitAll` as it creates a
+clique of communicators in one call.
+
+```c
+// Initialize all GPUs in one call
+ncclComm_t* comms;
+int num_gpus;
+NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL));
+```
+
+## Building
+
+### **Quick Start**
+```shell
+# Build example by directory name
+make 01_ring_pattern
+```
+
+### **Individual Examples**
+```shell
+# Build and run the ring pattern
+cd 01_ring_pattern && make
+./ring_pattern
+```
+
+## References
+- [NCCL User Guide:
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
diff --git a/examples/03_collectives/01_allreduce/Makefile b/examples/03_collectives/01_allreduce/Makefile
new file mode 100644
index 000000000..9972b91cd
--- /dev/null
+++ b/examples/03_collectives/01_allreduce/Makefile
@@ -0,0 +1,57 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = allreduce
+
+# Source files
+SOURCES = main.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Allreduce"
+	@echo "=============================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/03_collectives/01_allreduce/README.md b/examples/03_collectives/01_allreduce/README.md
new file mode 100644
index 000000000..42bf43aad
--- /dev/null
+++ b/examples/03_collectives/01_allreduce/README.md
@@ -0,0 +1,141 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL AllReduce Collective Operation Example
+
+This example demonstrates the fundamental AllReduce collective operation using
+NCCL's single-process, multi-GPU approach in which a single process manages all
+GPUs to perform a sum reduction.
+
+## Overview
+
+AllReduce combines data from all participants using a reduction operation (sum,
+max, min, etc.) and distributes the result to all participants. This example
+shows how each GPU contributes its rank value and all GPUs receive the combined
+sum using `ncclCommInitAll` for simplified setup.
+
+## What This Example Does
+
+1. **Detects available GPUs** and initializes NCCL communicators for all devices
+   using `ncclCommInitAll`
+2. **Initializes data** with each GPU contributing its rank value (GPU 0→0, GPU
+   1→1, etc.)
+3. **Performs AllReduce sum operation** where all GPU values are summed and
+   distributed to all participants
+4. **Verifies correctness** by checking that all GPUs received the expected sum:
+   0+1+2+...+(n-1)
+
+## Building and Running
+
+### Build the Example
+```bash
+cd examples/03_collectives/01_allreduce
+make [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run with All Available GPUs
+```bash
+./allreduce
+```
+
+### Run with Specific GPUs
+```bash
+CUDA_VISIBLE_DEVICES=0,1,2,3 ./allreduce
+```
+
+## Code Walk-through
+
+### Data Initialization
+Each GPU sets a send buffer allocated on the GPU to its rank value:
+```cpp
+float** sendbuff;
+float rank_value = (float)i;
+size_t size; // size is the number of float to be sent
+
+// Allocate device memory for send buffers
+CUDACHECK(cudaMalloc((void **)&sendbuff[i], size * sizeof(float)));
+
+// Each GPU contributes its rank (GPU i contributes value i)
+// Zero the entire buffer, then set first element to rank
+CUDACHECK(cudaMemset(sendbuff[i], 0, size * sizeof(float)));
+CUDACHECK(cudaMemcpy(sendbuff[i], &rank_value, sizeof(float), cudaMemcpyHostToDevice));
+```
+
+### AllReduce Operation
+All GPUs participate in the sum reduction. The operations are evaluated in parallel within a NCCL group to avoid any deadlocks.
+```cpp
+float** recvbuff;
+ncclComm_t *comms;  // comms are set during ncclCommInitAll
+cudaStream_t *streams; // streams are set in cudaStreamCreate
+
+// Allocate device memory for receive buffers
+CUDACHECK(cudaMalloc((void **)&recvbuff[i], size * sizeof(float)));
+
+NCCLCHECK(ncclGroupStart());
+for (int i = 0; i < num_gpus; i++) {
+    NCCLCHECK(ncclAllReduce(sendbuff[i], recvbuff[i], size, ncclFloat,
+                            ncclSum, comms[i], streams[i]));
+}
+NCCLCHECK(ncclGroupEnd());
+```
+
+## Expected Output
+
+```
+Using 4 devices for collective communication
+Memory allocated for 4 communicators and streams
+NCCL communicators initialized for all devices
+  Device 0 initialized with data value 0
+  Device 1 initialized with data value 1
+  Device 2 initialized with data value 2
+  Device 3 initialized with data value 3
+Starting collective sum operation across all devices
+Collective operation completed
+Verifying results (expected sum: 6)
+  Device 0 correctly received sum: 6
+  Device 1 correctly received sum: 6
+  Device 2 correctly received sum: 6
+  Device 3 correctly received sum: 6
+Example completed successfully!
+```
+
+## When to Use
+
+- **Deep learning**: Gradient averaging in data-parallel training
+- **Scientific computing**: Global reductions in parallel algorithms
+- **Statistics**: Computing global sums, averages, or other reductions
+- **Distributed algorithms**: Any scenario requiring collective reduction
+  operations
+
+## Key Insights
+- `ncclCommInitAll` simplifies single-node multi-GPU setup
+- No MPI or pthreads needed for single-node patterns
+- Allocate device buffer via ``cudaMalloc` and initialize via `cudaMemset`.
+- Best practices to wrap all collective calls in ncclGroupStart/End
+- All communication happens in parallel
+
+## Common Issues and Solutions
+
+### Issue: Verification failures
+**Solution:** Ensure each GPU initializes its buffer correctly with its rank
+value.
+
+### Issue: Out of memory errors
+**Solution:** Reduce the buffer size in the code or use fewer GPUs.
+
+## Error Handling
+
+This example uses comprehensive error checking with `NCCLCHECK` and `CUDACHECK`
+macros that immediately exit on any failure. In production code, consider more
+graceful error handling and recovery mechanisms.
+
+## Next Steps
+
+After understanding AllReduce, explore:
+- **Point-to-point communication**: Examples in `02_point_to_point/`
+- **Other collectives**: Implement Broadcast, Reduce, AllGather operations using
+  this example
+- **Multi-node approach**: Use the MPI implementation from `01_communicators` to
+  send data across nodes.
+
diff --git a/examples/03_collectives/01_allreduce/main.cc b/examples/03_collectives/01_allreduce/main.cc
new file mode 100644
index 000000000..08fef3d20
--- /dev/null
+++ b/examples/03_collectives/01_allreduce/main.cc
@@ -0,0 +1,201 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * NCCL AllReduce Example - Collective Communication
+ *
+ * This example demonstrates the fundamental AllReduce collective operation
+ * using NCCL's single-process, multi-GPU approach. AllReduce is one of the most
+ * important collective operations in distributed and parallel computing.
+ *
+ * Learning Objectives:
+ * - Understand AllReduce collective communication pattern
+ * - Learn NCCL single-process multi-GPU programming model
+ * - See how data reduction works across multiple devices
+ * - Practice verification and validation of collective results
+ *
+ */
+
+// Enhanced error checking macro for NCCL operations
+// Provides detailed error information including the failed operation
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+int main(int argc, char *argv[]) {
+  // ========================================================================
+  // STEP 1: Initialize Variables and Detect Available GPUs
+  // ========================================================================
+
+  int num_gpus = 0;
+  ncclComm_t *comms;
+  cudaStream_t *streams;
+  float **sendbuff;
+  float **recvbuff;
+
+  // Get number of CUDA devices
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+  if (num_gpus < 1) {
+    printf("No CUDA devices found\n");
+    return EXIT_FAILURE;
+  }
+
+  printf("Using %d devices for collective communication\n", num_gpus);
+
+  // ========================================================================
+  // STEP 2: Allocate Memory for Communicators, Streams, and Data Buffers
+  // ========================================================================
+
+  // Allocate arrays for per-device resources, and array of pointers for buffers
+  comms = (ncclComm_t *)malloc(num_gpus * sizeof(ncclComm_t));
+  streams = (cudaStream_t *)malloc(num_gpus * sizeof(cudaStream_t));
+  sendbuff = (float **)malloc(num_gpus * sizeof(float *));
+  recvbuff = (float **)malloc(num_gpus * sizeof(float *));
+
+  printf("Memory allocated for %d communicators and streams\n", num_gpus);
+
+  // ========================================================================
+  // STEP 3: Initialize NCCL Communicators for All Devices
+  // ========================================================================
+
+  // ncclCommInitAll creates communicators for all devices in one call
+  // This is the simplest way to set up NCCL for single-process applications
+  NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL));
+  printf("NCCL communicators initialized for all devices\n");
+
+  // ========================================================================
+  // STEP 4: Create CUDA Streams and Allocate Device Memory
+  // ========================================================================
+
+  const size_t size = 32 * 1024 * 1024; // 32M floats for demonstration
+
+  for (int i = 0; i < num_gpus; i++) {
+    // Set device context for each GPU
+    CUDACHECK(cudaSetDevice(i));
+
+    // Create stream for asynchronous operations
+    CUDACHECK(cudaStreamCreate(&streams[i]));
+
+    // Allocate device memory for send and receive buffers
+    CUDACHECK(cudaMalloc((void **)&sendbuff[i], size * sizeof(float)));
+    CUDACHECK(cudaMalloc((void **)&recvbuff[i], size * sizeof(float)));
+
+    // Initialize send buffer: zero the entire buffer, then set first element to
+    // rank
+    CUDACHECK(cudaMemset(sendbuff[i], 0, size * sizeof(float)));
+    float rank_value = (float)i;
+    CUDACHECK(cudaMemcpy(sendbuff[i], &rank_value, sizeof(float),
+                         cudaMemcpyHostToDevice));
+
+    printf("  Device %d initialized with data value %d\n", i, i);
+  }
+
+  // ========================================================================
+  // STEP 5: Perform AllReduce Sum Operation
+  // ========================================================================
+
+  printf("Starting collective sum operation across all devices\n");
+
+  // NOTE: ncclGroupStart and ncclGroupEnd are essential to avoid
+  // deadlock when using ncclCommInitAll and multiple communication calls.
+  NCCLCHECK(ncclGroupStart());
+  for (int i = 0; i < num_gpus; i++) {
+    // Each device performs combines all contributions and distributes result
+    NCCLCHECK(ncclAllReduce(sendbuff[i], recvbuff[i], size, ncclFloat, ncclSum,
+                            comms[i], streams[i]));
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  // Synchronize all streams to ensure completion
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(i));
+    CUDACHECK(cudaStreamSynchronize(streams[i]));
+  }
+
+  printf("Collective operation completed\n");
+
+  // ========================================================================
+  // STEP 6: Verify Results and Validate Correctness
+  // ========================================================================
+
+  // Expected result: sum of all ranks = 0 + 1 + 2 + ... + (num_gpus-1)
+  // Note: We only check the first element since that's all we initialized
+  float expected = (float)(num_gpus * (num_gpus - 1) / 2);
+  printf("Verifying results (expected sum: %.0f)\n", expected);
+
+  bool success = true;
+  for (int i = 0; i < num_gpus; i++) {
+    float result;
+    CUDACHECK(cudaSetDevice(i));
+    CUDACHECK(cudaMemcpy(&result, recvbuff[i], sizeof(float),
+                         cudaMemcpyDeviceToHost));
+
+    if (result != expected) {
+      printf("  Device %d received incorrect result: %.0f (expected %.0f)\n", i,
+             result, expected);
+      success = false;
+    } else {
+      printf("  Device %d correctly received sum: %.0f\n", i, result);
+    }
+  }
+
+  // ========================================================================
+  // STEP 7: Cleanup Resources and Report Results
+  // ========================================================================
+
+  // Destroy NCCL communicators
+  for (int i = 0; i < num_gpus; i++) {
+    ncclCommDestroy(comms[i]);
+  }
+
+  // Free device memory and destroy streams
+  for (int i = 0; i < num_gpus; i++) {
+    CUDACHECK(cudaSetDevice(i));
+    CUDACHECK(cudaFree(sendbuff[i]));
+    CUDACHECK(cudaFree(recvbuff[i]));
+    CUDACHECK(cudaStreamDestroy(streams[i]));
+  }
+
+  // Free host memory
+  free(comms);
+  free(streams);
+  free(sendbuff);
+  free(recvbuff);
+
+  if (success) {
+    printf("Example completed successfully!\n");
+  } else {
+    printf("Example failed - incorrect results detected\n");
+    return EXIT_FAILURE;
+  }
+
+  return 0;
+}
diff --git a/examples/03_collectives/Makefile b/examples/03_collectives/Makefile
new file mode 100644
index 000000000..c72163cba
--- /dev/null
+++ b/examples/03_collectives/Makefile
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL Collective Examples
+EXAMPLES = 01_allreduce
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test; \
+	done
+
+# Help
+help:
+	@echo "NCCL Collective Communication Examples"
+	@echo "====================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_allreduce - AllReduce collective operation"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_allreduce"
+
+.PHONY: all clean test help $(EXAMPLES)
diff --git a/examples/03_collectives/README.md b/examples/03_collectives/README.md
new file mode 100644
index 000000000..468202380
--- /dev/null
+++ b/examples/03_collectives/README.md
@@ -0,0 +1,68 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Collective Communication Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL collective
+communication operations on a single node using a single process managing all
+GPUs. The focus is clarity, correct resource management, and result
+verification.
+
+## Examples
+
+### [01_allreduce](01_allreduce/)
+**AllReduce Collective Operation**
+- **Pattern**: All participants reduce and distribute the result
+- **API**: `ncclCommInitAll`, `ncclAllReduce`
+- **Use case**: Global reductions in ML and HPC (e.g., gradient averaging)
+- **Key features**:
+  - Initializes all GPUs in a single process
+  - Each GPU contributes its rank value
+  - Executes AllReduce sum across all GPUs
+  - Verifies the expected global sum
+
+## Choosing the Right Pattern
+
+*Scenario* : Parallel training needs efficient global communication
+*Addresses* : Most commonly used collective algorithms
+*Dependencies* : A functional NCCL library and its dependencies
+
+### Why `ncclCommInitAll` here?
+For single-node collective examples we use `ncclCommInitAll` as it creates a
+clique of communicators in one call.
+
+```c
+// Initialize all GPUs in one call
+ncclComm_t* comms;
+int num_gpus;
+NCCLCHECK(ncclCommInitAll(comms, num_gpus, NULL));
+```
+
+A more advanced setup using MPI to initialize communicators across multiple
+nodes is shown in
+[01_communicators/03_one_device_per_process_mpi](../01_communicators/03_one_device_per_process_mpi)
+
+## Building
+
+### **Quick Start**
+```shell
+# Build example by directory name
+make 01_allreduce
+```
+
+### **Individual Examples**
+```shell
+# Build and run AllReduce
+cd 01_allreduce && make
+./allreduce
+```
+
+## References
+- [NCCL User Guide:
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
diff --git a/examples/04_user_buffer_registration/01_allreduce/Makefile b/examples/04_user_buffer_registration/01_allreduce/Makefile
new file mode 100644
index 000000000..014d17583
--- /dev/null
+++ b/examples/04_user_buffer_registration/01_allreduce/Makefile
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = allreduce_ub
+
+# Common utilities
+COMMON_INC = ../../common/include
+COMMON_SRC = ../../common/src
+
+# Build configuration
+INCLUDES += -I$(COMMON_INC)
+
+# Source files
+SOURCES = main.cc $(COMMON_SRC)/utils.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+else
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@
+endif
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+else
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+endif
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+ifeq ($(MPI),1)
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+else
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+endif
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: User Buffer Registration Allreduce"
+	@echo "=============================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/04_user_buffer_registration/01_allreduce/README.md b/examples/04_user_buffer_registration/01_allreduce/README.md
new file mode 100644
index 000000000..01e4b3467
--- /dev/null
+++ b/examples/04_user_buffer_registration/01_allreduce/README.md
@@ -0,0 +1,163 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL User Buffer Registration AllReduce Example
+
+This example demonstrates how to use NCCL's user buffer registration feature to
+optimize performance for repeated collective operations on the same buffers.
+User Buffer Registration is a feature that allows NCCL to directly
+send/receive/operate data through the user buffer without extra internal copy
+(zero-copy).
+
+## Overview
+
+User buffer registration allows NCCL to pre-register memory buffers with
+communicators, eliminating registration overhead on each operation. This is
+particularly beneficial for applications that repeatedly perform collective
+operations on the same memory regions, such as iterative training loops.
+
+## What This Example Does
+
+1. **Allocates memory using NCCL allocator** (`ncclMemAlloc`) which is provided
+   by NCCL as convenience function
+2. **Registers buffers with communicator** using `ncclCommRegister` for
+   optimized performance
+3. **Performs AllReduce sum operation** using the registered buffers for
+   efficient communication
+
+## Building and Running
+
+The advanced examples can be built using either pthread or MPI for
+parallelization. pthread is the default choice. To use MPI the user needs to
+provide a valid MPI installation under `MPI_HOME`.
+
+### Build
+```shell
+make [MPI=1] [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run when compiled for pthreads (default)
+```shell
+[NTHREADS=N] ./allreduce_ub
+```
+
+### Run when compiled for MPI
+```shell
+mpirun -np <num_processes> ./allreduce_ub
+```
+
+## Code Structure
+
+### Key Components
+
+1. **Buffer Allocation and Registration**:
+```c
+size_t size_bytes; // Is set to the size of the send/receive buffers
+void *d_sendbuff;
+void *d_recvbuff;
+
+// Allocate buffers using ncclMemAlloc (or another qualified allocator) on the device
+NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+ncclComm_t comm;   // comms is set during ncclCommInitRank
+void *send_handle;
+void *recv_handle;
+
+// Register buffers with NCCL, handle is returned for De-registration
+NCCLCHECK(ncclCommRegister(comm, d_sendbuff, size_bytes, &send_handle));
+NCCLCHECK(ncclCommRegister(comm, d_recvbuff, size_bytes, &recv_handle));
+```
+
+2. **AllReduce with Group Operations**:
+```c
+size_t count;  // set to number of floats to exchange
+cudaStream_t stream; // stream is set in cudaStreamCreate
+
+NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum,
+                        comm, stream));
+```
+
+3. **Buffer Deregistration and Cleanup**:
+```c
+// Deregister buffers using handle from ncclCommRegister
+NCCLCHECK(ncclCommDeregister(comm, send_handle));
+NCCLCHECK(ncclCommDeregister(comm, recv_handle));
+
+// Free buffers allocated with ncclMemAlloc
+NCCLCHECK(ncclMemFree(d_sendbuff));
+NCCLCHECK(ncclMemFree(d_recvbuff));
+
+```
+
+## Expected Output
+
+### With 4 GPUs (using pthreads/MPI)
+```
+Starting AllReduce example with 4 ranks
+  Rank 0 communicator initialized using device 0
+  Rank 1 communicator initialized using device 1
+  Rank 2 communicator initialized using device 2
+  Rank 3 communicator initialized using device 3
+User Buffer allocation:
+  Rank 0 allocating 4.00 MB per buffer
+  Rank 1 allocating 4.00 MB per buffer
+  Rank 2 allocating 4.00 MB per buffer
+  Rank 3 allocating 4.00 MB per buffer
+  Rank 0 data initialized (value: 0)
+  Rank 1 data initialized (value: 1)
+  Rank 2 data initialized (value: 2)
+  Rank 3 data initialized (value: 3)
+Starting AllReduce with 1048576 elements (4 MB)
+AllReduce completed successfully
+Verification - Expected: 6.0, Got: 6.0
+Results verified correctly
+  Rank 0 buffers deregistered
+  Rank 1 buffers deregistered
+  Rank 2 buffers deregistered
+  Rank 3 buffers deregistered
+All resources cleaned up successfully
+```
+
+## Performance Benefits of User Buffer Registration
+
+User buffer registration provides several performance advantages:
+
+1. **Reduced Overhead**: Pre-registration eliminates the need to
+   register/deregister buffers for each operation
+2. **Better Memory Pinning**: Registered buffers are pinned in memory,
+   preventing page faults
+3. **Lower Latency**: Especially beneficial for repeated operations on the same
+   buffers
+
+**Important**: Buffers must be allocated with `ncclMemAlloc` or a compatible
+allocator for registration to work. See The [General Buffer Registration
+](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html#general-buffer-registration)
+section of the user guide.
+
+**Important**: If any rank in a communicator passes registered buffers to a NCCL
+communication function, all other ranks in the same communicator must pass their
+registered buffers; otherwise, mixing registered and non-registered buffers can
+result in undefined behavior.
+
+## Key Insights
+
+- **User Buffer Registration** is most beneficial for:
+  - Large data transfers
+  - Repeated operations on the same buffers
+  - Performance-critical applications
+- **Memory management** is critical - always deregister buffers before freeing
+
+## Common Issues and Solutions
+
+1. **Registration Failure**: Buffers MUST be allocated with `ncclMemAlloc` or
+   another qualified allocator (not `cudaMalloc`) for registration. See [Buffer
+   Registration](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/bufferreg.html)
+   section for details.
+2. **Allocation Error**: If `ncclMemAlloc` fails, check NCCL version (requires
+   2.19.x+) and available memory
+3. **Deregistration Order**: Always deregister before freeing memory
+4. **Handle Management**: Keep track of registration handles for proper cleanup
+5. **Memory Leaks**: Always use `ncclMemFree` for buffers allocated with
+   `ncclMemAlloc`
diff --git a/examples/04_user_buffer_registration/01_allreduce/main.cc b/examples/04_user_buffer_registration/01_allreduce/main.cc
new file mode 100644
index 000000000..105b4e68e
--- /dev/null
+++ b/examples/04_user_buffer_registration/01_allreduce/main.cc
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "utils.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+/*
+ * NCCL User Buffer Registration AllReduce Example
+ *
+ * This example demonstrates how to use NCCL's user buffer registration feature
+ * to optimize performance for repeated collective operations on the same
+ * buffers.
+ *
+ * Learning Objectives:
+ * - Learn how to register and deregister buffers with NCCL communicators
+ * - See the proper lifecycle management of registered buffers
+ *
+ */
+
+/*
+ * This function can be called inside an MPI rank or pthread thread. The
+ * initialization and broadcast are implemented in common/src/utils.cc for
+ * easier readability. For fully integrated examples using pthreads or MPI see
+ * examples in 01_communicators.
+ */
+void *allReduce(int my_rank, int total_ranks, int local_device,
+                int devices_per_rank) {
+
+  // ========================================================================
+  // STEP 1: Initialize NCCL Communicator and Setup
+  // ========================================================================
+
+  ncclUniqueId nccl_unique_id;
+  if (my_rank == 0) {
+    printf("Starting AllReduce example with %d ranks\n", total_ranks);
+    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+  }
+
+  // Distribute unique ID.
+  // This step ensures all ranks have the same unique ID for communicator
+  // creation
+  util_broadcast(0, my_rank, &nccl_unique_id);
+
+  // Set device context for this rank
+  // Each rank manages its assigned GPU device
+  CUDACHECK(cudaSetDevice(local_device));
+
+  // Initialize NCCL communicator
+  // This creates the communication context for collective operations
+  ncclComm_t comm;
+  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
+  printf("  Rank %d communicator initialized using device %d\n", my_rank,
+         local_device);
+
+  // ========================================================================
+  // STEP 2: Allocate Memory Using NCCL Allocator
+  // ========================================================================
+
+  if (my_rank == 0) {
+    printf("User Buffer allocation:\n");
+  }
+  // Allocate memory - using larger buffers to demonstrate registration
+  // benefits
+  size_t count = 1024 * 1024; // 1M elements
+  size_t size_bytes = count * sizeof(float);
+
+  printf("  Rank %d allocating %.2f MB per buffer\n", my_rank,
+         (float)size_bytes / (1024 * 1024));
+
+  // Allocate buffers using NCCL allocator
+  // NCCL's allocator can provide optimized memory for communication
+  void *d_sendbuff;
+  void *d_recvbuff;
+  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+  // ========================================================================
+  // STEP 3: Register Buffers with NCCL Communicator
+  // ========================================================================
+
+  // Register the buffers with NCCL
+  // This is the key optimization - buffers are pre-registered for efficiency
+  // The handles returned can be used to identify registered buffers
+  void *send_handle;
+  void *recv_handle;
+  NCCLCHECK(ncclCommRegister(comm, d_sendbuff, size_bytes, &send_handle));
+  NCCLCHECK(ncclCommRegister(comm, d_recvbuff, size_bytes, &recv_handle));
+
+  // ========================================================================
+  // STEP 4: Initialize Data and Prepare for Communication
+  // ========================================================================
+
+  // Initialize data - each rank contributes its rank value
+  // This creates a simple test pattern for verification
+  float *h_data = (float *)malloc(size_bytes);
+  for (size_t i = 0; i < count; i++) {
+    h_data[i] = (float)my_rank;
+  }
+  CUDACHECK(cudaMemcpy(d_sendbuff, h_data, size_bytes, cudaMemcpyHostToDevice));
+  printf("  Rank %d data initialized (value: %d)\n", my_rank, my_rank);
+
+  // Create stream for asynchronous operations
+  // Streams allow overlapping computation and communication
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  // ========================================================================
+  // STEP 5: Perform AllReduce Operation
+  // ========================================================================
+
+  if (my_rank == 0) {
+    printf("Starting AllReduce with %zu elements (%zu MB)\n", count,
+           size_bytes / (1024 * 1024));
+  }
+
+  // Perform AllReduce operation
+  // Since buffers are registered, this should have optimized performance
+  NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum,
+                          comm, stream));
+
+  if (my_rank == 0) {
+    printf("AllReduce completed successfully\n");
+  }
+
+  // ========================================================================
+  // STEP 6: Verify Results and Validate Correctness
+  // ========================================================================
+
+  // Synchronize to ensure completion
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  // Verify results (optional - copy back and check a few elements)
+  float *h_result = (float *)malloc(sizeof(float) * count);
+  CUDACHECK(cudaMemcpy(h_result, d_recvbuff, sizeof(float) * count,
+                       cudaMemcpyDeviceToHost));
+
+  // Each element should be the sum of all ranks
+  float expected_sum = (float)(total_ranks * (total_ranks - 1)) / 2;
+  bool all_ok = true;
+  if (my_rank == 0) {
+    printf("Verification - Expected: %.1f, Got: %.1f\n", expected_sum,
+           h_result[0]);
+
+    for (size_t i = 1; i < count; i++) {
+      if (fabsf(h_result[i] - expected_sum) > 0.001) {
+        printf(" Results verification failed at index %zu: Expected %.1f, Got "
+               "%.1f\n",
+               i, expected_sum, h_result[i]);
+        all_ok = false;
+        break;
+      }
+    }
+
+    if (all_ok) {
+      printf("Results verified correctly\n");
+    } else {
+      printf("Results verification failed\n");
+    }
+  }
+
+  // ========================================================================
+  // STEP 7: Cleanup and Resource Management
+  // ========================================================================
+
+  // Important: Cleanup must happen in the correct order
+  // 1. Free host memory
+  // 2. Deregister buffers from communicator
+  // 3. Free device memory
+  // 4. Destroy CUDA resources
+  // 5. Finalize and destroy NCCL communicator
+
+  free(h_data);
+  free(h_result);
+
+  // Deregister buffers from communicator
+  // This must happen before freeing the buffers or destroying the
+  // communicator
+  NCCLCHECK(ncclCommDeregister(comm, send_handle));
+  NCCLCHECK(ncclCommDeregister(comm, recv_handle));
+  printf("  Rank %d buffers deregistered\n", my_rank);
+
+  // Free device memory allocated by NCCL
+  NCCLCHECK(ncclMemFree(d_sendbuff));
+  NCCLCHECK(ncclMemFree(d_recvbuff));
+
+  // Destroy CUDA stream
+  CUDACHECK(cudaStreamDestroy(stream));
+
+  // Finalize and destroy NCCL communicator
+  NCCLCHECK(ncclCommFinalize(comm));
+  NCCLCHECK(ncclCommDestroy(comm));
+
+  if (my_rank == 0) {
+    printf("All resources cleaned up successfully\n");
+  }
+
+  return NULL;
+}
+
+int main(int argc, char *argv[]) {
+  // Run example using the standard test framework
+  // This handles MPI/pthread initialization, device assignment, and cleanup
+  return run_example(argc, argv, allReduce);
+}
diff --git a/examples/04_user_buffer_registration/Makefile b/examples/04_user_buffer_registration/Makefile
new file mode 100644
index 000000000..900074ab4
--- /dev/null
+++ b/examples/04_user_buffer_registration/Makefile
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL User Buffer Examples
+EXAMPLES = 01_allreduce
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test; \
+	done
+
+# Help
+help:
+	@echo "NCCL User Buffer Registration Examples"
+	@echo "======================================"
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_allreduce - AllReduce collective operation"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_allreduce"
+
+.PHONY: all clean test help $(EXAMPLES)
diff --git a/examples/04_user_buffer_registration/README.md b/examples/04_user_buffer_registration/README.md
new file mode 100644
index 000000000..f848cf140
--- /dev/null
+++ b/examples/04_user_buffer_registration/README.md
@@ -0,0 +1,73 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL User Buffer Registration Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL user buffer
+registration for improving performance by allowing NCCL to operate directly on
+user-allocated buffers.
+
+## Examples
+
+### [01_allreduce](01_allreduce/)
+**AllReduce with User Buffer Registration**
+- **Pattern**: Register communication buffers once and reuse across operations
+- **API**: `ncclCommRegister`, `ncclCommDeregister`, `ncclMemAlloc`,
+  `ncclAllReduce`
+- **Use case**: Repeated collectives on the same buffers; performance-critical
+  workloads
+- **Key features**:
+  - Buffers allocated via `ncclMemAlloc` for registration compatibility
+  - Registration handles managed explicitly (register → use → deregister)
+  - Collective operations executed on registered buffers
+  - Correct cleanup and verification
+
+## Choosing the Right Pattern
+
+*Scenario* : Optimize performance for repeated collectives on same buffers
+*Addresses* : Throughput-sensitive training loops
+*Dependencies* : pthread or MPI
+
+### Why Buffer Registration?
+Pre-registering buffers eliminates per-call registration overhead and enables
+direct access. It can accelerate collectives and greatly reduce the resource
+usage (e.g. #channel usage). Also, this is a prerequisite for advanced features
+such as symmetric memory or device API calls.
+
+```c
+// Allocate using NCCL convenience function and register buffers
+NCCLCHECK(ncclMemAlloc((void**)&d_send, size_bytes));
+NCCLCHECK(ncclCommRegister(comm, d_send, size_bytes, &send_handle));
+
+// Use in collectives
+NCCLCHECK(ncclAllReduce(d_send, d_recv, count, ncclFloat, ncclSum, comm, stream));
+
+// Deregister and free
+NCCLCHECK(ncclCommDeregister(comm, send_handle));
+NCCLCHECK(ncclMemFree(d_send));
+```
+
+## Building
+
+### **Quick Start**
+```shell
+# Build example by directory name
+make 01_allreduce
+```
+
+### **Individual Examples**
+```shell
+# Build and run AllReduce with user buffer registration
+cd 01_allreduce && make
+./allreduce_ub
+```
+
+## References
+- [NCCL User Guide:
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
diff --git a/examples/05_symmetric_memory/01_allreduce/Makefile b/examples/05_symmetric_memory/01_allreduce/Makefile
new file mode 100644
index 000000000..20c8ad9ad
--- /dev/null
+++ b/examples/05_symmetric_memory/01_allreduce/Makefile
@@ -0,0 +1,77 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = allreduce_sm
+
+# Common utilities
+COMMON_INC = ../../common/include
+COMMON_SRC = ../../common/src
+
+# Build configuration
+INCLUDES += -I$(COMMON_INC)
+
+# Source files
+SOURCES = main.cc $(COMMON_SRC)/utils.cc
+OBJECTS = $(SOURCES:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+else
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@
+endif
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cc
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+else
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+endif
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+ifeq ($(MPI),1)
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+else
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+endif
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Symmetric Memeory Allreduce"
+	@echo "=============================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/05_symmetric_memory/01_allreduce/README.md b/examples/05_symmetric_memory/01_allreduce/README.md
new file mode 100644
index 000000000..b307b462a
--- /dev/null
+++ b/examples/05_symmetric_memory/01_allreduce/README.md
@@ -0,0 +1,165 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Symmetric Memory AllReduce Example
+
+This example demonstrates how to use NCCL's symmetric memory feature for
+optimized collective operations. Symmetric memory provides optimized performance
+by leveraging consistent memory layouts across all participating ranks, enabling
+advanced communication algorithms.
+
+## Overview
+
+Symmetric memory windows provide a way to register memory buffers that benefit
+from optimized collective operations. When using `NCCL_WIN_COLL_SYMMETRIC`, all
+ranks must provide symmetric buffers, enabling optimized communication patterns
+and better performance for large-scale multi-GPU operations.
+
+## What This Example Does
+
+1. **Allocates memory using NCCL allocator** (`ncclMemAlloc`) which provides
+   memory compatible with symmetric windows
+2. **Registers buffers as symmetric windows** using `ncclCommWindowRegister`
+   with `NCCL_WIN_COLL_SYMMETRIC` flag
+3. **Performs AllReduce sum operation** using the symmetric memory for optimized
+   communication performance
+
+## Building and Running
+
+The advanced examples can be built using either pthread or MPI for
+parallelization. pthread is the default choice. To use MPI the user needs to set
+`MPI=1` at build time and can optionally provide a valid MPI installation under
+`MPI_HOME`.
+
+### Build
+```shell
+make [MPI=1] [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run when compiled for pthreads (default)
+```shell
+[NTHREADS=N] ./allreduce_sm
+```
+
+### Run when compiled for MPI
+```shell
+mpirun -np <num_processes> ./allreduce_sm
+```
+
+## Code Structure
+
+### Key Components
+
+1. **Buffer Allocation and Window Registration**:
+```c
+size_t size_bytes; // Is set to the size of the send/receive buffers
+void *d_sendbuff;
+void *d_recvbuff;
+
+// Allocate buffers using ncclMemAlloc (compatible with symmetric memory)
+NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+ncclComm_t comm;
+ncclWindow_t send_win;
+ncclWindow_t recv_win;
+
+// Register buffers as symmetric windows
+NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+```
+
+2. **AllReduce Operation**:
+```c
+size_t count;  // set to number of floats to exchange
+cudaStream_t stream; // stream is set in cudaStreamCreate
+
+// Perform AllReduce with symmetric memory optimization
+NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum,
+                        comm, stream));
+```
+
+3. **Window Deregistration and Cleanup**:
+```c
+// Deregister symmetric memory windows
+NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
+NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
+
+// Free buffers allocated with ncclMemAlloc
+NCCLCHECK(ncclMemFree(d_sendbuff));
+NCCLCHECK(ncclMemFree(d_recvbuff));
+```
+
+## Expected Output
+
+### With 4 GPUs (using pthreads/MPI)
+```
+Starting AllReduce example with 4 ranks
+  Rank 0 communicator initialized using device 0
+  Rank 1 communicator initialized using device 1
+  Rank 2 communicator initialized using device 2
+  Rank 3 communicator initialized using device 3
+Symmetric Memory allocation
+  Rank 0 allocating 4.00 MB per buffer
+  Rank 1 allocating 4.00 MB per buffer
+  Rank 2 allocating 4.00 MB per buffer
+  Rank 3 allocating 4.00 MB per buffer
+  Rank 0 data initialized (value: 0)
+  Rank 1 data initialized (value: 1)
+  Rank 2 data initialized (value: 2)
+  Rank 3 data initialized (value: 3)
+Starting AllReduce with 1048576 elements (4 MB)
+AllReduce completed successfully
+Verification - Expected: 6.0, Got: 6.0
+Results verified correctly
+  Rank 0 symmetric memory windows deregistered
+  Rank 1 symmetric memory windows deregistered
+  Rank 2 symmetric memory windows deregistered
+  Rank 3 symmetric memory windows deregistered
+All resources cleaned up successfully
+Example completed - demonstrated symmetric memory lifecycle
+```
+
+## Performance Benefits of Symmetric Memory
+
+Symmetric memory registration provides several performance advantages:
+
+- **Optimized Communication Algorithms**: NCCL can apply advanced optimizations
+  when all ranks have symmetric layouts
+- **Better Memory Access Patterns**: Consistent layouts enable better caching
+  and memory access optimization
+
+For more information on the performance benefits see the [Enabling Fast
+Inference and Resilient Training with NCCL
+2.27]()https://developer.nvidia.com/blog/enabling-fast-inference-and-resilient-training-with-nccl-2-27/)
+blog.
+
+**Important**: Buffers must be allocated using the CUDA Virtual Memory
+Management (VMM) API. NCCL provides the `ncclMemAlloc` convenience function for
+symmetric memory registration. The `NCCL_WIN_COLL_SYMMETRIC` flag requires all
+ranks to provide symmetric buffers consistently.
+
+## Key Insights
+
+- **Symmetric Memory Windows** are most beneficial for:
+  - Large-scale collective operations with consistent memory patterns
+  - Latency-sensitive kernels
+  - Applications with predictable allocation patterns
+- **ncclCommInitRank** can be used for pthread or MPI parallel case
+- **Window registration** must happen on all ranks for collective operations
+- **Memory management** is critical - always deregister windows before freeing
+  memory
+
+## Common Issues and Solutions
+
+1. **Window Registration Failure**: Buffers MUST be allocated with (VMM) API,
+   e.g. `ncclMemAlloc` (not `cudaMalloc`) for symmetric memory.
+2. **Allocation Error**: If `ncclMemAlloc` fails, check NCCL version (requires
+   at least v2.27) and available memory
+3. **Deregistration Order**: Always deregister windows before freeing memory or
+   destroying communicators
+4. **Symmetric Requirement**: All ranks must use `NCCL_WIN_COLL_SYMMETRIC`
+   consistently in collective operations
+5. **Memory Leaks**: Always use `ncclMemFree` for buffers allocated with
+   `ncclMemAlloc`
diff --git a/examples/05_symmetric_memory/01_allreduce/main.cc b/examples/05_symmetric_memory/01_allreduce/main.cc
new file mode 100644
index 000000000..f2cda155e
--- /dev/null
+++ b/examples/05_symmetric_memory/01_allreduce/main.cc
@@ -0,0 +1,220 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "utils.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+/*
+ * NCCL Symmetric Memory AllReduce Example
+ *
+ * This example demonstrates how to use NCCL's symmetric memory feature
+ * for collective operations. Symmetric memory provides optimized performance
+ * by leveraging consistent memory layouts across all participating ranks.
+ *
+ * Learning Objectives:
+ * - Learn how to register symmetric memory windows with NCCL communicators
+ * - See the proper lifecycle management of symmetric memory
+ *
+ */
+
+/*
+ * This function can be called inside an MPI rank or pthread thread. The
+ * initialization and broadcast are implemented in common/src/utils.cc for
+ * easier readability. For fully integrated examples using pthreads or MPI see
+ * examples in 01_communicators.
+ */
+void *allReduce(int my_rank, int total_ranks, int local_device,
+                int devices_per_rank) {
+
+  // ========================================================================
+  // STEP 1: Initialize NCCL Communicator and Setup
+  // ========================================================================
+
+  ncclUniqueId nccl_unique_id;
+  if (my_rank == 0) {
+    printf("Starting AllReduce example with %d ranks\n", total_ranks);
+    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+  }
+
+  // Distribute unique ID.
+  // This step ensures all ranks have the same unique ID for communicator
+  // creation
+  util_broadcast(0, my_rank, &nccl_unique_id);
+
+  // Set device context for this rank
+  // Each rank manages its assigned GPU device
+  CUDACHECK(cudaSetDevice(local_device));
+
+  // Initialize NCCL communicator
+  // This creates the communication context for collective operations
+  ncclComm_t comm;
+  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
+  printf("  Rank %d communicator initialized using device %d\n", my_rank,
+         local_device);
+
+  // ========================================================================
+  // STEP 2: Allocate Memory Using NCCL Allocator
+  // ========================================================================
+
+  if (my_rank == 0) {
+    printf("Symmetric Memory allocation\n");
+  }
+  // Allocate memory - using larger buffers to demonstrate symmetric memory
+  // benefits
+  size_t count = 1024 * 1024; // 1M elements
+  size_t size_bytes = count * sizeof(float);
+
+  printf("  Rank %d allocating %.2f MB per buffer\n", my_rank,
+         (float)size_bytes / (1024 * 1024));
+
+  float *h_data = (float *)malloc(size_bytes);
+
+  // Allocate buffers using NCCL allocator
+  // NCCL's allocator is compatible with symmetric memory layouts
+  void *d_sendbuff;
+  void *d_recvbuff;
+  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+  // ========================================================================
+  // STEP 3: Register Symmetric Memory Windows
+  // ========================================================================
+
+  /* Passing NCCL_WIN_COLL_SYMMETRIC requires users to provide the symmetric
+   * buffers among all ranks in collectives.
+   * Every rank needs to call ncclCommWindowRegister to register its buffers.
+   */
+
+  // Register symmetric memory windows with NCCL
+  ncclWindow_t send_win;
+  ncclWindow_t recv_win;
+  NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win,
+                                   NCCL_WIN_COLL_SYMMETRIC));
+  NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win,
+                                   NCCL_WIN_COLL_SYMMETRIC));
+
+  // ========================================================================
+  // STEP 4: Initialize Data and Prepare for Communication
+  // ========================================================================
+
+  // Initialize data - each rank contributes its rank value
+  // This creates a simple test pattern for verification
+  for (size_t i = 0; i < count; i++) {
+    h_data[i] = (float)my_rank;
+  }
+  CUDACHECK(cudaMemcpy(d_sendbuff, h_data, size_bytes, cudaMemcpyHostToDevice));
+  printf("  Rank %d data initialized (value: %d)\n", my_rank, my_rank);
+
+  // Create stream for asynchronous operations
+  // Streams allow overlapping computation and communication
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  // ========================================================================
+  // STEP 5: Perform AllReduce Operation
+  // ========================================================================
+
+  if (my_rank == 0) {
+    printf("Starting AllReduce with %zu elements (%zu MB)\n", count,
+           size_bytes / (1024 * 1024));
+  }
+
+  // Perform AllReduce operation
+  // Since symmetric memory is registered, NCCL can apply optimized algorithms
+  NCCLCHECK(ncclAllReduce(d_sendbuff, d_recvbuff, count, ncclFloat, ncclSum,
+                          comm, stream));
+
+  if (my_rank == 0) {
+    printf("AllReduce completed successfully\n");
+  }
+
+  // ========================================================================
+  // STEP 6: Verify Results and Validate Correctness
+  // ========================================================================
+
+  // Synchronize to ensure completion
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  // Verify results (optional - copy back and check)
+  float *h_result = (float *)malloc(size_bytes);
+  CUDACHECK(cudaMemcpy(h_result, d_recvbuff, size_bytes,
+                       cudaMemcpyDeviceToHost));
+
+  // Each element should be the sum of all ranks
+  float expected_sum = (float)(total_ranks * (total_ranks - 1)) / 2;
+  bool all_ok = true;
+  if (my_rank == 0) {
+    printf("Verification - Expected: %.1f, Got: %.1f\n", expected_sum,
+           h_result[0]);
+
+    for (size_t i = 1; i < count; i++) {
+      if (fabsf(h_result[i] - expected_sum) > 0.001) {
+        printf(" Results verification failed at index %zu: Expected %.1f, Got "
+               "%.1f\n", i, expected_sum, h_result[i]);
+        all_ok = false;
+        break;
+      }
+    }
+
+    if (all_ok) {
+      printf("Results verified correctly\n");
+    } else {
+      printf("Results verification failed\n");
+    }
+  }
+
+  // ========================================================================
+  // STEP 7: Cleanup and Resource Management
+  // ========================================================================
+
+  // Important: Cleanup must happen in the correct order
+  // 1. Free host memory
+  // 2. Deregister symmetric memory windows
+  // 3. Free device memory
+  // 4. Destroy CUDA resources
+  // 5. Finalize and destroy NCCL communicator
+
+  free(h_data);
+  free(h_result);
+
+  // Deregister symmetric memory windows from communicator
+  // This must happen before freeing the buffers or destroying the
+  // communicator
+  NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
+  NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
+  printf("  Rank %d symmetric memory windows deregistered\n", my_rank);
+
+  // Free device memory allocated by NCCL
+  NCCLCHECK(ncclMemFree(d_sendbuff));
+  NCCLCHECK(ncclMemFree(d_recvbuff));
+
+  // Destroy CUDA stream
+  CUDACHECK(cudaStreamDestroy(stream));
+
+  // Finalize and destroy NCCL communicator
+  NCCLCHECK(ncclCommFinalize(comm));
+  NCCLCHECK(ncclCommDestroy(comm));
+
+  if (my_rank == 0) {
+    printf("All resources cleaned up successfully\n");
+    printf("Example completed - demonstrated symmetric memory lifecycle\n");
+  }
+
+  return NULL;
+}
+
+int main(int argc, char *argv[]) {
+  // Run example using the standard test framework
+  // This handles MPI/pthread initialization, device assignment, and cleanup
+  return run_example(argc, argv, allReduce);
+}
diff --git a/examples/05_symmetric_memory/Makefile b/examples/05_symmetric_memory/Makefile
new file mode 100644
index 000000000..c2c5ce506
--- /dev/null
+++ b/examples/05_symmetric_memory/Makefile
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL Shared Memory Examples
+EXAMPLES = 01_allreduce
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test; \
+	done
+
+# Help
+help:
+	@echo "NCCL Symmetric Memeory Examples"
+	@echo "==============================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_allreduce - AllReduce collective operation"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_allreduce"
+
+.PHONY: all clean test help $(EXAMPLES)
diff --git a/examples/05_symmetric_memory/README.md b/examples/05_symmetric_memory/README.md
new file mode 100644
index 000000000..936ce4b78
--- /dev/null
+++ b/examples/05_symmetric_memory/README.md
@@ -0,0 +1,72 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Symmetric Memory Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL symmetric memory
+windows for improving performance of collective operations when all ranks use
+consistent memory layouts.
+
+## Examples
+
+### [01_allreduce](01_allreduce/)
+**AllReduce with Symmetric Memory Windows**
+- **Pattern**: Register symmetric windows per rank and use them for collectives
+- **API**: `ncclCommWindowRegister`, `ncclCommWindowDeregister`, `ncclMemAlloc`,
+  `ncclAllReduce`
+- **Use case**: Large-scale collectives with consistent buffer layouts across
+  ranks
+- **Key features**:
+  - Buffers allocated via `ncclMemAlloc` for symmetric compatibility
+  - Windows registered as `NCCL_WIN_COLL_SYMMETRIC`
+  - Collective operations executed on symmetric windows
+  - Correct deregistration and cleanup
+
+## Choosing the Right Pattern
+
+*Scenario* : Large-scale training with consistent memory patterns
+*Addresses* : Low-latency, high-bandwidth collectives on supported systems
+*Dependencies* : pthread or MPI
+
+### Why Symmetric Windows?
+Symmetric windows enable NCCL to apply optimized collective protocols when all
+ranks use consistent layouts. The memory needs to be allocated through the CUDA
+Virtual Memory Management (VMM) API and registered with NCCL.
+
+```c
+// Allocate using NCCL provided convenience function and register symmetric windows
+NCCLCHECK(ncclMemAlloc(&buffer, size_bytes));
+NCCLCHECK(ncclCommWindowRegister(comm, buffer, size_bytes, &win, NCCL_WIN_COLL_SYMMETRIC));
+
+// Collective using symmetric windows
+NCCLCHECK(ncclAllReduce(buffer, buffer, count, ncclFloat, ncclSum, comm, stream));
+
+// Deregister and free
+NCCLCHECK(ncclCommWindowDeregister(comm, win));
+NCCLCHECK(ncclMemFree(buffer));
+```
+
+## Building
+
+### **Quick Start**
+```shell
+# Build example by directory name
+make 01_allreduce
+```
+
+### **Individual Examples**
+```shell
+# Build and run AllReduce with symmetric windows
+cd 01_allreduce && make
+./allreduce_sm
+```
+
+## References
+- [NCCL User Guide:
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
diff --git a/examples/06_device_api/01_allreduce/Makefile b/examples/06_device_api/01_allreduce/Makefile
new file mode 100644
index 000000000..60b21c8bc
--- /dev/null
+++ b/examples/06_device_api/01_allreduce/Makefile
@@ -0,0 +1,81 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = allreduce_device_api
+
+# Common utilities
+COMMON_INC = ../../common/include
+COMMON_SRC = ../../common/src
+
+# Build configuration
+INCLUDES += -I$(COMMON_INC)
+
+# Source files
+SOURCES = main.cu $(COMMON_SRC)/utils.cc
+OBJECTS = $(SOURCES:.cu=.o)
+OBJECTS := $(OBJECTS:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+else
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@
+endif
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cu
+	$(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.cc
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+else
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+endif
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+ifeq ($(MPI),1)
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+else
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+endif
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Device API Allreduce"
+	@echo "=============================================="
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/06_device_api/01_allreduce/README.md b/examples/06_device_api/01_allreduce/README.md
new file mode 100644
index 000000000..b403c6321
--- /dev/null
+++ b/examples/06_device_api/01_allreduce/README.md
@@ -0,0 +1,218 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Device API AllReduce Example
+
+This example shows how to implement AllReduce sum operation directly in a kernel
+using the NCCL device API. We first create a device communicator with
+`ncclDevCommCreate` to enable kernel-initiated communication. After that,
+device-side synchronization is performed with barriers and symmetric memory
+windows are used to enable Load Store Accessible (LSA) memory access of peers.
+
+## Overview
+
+This example shows how to implement AllReduce sum operation using a GPU kernel
+that directly performs the collective operations. The device communicators are
+created with `ncclDevCommCreate` and device-side synchronization is ensured with
+Load Store Accessible (LSA) barriers. LSA windows are used for peer memory
+access.
+
+## What This Example Does
+
+1. **Creates device communicators** using `ncclDevCommCreate` for GPU kernel
+   access to NCCL operations
+2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for
+   direct peer-to-peer access
+3. **Launches GPU kernel** that performs AllReduce sum operation entirely on
+   device using LSA barriers
+
+## Building and Running
+
+The advanced examples can be built using either pthread or MPI for
+parallelization. pthread is the default choice. To use MPI the user needs to set
+`MPI=1` at build time and can optionally provide a valid MPI installation under
+`MPI_HOME`.
+
+### Build
+```shell
+make [MPI=1] [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run when compiled for pthreads (default)
+```shell
+[NTHREADS=N] ./allreduce_device_api
+```
+
+### Run when compiled for MPI
+```shell
+mpirun -np <num_processes> ./allreduce_device_api
+```
+
+## Code Walk-through
+
+### Device Communicator Creation (Host-side)
+The `ncclDevComm` is the core component of the device API, enabling GPU kernels
+to perform inter-GPU communication and fuse computation with communication. The
+`ncclDevCommRequirements` specifies what resources the device communicator
+should allocate. In this example, we set `lsaBarrierCount` to match our thread
+block count, giving each block its own barrier for independent cross-GPU
+synchronization.
+
+```cpp
+ncclDevComm devComm;
+ncclDevCommRequirements reqs;
+// Allocate one barrier per CTA we intend to launch
+reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT;
+
+// Create device communicator with LSA barrier support
+NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+```
+
+### Memory Window Registration (Host-side)
+The device API requires symmetric memory windows registered using
+`NCCL_WIN_COLL_SYMMETRIC`. See the [symmetric memory
+example](../../05_symmetric_memory/) for allocation and requirements details.
+
+```cpp
+ncclComm_t comm;
+void* d_sendbuff;
+void* d_recvbuff;
+ncclWindow_t send_win;
+ncclWindow_t recv_win;
+
+// Register symmetric windows for device-side peer access
+NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+```
+
+### LSA Barriers (Device-side)
+LSA barriers enable cross-GPU synchronization from device code. Each thread
+block uses `blockIdx.x` to select its dedicated barrier, allowing blocks to
+progress independently while coordinating with corresponding blocks on other
+GPUs.
+
+```cpp
+// LSA barriers enable coordination between GPU threads across different ranks
+// This ensures all ranks reach the same synchronization point before proceeding
+ncclLsaBarrierSession<ncclCoopCta> bar {
+    ncclCoopCta(),           // Barrier scope: entire CTA (thread block)
+    devComm, ncclTeamLsa(devComm), devComm.lsaBarrier,
+    blockIdx.x               // Barrier index: matches our CTA index (0 to lsaBarrierCount-1)
+};
+bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+// ...
+
+// Release barrier ensures that we received data from everyone before we unblock the stream and allow the next kernel(s) to process the data.
+// Critical for correctness in device-side collective operations
+bar.sync(ncclCoopCta(), cuda::memory_order_release);
+```
+### Memory Access (Device-side)
+`ncclGetLsaPointer` allows CUDA kernels to directly access other GPUs' memory
+within the LSA team.
+
+```cpp
+// Access peer memory directly using LSA (Load/Store Accessible) pointers
+float* peerPtr = (float*)ncclGetLsaPointer(sendwin, sendoffset, peer);
+```
+
+## Expected Output
+
+```
+Starting Device API AllReduce initialization
+  Rank 0 using GPU device 0
+  Rank 1 using GPU device 1
+  Rank 2 using GPU device 2
+  Rank 3 using GPU device 3
+  Rank 0 initialized NCCL communicator for 4 total ranks
+  Rank 1 initialized NCCL communicator for 4 total ranks
+  Rank 2 initialized NCCL communicator for 4 total ranks
+  Rank 3 initialized NCCL communicator for 4 total ranks
+  Rank 0 initialized data with value 0
+  Rank 1 initialized data with value 1
+  Rank 2 initialized data with value 2
+  Rank 3 initialized data with value 3
+  Rank 0 created device communicator with 16 LSA barriers
+  Rank 1 created device communicator with 16 LSA barriers
+  Rank 2 created device communicator with 16 LSA barriers
+  Rank 3 created device communicator with 16 LSA barriers
+Starting AllReduce with 1048576 elements (4 MB) using Device API
+Expected result: sum of ranks 0 to 3 = 6 per element
+  Rank 0 completed AllReduce kernel execution
+  Rank 1 completed AllReduce kernel execution
+  Rank 2 completed AllReduce kernel execution
+  Rank 3 completed AllReduce kernel execution
+AllReduce completed. Result verification: PASSED
+All elements correctly sum to 6 (ranks 0-3)
+```
+
+## When to Use
+
+- **Kernel-level communication**: When compute kernels need immediate access to
+  communication results
+- **Low-latency scenarios**: Reduced host-device synchronization overhead
+- **Custom collectives**: Implementing specialized reduction or communication
+  patterns
+- **Iterative algorithms**: Repeated communication with minimal CPU involvement
+
+## Performance Considerations
+
+**Advantages:**
+- Lower latency for small to medium message sizes
+- Eliminates host-device synchronization bottlenecks
+- Enables computation-communication fusion within kernels
+- Direct peer memory access without CPU copying
+
+**Disadvantages:**
+- More complex programming model requiring LSA barriers
+- Requires careful memory ordering and synchronization
+- Higher development complexity compared to host API
+- CUDA Compute Capability 7.0+ and GPUs with P2P support (e.g., NVLink or PCI)
+  required.
+
+## Common Issues and Solutions
+
+### Issue: NCCL warning communicator does not support symmetric memory
+NCCL selects support for symmetric memory operations based on GPU connectivity.
+If the GPUs on a node are only connected through e.g. the inter-CPU link,
+symmetric memory will not be supported. **Solution:** Use `nvidia-smi` to
+identify and select a subset of GPUs (e.g. via `CUDA_VISIBLE_DEVICES`) connected
+through NVlink or PCIe.
+
+### Issue: LSA barrier synchronization failures
+**Solution:** Ensure `lsaBarrierCount` matches the number of thread blocks in
+kernel launch configuration.
+
+### Issue: Memory access violations in device kernel
+**Solution:** Verify memory windows are registered as `NCCL_WIN_COLL_SYMMETRIC`
+and all ranks use identical buffer sizes.
+
+### Issue: Incomplete results or race conditions
+**Solution:** Use proper memory ordering in LSA barriers
+(`cuda::memory_order_relaxed` vs `cuda::memory_order_release`).
+
+## Performance Notes
+
+- These are educational examples, not optimized for performance
+- Real implementations should use vectorization, loop unrolling, and memory
+  coalescing
+- Consider NCCL's optimized device kernels for best practices related to
+  performance
+  - NCCL library implementation of device kernels for collective operations
+  - NCCL perf tests implementations of optimized device kernels
+
+## Error Handling
+
+The example uses comprehensive error checking for both CUDA and NCCL operations.
+Device kernels should implement proper error handling for LSA operations and
+memory access patterns.
+
+## Next Steps
+
+After understanding this example, explore:
+- **Custom reduction operations**: Implement non-standard reduction patterns
+- **Mixed host-device patterns**: Combine host and device API for complex
+  workflows
+- **Performance optimization**: Fine-tune LSA barrier usage and memory access
+  patterns
diff --git a/examples/06_device_api/01_allreduce/main.cu b/examples/06_device_api/01_allreduce/main.cu
new file mode 100644
index 000000000..4eafe6cc1
--- /dev/null
+++ b/examples/06_device_api/01_allreduce/main.cu
@@ -0,0 +1,251 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "nccl_device.h"
+#include "utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+/*
+ * NCCL Device API AllReduce Example
+ *
+ * This example demonstrates NCCL's Device API, which enables GPU kernels to
+ * directly interact with NCCL without CPU intervention. This is particularly
+ * powerful for applications that need to perform communication
+ * from within CUDA kernels.
+ *
+ * Learning Objectives:
+ * - Understand NCCL Device API vs Host API differences
+ * - Learn how to register memory windows for device-side access
+ * - See how GPU kernels can perform collective operations directly
+ * - Practice LSA (Load Store Access) barrier synchronization
+ *
+ * Key Device API Concepts:
+ * - ncclDevComm: Device-side communicator for kernel use
+ * - ncclWindow_t: Memory windows that enable direct peer access
+ * - LSA barriers: Synchronization primitives for device-side coordination
+ * - ncclGetLsaPointer: Direct access to peer memory from device code
+ *
+ * When to Use Device API:
+ * - Compute kernels that need immediate communication results
+ * - Fusion of computation and communication in a single kernel
+ * - Reduced host-device synchronization overhead
+ * - Custom collective operations not available in standard NCCL
+ *
+ * Performance Considerations:
+ * - Lower latency than host API for small operations
+ * - Enables computation-communication overlap within kernels
+ * - Requires careful synchronization and memory ordering
+ * - LSA barriers add coordination overhead but enable correctness
+ */
+
+// Device API kernel launch configuration
+// CTA count must match lsaBarrierCount for proper barrier synchronization
+#define NCCL_DEVICE_CTA_COUNT 16
+#define NCCL_DEVICE_THREADS_PER_CTA 512
+
+// ==========================================================================
+// Device Kernel Implementation
+// ==========================================================================
+
+// Device kernel that performs AllReduce sum operation
+// This kernel demonstrates direct NCCL communication from GPU threads
+__global__ void simpleAllReduceKernel(ncclWindow_t sendwin, size_t sendoffset,
+                                      ncclWindow_t recvwin, size_t recvoffset,
+                                      size_t count, int root, struct ncclDevComm devComm) {
+  // LSA barriers enable coordination between GPU threads across different ranks
+  // Barrier scope: CTA (all threads in this block participate)
+  // Barrier index: blockIdx.x selects this CTA's dedicated barrier (one barrier per CTA)
+  ncclLsaBarrierSession<ncclCoopCta> bar { ncclCoopCta(), devComm, ncclTeamLsa(devComm),
+                                           devComm.lsaBarrier, blockIdx.x };
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed);
+
+  const int rank = devComm.rank, nRanks = devComm.nRanks;
+
+  // We are going to spread the workload accross all GPU ranks.
+  // So calculate the global thread ID accross all ranks.
+  // This maps global threads to data elements in the data to be reduced
+  const int globalTid = threadIdx.x + blockDim.x * (rank + blockIdx.x * nRanks);
+  const int globalNthreads = blockDim.x * gridDim.x * nRanks;
+
+  // Grid stride loop over all elements with the globalThreads
+  for (size_t offset = globalTid; offset < count; offset += globalNthreads) {
+    float v = 0;
+    // Access remote (and local [peer==rank]) memory and reduce locally
+    for (int peer=0; peer<nRanks; peer++) {
+      // Access peer memory directly using LSA (Load/Store Accessible) pointers
+      float* sendPtr = (float*)ncclGetLsaPointer(sendwin, sendoffset, peer);
+      v += sendPtr[offset];
+    }
+    // Write the result back to remote and local memory
+    for (int peer=0; peer<nRanks; peer++) {
+      float* recvPtr = (float*)ncclGetLsaPointer(recvwin, recvoffset, peer);
+      recvPtr[offset] = v;
+    }
+  }
+  // Release barrier ensures that we received data from everyone before we unblock the stream
+  // and allow the next kernel(s) to process the data.
+  // Critical for correctness in device-side collective operations
+  bar.sync(ncclCoopCta(), cuda::memory_order_release);
+}
+
+// ==========================================================================
+// Host-Side Setup and Device API Initialization
+// ==========================================================================
+
+// This function can be called inside an MPI rank or pthread thread. The
+// initialization and broadcast are implemented in common/src/utils.cc for
+// easier readability. For fully integrated examples using pthreads or MPI see
+// examples. in 01_communicators.
+
+void* allReduce(int my_rank, int total_ranks, int local_device, int devices_per_rank) {
+  ncclComm_t comm;
+  ncclUniqueId nccl_unique_id;
+
+  if (my_rank == 0) {
+    printf("Starting Device API AllReduce initialization\n");
+  }
+
+
+  // Standard NCCL communicator initialization (same as Host API)
+  if (my_rank == 0) {
+    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+  }
+
+  // Distribute unique ID in case of MPI.
+  util_broadcast(0, my_rank, &nccl_unique_id);
+
+  // Set device context for this rank
+  CUDACHECK(cudaSetDevice(local_device));
+  printf("  Rank %d using GPU device %d\n", my_rank, local_device);
+
+  // ==========================================================================
+  // STEP 2: Initialize NCCL Communicator and Allocate Memory
+  // ==========================================================================
+
+  // Initialize NCCL communicator (same as Host API)
+  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
+  printf("  Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks);
+
+  // Allocate memory for AllReduce operation
+  size_t count = 1024 * 1024; // 1M elements
+  size_t size_bytes = count * sizeof(float);
+
+  float *h_data = (float*)malloc(size_bytes);
+  void* d_sendbuff;
+  void* d_recvbuff;
+  ncclWindow_t send_win;
+  ncclWindow_t recv_win;
+
+  // Device API requires allocation compatible with symmetric memory allocation
+  // This ensures memory can be accessed directly by device kernels from all ranks
+  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+  // ==========================================================================
+  // STEP 4: Register Memory Windows for Device-Side Access
+  // ==========================================================================
+
+  // Register symmetric windows for LSA access
+  // Windows enable direct peer-to-peer access from device kernels
+  NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+  NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+
+  // Initialize data with rank-specific values for verification
+  for (size_t i = 0; i < count; i++) {
+    h_data[i] = (float)my_rank;
+  }
+  CUDACHECK(cudaMemcpy(d_sendbuff, h_data, size_bytes, cudaMemcpyHostToDevice));
+  printf("  Rank %d initialized data with value %d\n", my_rank, my_rank);
+
+  // ==========================================================================
+  // STEP 5: Create Device Communicator and Configure LSA Barriers
+  // ==========================================================================
+
+  // Create stream for kernel execution
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  // Create device communicator - this is the key Device API component
+  // Requirements specify resources to allocate (e.g., one barrier per CTA)
+  ncclDevComm devComm;
+  ncclDevCommRequirements reqs;
+  memset(&reqs, 0, sizeof(reqs));
+  reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT; // Must match kernel launch config
+  NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+  printf("  Rank %d created device communicator with %d LSA barriers\n", my_rank, NCCL_DEVICE_CTA_COUNT);
+
+  if (my_rank == 0) {
+    printf("Starting AllReduce with %zu elements (%zu MB) using Device API\n",
+           count, size_bytes / (1024 * 1024));
+    printf("Expected result: sum of ranks 0 to %d = %d per element\n",
+           total_ranks - 1, (total_ranks * (total_ranks - 1)) / 2);
+  }
+
+  // ==========================================================================
+  // STEP 6: Launch Device Kernel for AllReduce Operation
+  // ==========================================================================
+
+  // Launch device kernel to perform AllReduce
+  // This kernel will directly access peer memory and perform collective operation
+  simpleAllReduceKernel<<<NCCL_DEVICE_CTA_COUNT, NCCL_DEVICE_THREADS_PER_CTA, 0, stream>>>(
+                                                                                           send_win, 0, recv_win, 0, count, 0, devComm);
+
+  // Wait for completion - kernel performs AllReduce.
+  CUDACHECK(cudaStreamSynchronize(stream));
+  printf("  Rank %d completed AllReduce kernel execution\n", my_rank);
+
+  // ==========================================================================
+  // STEP 7: Verify Results and Cleanup Resources
+  // ==========================================================================
+
+  // Verify results by copying back and checking
+  CUDACHECK(cudaMemcpy(h_data, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost));
+  float expected = (float)((total_ranks * (total_ranks - 1)) / 2);
+  bool success = true;
+  for (int i = 0; i < count; i++) {
+    if (h_data[i] != expected) {
+      success = false;
+      break;
+    }
+  }
+
+  if (my_rank == 0) {
+    printf("AllReduce completed. Result verification: %s\n",
+           success ? "PASSED" : "FAILED");
+    if (success) {
+      printf("All elements correctly sum to %.0f (ranks 0-%d)\n",
+             expected, total_ranks - 1);
+    }
+  }
+
+  // Cleanup resources in proper order
+  free(h_data);
+
+  // Device API specific cleanup
+  NCCLCHECK(ncclDevCommDestroy(comm, &devComm));
+  NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
+  NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
+  NCCLCHECK(ncclMemFree(d_sendbuff));
+  NCCLCHECK(ncclMemFree(d_recvbuff));
+
+  // Standard NCCL cleanup
+  CUDACHECK(cudaStreamDestroy(stream));
+  NCCLCHECK(ncclCommFinalize(comm));
+  NCCLCHECK(ncclCommDestroy(comm));
+
+  return NULL;
+}
+
+int main(int argc, char* argv[]) {
+  // Run example using the provided utility framework
+  return run_example(argc, argv, allReduce);
+}
diff --git a/examples/06_device_api/Makefile b/examples/06_device_api/Makefile
new file mode 100644
index 000000000..c217592f8
--- /dev/null
+++ b/examples/06_device_api/Makefile
@@ -0,0 +1,47 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL Shared Memory Examples
+EXAMPLES = 01_allreduce
+
+# Default target
+all: $(EXAMPLES)
+
+# Build individual examples
+$(EXAMPLES):
+	$(MAKE) -C $@
+
+# Clean all build artifacts
+clean:
+	for example in $(EXAMPLES); do \
+		$(MAKE) -C $$example clean; \
+	done
+
+# Test all examples
+test: all
+	for example in $(EXAMPLES); do \
+		echo "Testing $$example..."; \
+		$(MAKE) -C $$example test; \
+	done
+
+# Help
+help:
+	@echo "NCCL Device API Examples"
+	@echo "========================"
+	@echo ""
+	@echo "Targets:"
+	@echo "  all     - Build all examples"
+	@echo "  clean   - Clean all build artifacts"
+	@echo "  test    - Test all examples"
+	@echo "  help    - Show this help"
+	@echo ""
+	@echo "Examples:"
+	@echo "  01_allreduce - AllReduce collective operation"
+	@echo ""
+	@echo "To build/run individual examples:"
+	@echo "  make -C 01_allreduce"
+
+.PHONY: all clean test help $(EXAMPLES)
diff --git a/examples/06_device_api/README.md b/examples/06_device_api/README.md
new file mode 100644
index 000000000..3a5e17ae5
--- /dev/null
+++ b/examples/06_device_api/README.md
@@ -0,0 +1,70 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Device API Examples
+
+## Overview
+This directory contains minimal examples that demonstrate NCCL's device API,
+enabling users to perform inter-GPU communication within their own kernels.
+
+## Examples
+
+### [01_allreduce](01_allreduce/)
+**AllReduce with Device Kernel Implementation**
+- **Pattern**: GPU kernel performs collectives using device communicators
+- **API**: `ncclDevCommCreate`, `ncclCommWindowRegister`, device-side LSA
+  barriers, `ncclAllReduce`
+- **Use case**: Allreduce operations with custom operations, fusing allreduce
+  operation with previous/next compute operation.
+- **Key features**:
+  - Device communicator creation with LSA barrier support
+  - Symmetric memory windows for peer memory access
+  - Device kernels coordinating via LSA barriers
+  - Host launches kernel; kernel performs AllReduce on-device
+
+## Choosing the Right Pattern
+
+*Scenario* : Custom kernels fusing computation and communication.
+*Addresses* : Schedule communication from inside a CUDA kernel.
+*Dependencies* : pthread or MPI
+
+### Why the Device API?
+The device API allows NCCL communication within CUDA kernels, fusing communication and computation steps:
+```cpp
+// Host:
+// 1) Create device communicator + requirements
+// 2) Register symmetric memory window for peer access
+ncclDevComm devComm; ncclDevCommRequirements reqs{};
+reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT;
+NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+NCCLCHECK(ncclCommWindowRegister(comm, buffer, size, &win, NCCL_WIN_COLL_SYMMETRIC));
+
+// Device:
+// - Use barriers for cross-GPU synchronization
+// - Access peers via symmetric window (LSA pointers)
+myAllReduceKernel<<<grid, block>>>(win, devComm);
+```
+
+## Building
+
+### **Quick Start**
+```shell
+# Build example by directory name
+make 01_allreduce
+```
+
+### **Individual Examples**
+```shell
+# Build and run the device API AllReduce
+cd 01_allreduce && make
+./allreduce_device_api
+```
+
+## References
+- [NCCL User Guide:
+  Examples](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html)
+- [NCCL API
+  Reference](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api.html)
+- [CUDA Programming
+  Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/)
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 000000000..9a9cd1b3d
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,54 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# NCCL Examples Main Makefile
+
+# Define all category directories
+CATEGORIES := 01_communicators 02_point_to_point 03_collectives 04_user_buffer_registration 05_symmetric_memory 06_device_api
+
+# Default target
+all: $(CATEGORIES)
+
+# Build all categories
+$(CATEGORIES):
+	@echo "Building $@..."
+	@$(MAKE) -C $@
+
+# Clean all categories
+clean:
+	@echo "Cleaning all examples..."
+	@for category in $(CATEGORIES); do \
+		$(MAKE) -C $$category clean; \
+	done
+
+# Test all examples
+test: all
+	@echo "Testing all examples..."
+	@for category in $(CATEGORIES); do \
+		$(MAKE) -C $$category test; \
+	done
+
+# Install all examples
+install: all
+	@echo "Installing all examples..."
+	@for category in $(CATEGORIES); do \
+		$(MAKE) -C $$category install; \
+	done
+
+# Help target
+help:
+	@echo "NCCL Examples Main Makefile"
+	@echo "==========================="
+	@echo ""
+	@echo "Available targets:"
+	@echo "  all      - Build all examples"
+	@echo "  clean    - Clean all build artifacts"
+	@echo "  test     - Build and test all examples"
+	@echo "  install  - Install all examples"
+	@echo "  help     - Show this help message"
+	@echo ""
+
+.PHONY: all clean test install help $(CATEGORIES)
diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 000000000..f491eab5a
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,146 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Library Examples
+
+Welcome to the NCCL examples directory. This collection of NCCL (NVIDIA
+Collective Communications Library) examples is designed to teach developers how
+to effectively use NCCL in their applications. The examples progress from basic
+concepts to advanced usage patterns, with each example featuring a detailed
+README file. The APIs and features covered here are far from the complete set of
+what NCCL provides. The [NCCL
+Documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html)
+includes a detailed description of NCCL features and APIs.
+
+These examples showcase individual features but are not intended to maximize the
+performance for an individual communication pattern. For a performance
+implementation please refer to the
+[nccl-tests](https://github.com/NVIDIA/nccl-tests/) GitHub repository.
+
+## Basic Examples
+We start with the most basic NCCL operations. All examples in this section are
+self-contained, meaning you can copy-paste one file and compile it on its own.
+The only dependencies are the NCCL library itself and, in the MPI case, an MPI
+implementation. These templates tend to new users coming up to speed with NCCL
+for GPU communication.
+
+### [Communicators](01_communicators/)
+
+This section teaches you how to create, test, and destroy a communicator. We
+have provided 3 examples using a single thread, multiple threads, and multiple
+processes. This section shows the different options of launching an NCCL
+application.
+
+### [Point 2 Point](02_point_to_point/)
+
+This sample send/recv implementation uses point to point communication to
+communicate in a simple ring pattern.
+
+### [Collectives](03_collectives/)
+
+This sample implementation shows the most basic NCCL collective communication
+call.
+
+## Advanced Features
+
+These examples are intended for experienced users looking for best practices to
+use a specific feature. For complete end-to-end templates please use the basic
+examples.
+
+Since NCCL does not include its own launcher, we have provided two popular
+bootstrap mechanisms. By default these examples will be launched as separate
+threads, one thread per GPU. Users can set `MPI=1` to build an MPI parallel
+version which can run across multiple compute nodes. Users can optionally
+provide a valid MPI installation under `MPI_HOME`.
+
+Each example can be run individually. By default you can run each executable via
+```
+[NTHREADS=<number of threads>] ./<example_name>
+```
+
+If `NTHREADS` is unset, the examples will use the number of visible GPUs as
+number threads. If the applications are built with `MPI` support, you can run
+each executable as
+
+```
+mpirun -np <number of MPI ranks> ./<example_name>
+```
+
+To ease the readability of these examples we have moved the bootstrap and
+broadcast part to the [common](common/) directory. Completely self-contained
+examples are provided in the sections above.
+
+### [User Buffer Registration](04_user_buffer_registration/)
+
+User Buffer Registration eliminates the overhead of copying data between user
+buffers and internal NCCL buffers. This folder provides sample implementation
+using User Buffer Registration with common collectives.
+
+### [Symmetric Memory Registration](05_symmetric_memory/)
+
+Since 2.27, NCCL supports window registration, which allows users to register
+local buffers into NCCL window and enables extreme low latency and high
+bandwidth communication in NCCL. This folder provides sample implementation
+using Symmetric Memory Registration with common collectives.
+
+### [Device APIs](06_device_api/)
+
+Device API enables GPU kernels to directly perform inter-GPU communication. This
+enables applications to perform communication from within CUDA kernels and fuse
+computation and communication, and fine-grained control over collective
+implementation. This folder demonstrates how to implement collectives using
+device-side kernels.
+
+
+## Prerequisites
+
+- The same prerequisites as building NCCL from source.
+- Users can optionally add `MPI_HOME` for an MPI library in a non-standard
+  location.
+
+## Build Steps
+The examples can be built while building the NCCL library from source. Users can
+choose to build the examples with MPI support (`MPI=1`).
+
+```
+git clone https://github.com/NVIDIA/nccl.git
+cd nccl
+make -j examples [MPI=1]
+```
+
+or, if NCCL has already been built, the user can optionally add a non-standard
+NCCL installation location:
+
+```
+cd examples
+make NCCL_HOME=<path-to-nccl> [MPI=1]
+```
+## Environment Variables
+
+### Build Stage
+Users can use these optional variables to choose which libraries are used to
+build these examples:
+- `NCCL_HOME=<path>`: Local base directory of a NCCL installation.
+- `MPI` : [0,1] Build the examples with MPI support.
+- `MPI_HOME=<path>` : Local base directory of a MPI installation.
+- `CUDA_HOME=<path>` : Local base directory of a CUDA installation.
+
+### Run Stage
+- `NTHREADS=<n>`: Number of threads to create for the threaded examples.
+  Defaults to number of visible GPUs.
+- `CUDA_VISIBLE_DEVICES`: Comma delimited list of GPUs visible to the
+  application.
+- All other NCCL [environment
+  variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html)
+  apply.
+
+## Supported OS
+Linux
+
+## Troubleshooting
+Each example includes a /Common Issues and Solutions/ section for the individual
+tests. For general runtime issues use the debug output enabled by setting
+`NCCL_DEBUG=INFO` for detailed logging. The
+[Troubleshooting](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html)
+section of the NCCL documentation also includes many helpful tips.
diff --git a/examples/common/README.md b/examples/common/README.md
new file mode 100644
index 000000000..006c2b9a0
--- /dev/null
+++ b/examples/common/README.md
@@ -0,0 +1,36 @@
+# NCCL Common Utilities
+
+## Description
+This directory contains shared utilities and helper functions used across all NCCL examples. These utilities provide common functionality for error handling, device management, and MPI integration.
+
+## Components
+
+### Headers (`include/`)
+- **utils.h**: General utility functions
+- **nccl_utils.h**: NCCL error checking macros
+- **mpi_utils.h**: MPI error checking macros
+
+### Source Files (`src/`)
+- **utils.cc**: General utility functions
+
+## Key Features
+
+### Error Checking Macros
+```c
+#define NCCLCHECK(cmd)  // NCCL error checking
+#define CUDACHECK(cmd)  // CUDA error checking
+#define MPICHECK(cmd)   // MPI error checking
+```
+
+## Usage in Examples
+Include the headers in your example source files:
+```c
+#include "utils.h"
+#include "mpi_utils.h"
+```
+
+## Notes
+- All utilities include comprehensive error checking
+- Functions are designed to be thread-safe
+- Memory management functions handle null pointers safely
+- MPI utilities are only needed for multi-process examples
diff --git a/examples/common/include/mpi_utils.h b/examples/common/include/mpi_utils.h
new file mode 100644
index 000000000..151b730d2
--- /dev/null
+++ b/examples/common/include/mpi_utils.h
@@ -0,0 +1,23 @@
+#ifndef MPI_UTILS_H_
+#define MPI_UTILS_H_
+
+#include "mpi.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+// MPI error checking macro
+#define MPICHECK(cmd)                                                          \
+  do {                                                                         \
+    int err = cmd;                                                             \
+    if (err != MPI_SUCCESS) {                                                  \
+      char error_string[MPI_MAX_ERROR_STRING];                                 \
+      int length;                                                              \
+      MPI_Error_string(err, error_string, &length);                            \
+      fprintf(stderr, "MPI error at %s:%d - %s\n", __FILE__, __LINE__,         \
+              error_string);                                                   \
+      fprintf(stderr, "Failed MPI operation: %s\n", #cmd);                     \
+      MPI_Abort(MPI_COMM_WORLD, err);                                          \
+    }                                                                          \
+  } while (0)
+
+#endif
diff --git a/examples/common/include/nccl_utils.h b/examples/common/include/nccl_utils.h
new file mode 100644
index 000000000..ff98de1de
--- /dev/null
+++ b/examples/common/include/nccl_utils.h
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "cuda_runtime.h"
+
+// Error checking
+#define NCCLCHECK(cmd)                                                         \
+  do {                                                                         \
+    ncclResult_t res = cmd;                                                    \
+    if (res != ncclSuccess) {                                                  \
+      fprintf(stderr, "Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              ncclGetErrorString(res));                                        \
+      fprintf(stderr, "Failed NCCL operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CUDACHECK(cmd)                                                         \
+  do {                                                                         \
+    cudaError_t err = cmd;                                                     \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__,   \
+              cudaGetErrorString(err));                                        \
+      fprintf(stderr, "Failed CUDA operation: %s\n", #cmd);                    \
+      exit(EXIT_FAILURE);                                                      \
+    }                                                                          \
+  } while (0)
+
+#endif
diff --git a/examples/common/include/utils.h b/examples/common/include/utils.h
new file mode 100644
index 000000000..ec3dec334
--- /dev/null
+++ b/examples/common/include/utils.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef UTILS_H_
+#define UTILS_H_
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "nccl_utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#include "mpi_utils.h"
+#else
+#include <pthread.h>
+#include <unistd.h>
+#endif
+
+/**
+ * Broadcast NCCL unique ID
+ *
+ * Broadcasts the NCCL unique ID from the root rank to all other ranks.
+ * Uses MPI_Bcast in MPI mode and pthread barrier in pthread mode.
+ *
+ * @param root      Root rank that holds the NCCL unique ID
+ * @param my_rank   Current rank or thread id
+ * @param arg       Pointer to NCCL unique ID to broadcast
+ *
+ * @return 0 on success, non-zero on error
+ */
+int util_broadcast(int root, int my_rank, ncclUniqueId *arg);
+
+/**
+ * Run the given NCCL example in parallel
+ *
+ * This function performs the complete NCCL example lifecycle:
+ * 1. Initialize backend (MPI or pthread)
+ * 2. Execute NCCL communicator setup function
+ * 3. Cleanup of all resources
+ *
+ * @param argc              Command line argument count
+ * @param argv              Command line arguments
+ * @param ncclExample       Function pointer to example-specific NCCL setup
+ *
+ * @return 0 on success, non-zero on error
+ */
+int run_example(int argc, char *argv[],
+                void *(*ncclExample)(int, int, int, int));
+
+#endif // UTILS_H_
diff --git a/examples/common/src/utils.cc b/examples/common/src/utils.cc
new file mode 100644
index 000000000..20a411cbc
--- /dev/null
+++ b/examples/common/src/utils.cc
@@ -0,0 +1,334 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "utils.h"
+#include <unistd.h>
+
+#ifndef MPI_SUPPORT
+pthread_barrier_t barrier;
+ncclUniqueId nccl_unique_id;
+#endif
+
+/**
+ * Common context structure for both MPI and pthread examples
+ *
+ * This structure provides a unified interface for NCCL examples that can
+ * run in either MPI mode (one process per device) or pthread mode
+ * (one thread per device).
+ */
+typedef struct {
+  // Common variables
+  int total_ranks;      // Total number of MPI ranks or pthreads
+  int devices_per_rank; // Number of devices per rank or thread
+  int local_device;     // Node local rank or thread id (0 to total_ranks-1)
+  int my_rank;          // Rank or thread id for NCCL
+  ncclUniqueId nccl_id; // NCCL unique ID
+  ncclComm_t comm;      // NCCL communicator (single for all modes)
+  ncclUniqueId *nccl_unique_id; // NCCL unique ID pointer
+  void *func;
+
+  // pthread-specific variables
+#ifndef MPI_SUPPORT
+  pthread_t *threads; // Thread array
+  int *thread_ranks;  // Thread rank array
+#endif
+} context_t;
+
+/**
+ * Initialize MPI or pthread backend
+ *
+ * Sets up the backend and populates common context variables.
+ *
+ * MPI Mode (compiled with MPI=1):
+ *   - Initializes MPI (rank, size)
+ *   - Calculates local rank based on splitting communicator by node multi-node
+ * support
+ *   - Generates and broadcasts NCCL unique ID
+ *   - Sets device assignment based on local rank
+ *
+ * pthread Mode (default):
+ *   - Gets thread count from NTHREADS environment or GPU count
+ *   - Validates thread count against available GPUs
+ *   - Generates NCCL unique ID for sharing across threads
+ *   - Allocates thread management resources
+ *
+ * @param argc      Command line argument count
+ * @param argv      Command line arguments
+ * @param ctx       Output: Populated example context
+ *
+ * @return 0 on success, non-zero on error
+ */
+int initialize(int argc, char *argv[], context_t *ctx);
+
+/**
+ * Wrap function to call the example function in a thread
+ *
+ * Note: This function is needed since pthread only allows a single void*
+ * argument.
+ */
+void *thread_wrapper(void *arg);
+
+/**
+ * Run ncclExample in parallel using MPI or pthreads
+ *
+ * Starts the execution of the given NCCL example function in parallel
+ *
+ * MPI Mode:
+ *   - Starts the function on each rank
+ *   - Checks the output and calls MPI_Barrier to synchronize
+ *
+ * pthread Mode:
+ *   - Creates threads (one per device)
+ *   - Each thread runs the given example function
+ *   - Waits for all threads to complete
+ *
+ * @param ctx               Context with backend setup completed
+ * @param ncclExample       Function pointer to example-specific NCCL setup
+ *
+ * @return 0 on success, non-zero on error
+ *
+ * Note: This function expects ncclExample() to be defined by the example.
+ * The ncclExample function should have signature:
+ * void* ncclExample(int, int, int, int)
+ */
+int run_parallel(context_t *ctx, void *(*ncclExample)(int, int, int, int));
+
+/**
+ * Clean up resources
+ *
+ * Properly cleans up all resources allocated during initialization.
+ * Note: NCCL communicators are destroyed by ncclCommSetup function.
+ *
+ * MPI Mode:
+ *   - Finalizes MPI
+ *
+ * pthread Mode:
+ *   - Frees thread arrays
+ *
+ * @param ctx       Context to clean up
+ */
+void cleanup(context_t *ctx);
+
+/**
+ * Broadcast NCCL unique ID
+ */
+int util_broadcast(int root, int my_rank, ncclUniqueId *arg) {
+#ifdef MPI_SUPPORT
+  MPICHECK(
+      MPI_Bcast(arg, sizeof(ncclUniqueId), MPI_BYTE, root, MPI_COMM_WORLD));
+#else
+  if (my_rank == root) {
+    nccl_unique_id = *arg;
+  }
+  int barrier_err = pthread_barrier_wait(&barrier);
+  if (barrier_err != 0 && barrier_err != PTHREAD_BARRIER_SERIAL_THREAD) {
+    fprintf(stderr, "pthread_barrier_wait failed at %s:%d with error code %d\n",
+            __FILE__, __LINE__, barrier_err);
+    abort();
+  }
+  if (my_rank != root) {
+    *arg = nccl_unique_id;
+  }
+#endif
+  return 0;
+}
+
+/**
+ * Initialize MPI or pthread backend
+ */
+int initialize(int argc, char *argv[], context_t *ctx) {
+#ifdef MPI_SUPPORT
+  // Initialize MPI
+  MPICHECK(MPI_Init(&argc, &argv));
+  MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &ctx->my_rank));
+  MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &ctx->total_ranks));
+
+  if (ctx->my_rank == 0) {
+    printf("Number of processes: %d\n", ctx->total_ranks);
+  }
+  // Only for printing the output in order
+  MPI_Barrier(MPI_COMM_WORLD);
+  printf("MPI initialized: rank %d of %d\n", ctx->my_rank, ctx->total_ranks);
+
+  // Split the communicator based on shared memory (i.e., nodes)
+  MPI_Comm node_comm;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, ctx->my_rank,
+                      MPI_INFO_NULL, &node_comm);
+
+  // Get the rank within the node communicator
+  MPI_Comm_rank(node_comm, &ctx->local_device);
+
+  // Clean up the node communicator
+  MPI_Comm_free(&node_comm);
+
+#else
+
+  // Get number of devices (threads) from environment or default to available
+  // GPUs
+  int num_gpus = 0;
+  CUDACHECK(cudaGetDeviceCount(&num_gpus));
+  ctx->total_ranks = num_gpus; // Default to all available GPUs
+  const char *nThreadsEnv = getenv("NTHREADS");
+  if (nThreadsEnv) {
+    ctx->total_ranks = atoi(nThreadsEnv);
+  }
+
+  printf("Creating %d threads for %d devices\n", ctx->total_ranks, num_gpus);
+
+  if (ctx->total_ranks < 1) {
+    printf("Invalid number of threads: %d\n", ctx->total_ranks);
+    return 1;
+  }
+
+  // Check if we have enough GPUs
+  if (ctx->total_ranks > num_gpus) {
+    printf("Error: Requested %d threads but only %d GPUs available\n",
+           ctx->total_ranks, num_gpus);
+    printf("Please reduce NTHREADS to %d or fewer\n", num_gpus);
+    return 1;
+  }
+
+  // Thread synchronization needed for unique ID sharing later on
+  pthread_barrier_init(&barrier, NULL, ctx->total_ranks);
+
+  // Generate NCCL unique ID (shared across all threads)
+  NCCLCHECK(ncclGetUniqueId(&ctx->nccl_id));
+
+  // Allocate thread resources
+  ctx->threads = (pthread_t *)malloc(ctx->total_ranks * sizeof(pthread_t));
+  ctx->thread_ranks = (int *)malloc(ctx->total_ranks * sizeof(int));
+  if (ctx->threads == NULL || ctx->thread_ranks == NULL) {
+    printf("Failed to allocate memory for threads\n");
+    return 1;
+  }
+#endif
+
+  return 0;
+}
+
+/**
+ * Wrap function to call the example function in a thread
+ *
+ * Note: This function is needed since pthread only allows a single void*
+ * argument.
+ */
+void *thread_wrapper(void *arg) {
+  context_t *ctx = (context_t *)arg;
+  void *(*example_func)(int, int, int, int) =
+      (void *(*)(int, int, int, int))ctx->func;
+  return example_func(ctx->my_rank, ctx->total_ranks, ctx->local_device,
+                      ctx->devices_per_rank);
+}
+
+/**
+ * Run ncclExample in parallel using MPI or pthreads
+ */
+int run_parallel(context_t *ctx, void *(*ncclExample)(int, int, int, int)) {
+#ifdef MPI_SUPPORT
+  if (ctx->my_rank == 0) {
+    printf("NCCL Example: One Device per Process\n");
+    printf("====================================\n");
+  }
+
+  if (ncclExample(ctx->my_rank, ctx->total_ranks, ctx->local_device,
+                  ctx->devices_per_rank) != NULL)
+    return 1;
+  // Synchronize to ensure ordered output
+  MPICHECK(MPI_Barrier(MPI_COMM_WORLD));
+#else
+  printf("NCCL Example: One Device per Thread\n");
+  printf("===================================\n");
+
+  // Create separate context for each thread
+  context_t *thread_contexts =
+      (context_t *)malloc(ctx->total_ranks * sizeof(context_t));
+  if (thread_contexts == NULL) {
+    printf("Failed to allocate thread contexts\n");
+    return 1;
+  }
+  ncclUniqueId *nccl_unique_id =
+      (ncclUniqueId *)calloc(1, sizeof(ncclUniqueId));
+
+  for (int i = 0; i < ctx->total_ranks; i++) {
+    // Copy main context to thread context
+    memcpy(&thread_contexts[i], ctx, sizeof(context_t));
+    thread_contexts[i].threads = NULL;
+    thread_contexts[i].thread_ranks = NULL;
+    thread_contexts[i].my_rank = i; // Set NCCL rank to thread id
+    thread_contexts[i].local_device = i;
+    thread_contexts[i].total_ranks = ctx->total_ranks;
+    thread_contexts[i].devices_per_rank = 1;
+    thread_contexts[i].func = (void *)ncclExample;
+    thread_contexts[i].nccl_unique_id = nccl_unique_id;
+    ctx->thread_ranks[i] = i;
+    pthread_create(&ctx->threads[i], NULL, thread_wrapper, &thread_contexts[i]);
+  }
+
+  // Wait for all threads to complete
+  for (int i = 0; i < ctx->total_ranks; i++) {
+    pthread_join(ctx->threads[i], NULL);
+  }
+
+  free(thread_contexts);
+#endif
+
+  return 0;
+}
+
+/**
+ * Run the given NCCL example in parallel
+ */
+int run_example(int argc, char *argv[],
+                void *(*ncclExample)(int, int, int, int)) {
+
+  // 1. Allocate context
+  context_t *ctx = (context_t *)calloc(1, sizeof(context_t));
+  if (ctx == NULL) {
+    printf("Failed to allocate memory for context\n");
+    return 1;
+  }
+
+  // 2. Initialize backend (MPI or pthread)
+  if (initialize(argc, argv, ctx) != 0) {
+    printf("Failed to initialize backend\n");
+    return 1;
+  }
+
+  // 3. Start the given example code in parallel
+  if (run_parallel(ctx, ncclExample) != 0) {
+    printf("Failed to execute NCCL operations\n");
+    cleanup(ctx); // Cleanup on failure
+    return 1;
+  }
+
+  // 3. Cleanup
+  cleanup(ctx);
+
+  // 4. Print common success message
+#ifdef MPI_SUPPORT
+  if (ctx->my_rank == 0) {
+#endif
+    printf("\nAll NCCL communicators finalized successfully!\n");
+#ifdef MPI_SUPPORT
+  }
+#endif
+
+  return 0;
+}
+
+/**
+ * Clean up resources
+ */
+void cleanup(context_t *ctx) {
+#ifdef MPI_SUPPORT
+  // Free MPI resources
+  MPICHECK(MPI_Finalize());
+#else
+  free(ctx->threads);
+  free(ctx->thread_ranks);
+  pthread_barrier_destroy(&barrier);
+#endif
+}
diff --git a/makefiles/examples.mk b/makefiles/examples.mk
new file mode 100644
index 000000000..6f3a520f3
--- /dev/null
+++ b/makefiles/examples.mk
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Make sure NCCL headers are found and libraries are linked
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+# Build configuration
+INCLUDES = -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include
+LIBRARIES = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib
+LDFLAGS = -lcudart -lnccl -Wl,-rpath,$(NCCL_HOME)/lib
+
+
+# MPI configuration
+ifeq ($(MPI), 1)
+
+ifdef MPI_HOME
+MPICXX ?= $(MPI_HOME)/bin/mpicxx
+MPIRUN ?= $(MPI_HOME)/bin/mpirun
+else
+MPICXX ?= mpicxx
+MPIRUN ?= mpirun
+endif
+
+CXXFLAGS += -DMPI_SUPPORT
+endif

From 834ef7231913ecf22f5cad29d7e26a6596f36452 Mon Sep 17 00:00:00 2001
From: Stephen Sachs <ssachs@nvidia.com>
Date: Tue, 14 Oct 2025 15:02:05 +0200
Subject: [PATCH 20/21] Remove the github actions to auto-close older issues

---
 .github/workflows/close-old-issues.js   | 79 -------------------------
 .github/workflows/close_old_issues.yaml | 31 ----------
 2 files changed, 110 deletions(-)
 delete mode 100644 .github/workflows/close-old-issues.js
 delete mode 100644 .github/workflows/close_old_issues.yaml

diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js
deleted file mode 100644
index 57e110339..000000000
--- a/.github/workflows/close-old-issues.js
+++ /dev/null
@@ -1,79 +0,0 @@
-const { Octokit } = require("@octokit/rest");
-
-const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN });
-
-const owner = process.env.REPO_OWNER;
-const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format
-
-const now = new Date();
-const sixMonthsAgo = new Date(now);
-sixMonthsAgo.setMonth(now.getMonth() - 6);
-const oneMonthAgo = new Date(now);
-oneMonthAgo.setMonth(now.getMonth() - 1);
-
-async function closeOldIssues() {
-  let page = 1;
-  let closedCount = 0;
-
-    // write a multiline comment into a variable:
-    let body = `### Issue Cleanup: Helping Us Focus on Current Challenges
-
-We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now.
-
-*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\").
-Thanks for your understanding and for contributing to NCCL.`;
-
-  while (true) {
-    const { data: issues } = await octokit.issues.listForRepo({
-      owner,
-      repo,
-      state: "open",
-      per_page: 100,
-      page,
-    });
-
-    if (issues.length === 0) break;
-
-    for (const issue of issues) {
-      // Ignore PRs
-      if (issue.pull_request) continue;
-
-      // Ignore issues with label "ongoing"
-      if (issue.labels.some(label => label.name === "ongoing")) continue;
-
-      const createdAt = new Date(issue.created_at);
-      const updatedAt = new Date(issue.updated_at);
-
-        if (createdAt < sixMonthsAgo && updatedAt < sixMonthsAgo) {
-
-        // Add a comment before closing
-        await octokit.issues.createComment({
-          owner,
-          repo,
-          issue_number: issue.number,
-          body: body,
-        });
-
-        await octokit.issues.update({
-          owner,
-          repo,
-          issue_number: issue.number,
-          state: "closed",
-          state_reason: "not_planned",
-        });
-        closedCount++;
-        console.log(`Closed issue #${issue.number}`);
-
-        // Break out if we have closed 100 issues
-        if (closedCount >= 100) {
-          console.log("Closed 100 issues, stopping.");
-          return;
-        }
-      }
-    }
-    page++;
-  }
-  console.log(`Total closed: ${closedCount}`);
-}
-
-closeOldIssues().catch(console.error);
diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml
deleted file mode 100644
index 15d81cb54..000000000
--- a/.github/workflows/close_old_issues.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: Close Old Issues
-
-on:
-  schedule:
-    - cron: '30 2 * * *'  # Runs daily at 02:30 UTC
-  workflow_dispatch:
-
-permissions:
-  issues: write
-
-jobs:
-  close-old-issues:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Install dependencies
-        run: npm install @octokit/rest@22.0.0
-
-      - name: Run close-old-issues script
-        run: node .github/workflows/close-old-issues.js
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO_OWNER: ${{ github.repository_owner }}
-          REPO_NAME: ${{ github.event.repository.name || github.repository }}

From ae7aed194dc63c65d1bf5c0385ba3d68d3b64c8c Mon Sep 17 00:00:00 2001
From: Mark Santesson <msantesson@nvidia.com>
Date: Fri, 17 Oct 2025 17:17:50 -0700
Subject: [PATCH 21/21] NCCL 2.28.7-1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GPU-Initiated Networking (GIN):
 * Provides device-side API for integrating GPU-Initiated Networking
   capability into application kernels.
 * New transport layer called DOCA GPUNetIO.
 * New ncclGin construct to create, destroy and manipulate GIN contexts.
 * New ncclGinBarrierSession to provide synchronization functionality.
 * New put, signal, counter operations for data movement and signaling.
 * GIN API signatures and functionalities are subject to change.
 * GIN Support Requirements
   * CUDA 12.2 or later when compiling the GPU code
   * NVIDIA GPUs: Volta or newer. NVIDIA GPU drivers >= 510.40.3
   * NVIDIA NICs: CX4 or newer. rdma-core >= 44.0
   * Requires nvidia-peermem or DMABUF support. When using DMABUF, linux
     kernel >= 6.1 is required.

New ncclCommRevoke API for fault tolerance:
 * Introduces ncclCommRevoke to quiesce ongoing NCCL work on a
   communicator without freeing resources.
 * This answers the need for a lightweight way to cancel in-flight
   collectives and bring a communicator to a safe state before
   split/shrink/finalize/destroy.
 * Includes optional cross-rank coordination (global barrier) and
   supports blocking/non-blocking usage.

New NCCL Environment Plugin:
 * The env plugin allows users to set NCCL environment variables, for
   example, after loading them from a centralized database.
 * The NCCL_ENV_PLUGIN variable can be used to let NCCL load an external
   environment plugin.

New NCCL Examples on GitHub:
 * The NCCL examples directory provides users and developers with
   practical code samples that highlight NCCL’s core features.
 * It covers basic operations like communicator initialization,
   point-to-point communication, and collective operations, as well as
   advanced features such as user buffer registration, symmetric memory,
   and the device API.

Device API improvements:
 * Adds ncclFindWindow API.
 * Adds new ncclBarrierSession to provide hybrid synchronization
   functionality.
 * Makes multimem available with as few as two ranks.
 * Removes distance (NCCL_P2P_LEVEL) considerations from determining the
   availability of symmetric memory.

Enhanced NCCL RAS output:
 * Extends RAS subsystem with JSON format to support machine-parsable
   metrics collection.
 * Enables structured data export for monitoring tools, dashboards, and
   automated analysis systems.

Github Pull Requests resolved:
 * Fast Init - CPU Optimizations for NCCL Initialization Large Scale.
   (PR #1789)
 * Fast Init - Improve Bootstrap AllGather by 2x at large scale by
   sending bootstrap information bidirectionally. (PR #1791)
 * Fixes spurious failures when PyTorch is statically linked with
   NCCL-2.28.3 because error is not drained, but rather gets propagated
   into the next CUDA kernel invocation. (PR #1864)

Other notable improvements:
 * Fixes multicast object leaks in case of failed NVLS user buffer
   registrations, which could lead to crashes. Avoids such registration
   attempts in case of the use of incompatible memory allocators.
 * Fixes potential data corruption with built-in symmetric kernels for
   small messages with size granularity under 8 bytes or when multiple
   symmetric operations were aggregated in a group.
 * Generalizes the existing point-to-point scheduling to the case of
   un-even GPU count per node.
 * Fixes a crash when network plugin assignment fails.
 * Fixes a large performance issue with NCCL_CROSS_NIC=0 and certain
   split mask settings, where NCCL cannot find a viable ring.
 * Fixes crash when NCCL is compiled with recent CUDA versions but
   running on hosts with certain specific older CUDA drivers.
---
 CMakeLists.txt                                |    4 +
 ext-tuner/example/plugin.c                    |    2 +-
 ext-tuner/example/test/test_plugin.c          |    6 +-
 makefiles/common.mk                           |    9 +-
 makefiles/version.mk                          |    2 +-
 src/CMakeLists.txt                            |   28 +-
 src/Makefile                                  |   63 +-
 src/bootstrap.cc                              |   51 +-
 src/ce_coll.cc                                |   26 +-
 src/debug.cc                                  |   17 +-
 src/dev_runtime.cc                            |  198 +-
 src/device/CMakeLists.txt                     |    2 +-
 src/device/Makefile                           |    3 +-
 src/device/generate.py                        |   10 +-
 src/device/network/unpack/unpack.h            |    2 +-
 src/device/reduce_kernel.h                    |    2 +-
 src/device/symmetric/all_gather.cuh           |    2 +-
 src/device/symmetric/generate.py              |   10 +-
 src/device/symmetric/primitives.cuh           |   13 +-
 src/device/symmetric/reduce_scatter.cuh       |    6 +-
 src/enqueue.cc                                |   40 +-
 src/gin/CMakeLists.txt                        |    8 +
 src/gin/gin_host.cc                           |  277 +
 src/gin/gin_host_proxy.cc                     |  501 ++
 src/graph/paths.cc                            |   38 +-
 src/graph/rings.cc                            |   29 +-
 src/graph/search.cc                           |   80 +-
 src/graph/topo.cc                             |   92 +-
 src/graph/topo.h                              |    2 +
 src/graph/tuning.cc                           |    2 +-
 src/graph/xml.cc                              |   29 +-
 src/include/allocator.h                       |    4 +
 src/include/channel.h                         |    7 +-
 src/include/checks.h                          |   15 +
 src/include/comm.h                            |   13 +-
 src/include/debug.h                           |    8 +
 src/include/dev_runtime.h                     |    1 +
 src/include/device.h                          |    6 +-
 src/include/env.h                             |   23 +
 src/include/gin/gin_host.h                    |   54 +
 src/include/gin/gin_host_proxy.h              |   28 +
 src/include/graph.h                           |   11 +-
 src/include/group.h                           |    4 +
 src/include/nccl_device.h                     |   10 +-
 src/include/nccl_device/barrier.h             |   47 +
 src/include/nccl_device/coop.h                |   73 +-
 src/include/nccl_device/core.h                |   19 +
 src/include/nccl_device/gin.h                 |  207 +
 src/include/nccl_device/gin/gdaki/gin_gdaki.h |  214 +
 .../gin/gdaki/gin_gdaki_device_host_common.h  |   36 +
 src/include/nccl_device/gin/gin_device_api.h  |   18 +
 .../nccl_device/gin/gin_device_common.h       |  120 +
 .../nccl_device/gin/gin_device_host_common.h  |   24 +
 src/include/nccl_device/gin/proxy/gin_proxy.h |  235 +
 .../gin/proxy/gin_proxy_device_host_common.h  |  125 +
 src/include/nccl_device/gin_barrier.h         |   37 +
 src/include/nccl_device/impl/barrier__funcs.h |   94 +
 src/include/nccl_device/impl/barrier__types.h |   29 +
 src/include/nccl_device/impl/comm__types.h    |   13 +-
 src/include/nccl_device/impl/core__funcs.h    |   32 +
 src/include/nccl_device/impl/core__types.h    |    4 +-
 src/include/nccl_device/impl/gin__funcs.h     |  407 ++
 src/include/nccl_device/impl/gin__types.h     |   10 +
 .../nccl_device/impl/gin_barrier__funcs.h     |   66 +
 .../nccl_device/impl/gin_barrier__types.h     |   31 +
 ..._barrier__funcs.h => lsa_barrier__funcs.h} |    2 +-
 ..._barrier__types.h => lsa_barrier__types.h} |    2 +-
 src/include/nccl_device/ll_a2a.h              |    4 +-
 .../{mem_barrier.h => lsa_barrier.h}          |    0
 src/include/{ => nccl_device}/net_device.h    |    7 +-
 src/include/nccl_device/utility.h             |   74 +-
 src/include/net.h                             |    3 +
 src/include/nvtx.h                            |    3 +-
 src/include/nvtx_payload_schemas.h            |    3 +-
 src/include/plugin/env/env_v1.h               |   33 +
 src/include/plugin/nccl_env.h                 |   16 +
 src/include/plugin/nccl_net.h                 |    7 +-
 src/include/plugin/net/net_v11.h              |   50 +-
 src/include/plugin/plugin.h                   |    2 +
 src/include/proxy.h                           |    3 +
 src/include/register.h                        |    3 +
 src/include/socket.h                          |    9 +-
 src/include/sym_kernels.h                     |    3 +-
 src/include/transport.h                       |   13 +-
 src/include/utils.h                           |   10 +
 src/init.cc                                   |  374 +-
 src/libnccl.map                               |    8 +
 src/misc/ibvsymbols.cc                        |    2 +-
 src/misc/ibvwrap.cc                           |    5 +
 src/misc/param.cc                             |    5 +-
 src/misc/socket.cc                            |   23 +
 src/nccl.h.in                                 |   12 +
 src/nccl_device/CMakeLists.txt                |    3 +-
 src/nccl_device/gin_barrier.cc                |   22 +
 .../{mem_barrier.cc => lsa_barrier.cc}        |    2 +-
 src/plugin/CMakeLists.txt                     |    3 +
 src/plugin/env.cc                             |  111 +
 src/plugin/env/CMakeLists.txt                 |    7 +
 src/plugin/env/env_v1.cc                      |   40 +
 src/plugin/net.cc                             |  147 +-
 src/plugin/net/net_v10.cc                     |    3 +-
 src/plugin/net/net_v11.cc                     |   11 +-
 src/plugin/net/net_v6.cc                      |    1 -
 src/plugin/net/net_v7.cc                      |    1 -
 src/plugin/net/net_v8.cc                      |    1 -
 src/plugin/net/net_v9.cc                      |    1 -
 src/plugin/plugin_open.cc                     |   14 +-
 src/proxy.cc                                  |    1 +
 src/ras/client.cc                             |   56 +-
 src/ras/client_support.cc                     |  310 +-
 src/ras/ras_internal.h                        |    9 +
 src/register/register.cc                      |   15 +-
 src/scheduler/symmetric_sched.cc              |    4 +
 src/sym_kernels.cc                            |   18 +-
 src/transport.cc                              |   56 +-
 src/transport/CMakeLists.txt                  |   11 +
 src/transport/gdaki/CMakeLists.txt            |   65 +
 .../include/common/doca_gpunetio_verbs_def.h  |  398 ++
 .../include/common/doca_gpunetio_verbs_dev.h  |  203 +
 .../device/doca_gpunetio_dev_verbs_common.cuh |  422 ++
 .../doca_gpunetio_dev_verbs_counter.cuh       |  421 ++
 .../device/doca_gpunetio_dev_verbs_cq.cuh     |  295 +
 .../doca_gpunetio_dev_verbs_onesided.cuh      |  508 ++
 .../device/doca_gpunetio_dev_verbs_qp.cuh     |  824 +++
 .../include/doca_gpunetio_config.h            |   45 +
 .../include/doca_gpunetio_device.h            |   47 +
 .../include/doca_gpunetio_host.h              |   49 +
 .../doca-gpunetio/include/host/doca_error.h   |   89 +
 .../include/host/doca_gpunetio.h              |  387 ++
 .../include/host/doca_gpunetio_high_level.h   |  191 +
 .../doca-gpunetio/include/host/doca_verbs.h   | 2467 +++++++
 .../doca-gpunetio/include/host/mlx5_ifc.h     | 5693 +++++++++++++++++
 .../doca-gpunetio/include/host/mlx5_prm.h     |  170 +
 .../gdaki/doca-gpunetio/src/doca_gpunetio.cpp |  942 +++
 .../src/doca_gpunetio_gdrcopy.cpp             |  261 +
 .../doca-gpunetio/src/doca_gpunetio_gdrcopy.h |   55 +
 .../src/doca_gpunetio_high_level.cpp          |  903 +++
 .../doca-gpunetio/src/doca_gpunetio_log.cpp   |   77 +
 .../doca-gpunetio/src/doca_gpunetio_log.hpp   |   43 +
 .../gdaki/doca-gpunetio/src/doca_internal.hpp |  118 +
 .../gdaki/doca-gpunetio/src/doca_verbs_cq.cpp |  472 ++
 .../gdaki/doca-gpunetio/src/doca_verbs_cq.hpp |  151 +
 .../src/doca_verbs_cuda_wrapper.cpp           |  129 +
 .../src/doca_verbs_cuda_wrapper.h             |   96 +
 .../src/doca_verbs_device_attr.cpp            |  266 +
 .../src/doca_verbs_device_attr.hpp            |   96 +
 .../src/doca_verbs_ibv_wrapper.cpp            |  374 ++
 .../src/doca_verbs_ibv_wrapper.h              |  452 ++
 .../src/doca_verbs_mlx5dv_wrapper.cpp         |  287 +
 .../src/doca_verbs_mlx5dv_wrapper.h           |  431 ++
 .../src/doca_verbs_net_wrapper.h              |   62 +
 .../gdaki/doca-gpunetio/src/doca_verbs_qp.cpp | 2743 ++++++++
 .../gdaki/doca-gpunetio/src/doca_verbs_qp.hpp |  211 +
 .../doca-gpunetio/src/doca_verbs_srq.cpp      |  580 ++
 .../doca-gpunetio/src/doca_verbs_srq.hpp      |  109 +
 .../doca-gpunetio/src/doca_verbs_uar.cpp      |  197 +
 .../doca-gpunetio/src/doca_verbs_uar.hpp      |  109 +
 .../doca-gpunetio/src/doca_verbs_umem.cpp     |  212 +
 .../doca-gpunetio/src/doca_verbs_umem.hpp     |  118 +
 src/transport/gdaki/gin_host_gdaki.cc         | 1065 +++
 src/transport/gdaki/gin_host_gdaki.h          |   36 +
 src/transport/net_ib.cc                       |  664 +-
 src/transport/net_ib_gin.h                    |   29 +
 src/transport/nvls.cc                         |   64 +-
 src/transport/p2p.cc                          |   10 +-
 165 files changed, 28241 insertions(+), 497 deletions(-)
 create mode 100644 src/gin/CMakeLists.txt
 create mode 100644 src/gin/gin_host.cc
 create mode 100644 src/gin/gin_host_proxy.cc
 create mode 100644 src/include/env.h
 create mode 100644 src/include/gin/gin_host.h
 create mode 100644 src/include/gin/gin_host_proxy.h
 create mode 100644 src/include/nccl_device/barrier.h
 create mode 100644 src/include/nccl_device/gin.h
 create mode 100644 src/include/nccl_device/gin/gdaki/gin_gdaki.h
 create mode 100644 src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
 create mode 100644 src/include/nccl_device/gin/gin_device_api.h
 create mode 100644 src/include/nccl_device/gin/gin_device_common.h
 create mode 100644 src/include/nccl_device/gin/gin_device_host_common.h
 create mode 100644 src/include/nccl_device/gin/proxy/gin_proxy.h
 create mode 100644 src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
 create mode 100644 src/include/nccl_device/gin_barrier.h
 create mode 100644 src/include/nccl_device/impl/barrier__funcs.h
 create mode 100644 src/include/nccl_device/impl/barrier__types.h
 create mode 100644 src/include/nccl_device/impl/gin__funcs.h
 create mode 100644 src/include/nccl_device/impl/gin__types.h
 create mode 100644 src/include/nccl_device/impl/gin_barrier__funcs.h
 create mode 100644 src/include/nccl_device/impl/gin_barrier__types.h
 rename src/include/nccl_device/impl/{mem_barrier__funcs.h => lsa_barrier__funcs.h} (99%)
 rename src/include/nccl_device/impl/{mem_barrier__types.h => lsa_barrier__types.h} (98%)
 rename src/include/nccl_device/{mem_barrier.h => lsa_barrier.h} (100%)
 rename src/include/{ => nccl_device}/net_device.h (89%)
 create mode 100644 src/include/plugin/env/env_v1.h
 create mode 100644 src/include/plugin/nccl_env.h
 create mode 100644 src/libnccl.map
 create mode 100644 src/nccl_device/gin_barrier.cc
 rename src/nccl_device/{mem_barrier.cc => lsa_barrier.cc} (94%)
 create mode 100644 src/plugin/env.cc
 create mode 100644 src/plugin/env/CMakeLists.txt
 create mode 100644 src/plugin/env/env_v1.cc
 create mode 100644 src/transport/gdaki/CMakeLists.txt
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_error.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp
 create mode 100644 src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp
 create mode 100644 src/transport/gdaki/gin_host_gdaki.cc
 create mode 100644 src/transport/gdaki/gin_host_gdaki.h
 create mode 100644 src/transport/net_ib_gin.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1941cdafe..d1eb6cc8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,6 +148,10 @@ if(MAX_EXT_NET_PLUGINS GREATER 0)
     add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS})
 endif()
 
+add_definitions(-DDOCA_VERBS_USE_CUDA_WRAPPER)
+add_definitions(-DDOCA_VERBS_USE_NET_WRAPPER)
+add_definitions(-DNCCL_GIN_PROXY_ENABLE=1)
+
 # Library dependencies
 find_library(RT_LIBRARY NAMES rt)
 if(RT_LIBRARY)
diff --git a/ext-tuner/example/plugin.c b/ext-tuner/example/plugin.c
index af813495a..d9b84b884 100644
--- a/ext-tuner/example/plugin.c
+++ b/ext-tuner/example/plugin.c
@@ -308,7 +308,7 @@ __hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks,
     // Set NVLSTree base network latency to 24us
     constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] = 24.0;
   }
-  
+
   TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
   if (!ctx) return ncclSystemError;
 
diff --git a/ext-tuner/example/test/test_plugin.c b/ext-tuner/example/test/test_plugin.c
index c0300d51c..328762519 100644
--- a/ext-tuner/example/test/test_plugin.c
+++ b/ext-tuner/example/test/test_plugin.c
@@ -767,16 +767,16 @@ int test_nvl_domain_info() {
     .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
     .maxRanksPerNvlDomain = 5  // maximum ranks across all domains (capacity)
   };
-  
+
   void* context = NULL;
   ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
-  
+
   // Validate NVLD info structure
   TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
   TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
   TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
-  
+
   // Clean up
   pluginFinalize(context);
   printf("NVLink domain info test passed!\n");
diff --git a/makefiles/common.mk b/makefiles/common.mk
index f8f455dec..2b1d1c4b3 100644
--- a/makefiles/common.mk
+++ b/makefiles/common.mk
@@ -20,7 +20,7 @@ NET_PROFILER ?= 0
 MLX5DV ?= 0
 MAX_EXT_NET_PLUGINS ?= 0
 
-NVCC = $(CUDA_HOME)/bin/nvcc
+NVCC ?= $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -85,6 +85,8 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xp
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 
+NVCUFLAGS_SYM :=
+
 ########## GCOV ##########
 GCOV ?= 0 # disable by default.
 GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
@@ -158,3 +160,8 @@ endif
 ifneq ($(MAX_EXT_NET_PLUGINS), 0)
 CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS)
 endif
+
+CXXFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER
+NVCUFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER
+
+CXXFLAGS += -DNCCL_GIN_PROXY_ENABLE=1
diff --git a/makefiles/version.mk b/makefiles/version.mk
index d0e97c065..7bb671ddf 100644
--- a/makefiles/version.mk
+++ b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 28
-NCCL_PATCH   := 3
+NCCL_PATCH   := 7
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5ab69dc92..b48ed1880 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -39,6 +39,7 @@ add_subdirectory(device)
 add_subdirectory(nccl_device)
 add_subdirectory(ras)
 add_subdirectory(scheduler)
+add_subdirectory(gin)
 
 add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
 
@@ -52,6 +53,8 @@ list(APPEND LIBSRCFILES
     ${RAS_SOURCES}
     ${SYM_SOURCES}
     ${SCHEDULER_SOURCES}
+    ${GIN_SOURCES}
+    ${DOCA_SOURCES}
 )
 
 ###################### Create a shared NCCL library ############################
@@ -65,6 +68,7 @@ target_include_directories(nccl PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/include
     ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
     ${CUDAToolkit_INCLUDE_DIRS}
+    ${DOCA_HOME}/include
     ${CUDAToolkit_INCLUDE_DIRS}/cccl
 )
 
@@ -80,9 +84,25 @@ add_custom_command(
     BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
 )
 
-add_custom_target(nccl_header DEPENDS ${CMAKE_BINARY_DIR}/include/nccl.h)
+file(GLOB_RECURSE SRC_DEVICE_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device/*.h)
+
+# Copy all device header files to the destination
+foreach(HEADER_FILE ${SRC_DEVICE_HEADERS})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE} ${CMAKE_BINARY_DIR}/${HEADER_FILE} COPYONLY)
+    list(APPEND DEVICE_HEADERS ${CMAKE_BINARY_DIR}/${HEADER_FILE})
+endforeach()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device.h ${CMAKE_BINARY_DIR}/include/nccl_device.h COPYONLY)
+
+add_custom_target(nccl_header DEPENDS
+    ${CMAKE_BINARY_DIR}/include/nccl.h
+    ${CMAKE_BINARY_DIR}/include/nccl_device.h
+    ${DEVICE_HEADERS}
+    ${DEVICE_DOCA_HEADERS}
+)
 
 add_dependencies(nccl nccl_header)
+add_dependencies(nccl_device nccl_header)
 
 # Set version and output name
 set_target_properties(nccl PROPERTIES
@@ -111,6 +131,11 @@ target_link_libraries(nccl
     ${EXTRA_LIBS}
 )
 
+# Add version script for symbol visibility control
+target_link_options(nccl PRIVATE
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libnccl.map"
+)
+
 # Set output directories for nccl shared library
 set_target_properties(nccl PROPERTIES
     LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
@@ -149,6 +174,7 @@ target_include_directories(nccl_static PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/include
     ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
     ${CUDAToolkit_INCLUDE_DIRS}
+    transport/gdaki/doca-gpunetio/include
     ${CUDAToolkit_INCLUDE_DIRS}/cccl
 )
 
diff --git a/src/Makefile b/src/Makefile
index be026cc26..471a0335e 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -8,7 +8,7 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_device.h \
-	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
+	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/*/*.h include/nccl_device/*/*/*.h))
 
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
@@ -16,13 +16,16 @@ LIBSRCFILES := \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
+	$(wildcard transport/gdaki/*.cc) \
 	$(wildcard register/*.cc) \
 	$(wildcard plugin/*.cc) \
 	$(wildcard plugin/net/*.cc) \
 	$(wildcard plugin/tuner/*.cc) \
 	$(wildcard plugin/profiler/*.cc) \
+	$(wildcard plugin/env/*.cc) \
 	$(wildcard nccl_device/*.cc) \
 	$(wildcard scheduler/*.cc) \
+	$(wildcard gin/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -40,6 +43,7 @@ LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
 PKGDIR := $(BUILDDIR)/lib/pkgconfig
 BINDIR := $(BUILDDIR)/bin
+
 ##### target files
 CUDARTLIB  ?= cudart_static
 
@@ -61,6 +65,17 @@ INCPLUGIN  := include/plugin
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
+# DOCA GPUNetIO definitions
+DOCA_HOME        ?= transport/gdaki/doca-gpunetio
+DOCA_INC_INSTALL := $(INCDIR)/nccl_device/gin/gdaki/doca_gpunetio
+DOCA_OBJDIR      := $(OBJDIR)/transport/gdaki/doca-gpunetio
+DOCA_INCLUDES    := $(DOCA_HOME)/include/doca_gpunetio_device.h $(wildcard $(DOCA_HOME)/include/common/*.h) $(wildcard $(DOCA_HOME)/include/device/*.cuh)
+DOCA_INCTARGETS  := $(DOCA_INCLUDES:$(DOCA_HOME)/include/%=$(DOCA_INC_INSTALL)/%)
+INCTARGETS       += $(DOCA_INCTARGETS)
+DOCA_LIBSRC      := doca_verbs_qp.cpp doca_verbs_cq.cpp doca_verbs_device_attr.cpp doca_verbs_umem.cpp doca_verbs_srq.cpp doca_verbs_uar.cpp doca_gpunetio.cpp doca_gpunetio_log.cpp doca_gpunetio_high_level.cpp doca_verbs_cuda_wrapper.cpp doca_verbs_mlx5dv_wrapper.cpp doca_verbs_ibv_wrapper.cpp doca_gpunetio_gdrcopy.cpp
+DOCA_LIBOBJ      := $(DOCA_LIBSRC:%.cpp=$(DOCA_OBJDIR)/%.o)
+LIBOBJ           += $(DOCA_LIBOBJ)
+
 ##### rules
 build : lib staticlib binary
 
@@ -94,7 +109,7 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
 $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) -Wl,--version-script=libnccl.map
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
@@ -137,6 +152,36 @@ $(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
 	mkdir -p $(INCDIR)/nccl_device/impl
 	install -m 644 $< $@
 
+$(INCDIR)/nccl_device/gin/%.h: include/nccl_device/gin/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/gin/gdaki/%.h: include/nccl_device/gin/gdaki/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin/gdaki
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/gin/proxy/%.h: include/nccl_device/gin/proxy/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin/proxy
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/%.h: $(DOCA_HOME)/include/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/common/%.h: $(DOCA_HOME)/include/common/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)/common
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/device/%.cuh: $(DOCA_HOME)/include/device/%.cuh
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)/device
+	install -m 644 $< $@
+
 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
@@ -145,8 +190,18 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -M $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+$(DOCA_OBJDIR)/%.o : $(DOCA_HOME)/src/%.cpp
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -c $< -o $@
+	@$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
diff --git a/src/bootstrap.cc b/src/bootstrap.cc
index 7615b9c52..ff4a8eb24 100644
--- a/src/bootstrap.cc
+++ b/src/bootstrap.cc
@@ -226,6 +226,21 @@ static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData,
   return ncclSuccess;
 }
 
+static ncclResult_t socketDoubleSendRecv(struct ncclSocketOp ops[4]) {
+  // ops synchronously exchange size then asynchronously exchange data in send->recv->send->recv order
+  int senderRecvSize1, senderRecvSize2;
+  NCCLCHECK(ncclSocketSendRecv(ops[0].sock, &ops[0].size, sizeof(int), ops[1].sock, &senderRecvSize1, sizeof(int)));
+  NCCLCHECK(ncclSocketSendRecv(ops[2].sock, &ops[2].size, sizeof(int), ops[3].sock, &senderRecvSize2, sizeof(int)));
+  if (senderRecvSize1 > ops[1].size || senderRecvSize2 > ops[3].size) {
+    WARN("Message truncated : received %d,%d bytes instead of %d,%d", senderRecvSize1, senderRecvSize2, ops[1].size, ops[3].size);
+    return ncclInternalError;
+  }
+  ops[1].size = std::min(ops[1].size, senderRecvSize1);
+  ops[3].size = std::min(ops[3].size, senderRecvSize2);
+  NCCLCHECK(ncclSocketMultiOp(ops, 4));
+  return ncclSuccess;
+}
+
 union ringConnectInfo {
   union ncclSocketAddress addr;
   char handle[NCCL_NET_HANDLE_MAXSIZE];
@@ -1007,22 +1022,40 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC
   if (recvDataHandle) netDereg(net, recvComm, &recvDataHandle);
   return res;
 }
-static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct ncclSocket* recvSock, int rank, int nranks, char* data, int size) {
+static ncclResult_t socketRingAllGather(struct ncclSocket* nextSock, struct ncclSocket* prevSock, int rank, int nranks, char* data, int size) {
   ncclResult_t res = ncclSuccess;
   uint64_t tFirst = 0, tRest = 0;
   /* Simple ring based AllGather
    * At each step i receive data from (rank-i-1) from prev
    * and send previous step's data from (rank-i) to next
    */
-  TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started");
+  TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started: rank=%d nranks=%d", rank, nranks);
+  int totalSteps = nranks / 2;
+  TRACE(NCCL_BOOTSTRAP, "bidirectional bootstrap: totalSteps=%d", totalSteps);
   BOOTSTRAP_PROF_OPEN(tFirst);
-  for (int i = 0; i < nranks - 1; i++) {
-    size_t rslice = (rank - i - 1 + nranks) % nranks;
-    size_t sslice = (rank - i + nranks) % nranks;
-    void* recv_data = data + rslice * size;
-    void* send_data = data + sslice * size;
-    NCCLCHECKGOTO(socketSendRecv(sendSock, send_data, size, recvSock, recv_data, size), res, exit);
-    if (i == 0) {
+  for (int step = 0; step < totalSteps; step++) {
+    // N ranks requires (N-1)/2 steps for the double ring  algorithm. If N is even, the last step is requires a single send/recv
+    bool isFinalUnidirectional = (step == totalSteps - 1) && (nranks % 2 == 0);
+    // Ring0: ring from previous to next
+    int sendSliceRing0 = (rank - step + nranks) % nranks;      // Send this slice to next neighbor
+    int recvSliceRing0 = (rank - step - 1 + nranks) % nranks;  // Receive this slice from prev neighbor
+    // Ring1: ring from next to previous
+    int sendSliceRing1 = (rank + step) % nranks;               // Send this slice to prev neighbor
+    int recvSliceRing1 = (rank + step + 1) % nranks;           // Receive this slice from next neighbor
+    if (isFinalUnidirectional) {
+      // Final unidirectional step, only Ring0 is used
+      NCCLCHECKGOTO(socketSendRecv(nextSock, data + sendSliceRing0 * size, size, prevSock, data + recvSliceRing0 * size, size), res, exit);
+    } else {
+      // Bidirectional step: Ring0 and Ring1 are used simultaneously
+      struct ncclSocketOp ops[4] = {
+        {NCCL_SOCKET_SEND, nextSock, data + sendSliceRing0 * size, size, 0},  // Ring0: send to next
+        {NCCL_SOCKET_RECV, prevSock, data + recvSliceRing0 * size, size, 0},  // Ring0: recv from prev
+        {NCCL_SOCKET_SEND, prevSock, data + sendSliceRing1 * size, size, 0},  // Ring1: send to prev
+        {NCCL_SOCKET_RECV, nextSock, data + recvSliceRing1 * size, size, 0}   // Ring1: recv from next
+      };
+      NCCLCHECKGOTO(socketDoubleSendRecv(ops), res, exit);
+    }
+    if (step == 0) {
       BOOTSTRAP_PROF_CLOSE(tFirst);
       BOOTSTRAP_PROF_OPEN(tRest);
     }
diff --git a/src/ce_coll.cc b/src/ce_coll.cc
index 3f3dcbd7f..b2bf32b8b 100644
--- a/src/ce_coll.cc
+++ b/src/ce_coll.cc
@@ -55,13 +55,13 @@ ncclResult_t ncclCeInit(struct ncclComm* comm) {
 
 ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Clean up ceInitTaskQueue
   while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
     struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
     free(task);
   }
-  
+
   // Clean up CE resources
   if (comm->ceColl.baseUCSymReadyPtr != NULL) {
     if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
@@ -117,7 +117,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, CUstreamBatc
   void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
   size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
   NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
-  
+
   // Write our own ready/complete flag to the multi-cast address
   CUDACHECKGOTO(cudaMemcpyAsync(
     mcDstPtr,
@@ -194,7 +194,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
   // Get pointers to the ready and complete synchronization arrays
   uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
   uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
-  
+
   // Allocate enough slots for all possible ops
   size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
   size_t opIdx = 0;
@@ -220,7 +220,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
       opIdx++;
     }
   }
-  
+
   // Execute all memory operations in a single batch
   CUCHECKGOTO(cuStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
 
@@ -236,7 +236,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
 
 ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
   ncclResult_t ret = ncclSuccess;
-  
+
   params->srcs = nullptr;
   params->dsts = nullptr;
   params->sizes = nullptr;
@@ -247,7 +247,7 @@ ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int n
   params->attrIdxs = nullptr;
   params->numAttrs = 0;
 #endif
-  
+
   NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
@@ -284,7 +284,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa
 
   int driverVersion;
   NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
-    
+
   //--------------Graph capture--------------
   // cudaMemcpyBatchAsync is not supported during CUDA graph capture
   if (capturing) {
@@ -372,7 +372,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa
 
 ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of each rank's data chunk
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -423,7 +423,7 @@ ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args,
 
 ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data each rank sends to every other rank
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -442,7 +442,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args,
     int dstRank = (comm->rank + r) % comm->nRanks;
     uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
     uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
-    
+
     if (dstRank == comm->rank) {
       // Local copy for own data
       batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
@@ -478,7 +478,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args,
 
 ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data root sends to each rank
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -538,7 +538,7 @@ ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, c
 
 ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data each rank sends to root
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
diff --git a/src/debug.cc b/src/debug.cc
index 0d6ed8400..9e2bfa459 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -16,6 +16,7 @@
 #include <chrono>
 #include "param.h"
 #include <mutex>
+#include "env.h"
 
 #define NCCL_DEBUG_RESET_TRIGGERED (-2)
 
@@ -37,9 +38,12 @@ static bool ncclWarnSetDebugInfo = false;
 
 static __thread int tid = -1;
 
+typedef const char* (*ncclGetEnvFunc_t)(const char*);
+
 // This function must be called with ncclDebugLock locked!
 static void ncclDebugInit() {
-  const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
+  ncclGetEnvFunc_t getEnvFunc = ncclEnvPluginInitialized() ? ncclGetEnv : (ncclGetEnvFunc_t)getenv;
+  const char* nccl_debug = getEnvFunc("NCCL_DEBUG");
   int tempNcclDebugLevel = -1;
   uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask
   if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) {
@@ -47,6 +51,7 @@ static void ncclDebugInit() {
     fclose(ncclDebugFile);
     ncclDebugFile = stdout;
   }
+
   if (nccl_debug == NULL) {
     tempNcclDebugLevel = NCCL_LOG_NONE;
   } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
@@ -65,7 +70,7 @@ static void ncclDebugInit() {
    * This can be a comma separated list such as INIT,COLL
    * or ^INIT,COLL etc
    */
-  const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS");
+  const char* ncclDebugSubsysEnv = getEnvFunc("NCCL_DEBUG_SUBSYS");
   if (ncclDebugSubsysEnv != NULL) {
     int invert = 0;
     if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
@@ -117,7 +122,7 @@ static void ncclDebugInit() {
     free(ncclDebugSubsys);
   }
 
-  const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
+  const char* ncclWarnSetDebugInfoEnv = getEnvFunc("NCCL_WARN_ENABLE_DEBUG_INFO");
   if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
     int64_t value;
     errno = 0;
@@ -127,7 +132,7 @@ static void ncclDebugInit() {
   }
 
   // Determine which debug levels will have timestamps.
-  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  const char* timestamps = getEnvFunc("NCCL_DEBUG_TIMESTAMP_LEVELS");
   if (timestamps == nullptr) {
     ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
   } else {
@@ -163,7 +168,7 @@ static void ncclDebugInit() {
   }
 
   // Store a copy of the timestamp format with space for the subseconds, if used.
-  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  const char* tsFormat = getEnvFunc("NCCL_DEBUG_TIMESTAMP_FORMAT");
   if (tsFormat == nullptr) tsFormat = "[%F %T] ";
   ncclDebugTimestampSubsecondsStart = -1;
   // Find where the subseconds are in the format.
@@ -216,7 +221,7 @@ static void ncclDebugInit() {
    * then create the debug file. But don't bother unless the
    * NCCL_DEBUG level is > VERSION
    */
-  const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE");
+  const char* ncclDebugFileEnv = getEnvFunc("NCCL_DEBUG_FILE");
   if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
     int c = 0;
     char debugFn[PATH_MAX+1] = "";
diff --git a/src/dev_runtime.cc b/src/dev_runtime.cc
index 54e6e01bf..60cb200aa 100644
--- a/src/dev_runtime.cc
+++ b/src/dev_runtime.cc
@@ -18,8 +18,11 @@ struct ncclDevrMemory {
   int refCount;
   struct ncclDevrMemory* next;
   CUmemGenericAllocationHandle memHandle;
+  void* primaryAddr; // What we hope is the VA of this memory's first mapping.
   size_t size;
   size_t bigOffset; // offset in big VA space
+  void* ginHostWins[NCCL_GIN_MAX_CONTEXTS];
+  ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS];
 };
 
 struct ncclDevrWindowSorted {
@@ -56,12 +59,21 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
   struct ncclDevrState* devr = &comm->devrState;
   if (devr->bigSize != 0) return ncclSuccess;
 
-  bool lsaIsLocal = true;
-  for (int i=0; i < comm->localRanks; i++) {
-    lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i;
+  // LSA needs to be the same size for all ranks, and it needs to represent
+  // a consecutive set of ranks.
+  int lsaSize = 0;
+  int nodeSize = 1;
+  for (int r=1; r < comm->nRanks; r++) {
+    if (comm->rankToNode[r] == comm->rankToNode[r-1]) {
+      nodeSize += 1;
+    } else {
+      lsaSize = gcd(lsaSize, nodeSize);
+      nodeSize = 1;
+    }
   }
-  devr->lsaSelf = lsaIsLocal ? comm->localRank : 0;
-  devr->lsaSize = lsaIsLocal ? comm->localRanks : 1;
+  lsaSize = gcd(lsaSize, nodeSize);
+  devr->lsaSize = lsaSize;
+  devr->lsaSelf = comm->rank % lsaSize;
   devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int));
   for (int i=0; i < devr->lsaSize; i++) {
     devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf);
@@ -83,7 +95,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
   }
   devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32);
   INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30);
-  
+
   ncclSpaceConstruct(&devr->bigSpace);
   ncclShadowPoolConstruct(&devr->shadows);
   return ncclSuccess;
@@ -103,7 +115,7 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
     struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue);
     free(task);
   }
-  
+
   symTeamDestroyAll(comm);
   { // delete windowTable
     cudaStream_t stream;
@@ -336,11 +348,17 @@ static void symTeamDestroyAll(struct ncclComm* comm) {
   }
 }
 
+static ncclResult_t symMemoryRegisterGin(struct ncclComm* comm, struct ncclDevrMemory* mem) {
+  NCCLCHECK(ncclGinConnectOnce(comm));
+  NCCLCHECK(ncclGinRegister(comm, mem->primaryAddr, mem->size, mem->ginHostWins, mem->ginDevWins));
+  return ncclSuccess;
+}
+
 // On success we take caller's reference on memHandle.
 // Due to multicast binds for each pre-exiting team, this function requires
 // caller do a world barrier before returning to user.
 static ncclResult_t symMemoryObtain(
-    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size,
+    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, void* memAddr, size_t size,
     struct ncclDevrMemory** outMem
   ) {
   ncclResult_t ret = ncclSuccess;
@@ -355,12 +373,14 @@ static ncclResult_t symMemoryObtain(
     }
     mem = mem->next;
   }
+
   // New memory.
   mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory));
   mem->refCount = 0;
   mem->memHandle = memHandle;
+  mem->primaryAddr = memAddr;
   mem->size = size;
- 
+
   // Grab offset in the big space.
   NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem);
   mem->bigOffset = bigOffset;
@@ -368,10 +388,20 @@ static ncclResult_t symMemoryObtain(
   // Map unicast addresses into flat VA space for lsa team.
   NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space);
 
+  // If our caller doesn't have a VA then we'll use the LSA mapping.
+  if (mem->primaryAddr == nullptr) {
+    mem->primaryAddr = (char*)devr->lsaFlatBase + devr->lsaSelf*devr->bigSize + mem->bigOffset;
+  }
+
   // Bind new memory with each existing team.
   for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
     NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams);
   }
+
+  if (devr->ginEnabled) {
+    NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail_mem_space_teams);
+  }
+
   // Add to list of mems.
   mem->next = devr->memHead;
   devr->memHead = mem;
@@ -398,6 +428,9 @@ static void symMemoryDropRef(
   ) {
   if (mem != nullptr && 0 == --mem->refCount) {
     struct ncclDevrState* devr = &comm->devrState;
+    if (devr->ginEnabled) {
+      ncclGinDeregister(comm, mem->ginHostWins);
+    }
     for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
       symUnbindTeamMemory(comm, t, mem);
     }
@@ -461,18 +494,22 @@ static ncclResult_t symWindowCreate(
   winDevHost->lsaRank = devr->lsaSelf;
   winDevHost->worldRank = comm->rank;
   winDevHost->winHost = (void*)win;
+  winDevHost->ginOffset4K = memOffset>>12;
+  for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) {
+    winDevHost->ginWins[i] = mem->ginDevWins[i];
+  }
   CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream));
 
   NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists
   struct ncclDevCommWindowTable* tableDev = devr->windowTable;
-  struct ncclDevCommWindowTable* tableHost;
-  NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost));
   while (true) {
+    struct ncclDevCommWindowTable* tableHost;
+    NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost));
     int i = 0;
     while (i < 32 && tableHost->entries[i].window != nullptr) i += 1;
     if (i < 32) {
       tableHost->entries[i].base = userAddr;
-      tableHost->entries[i].size = userAddr + userSize;
+      tableHost->entries[i].size = userSize;
       tableHost->entries[i].window = winDev;
       CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream));
       break;
@@ -482,7 +519,6 @@ static ncclResult_t symWindowCreate(
       CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream));
     }
     tableDev = tableHost->next;
-    NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost));
   }
 
   { // insert into winSorted[]
@@ -511,9 +547,9 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi
   symMemoryDropRef(comm, winHost->memory);
 
   { struct ncclDevCommWindowTable* tableDev = devr->windowTable;
-    struct ncclDevCommWindowTable* tableHost;
-    NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted);
     while (true) {
+      struct ncclDevCommWindowTable* tableHost;
+      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted);
       int i = 0;
       while (i < 32 && tableHost->entries[i].window != winDev) i += 1;
       if (i < 32) {
@@ -523,7 +559,6 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi
       }
       if (tableHost->next == nullptr) break; // Error didn't find window in table
       tableDev = tableHost->next;
-      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted);
     }
   }
   NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted);
@@ -578,7 +613,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup(
   CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast<void*>(memAddr)), ret, fail_locReg);
 
   // Trade cumem handle for ncclDevrMemory*
-  NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle);
+  NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, (void*)memAddr, memSize, &mem), ret, fail_locReg_memHandle);
   memHandle = 0x0; // symMemoryObtain took our reference
 
   CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
@@ -587,7 +622,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup(
       comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream
     ), ret, fail_locReg_memHandle_mem_stream);
   mem = nullptr; // symWindowCreate took our reference
-  
+
   CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win);
 
   // symWindowCreate needs barrier.
@@ -679,15 +714,35 @@ ncclResult_t ncclDevrCommCreateInternal(
   struct ncclDevrState* devr = &comm->devrState;
   struct ncclTeam world = ncclTeamWorld(comm);
   struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize);
+  bool ginActivated = false;
   struct ncclDevrTeam* tmLsa;
   size_t bufSizeTotal;
+  int nGinContexts = 0;
+  int ginSignalTotal = 0, ginCounterTotal = 0;
   struct ncclDevResourceRequirements* resReqsHead;
   struct ncclDevResourceRequirements lsaBarReq;
   cudaStream_t stream = nullptr;
+  struct ncclDevResourceRequirements railGinBarrierReq;
   CUmemGenericAllocationHandle memHandle = 0x0;
   struct ncclDevrMemory* mem = nullptr;
   struct ncclDevrWindow* win = nullptr;
   struct ncclWindow_vidmem* winHost = nullptr;
+  size_t ginSignalShadowsOffset = 0;
+
+  if (comm->nNodes > 1 || reqs->ginForceEnable || reqs->ginCounterCount != 0 || reqs->ginSignalCount != 0) {
+    ginActivated = !devr->ginEnabled;
+    devr->ginEnabled = true;
+  }
+
+  if (ginActivated) {
+    NCCLCHECKGOTO(ncclGinConnectOnce(comm), ret, fail);
+    // Register all preexisting memories with GIN. Update the windows later when
+    // we have a stream.
+    for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) {
+      NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail);
+    }
+  }
+  if (devr->ginEnabled) nGinContexts = comm->sharedRes->ginState.ginCommCount;
 
   memset(outDevComm, 0, sizeof(*outDevComm));
   outDevComm->rank = comm->rank;
@@ -713,25 +768,52 @@ ncclResult_t ncclDevrCommCreateInternal(
 
   resReqsHead = reqs->resourceRequirementsList;
 
-  ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq);
+  ncclLsaBarrierCreateRequirement(lsa, std::max(reqs->barrierCount, reqs->lsaBarrierCount), &outDevComm->lsaBarrier, &lsaBarReq);
   lsaBarReq.next = resReqsHead;
   resReqsHead = &lsaBarReq;
 
+  ncclGinBarrierCreateRequirement(comm, ncclTeamRail(comm), std::max(reqs->barrierCount, reqs->railGinBarrierCount), &outDevComm->railGinBarrier, &railGinBarrierReq);
+  railGinBarrierReq.next = resReqsHead;
+  resReqsHead = &railGinBarrierReq;
+
   { struct ncclDevResourceRequirements* rr = resReqsHead;
     bufSizeTotal = 0;
+    ginSignalTotal = reqs->ginSignalCount;
+    ginCounterTotal = reqs->ginCounterCount;
     while (rr != nullptr) {
       bufSizeTotal = alignUp(bufSizeTotal, std::max<size_t>(128, rr->bufferAlign));
       if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128;
+      if (rr->outGinSignalStart != nullptr) *rr->outGinSignalStart = ginSignalTotal;
+      if (rr->outGinCounterStart != nullptr) *rr->outGinCounterStart = ginCounterTotal;
       bufSizeTotal += rr->bufferSize;
+      ginSignalTotal += rr->ginSignalCount;
+      ginCounterTotal += rr->ginCounterCount;
       rr = rr->next;
     }
+    bufSizeTotal= alignUp(bufSizeTotal, 128);
+    ginSignalShadowsOffset = bufSizeTotal;
+    bufSizeTotal += nGinContexts*ginSignalTotal*sizeof(uint64_t); // include signal shadows
     bufSizeTotal = alignUp(bufSizeTotal, devr->granularity);
   }
 
   CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
 
-  NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists
-  outDevComm->windowTable = comm->devrState.windowTable;
+  if (ginActivated) {
+    // Now update the GIN handles in all existing windows. Registration of memories happened above.
+    for (int i=0; i < devr->winSortedCount; i++) {
+      struct ncclDevrWindow* win = devr->winSorted[i].win;
+      struct ncclWindow_vidmem* winHost;
+      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream);
+      winHost->ginOffset4K = (win->bigOffset - win->memory->bigOffset)>>12;
+      for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) {
+        winHost->ginWins[i] = win->memory->ginDevWins[i];
+      }
+      CUDACHECKGOTO(cudaMemcpyAsync(win->vidmem, winHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream), ret, fail_stream);
+    }
+  }
+
+  NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail_stream); // ensure devr->windowTable exists
+  outDevComm->windowTable = devr->windowTable;
 
   if (bufSizeTotal == 0) {
     outDevComm->resourceWindow = nullptr;
@@ -741,45 +823,65 @@ ncclResult_t ncclDevrCommCreateInternal(
     memProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
     memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     memProp.requestedHandleTypes = ncclCuMemHandleType;
+    // We have to assume that if GIN is possible it might be requested in the future,
+    // even on single node.
+    memProp.allocFlags.gpuDirectRDMACapable = comm->sharedRes->ginState.ncclGin != nullptr ? 1 : 0;
     memProp.location.id = comm->cudaDev;
 
-    CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail);
+    CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail_stream);
 
-    NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail);
+    NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, NULL, bufSizeTotal, &mem), ret, fail_stream_mem);
     memHandle = 0x0; // Reference given to symMemoryObtain
 
     NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward.
       comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0,
       /*localReg=*/nullptr, &outDevComm->resourceWindow, &win,
-      stream), ret, fail);
+      stream), ret, fail_stream_mem);
     mem = nullptr; // Reference given to symWindowCreate
-    NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail);
+    NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream_mem_win);
     outDevComm->resourceWindow_inlined = *winHost;
+    outDevComm->ginSignalShadows = (uint64_t*)add4G((char*)winHost->lsaFlatBase + ginSignalShadowsOffset, winHost->lsaRank*winHost->stride4G);
 
-    CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail);
+    CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail_stream_mem_win);
   }
 
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail);
+  if (devr->ginEnabled) {
+    outDevComm->ginContextCount = nGinContexts;
+    outDevComm->ginSignalCount = ginSignalTotal;
+    outDevComm->ginCounterCount = ginCounterTotal;
+    NCCLCHECKGOTO(ncclGinAllocSignalsCounters(comm,
+      ginSignalTotal, &outDevComm->ginSignalBase,
+      ginCounterTotal, &outDevComm->ginCounterBase
+    ), ret, fail_stream_mem_win);
+
+    for (int ctx=0; ctx < nGinContexts; ctx++) {
+      outDevComm->ginTypes[ctx] = (int)comm->sharedRes->ginState.ginDevHandles[ctx]->netDeviceType;
+      outDevComm->ginHandles[ctx] = comm->sharedRes->ginState.ginDevHandles[ctx]->handle;
+    }
+  }
 
-  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_stream_mem_win_signals);
 
-  cudaStreamDestroy(stream);
+  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_stream_mem_win_signals);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), ret, fail_stream_mem_win_signals);
   return ret;
 
-fail:
-  if (win != nullptr) {
-    symWindowDestroy(comm, win->vidmem, stream);
-    cudaStreamSynchronize(stream);
-  }
-  if (mem != nullptr) {
-    symMemoryDropRef(comm, mem);
-  }
-  if (memHandle != 0x0) {
-    CUCHECKIGNORE(cuMemRelease(memHandle));
-  }
-  if (stream != nullptr) {
-    cudaStreamDestroy(stream);
+fail_stream_mem_win_signals:
+  if (devr->ginEnabled) {
+    ncclGinFreeSignalsCounters(comm,
+      outDevComm->ginSignalBase, outDevComm->ginSignalCount,
+      outDevComm->ginCounterBase, outDevComm->ginCounterCount
+    );
   }
+fail_stream_mem_win:
+  symWindowDestroy(comm, win->vidmem, stream);
+  cudaStreamSynchronize(stream);
+fail_stream_mem:
+  if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); }
+  symMemoryDropRef(comm, mem);
+fail_stream:
+  cudaStreamDestroy(stream);
+fail:
   return ret;
 }
 
@@ -905,7 +1007,13 @@ NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const*
 ncclResult_t ncclDevCommDestroy(
     struct ncclComm* comm, struct ncclDevComm const* devComm
   ) {
-  //struct ncclDevrState* devr = &comm->devrState;
+  struct ncclDevrState* devr = &comm->devrState;
+  if (devr->ginEnabled) {
+    ncclGinFreeSignalsCounters(comm,
+      devComm->ginSignalBase, devComm->ginSignalCount,
+      devComm->ginCounterBase, devComm->ginCounterCount
+    );
+  }
   if (devComm->resourceWindow != nullptr) {
     NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow));
   }
@@ -920,7 +1028,7 @@ ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow*
   }
 
   struct ncclDevrState* devr = &comm->devrState;
-  
+
   // Validate lsaRank is within bounds
   if (lsaRank < 0 || lsaRank >= devr->lsaSize) {
     return ncclInvalidArgument;
@@ -949,7 +1057,7 @@ ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindo
   bool multimem = true;
   struct ncclDevrTeam* tm;
   NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm));
-    
+
   // Return the base multicast address for this team with offset
   *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset);
   return ncclSuccess;
diff --git a/src/device/CMakeLists.txt b/src/device/CMakeLists.txt
index 98447428d..acaa9b65d 100644
--- a/src/device/CMakeLists.txt
+++ b/src/device/CMakeLists.txt
@@ -50,9 +50,9 @@ set_target_properties(nccl_device PROPERTIES
 # Set include directories for the target
 target_include_directories(nccl_device PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_BINARY_DIR}/include
     ${CMAKE_SOURCE_DIR}/src/include
     ${CMAKE_SOURCE_DIR}/src/include/plugin
-    ${CMAKE_BINARY_DIR}/include
     ${CUDAToolkit_INCLUDE_DIRS}
     ${CUDAToolkit_INCLUDE_DIRS}/cccl
 )
diff --git a/src/device/Makefile b/src/device/Makefile
index fd8f2759d..cf0fa0637 100644
--- a/src/device/Makefile
+++ b/src/device/Makefile
@@ -23,12 +23,13 @@ INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)
 
-NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
+NVCUFLAGS_SYM += -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
 NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 
 SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
 
 COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
+COMPILE.kernel = $(NVCC) $(NVCUFLAGS) -dw $2 -o $1
 COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
 define COMPILE
 @$(SAY) "Compiling" $2;\
diff --git a/src/device/generate.py b/src/device/generate.py
index aefba9422..4b081924e 100755
--- a/src/device/generate.py
+++ b/src/device/generate.py
@@ -269,7 +269,7 @@ def validate(coll, redop, ty, algo, proto):
 
   # List of all kernel function pointers.
   out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
-  out("extern void* const ncclDevKernelList[] = {\n")
+  out("void* ncclDevKernelList[] = {\n")
   index = 0
   for kfn in kernel_funcs:
     cudart, _ = required_cuda(*kfn)
@@ -281,6 +281,14 @@ def validate(coll, redop, ty, algo, proto):
   out("nullptr};\n")
   out("\n")
 
+  out("int ncclDevKernelRequirements[] = {\n")
+  for index,kfn in enumerate(kernel_funcs):
+    cudart,_ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    out("  %7d, /*%4d %s*/\n" % (cudart or 0, index, sym));
+  out("};\n")
+  out("\n")
+
   # Maps primary id to kernel function pointer.
   out("extern void* const ncclDevKernelForFunc[] = {\n")
   index = 0
diff --git a/src/device/network/unpack/unpack.h b/src/device/network/unpack/unpack.h
index 941b4328d..7ecad5cee 100644
--- a/src/device/network/unpack/unpack.h
+++ b/src/device/network/unpack/unpack.h
@@ -248,7 +248,7 @@ inline __device__ void ncclNetDeviceUnpackInner(
 
     for (int x = 0; x < iter_meta_cnt; x++) {
       int meta_idx = x + w * PPW;
-      
+
       // load page offs
       loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]);
 
diff --git a/src/device/reduce_kernel.h b/src/device/reduce_kernel.h
index d36dfe5a7..16a9ed5d9 100644
--- a/src/device/reduce_kernel.h
+++ b/src/device/reduce_kernel.h
@@ -778,7 +778,7 @@ struct FuncSumPostDiv {
   using UintType = typename std::conditional<sizeof(T)==8, uint64_t, uint32_t>::type;
   uint32_t divisor:31, isSigned:1;
   UintType recip;
-  
+
   __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) {
     isSigned = opArg & 1;
     divisor = opArg >> 1;
diff --git a/src/device/symmetric/all_gather.cuh b/src/device/symmetric/all_gather.cuh
index 9f050836c..98a755127 100644
--- a/src/device/symmetric/all_gather.cuh
+++ b/src/device/symmetric/all_gather.cuh
@@ -352,7 +352,7 @@ static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const*
         char* blockInput = input.localPtr();
         char* blockOutput = output.localPtr();
 
-        uint32_t lowBits = nElts;
+        uint32_t lowBits = nAllElts;
         lowBits |= (uintptr_t)blockInput;
         lowBits |= (uintptr_t)blockOutput;
         if (__builtin_expect(lowBits%8 == 0, true)) {
diff --git a/src/device/symmetric/generate.py b/src/device/symmetric/generate.py
index 8e62bda5b..972e5ca93 100755
--- a/src/device/symmetric/generate.py
+++ b/src/device/symmetric/generate.py
@@ -249,12 +249,20 @@ def partition(vals, keyfn):
   emitln(f, '')
 
   emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels())))
-  emitln(f, 'extern void* const ncclSymkKernelList[] = {')
+  emitln(f, 'void* ncclSymkKernelList[] = {')
   for k in enumerate_kernels():
     emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
   emitln(f, 'nullptr};')
   emitln(f, '')
 
+  emitln(f, 'int ncclSymkKernelRequirements[] = {')
+  for index,k in enumerate(enumerate_kernels()):
+    cudart, _, _ = required_cuda(k)
+    sym = kernel_cname(k)
+    emitln(f, '  %7d, /*%4d %s*/' % (cudart or 0, index, sym));
+  emitln(f, '};')
+  emitln(f, '')
+
   emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {')
   indents += 1
   emitln(f, 'switch (id) {')
diff --git a/src/device/symmetric/primitives.cuh b/src/device/symmetric/primitives.cuh
index 73305d54c..dfdde0e50 100644
--- a/src/device/symmetric/primitives.cuh
+++ b/src/device/symmetric/primitives.cuh
@@ -56,13 +56,14 @@ struct ncclSymkArgsHandler {
       workLo++;
       fracLo = 0;
     }
-    struct ncclSymkDevWork const& dw = devWork[workLo];
-    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
+    struct ncclSymkDevWork const& dwLo = devWork[workLo];
+    indexLo = ((fracLo * divUp(dwLo.nElts, EltPerCell)) >> 16) * EltPerCell;
 
     // Where the work ends
     workHi = channelWorkRange[block].workHi;
     fracHi = channelWorkRange[block].fracHi + 1;
-    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
+    struct ncclSymkDevWork const& dwHi = devWork[workHi];
+    indexHi = min(((fracHi * divUp(dwHi.nElts, EltPerCell)) >> 16) * EltPerCell, dwHi.nElts);
   }
 
   template<typename T>
@@ -78,7 +79,7 @@ struct ncclSymkArgsHandler {
     lastBlock = dw.sChannelId+dw.nChannels-1;
 
     // Where the work begins
-    fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF);
+    fracLo = (dw.sChannelId>0 && channelWorkRange[dw.sChannelId-1].workHi == w) ? ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF) : 0;
     indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
     fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000;
     indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
@@ -91,16 +92,16 @@ struct ncclSymkArgsHandler {
 
       getWorkRange<T>(blockIdx.x, workLo, indexLo, workHi, indexHi);
 
-      size_t currentIndexLo = indexLo;
       #pragma unroll 1
       for (int w = workLo; w <= workHi; w++) {
         struct ncclSymkDevWork const& dw = devWork[w];
         size_t const& nAllElts = dw.nElts;
-        size_t currentIndexHi;
+        size_t currentIndexLo, currentIndexHi;
         int block, nBlocks;
         if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) {
           getWorkRangeFused<T>(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi);
         } else {
+          currentIndexLo = (w > workLo) ? 0 : indexLo;
           currentIndexHi = (w < workHi) ? nAllElts : indexHi;
           block = 0;
           nBlocks = 1;
diff --git a/src/device/symmetric/reduce_scatter.cuh b/src/device/symmetric/reduce_scatter.cuh
index 8f79b3990..850960845 100644
--- a/src/device/symmetric/reduce_scatter.cuh
+++ b/src/device/symmetric/reduce_scatter.cuh
@@ -241,7 +241,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs
                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
         int tn = nBlocks*blockDim.x;
 
-        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts);
+        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nAllElts, output, nElts);
 
         waitNeeded = false;
       }
@@ -323,7 +323,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkAr
                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
         int tn = nBlocks*blockDim.x;
 
-        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts);
+        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nAllElts, output.localPtr(), nElts);
       }
     );
 
@@ -402,7 +402,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs
         T* input = (T*)inputPtr.localPtr();
         T* output = (T*)outputPtr.localPtr();
 
-        uint32_t lowBits = nElts*sizeof(T);
+        uint32_t lowBits = nAllElts*sizeof(T);
         lowBits |= (uintptr_t)input;
         lowBits |= (uintptr_t)output;
         if (__builtin_expect(lowBits%8 == 0, true)) {
diff --git a/src/enqueue.cc b/src/enqueue.cc
index 00a0ef8da..da45abe6f 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -32,16 +32,28 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
   int carveout = ncclParamL1SharedMemoryCarveout();
   int ncclMaxSharedMem = ncclShmemDynamicSize(cudaArch);
 
+  int driverVersion;
+  NCCLCHECK(ncclCudaDriverVersion(&driverVersion));
+
   for (int sym=0; sym <= 1; sym++) {
     int kcount = sym==0 ? ncclDevKernelCount : ncclSymkKernelCount;
-    void* const* kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList;
+    void** kptrs = sym==0 ? ncclDevKernelList : ncclSymkKernelList;
+    int* krequires = sym==0 ? ncclDevKernelRequirements : ncclSymkKernelRequirements;
     for (int k=0; k < kcount; k++) {
+      if (kptrs[k] != nullptr && driverVersion < krequires[k]) {
+        INFO(NCCL_INIT, "Skipping %skernel %d which requires driver %d",
+             sym ? "symmetric " : "", k, krequires[k]);
+        kptrs[k] = nullptr;
+      }
       void* fn = kptrs[k];
       cudaFuncAttributes attr = {0};
       if (fn == nullptr) continue;
 
       cudaError_t errcode = cudaFuncGetAttributes(&attr, fn);
-      if (errcode != cudaSuccess) continue; // Silently ignore failures
+      if (errcode != cudaSuccess) {
+		  cudaGetLastError(); // Drain error code
+		  continue; // Silently ignore failures
+	  }
       if (maxStackSize) {
         if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
       }
@@ -116,9 +128,14 @@ static void addWorkBatchToPlan(
     // batch further down.
     newBatch |= NCCL_MAX_DEV_WORK_BATCH_BYTES < chan->wipBatch.workBytes + workSize;
     if (workType == ncclDevWorkTypeP2p) {
+      // We only allow NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops per batch.
       newBatch |= chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
-      for (int i=0; i < chan->wipBatch.nP2ps; i++) {
+      for (int i = 0; i < chan->wipBatch.nP2ps; i++) {
+        // Do not allow the same round twice in the same batch.
         newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
+        // Make sure we only aggregate p2p operations within the same p2p round epoch (one epoch is NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops).
+        // This enforces uniform batching accross ranks in the communicator and prevents hangs.
+        newBatch |= (p2pRound / NCCL_MAX_DEV_WORK_P2P_PER_BATCH) != (chan->wipBatch.p2pRounds[i] / NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
       }
     }
   }
@@ -2447,7 +2464,7 @@ static ncclResult_t collTaskAppend(
   NCCLCHECK(ncclProfilerStartGroupApiEvent(info, isGraphCaptured));
   NCCLCHECK(ncclProfilerRecordGroupApiEventState(ncclProfilerGroupStartApiStop));
   NCCLCHECK(ncclProfilerStartCollApiEvent(info, isGraphCaptured));
-  
+
   struct ncclTaskColl* t = ncclMemoryPoolAlloc<struct ncclTaskColl>(&comm->memPool_ncclTaskColl, &comm->memPermanent);
   t->func = info->coll;
   t->sendbuff = info->sendbuff;
@@ -2484,7 +2501,7 @@ static ncclResult_t ceCollTaskAppend(
     struct ncclDevrWindow* recvWin,
     struct ncclDevRedOpFull opDev) {
   struct ncclKernelPlanner *planner = &comm->planner;
-  
+
   // Check if CE needs initialization
   if (comm->ceColl.baseUCSymReadyPtr == NULL && ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
     struct ncclCeInitTask* ceTask;
@@ -2558,7 +2575,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
       ncclDevrFindWindow(comm, info->sendbuff, &sendWin);
       ncclDevrFindWindow(comm, info->recvbuff, &recvWin);
       bool ceImplemented = ncclCeImplemented(info->coll, info->op, info->datatype);
-      
+
       // Append CE collective task if CE is supported and requested by user
       if (comm->symmetricSupport && comm->nNodes == 1 && sendWin && recvWin && (sendWin->winFlags & recvWin->winFlags & NCCL_WIN_COLL_SYMMETRIC) && comm->config.CTAPolicy == NCCL_CTA_POLICY_ZERO && ceImplemented) {
         NCCLCHECK(ceCollTaskAppend(comm, info, sendWin, recvWin, opDev));
@@ -2601,16 +2618,21 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
 }
 
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  // Early-out on invalid or revoked communicator
+  ncclResult_t ret = CommCheck(info->comm, info->opName, "comm");
+  if (ret != ncclSuccess) return ncclGroupErrCheck(ret);
+  if (info->comm->revokedFlag) {
+    WARN("%s: communicator was revoked", info->opName);
+    return ncclGroupErrCheck(ncclInvalidUsage);
+  }
   // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth
   // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls
   if (ncclProfilerApiState.profilerGroupDepth > 0) {
     ncclProfilerApiState.profilerGroupDepth++;
   }
   NCCLCHECK(ncclGroupStartInternal());
-  ncclResult_t ret = ncclSuccess;
+  ret = ncclSuccess;
   int devOld = -1;
-
-  NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail);
   // Check whether communicator is ready to communicate
   NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
 
diff --git a/src/gin/CMakeLists.txt b/src/gin/CMakeLists.txt
new file mode 100644
index 000000000..e20d7ddf3
--- /dev/null
+++ b/src/gin/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Gin sources
+set(GIN_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_host.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_proxy.cc
+)
+
+# Add gin sources to parent scope
+set(GIN_SOURCES ${GIN_SOURCES} PARENT_SCOPE)
diff --git a/src/gin/gin_host.cc b/src/gin/gin_host.cc
new file mode 100644
index 000000000..b42f88fde
--- /dev/null
+++ b/src/gin/gin_host.cc
@@ -0,0 +1,277 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "param.h"
+#include "graph.h"
+#include "transport.h"
+#include "register_inline.h"
+#include "gin/gin_host.h"
+#include "gin/gin_host_proxy.h"
+
+NCCL_PARAM(GinEnable, "GIN_ENABLE", 1);
+NCCL_PARAM(GinType, "GIN_TYPE", -1);
+NCCL_PARAM(GinSignalPoolSize, "GIN_SIGNAL_POOL_SIZE", 64 << 10);
+NCCL_PARAM(GinCounterPoolSize, "GIN_COUNTER_POOL_SIZE", 64 << 10);
+
+void* ncclGinProgress(void* ginState_) {
+  struct ncclGinState* ginState = (struct ncclGinState*)ginState_;
+  while (1) {
+    pthread_mutex_lock(&ginState->threadLock);
+    if (ginState->ginProgress == 1) {
+      pthread_mutex_unlock(&ginState->threadLock);
+      for (int n=0; n<ginState->ginCommCount; n++) {
+        ncclResult_t ret;
+        if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+          ret = ncclGinProxyProgress(ginState->ncclGin, ginState->ginCtx[n]);
+        } else {
+          ret = ginState->ncclGin->ginProgress(ginState->ginComms[n]);
+        }
+        if (ret != ncclSuccess) {
+          __atomic_store_n(&ginState->asyncResult, ret, __ATOMIC_RELEASE);
+          INFO(NCCL_ALL,"%s:%d -> %d [GIN Progress Thread]", __FILE__, __LINE__, ret);
+          ginState->ginProgress = -2;
+          return NULL;
+        }
+      }
+      sched_yield();
+    } else if (ginState->ginProgress == -1) {
+      pthread_mutex_unlock(&ginState->threadLock);
+      return NULL;
+    } else if (ginState->ginProgress == 0) {
+      pthread_cond_wait(&ginState->threadCond, &ginState->threadLock);
+      pthread_mutex_unlock(&ginState->threadLock);
+    } else {
+      pthread_mutex_unlock(&ginState->threadLock);
+      INFO(NCCL_ALL,"%s:%d -> [GIN Progress Thread] state unknown %d", __FILE__, __LINE__, ginState->ginProgress);
+      ginState->ginProgress = -2;
+      return NULL;
+    }
+  }
+}
+
+NCCL_PARAM(GinNcontexts, "GIN_NCONTEXTS", NCCL_GIN_MAX_CONTEXTS);
+
+ncclResult_t ncclGinConnectOnce(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (ginState->ncclGin == NULL) {
+    WARN("GIN not supported.");
+    return ncclInvalidUsage;
+  }
+  if (ncclParamGinEnable() == 0) {
+    WARN("GIN is disabled.");
+    return ncclInternalError;
+  }
+  if (ginState->connected) return ncclSuccess;
+
+  NCCLCHECK(ginState->ncclGin->init(&ginState->ginInstance, comm->commHash, ncclDebugLog));
+
+  int ndev = 0;
+  NCCLCHECK(ginState->ncclGin->devices(&ndev));
+  if (ndev <= 0) {
+    WARN("No GIN-capable devices found.");
+    return ncclInternalError;
+  }
+
+  ncclNetProperties_t props;
+  NCCLCHECK(ginState->ncclGin->getProperties(0, &props));
+  ginState->ginType = props.netDeviceType;
+  if ((ncclParamGinType() != -1) && (ginState->ginType != ncclParamGinType())) {
+    WARN("GIN-capable device type mismatch.");
+    return ncclInternalError;
+  }
+
+  int nLocalNets;
+  int64_t localNets[NCCL_TOPO_MAX_NODES];
+  NCCLCHECK(ncclTopoGetLocalNets(comm->topo, comm->rank, localNets, &nLocalNets));
+
+  void** handles = NULL;
+  char* allHandles = NULL;
+
+  ginState->ginCommCount = std::min<int>(NCCL_GIN_MAX_CONTEXTS, ncclParamGinNcontexts());
+
+  NCCLCHECKGOTO(ncclCalloc(&allHandles, (size_t)comm->nRanks * NCCL_NET_HANDLE_MAXSIZE), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&handles, comm->nRanks), ret, fail);
+  for (int r = 0; r < comm->nRanks; r++) handles[r] = allHandles + r * NCCL_NET_HANDLE_MAXSIZE;
+
+  ginState->signalSpaceSize = ncclParamGinSignalPoolSize();
+  if (ginState->signalSpaceSize < 0 || (1 << 30) <= ginState->signalSpaceSize) {
+    WARN("NCCL_GIN_SIGNAL_POOL_SIZE has invalid value.");
+    ginState->signalSpaceSize = 64 << 10;
+  }
+  ginState->counterSpaceSize = ncclParamGinCounterPoolSize();
+  if (ginState->counterSpaceSize < 0 || (1 << 30) <= ginState->counterSpaceSize) {
+    WARN("NCCL_GIN_COUNTER_POOL_SIZE has invalid value.");
+    ginState->counterSpaceSize = 64 << 10;
+  }
+
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    void* listenComm;
+    NCCLCHECKGOTO(
+      ginState->ncclGin->listen(ginState->ginInstance, localNets[n%nLocalNets],
+                                allHandles + NCCL_NET_HANDLE_MAXSIZE * comm->rank, &listenComm),
+      ret, fail);
+    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allHandles, NCCL_NET_HANDLE_MAXSIZE), ret,
+                  fail);
+    NCCLCHECKGOTO(ginState->ncclGin->connect(comm->ginContext, handles, comm->nRanks, comm->rank,
+                                             listenComm, ginState->ginComms + n),
+                  ret, fail);
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECKGOTO(ncclGinProxyCreateContext(comm, ginState->ginComms[n], localNets[n%nLocalNets],
+                                              ginState->signalSpaceSize, ginState->counterSpaceSize,
+                                              &ginState->ginCtx[n], &ginState->ginDevHandles[n]),
+                    ret, fail);
+    } else {
+      NCCLCHECKGOTO(ginState->ncclGin->createContext(
+                      ginState->ginComms[n], ginState->signalSpaceSize, ginState->counterSpaceSize,
+                      &ginState->ginCtx[n], &ginState->ginDevHandles[n]),
+                    ret, fail);
+    }
+    NCCLCHECKGOTO(ginState->ncclGin->closeListen(listenComm), ret, fail);
+  }
+  free(handles);
+  handles = NULL;
+  free(allHandles);
+  allHandles = NULL;
+
+  // Check whether we need proxy progress and if so, start / wake up the progress thread.
+  ginState->needsProxyProgress = 0;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginDevHandles[n]->needsProxyProgress) ginState->needsProxyProgress = 1;
+  }
+  if (ginState->needsProxyProgress) {
+    ginState->ginProgress = 1;
+    pthread_mutex_init(&ginState->threadLock, NULL);
+    pthread_cond_init(&ginState->threadCond, NULL);
+    PTHREADCHECK(pthread_create(&ginState->thread, NULL, ncclGinProgress, ginState), "pthread_create");
+    ncclSetThreadName(ginState->thread, "NCCL GIN Progress%2d", comm->cudaDev);
+  }
+
+  ncclSpaceConstruct(&ginState->counterSpace);
+  ncclSpaceConstruct(&ginState->signalSpace);
+
+exit:
+  if (ret == ncclSuccess) ginState->connected = true;
+  return ret;
+fail:
+  free(allHandles);
+  free(handles);
+  goto exit;
+}
+
+ncclResult_t ncclGinFinalize(struct ncclComm* comm) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (!ginState->connected) return ncclSuccess;
+
+  if (ginState->needsProxyProgress) {
+    pthread_mutex_lock(&ginState->threadLock);
+    comm->sharedRes->ginState.ginProgress = -1;
+    pthread_cond_signal(&ginState->threadCond);
+    pthread_mutex_unlock(&ginState->threadLock);
+    PTHREADCHECK(pthread_join(ginState->thread, NULL), "pthread_join");
+  }
+
+  if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+    for (int n = 0; n < ginState->ginCommCount; n++) {
+      if (ginState->ginCtx[n] != NULL) {
+        NCCLCHECK(ncclGinProxyDestroyContext(ginState->ncclGin, ginState->ginCtx[n]));
+        ginState->ginCtx[n] = NULL;
+      }
+    }
+  }
+
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginCtx[n] != NULL) {
+      NCCLCHECK(ginState->ncclGin->destroyContext(ginState->ginCtx[n]));
+      ginState->ginCtx[n] = NULL;
+    }
+    if (ginState->ginComms[n] != NULL) {
+      NCCLCHECK(ginState->ncclGin->closeColl(ginState->ginComms[n]));
+      ginState->ginComms[n] = NULL;
+    }
+  }
+  NCCLCHECK(ginState->ncclGin->finalize(ginState->ginInstance));
+  memset(ginState, 0, sizeof(*ginState));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size,
+                             void* ginHostWins[NCCL_GIN_MAX_CONTEXTS],
+                             ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECK(ncclGinProxyRegister(ginState->ncclGin, ginState->ginCtx[n], address, size,
+                                     NCCL_PTR_CUDA, 0, &ginHostWins[n], &ginDevWins[n]));
+    } else {
+      NCCLCHECK(ginState->ncclGin->regMrSym(ginState->ginComms[n], address, size, NCCL_PTR_CUDA, 0,
+                                            &ginHostWins[n], &ginDevWins[n]));
+    }
+    if (ginHostWins[n] == NULL) {
+      WARN("rank %d - GIN Symmetric register failed: buff %p, size %ld", comm->rank, address, size);
+      return ncclSystemError;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECK(ncclGinProxyDeregister(ginState->ncclGin, ginState->ginCtx[n], ginHostWins[n]));
+    } else {
+      NCCLCHECK(ginState->ncclGin->deregMrSym(ginState->ginComms[n], ginHostWins[n]));
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0,
+                                         int nCounters, uint32_t* outCounter0) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  int64_t start;
+  if (nSignals != 0) {
+    NCCLCHECKGOTO(
+      ncclSpaceAlloc(&ginState->signalSpace, ginState->signalSpaceSize, nSignals, 1, &start), ret,
+      fail);
+    *outSignal0 = (uint32_t)start;
+  }
+  if (nCounters != 0) {
+    NCCLCHECKGOTO(
+      ncclSpaceAlloc(&ginState->counterSpace, ginState->counterSpaceSize, nCounters, 1, &start),
+      ret, fail_signals);
+    *outCounter0 = (uint32_t)start;
+  }
+  return ncclSuccess;
+fail_signals:
+  if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, *outSignal0, nSignals);
+fail:
+  return ret;
+}
+
+ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals,
+                                        uint32_t counter0, int nCounters) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, signal0, nSignals);
+  if (nCounters != 0) ncclSpaceFree(&ginState->counterSpace, counter0, nCounters);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError) {
+  bool hasError_ = false;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY)
+      NCCLCHECK(ncclGinProxyQueryLastError(ginState->ncclGin, ginState->ginCtx[n], &hasError_));
+    else
+      NCCLCHECK(ginState->ncclGin->queryLastError(ginState->ginCtx[n], &hasError_));
+    if (hasError_) break;
+  }
+  *hasError = hasError_;
+  return ncclSuccess;
+}
diff --git a/src/gin/gin_host_proxy.cc b/src/gin/gin_host_proxy.cc
new file mode 100644
index 000000000..511e38b40
--- /dev/null
+++ b/src/gin/gin_host_proxy.cc
@@ -0,0 +1,501 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <assert.h>
+#include "nccl.h"
+#include "comm.h"
+#include "gin/gin_host.h"
+#include "alloc.h"
+#include "checks.h"
+#include "gdrwrap.h"
+#include "plugin/nccl_net.h"
+#include "nccl_device/gin/proxy/gin_proxy_device_host_common.h"
+
+NCCL_PARAM(GinProxyQueueSize, "GIN_PROXY_QUEUE_SIZE", -1);
+extern int64_t ncclParamIbDataDirect();
+extern int64_t ncclParamDmaBufEnable();
+
+struct ginProxyGfdState {
+  ncclGinProxyOp_t op;
+  uint16_t counterId;
+  int done;
+  void *request;
+};
+
+// a member might be on the GPU, if it has a *GdrHandle counterpart
+struct ginProxyHostGpuCtx {
+  size_t queueSize;
+
+  // size = nRanks * queueSize
+  ncclGinProxyGfd_t *queues;
+  void *cisGdrHandle;
+  // Consumed Indices, one per rank
+  uint32_t *cis;
+  // to decrease the number of reads/writes to cis which might be on the GPU
+  uint32_t *cisShadow;
+  // Seen Indices one per rank
+  uint32_t *sis;
+
+  // same size as queues
+  struct ginProxyGfdState *states;
+  // same size as queues
+  uint64_t *inlines;
+  // inlines is registered as a memory region with the GIN plugin
+  void *inlinesMhandle;
+  void *inlinesGinHandle;
+};
+
+struct ginProxyCtx {
+  struct ncclComm *comm;
+  void *collComm;
+  ncclNetDeviceHandle_v11_t *devHandle;
+  ncclNetProperties_t props;
+
+  // GPU queues, if GDR on the GPU, else on the CPU
+  // Queue size, must be a power of 2
+  struct ginProxyHostGpuCtx *hostGpuCtx;
+
+  void *countersGdrHandle;
+  uint64_t *counters;
+  uint64_t *countersDev;
+  CUmemGenericAllocationHandle signalsCumemhandle;
+  void *signalsMhandle;
+  void *signalsGinHandle;
+  uint64_t *signalsDev;
+  int hasError;
+};
+
+// Depending on GDR, allocate memory on the CPU or GPU.
+// host_flags is not used for now, but it is here for future use.
+template <typename T>
+static ncclResult_t allocMemCPUAccessible(T **ptr, T **devPtr, size_t nelem, int host_flags,
+                                          void **gdrHandle, bool forceHost = false) {
+  if (ncclGdrCopy && !forceHost) {
+    NCCLCHECK(ncclGdrCudaCalloc(ptr, devPtr, nelem, gdrHandle));
+  } else {
+    NCCLCHECK(ncclCuMemHostAlloc((void **)ptr, NULL, nelem * sizeof(T)));
+    memset((void *)*ptr, 0, nelem * sizeof(T));
+    *devPtr = *ptr;
+    if (gdrHandle) *gdrHandle = NULL;  // Mark as host allocated by nulling GDR handle
+  }
+  return ncclSuccess;
+}
+
+// Depending on GDR, free memory on the CPU or GPU.
+template <typename T>
+static ncclResult_t freeMemCPUAccessible(T *ptr, void *gdrHandle) {
+  if (gdrHandle != NULL) {  // If a GDR handle exists, it was GDR memory
+    NCCLCHECK(ncclGdrCudaFree(gdrHandle));
+  } else {  // Otherwise, it was host memory (or GDR was off)
+    NCCLCHECK(ncclCuMemHostFree(ptr));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getDmaBufFd(void *addr, size_t length, int *fd,
+                                bool forceNonDataDirect = false) {
+  if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage;
+
+#if CUDA_VERSION >= 11070
+  static size_t hostPageSize = sysconf(_SC_PAGESIZE);
+  size_t alignedSize = length;
+  ALIGN_SIZE(alignedSize, hostPageSize);
+
+#if CUDA_VERSION >= 12080
+  if (ncclParamIbDataDirect() && !forceNonDataDirect) {
+    CUresult status = pfn_cuMemGetHandleForAddressRange(
+      (void *)fd, (CUdeviceptr)addr, alignedSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+      CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (status == CUDA_SUCCESS) return ncclSuccess;
+  }
+#endif
+  CUresult status = pfn_cuMemGetHandleForAddressRange((void *)fd, (CUdeviceptr)addr, alignedSize,
+                                                      CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
+  if (status == CUDA_SUCCESS) return ncclSuccess;
+#endif
+
+  return ncclInvalidUsage;
+}
+
+static ncclResult_t proxyGinPollCompletions(ncclGin_t *ginComm, void *collComm,
+                                            struct ginProxyCtx *ctx,
+                                            struct ginProxyHostGpuCtx *hostGpuCtx) {
+  for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) {
+    // loop on all seen but unconsumed GFDs
+    for (uint32_t i = hostGpuCtx->cisShadow[targetRank]; i < hostGpuCtx->sis[targetRank]; i++) {
+      uint32_t idx = i & (hostGpuCtx->queueSize - 1);
+      struct ginProxyGfdState *state =
+        &hostGpuCtx->states[targetRank * hostGpuCtx->queueSize + idx];
+      // no need to poll if already done
+      if (!state->done) {
+        ginComm->test(collComm, state->request, &state->done);
+        if (state->done) {
+          TRACE(NCCL_NET, "GFD completed - stateIdx: %lu, request: %p", state - hostGpuCtx->states,
+                state->request);
+          // update the counter specified in the GFD
+          if (state->op & ncclGinProxyOpWithCounter) {
+            __atomic_store_n(&ctx->counters[state->counterId], ctx->counters[state->counterId] + 1,
+                             __ATOMIC_RELAXED);
+            TRACE(NCCL_NET, "Updated counter %d to %ld", state->counterId,
+                  ctx->counters[state->counterId]);
+          }
+        }
+      }
+      // allow holes in the CI space to get resolved
+      if (state->done && i == hostGpuCtx->cisShadow[targetRank]) {
+        // tell the GPU that we have consumed the GFD
+        __atomic_store_n(&hostGpuCtx->cis[targetRank], ++hostGpuCtx->cisShadow[targetRank],
+                         __ATOMIC_RELAXED);
+        TRACE(NCCL_NET, "Updated cis[%u] to %u", targetRank, hostGpuCtx->cisShadow[targetRank]);
+      }
+    }
+  }
+
+  return ncclSuccess;
+}
+
+static int proxyGinPollGfd(struct ginProxyCtx *ctx, ginProxyHostGpuCtx *hostGpuCtx, int targetRank,
+                           ncclGinProxyGfd_t *gfd, struct ginProxyGfdState **state) {
+  ncclGinProxyGfd_t *q = hostGpuCtx->queues + targetRank * hostGpuCtx->queueSize;
+  uint32_t idx = hostGpuCtx->sis[targetRank] & (hostGpuCtx->queueSize - 1);
+  ncclGinProxyQword_t qword;
+  __atomic_load(&q[idx].qword[ncclGinProxyGfdHeader].raw, &qword.raw, __ATOMIC_RELAXED);
+  if (qword.flag.v == 0) {
+    return 0;
+  }
+
+  // We know for sure that the first qword is there, copy it.
+  gfd->qword[ncclGinProxyGfdHeader] = q[idx].qword[ncclGinProxyGfdHeader];
+  // Wait for and copy the other qwords.
+  for (int k = 1; k < ncclGinProxyGfdQwords; k++) {
+    do {
+      __atomic_load(&q[idx].qword[k].raw, &qword.raw, __ATOMIC_RELAXED);
+    } while (qword.flag.v == 0);
+    gfd->qword[k] = qword;
+  }
+  // Now we have the full GFD in the local struct.
+
+  // Reset the GFD in the queue. This lets the producer know that the GFD is consumed.
+  for (int k = 0; k < ncclGinProxyGfdQwords; k++) {
+    __atomic_store_n(&q[idx].qword[k].raw, 0, __ATOMIC_RELAXED);
+  }
+
+  // set the counter_id into the state
+  uint32_t stateIdx = targetRank * hostGpuCtx->queueSize + idx;
+  *state = &hostGpuCtx->states[stateIdx];
+  (*state)->op = (ncclGinProxyOp_t)(gfd->qword[ncclGinProxyGfdHeader].header.op);
+  (*state)->counterId = gfd->qword[ncclGinProxyGfdCompletion].completion.counterId;
+  (*state)->done = 0;
+  (*state)->request = NULL;
+
+  TRACE(NCCL_NET,
+        "GFD to target PE %d raw idx: %u, idx: %u - op: %#lx, size: %lu, srcOff: %lu, dstOff: %lu, "
+        "srcHandle: %lu, dstHandle: %lu, counterId: %u, signalId: %u, stateIdx: %u",
+        targetRank, hostGpuCtx->sis[targetRank], idx, gfd->qword[ncclGinProxyGfdHeader].header.op,
+        gfd->qword[ncclGinProxyGfdHeader].header.size,
+        gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff,
+        gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff,
+        gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle,
+        gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle,
+        gfd->qword[ncclGinProxyGfdCompletion].completion.counterId,
+        gfd->qword[ncclGinProxyGfdCompletion].completion.signalId, stateIdx);
+
+  hostGpuCtx->sis[targetRank]++;
+
+  return 1;
+}
+
+static int mapGfdOpToCollNetOp(ncclGinProxyGfd_t *gfd) {
+  switch (gfd->qword[ncclGinProxyGfdHeader].header.op &
+          (ncclGinProxyOpComplMask & ~ncclGinProxyOpWithCounter)) {
+    case ncclGinProxyOpWithSignalInc:
+      return NCCL_NET_SIGNAL_OP_INC;
+    case ncclGinProxyOpWithSignalAdd:
+      return NCCL_NET_SIGNAL_OP_ADD;
+    default:
+      return -1;
+  }
+}
+
+static ncclResult_t proxyGinProcessGfd(ncclGin_t *ginComm, void *collComm, struct ginProxyCtx *ctx,
+                                       struct ginProxyHostGpuCtx *hostGpuCtx, int targetRank,
+                                       ncclGinProxyGfd_t *gfd, struct ginProxyGfdState *state) {
+  int signalOp;
+  uint64_t signalVal;
+
+  uint64_t size = gfd->qword[ncclGinProxyGfdHeader].header.size;
+  uint64_t srcOff;
+  void *srcHandle;
+  if (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpWithInline) {
+    uint64_t *inlineVal = &hostGpuCtx->inlines[gfd - hostGpuCtx->queues];
+    srcOff = (uint64_t)&inlineVal[0] - (uint64_t)hostGpuCtx->inlines;
+    // reconstruct the inline value from the two qwords
+    *inlineVal = gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow;
+    if (size == 8) {
+      *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 << 32;
+      *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh << 48;
+    }
+    srcHandle = hostGpuCtx->inlinesMhandle;
+  } else {
+    srcOff = gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff;
+    srcHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle;
+  }
+  uint64_t dstOff = gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff;
+  void *dstHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle;
+
+  switch (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpBaseMask) {
+    case ncclGinProxyOpPut:
+      signalOp = mapGfdOpToCollNetOp(gfd);
+      if (signalOp == -1) {
+        // First cast from 63 bits to 64 bits and then to void * to avoid warnings
+        NCCLCHECK(ginComm->iput(collComm, srcOff, srcHandle, size, dstOff, dstHandle,
+                                targetRank, &state->request));
+      } else {
+        // reconstruct the signal value from the two qwords
+        signalVal = gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow;
+        signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 << 16;
+        signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh << 32;
+        uint64_t signalOff =
+          gfd->qword[ncclGinProxyGfdCompletion].completion.signalId * sizeof(uint64_t);
+        NCCLCHECK(ginComm->iputSignal(collComm, srcOff, srcHandle, size, dstOff, dstHandle,
+                                      targetRank, signalOff, ctx->signalsGinHandle, signalVal,
+                                      signalOp, &state->request));
+      }
+      break;
+    default:
+      // this error should already have been checked in pollGfd
+      assert(0);
+  }
+  TRACE(NCCL_NET, "GFD submitted into GIN plugin - stateIdx: %lu, request: %p",
+        state - hostGpuCtx->states, state->request);
+  return ncclSuccess;
+}
+
+static uint64_t isPowerOfTwo(uint64_t n) { return (n > 0) && ((n & (n - 1)) == 0); }
+
+// Check if the GIN plugin supports DMA-BUF, if so we can try to get the DMA-BUF handle from CUDA,
+// if that fails we fallback to non-DMA-BUF
+static ncclResult_t ncclGinProxyRegMrSym(ncclGin_t *ginComm, struct ginProxyCtx *ctx, void *addr,
+                                         size_t size, int type, int mr_flags, void **mhandle,
+                                         void **ginHandle) {
+  if (type == NCCL_PTR_HOST) {
+    NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle));
+  } else if (type == NCCL_PTR_CUDA) {
+    ncclResult_t dmabufResult = ncclInvalidUsage;
+    if (ncclParamDmaBufEnable() && (ctx->props.ptrSupport & NCCL_PTR_DMABUF)) {
+      ncclResult_t registrationResult = ncclSuccess;
+      int dmabufFd = -1;
+      dmabufResult = getDmaBufFd(addr, size, &dmabufFd);
+      if (dmabufResult == ncclSuccess) {
+        registrationResult = ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd,
+                                                     mr_flags, mhandle, ginHandle);
+        close(dmabufFd);
+      }
+      if (registrationResult != ncclSuccess) {
+        dmabufFd = -1;
+        dmabufResult = getDmaBufFd(addr, size, &dmabufFd, true);
+        if (dmabufResult == ncclSuccess) {
+          NCCLCHECK(ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd,
+                                            mr_flags, mhandle, ginHandle));
+          close(dmabufFd);
+        }
+      }
+    }
+    // Fallback to non-DMA-BUF if the DMA-BUF handle is not supported
+    if (dmabufResult != ncclSuccess) {
+      NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle));
+    }
+  } else {
+    return ncclInvalidUsage;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId,
+                                       int nSignals, int nCounters, void **outGinCtx,
+                                       ncclNetDeviceHandle_v11_t **outDevHandle) {
+  ncclGin_t *ginComm = (ncclGin_t *)comm->sharedRes->ginState.ncclGin;
+
+  if (!ncclGdrCopy)
+    INFO(NCCL_NET, "GIN Proxy will not be using GDRCopy");
+
+  struct ginProxyCtx *proxyCtx = NULL;
+  NCCLCHECK(ncclCalloc(&proxyCtx, 1));
+
+  proxyCtx->comm = comm;
+  proxyCtx->collComm = collComm;
+
+  // Sanitize the queue size
+  NCCLCHECK(ginComm->getProperties(devId, &proxyCtx->props));
+  uint64_t queueSize = ncclParamGinProxyQueueSize();
+  uint32_t maxRequests = NCCL_NET_MAX_REQUESTS * proxyCtx->props.maxRecvs;
+  if (queueSize == -1) {
+    queueSize = maxRequests;
+  }
+  if (queueSize > maxRequests) {
+    INFO(NCCL_NET,
+         "NCCL_GIN_PROXY_QUEUE_SIZE is greater than the maximum outstanding requests in the GIN "
+         "plugin (%d), using the default/maximum value instead",
+         maxRequests);
+    queueSize = maxRequests;
+  }
+  if (queueSize < 1) {
+    INFO(NCCL_NET,
+         "NCCL_GIN_PROXY_QUEUE_SIZE is less than 1, using the default/maximum value instead");
+    queueSize = maxRequests;
+  }
+  if (!isPowerOfTwo(queueSize)) {
+    INFO(
+      NCCL_NET,
+      "NCCL_GIN_PROXY_QUEUE_SIZE is not a power of two, using the default/maximum value instead");
+    queueSize = maxRequests;
+  }
+
+  // Allocate the counters on the GPU or CPU depending on GDR
+  NCCLCHECK(allocMemCPUAccessible(&proxyCtx->counters, &proxyCtx->countersDev, nCounters,
+                                  CU_MEMHOSTALLOC_WRITECOMBINED,
+                                  &proxyCtx->countersGdrHandle));
+
+  // Allocate the signals on the GPU and then register the memory region with the GIN plugin.
+  // Enforcing strong ordering on the signals mr is vital to ensure ordering between puts and
+  // signals.
+  size_t signalsBufSize = nSignals * sizeof(uint64_t);
+  NCCLCHECK(ncclCuMemAlloc((void **)&proxyCtx->signalsDev, &proxyCtx->signalsCumemhandle,
+                           CU_MEM_HANDLE_TYPE_NONE, signalsBufSize));
+  CUDACHECK(cudaMemset(proxyCtx->signalsDev, 0, signalsBufSize));
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, proxyCtx->signalsDev, signalsBufSize,
+                                 NCCL_PTR_CUDA, NCCL_NET_MR_FLAG_FORCE_SO,
+                                 &proxyCtx->signalsMhandle, &proxyCtx->signalsGinHandle));
+
+  NCCLCHECK(ncclCalloc(&proxyCtx->hostGpuCtx, 1));
+  struct ginProxyHostGpuCtx *hostGpuCtx = proxyCtx->hostGpuCtx;
+  hostGpuCtx->queueSize = queueSize;
+  size_t queuesLength = hostGpuCtx->queueSize * comm->nRanks;
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->states, queuesLength));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->cisShadow, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->sis, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->inlines, queuesLength));
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, hostGpuCtx->inlines,
+                                       queuesLength * sizeof(uint64_t), NCCL_PTR_HOST, 0,
+                                       &hostGpuCtx->inlinesMhandle, &hostGpuCtx->inlinesGinHandle));
+
+  ncclGinProxyGpuCtx_t devGpuCtx_h;
+  devGpuCtx_h.nranks = comm->nRanks;
+  devGpuCtx_h.queueSize = hostGpuCtx->queueSize;
+  devGpuCtx_h.counters = proxyCtx->countersDev;
+  devGpuCtx_h.signals = proxyCtx->signalsDev;
+  NCCLCHECK(ncclCudaCalloc(&devGpuCtx_h.pis, comm->nRanks));
+
+  // Allocate the GFD queues, CIs, counters, signals and test/wait variables on the either the CPU
+  // or GPU.
+  NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->queues, &devGpuCtx_h.queues, queuesLength, 0,
+                                        NULL, true /*forceHost*/));
+  NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->cis, &devGpuCtx_h.cis, comm->nRanks,
+                                        CU_MEMHOSTALLOC_WRITECOMBINED, &hostGpuCtx->cisGdrHandle));
+
+  ncclGinProxyGpuCtx_t *devGpuCtx_d = NULL;
+  NCCLCHECK(ncclCudaCalloc(&devGpuCtx_d, 1));
+  // Copy the proxy's devGpuCtx to the GPU
+  NCCLCHECK(ncclCudaMemcpy(devGpuCtx_d, &devGpuCtx_h, 1));
+
+  ncclNetDeviceHandle_v11_t *devHandle = NULL;
+  NCCLCHECK(ncclCalloc(&devHandle, 1));
+  devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY;
+  devHandle->netDeviceVersion = NCCL_GIN_PROXY_VERSION;
+  devHandle->handle = (void *)devGpuCtx_d;
+  devHandle->size = 0;
+  devHandle->needsProxyProgress = 1;
+
+  proxyCtx->devHandle = devHandle;
+
+  *outDevHandle = devHandle;
+  *outGinCtx = proxyCtx;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size,
+                                  int type, int mr_flags, void **mhandle, void **ginHandle) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  // Register the memory region with the GIN plugin
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, ctx, addr, size, type, mr_flags, mhandle, ginHandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  // Deregister the memory region with the GIN plugin
+  NCCLCHECK(ginComm->deregMrSym(ctx->collComm, mhandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx) {
+  if (!ginCtx) return ncclSuccess;
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+
+  // Free counters
+  if (ctx) {
+    if (ctx->counters || ctx->countersGdrHandle)
+      freeMemCPUAccessible(ctx->counters, ctx->countersGdrHandle);
+
+    // Free signals
+    if (ginComm && ctx->collComm && ctx->signalsMhandle)
+      ginComm->deregMrSym(ctx->collComm, ctx->signalsMhandle);
+    if (ctx->signalsDev) ncclCudaFree(ctx->signalsDev);
+
+    // Free hostGpuCtx and its allocations
+    struct ginProxyHostGpuCtx *hostGpuCtx = ctx->hostGpuCtx;
+    if (hostGpuCtx) {
+      if (hostGpuCtx->cisShadow) free(hostGpuCtx->cisShadow);
+      if (hostGpuCtx->sis) free(hostGpuCtx->sis);
+      if (hostGpuCtx->states) free(hostGpuCtx->states);
+      if (hostGpuCtx->inlines) free(hostGpuCtx->inlines);
+      if (ginComm && ctx->collComm && hostGpuCtx->inlinesMhandle)
+        ginComm->deregMrSym(ctx->collComm, hostGpuCtx->inlinesMhandle);
+      if (hostGpuCtx->queues) freeMemCPUAccessible(hostGpuCtx->queues, NULL);
+      if (hostGpuCtx->cis || hostGpuCtx->cisGdrHandle)
+        freeMemCPUAccessible(hostGpuCtx->cis, hostGpuCtx->cisGdrHandle);
+      free(hostGpuCtx);
+    }
+
+    ncclNetDeviceHandle_v11_t *devHandle = (ncclNetDeviceHandle_v11_t *)ctx->devHandle;
+    if (devHandle) {
+      if (devHandle->handle) ncclCudaFree((void *)devHandle->handle);
+      free(devHandle);
+    }
+
+    free(ctx);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+
+  NCCLCHECK(proxyGinPollCompletions(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx));
+  for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) {
+    // Poll on the GFD queue
+    ncclGinProxyGfd_t gfd;
+    struct ginProxyGfdState *state = NULL;
+    if (proxyGinPollGfd(ctx, ctx->hostGpuCtx, targetRank, &gfd, &state)) {
+      ncclResult_t ret =
+        proxyGinProcessGfd(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx, targetRank, &gfd, state);
+      if (ret) ctx->hasError = ret;
+      NCCLCHECK(ret);
+    }
+    if (ginComm->ginProgress) ginComm->ginProgress(ctx->collComm);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  *hasError = ctx->hasError;
+  return ncclSuccess;
+}
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 86d185bc0..253c57489 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -266,14 +266,18 @@ ncclResult_t ncclGetUserP2pLevel(int* level) {
   return ncclSuccess;
 }
 
+// Tests two ranks for CUDA P2P connectivity.
+// *cudaP2p returns 1 if CUDA P2P between the ranks is supported.
+// *p2p returns 1 only if the distance between the ranks is no greater than NCCL_P2P_LEVEL.  The connection may go through an intermediate rank.
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
-                              int* p2p, int *read, int* intermediateRank) {
+                              int* p2p, int *read, int* intermediateRank, int* cudaP2p) {
   int mnnvl = 0;
   struct ncclPeerInfo* info1 = NULL;
   struct ncclPeerInfo* info2 = NULL;
   *p2p = 0;
   if (read) *read = 0;
   if (intermediateRank) *intermediateRank = -1;
+  if (cudaP2p) *cudaP2p = 0;
 
   // Rule out different nodes / isolated containers
   if (comm) {
@@ -325,11 +329,13 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
   // Compute the PCI distance and compare with the p2pLevel.
   if (path->type <= p2pLevel) *p2p = 1;
 
+  // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
+  // validate against NVML at all since they are pretending to be on other hw.
+  bool checkNvml = (ncclParamIgnoreDisabledP2p() != 2 && g1 != g2 &&
+                    (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
+                                      info1->hostHash == info2->hostHash)));
   if (*p2p == 1) {
-    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
-    // validate against NVML at all since they are pretending to be on other hw.
-    if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
-                                      info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
+    if (checkNvml) {
       int indexes[3] = {-1,-1,-1};
       int verticeN = 0;
       NCCLCHECK(ncclNvmlEnsureInitialized());
@@ -365,6 +371,19 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
     if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
   }
 
+  if (cudaP2p) {
+    if (checkNvml) {
+      int n1, n2;
+      n1 = system->nodes[GPU].nodes[g1].gpu.dev;
+      n2 = system->nodes[GPU].nodes[g2].gpu.dev;
+      *cudaP2p = (ncclNvmlDevicePairs[n1][n2].p2pStatusRead == NVML_P2P_STATUS_OK &&
+                  ncclNvmlDevicePairs[n1][n2].p2pStatusWrite == NVML_P2P_STATUS_OK);
+    } else {
+      // We assume P2P connectivity in case the ranks are connected using MNNVL or are on the same host.
+      *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash);
+    }
+  }
+
   return ncclSuccess;
 }
 
@@ -591,7 +610,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   struct ncclTopoSystem* system = comm->topo;
   *nranks = 0;
   *intermediateRanks = NULL;
-  if (system->nodes[NET].count == 0) return ncclSuccess;
+  if (system->inter == 0) return ncclSuccess;
 
   int nr = 0;
   int* ranks = NULL;
@@ -650,7 +669,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
       NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
-                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
+                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
@@ -780,10 +799,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     NCCLCHECKGOTO(ncclTopoRemoveNode(system, GPU, g), ret, fail);
   }
 
-  if (system->nodes[GPU].count == comm->nRanks) {
-    for (int n=system->nodes[NET].count-1; n>=0; n--)
-      NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
-  }
+  system->inter = system->nodes[GPU].count == comm->nRanks ? 0 : 1;
 exit:
   free(domains);
   if (ids) free(ids);
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
index 5d967abb9..70fac75b1 100644
--- a/src/graph/rings.cc
+++ b/src/graph/rings.cc
@@ -26,6 +26,11 @@ void dumpLine(int* values, int nranks, const char* prefix) {
 }
 
 ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  ncclResult_t ret = ncclSuccess;
+  uint64_t* rankFound;
+  int rankFoundSize = DIVUP(nranks, 64);
+  NCCLCHECK(ncclCalloc(&rankFound, rankFoundSize));
+
   for (int r=0; r<nrings; r++) {
     char prefix[40];
     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
@@ -35,6 +40,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
 
     int current = rank;
     for (int i=0; i<nranks; i++) {
+      rankFound[current/64] |= (1<<(current%64));
       rings[r*nranks+i] = current;
       current = next[r*nranks+current];
     }
@@ -42,22 +48,23 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
     if (current != rank) {
       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-      return ncclInternalError;
+      ret = ncclInternalError;
+      goto end;
     }
     // Check that all ranks are there
     for (int i=0; i<nranks; i++) {
-      int found = 0;
-      for (int j=0; j<nranks; j++) {
-        if (rings[r*nranks+j] == i) {
-          found = 1;
-          break;
-        }
-      }
-      if (found == 0) {
+      uint64_t bits = rankFound[i/64], mask = 1<<(i%64);
+      // Fast check 64 ranks at a time
+      if (mask == 1 && bits == 0xffffffffffffffff) { i += 63; continue; }
+      if ((bits & mask) == 0) {
         WARN("Error : ring %d does not contain rank %d", r, i);
-        return ncclInternalError;
+        ret = ncclInternalError;
+        goto end;
       }
     }
+    memset(rankFound, 0, rankFoundSize*sizeof(uint64_t));
   }
-  return ncclSuccess;
+end:
+  free(rankFound);
+  return ret;
 }
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 86199d78b..2868b6f2e 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -38,7 +38,7 @@ static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu)
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   system->maxBw = 0.0;
   system->totalBw = 0.0;
-  int inter = system->nodes[NET].count;
+  int inter = system->inter;
   if (inter == 0 && system->nodes[GPU].count == 1) {
     system->maxBw = LOC_BW;
     system->totalBw = LOC_BW;
@@ -496,14 +496,14 @@ static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system,
   return ncclSuccess;
 }
 
-// Build a sorted list of the NETs to try.
+// Build a sorted list of the NETs to try, the list will follow the NETDEVS_POLICY set by the user.
 //
-// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
-//  index when trying to get back to the NIC.
+// The value of "gpu" can be set to -1 to build a list suitable for all GPUs (for example for the search start).
+// The value of "gpu" can be set to the desired index when trying to get back to the NIC.
 //
 // The list is built the following way:
-// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
-// 2. add other NETs satisfying typeInter but not already in the list.
+// 1. First gather the preferred NETs for each of the GPU(s), based on the NETDEVS_POLICY and the connection.
+// 2. If the NETDEV_policy allows it, add all the other NETs satisfying typeInter but not already in the list of preferred NETs.
 NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1);
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
@@ -518,9 +518,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount));
   }
 
+  // Get the maximum of network devices allowed, depending on the policy.
+  // If the policy is not MAX, then allow all devices.
+  int maxDevCount = 0;
+  enum netDevsPolicy netDevsPolicy;
+  NCCLCHECK(ncclTopoGetNetDevsPolicy(&netDevsPolicy, &maxDevCount));
+  if (gpu == -1) maxDevCount *= system->nodes[GPU].count;
+  if (netDevsPolicy != NETDEVS_POLICY_MAX) maxDevCount = NCCL_TOPO_MAX_NODES;
+  if (netCount >= maxDevCount) goto exit;
+
   // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
     for (int g = 0; g < system->nodes[GPU].count; g++) {
+      // do not consider this GPU is it's not the GPU we asked for
       if (gpu != -1 && gpu != g) continue;
       int localNetCount = 0, localNets[MAXCHANNELS];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
@@ -532,16 +542,37 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
         int found = 0;
-        while (found<netCount && nets[found] != n) found++;
+        while (found < netCount && nets[found] != n) found++;
         if (found == netCount) nets[netCount++] = n;
+        if (netCount >= maxDevCount) goto exit;
       }
     }
   }
 
+exit:
   *netCountRet = netCount;
   return ret;
 }
 
+NCCL_PARAM(MnnvlRailPerHost, "MNNVL_RAIL_PER_HOST", 0);
+
+static bool ncclTopoSearchCheckNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* startNet, int n, int step) {
+  struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+  if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) return false; // Trees are symmetric
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
+    if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels - 1) * 2]) return false;
+  } else if (graph->crossNic == 0) {
+    if (ncclParamMnnvlRailPerHost() && NCCL_TOPO_ID_SYSTEM_ID(net->id) != NCCL_TOPO_ID_SYSTEM_ID(startNet->id)) {
+      // Different hosts in an MNNVL system: rail are per host and identified with the PCI id.
+      if (net->net.pciId != startNet->net.pciId || net->net.port != startNet->net.port) return false;
+    } else {
+      if (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port) return false;
+    }
+  }
+  if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE && step != 0 && net->id != graph->inter[graph->nChannels*2+1]) return false;
+  return true;
+}
+
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
@@ -567,7 +598,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   int nets[NCCL_TOPO_MAX_NODES];
   if (step == backToNet) {
     // first get back to NIC
-    if (system->nodes[NET].count) {
+    if (system->inter) {
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
@@ -575,24 +606,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
-        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
-        if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
-          if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
-        } else {
-          if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
-        }
-
+        if (!ncclTopoSearchCheckNet(system, graph, startNet, n, step)) continue;
         // Balanced Tree : count half of the bandwidth on first two GPUs
         int nextBackToNet = -1;
         float bwInterSave = graph->bwInter;
         if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
           // Count half of the bandwidth on each of the first two GPUs
           if (step == 0) nextBackToNet = 1;
-          else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
           graph->bwInter /= 2;
         }
 
+        struct ncclTopoNode* net;
         NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
         graph->bwInter = bwInterSave;
         if (net) {
@@ -744,7 +768,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
  *                                       `--> NET n (or m if crossNic)
  */
 ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, int* backToNet, int* backToFirstRank) {
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     if (pattern == NCCL_TOPO_PATTERN_RING) *backToNet = system->nodes[GPU].count-1;
     else if (pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) *backToNet = 1;
     else *backToNet = 0;
@@ -760,7 +784,7 @@ ncclResult_t ncclTopoSearchParams(struct ncclTopoSystem* system, int pattern, in
 ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int* time) {
   int backToNet, backToFirstRank;
   NCCLCHECK(ncclTopoSearchParams(system, graph->pattern, &backToNet, &backToFirstRank));
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     // Start from NET
     ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
   } else {
@@ -876,7 +900,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
   int* intra = graph->intra+ngpus*c;
   NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
   struct ncclXmlNode* node;
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
     NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0]));
   }
@@ -896,7 +920,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
     NCCLCHECK(xmlSetAttrLong(node, "dev", dev));
     if (graph->id == 3) break; // NVLS graphs only use the first GPU
   }
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
     NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1]));
   }
@@ -979,7 +1003,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
     NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
   }
-  if (system->nodes[NET].count > 0) {
+  if (system->inter) {
     NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
     NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
     maxTypeIntra = maxTypeInter;
@@ -1016,7 +1040,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
-  if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+  if (system->inter == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     // Force intra-node NVLS algorithm to pull evenly from all GPUs.
     graph->minChannels = graph->maxChannels;
   }
@@ -1036,7 +1060,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   // First try crossnic, then decrease bw and finally increase bwIntra.
   int nspeeds = 0;
   float* speedArray = NULL;
-  if (system->nodes[NET].count == 0) {
+  if (system->inter == 0) {
     nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
     speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
   } else {
@@ -1096,14 +1120,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     }
     tmpGraph.pattern = graph->pattern;
 
-    int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
+    int maxIntra = system->inter ? tmpGraph.typeInter : maxTypeIntra;
     if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
       if (tmpGraph.typeIntra < PATH_DIS) goto search;
     }
     tmpGraph.typeIntra = minTypeIntra;
 
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
+    if (system->inter && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       if (tmpGraph.typeInter < PATH_DIS) goto search;
     }
@@ -1181,7 +1205,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
   for (int c=0; c<graph->nChannels; c++) {
     sprintf(line, "%2d :", c);
     int offset = strlen(line);
-    if (system->nodes[NET].count > 0) {
+    if (system->inter) {
       sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c]));
       offset = strlen(line);
     }
@@ -1193,7 +1217,7 @@ ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGr
       offset = strlen(line);
       if (graph->id == 3) break; // NVLS graphs only use the first GPU
     }
-    if (system->nodes[NET].count > 0) {
+    if (system->inter) {
       sprintf(line+offset, " %s/%lx-%lx", topoNodeTypeStr[NET], NCCL_TOPO_ID_SYSTEM_ID(graph->inter[2*c+1]), NCCL_TOPO_ID_LOCAL_ID(graph->inter[2*c+1]));
       offset = strlen(line);
     }
diff --git a/src/graph/topo.cc b/src/graph/topo.cc
index 3a87725f1..be533b57f 100644
--- a/src/graph/topo.cc
+++ b/src/graph/topo.cc
@@ -357,25 +357,38 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
   int dev;
   NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
 
+  int64_t netId = NCCL_TOPO_ID(systemId, dev);
   struct ncclTopoNode* net;
-  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
+  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, netId));
   net->net.dev = dev;
   const char* str;
+  // if not guid is present use the net->id unique id instead, which will be unique within the node/NVLD
   NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
-  if (str) sscanf(str, "0x%lx", &net->net.asic);
-  else net->net.asic = dev;
+  net->net.asic = (str) ? strtoull(str, NULL, 16) : netId;
+
 
-  ncclDebugNoWarn = NCCL_GRAPH;
   int mbps;
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0), NCCL_GRAPH);
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
   net->net.bw = mbps / 8000.0;
-  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
-  ncclDebugNoWarn = 0;
+  ncclResult_t ret;
+  NOWARN(ret = xmlGetAttrFloat(xmlNet, "latency", &net->net.latency), NCCL_GRAPH);
+  if (ret != ncclSuccess) net->net.latency = 0;
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0), NCCL_GRAPH);
+
+  // build the PCI id using the parent PCI link
+  uint64_t hacc[2] = {1, 1};
+  const char* busId = NULL;
+  struct ncclXmlNode* parent = xmlNet->parent;
+  while (parent != NULL && strcmp(parent->name, "pci") != 0) parent = parent->parent;
+  if (parent) NCCLCHECK(xmlGetAttr(parent, "busid", &busId));
+  // If we fail to find the PCIe path, we use the GUID instead.
+  if (busId) eatHash(hacc, busId, strlen(busId));
+  else eatHash(hacc, &net->net.asic);
+  net->net.pciId = digestHash(hacc);
 
   NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
   NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
@@ -998,7 +1011,8 @@ ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netIn
 
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
-  ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps);
+  ncclResult_t ret;
+  NOWARN(ret = netInfo->makeVDevice(&vDevIndex, vProps), NCCL_GRAPH|NCCL_INIT|NCCL_NET);
   if (ret != ncclSuccess) {
     INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
       vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
@@ -1582,16 +1596,8 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
   return ncclSuccess;
 }
 
-enum netDevsPolicy {
-  NETDEVS_POLICY_AUTO = 0x0,
-  NETDEVS_POLICY_ALL = 0x1,
-  NETDEVS_POLICY_MAX = 0x2,
-  NETDEVS_POLICY_UNDEF = 0xffffffff
-};
-
-static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
 static int netDevsPolicyNum = -1;
-
+static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
 static void getNetDevsPolicyOnce() {
   const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY");
   if (envStr) {
@@ -1614,6 +1620,18 @@ static void getNetDevsPolicyOnce() {
   if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO;
 }
 
+ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum) {
+  static pthread_once_t onceNetDevsPolicy = PTHREAD_ONCE_INIT;
+  pthread_once(&onceNetDevsPolicy, getNetDevsPolicyOnce);
+  if (netDevsPolicy == NETDEVS_POLICY_MAX && netDevsPolicyNum <= 0) {
+    WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
+    return ncclInternalError;
+  }
+  if (policy) *policy = netDevsPolicy;
+  if (policyNum && netDevsPolicyNum >= 0) *policyNum = netDevsPolicyNum;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
@@ -1626,22 +1644,19 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
     return ncclInternalError;
   }
 
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once,getNetDevsPolicyOnce);
   int netsPerGpu = 0;
-  if (netDevsPolicy == NETDEVS_POLICY_AUTO) {
+  int policyCount = 0;
+  enum netDevsPolicy policy;
+  NCCLCHECK(ncclTopoGetNetDevsPolicy(&policy, &policyCount));
+  if (policy == NETDEVS_POLICY_AUTO) {
     int localGpus[NCCL_TOPO_MAX_NODES];
     int localGpuCount;
     NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
     netsPerGpu = DIVUP(localNetCount, localGpuCount);
-  } else if (netDevsPolicy == NETDEVS_POLICY_ALL) {
+  } else if (policy == NETDEVS_POLICY_ALL) {
     netsPerGpu = localNetCount;
-  } else if (netDevsPolicy == NETDEVS_POLICY_MAX) {
-    if (netDevsPolicyNum <= 0) {
-      WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
-      return ncclInternalError;
-    }
-    netsPerGpu = std::min(netDevsPolicyNum, localNetCount);
+  } else if (policy == NETDEVS_POLICY_MAX) {
+    netsPerGpu = std::min(policyCount, localNetCount);
   } else {
     WARN("Unknown netDevs policy");
     return ncclInternalError;
@@ -1655,6 +1670,21 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount) {
+  int gpu;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
+  int localNetIndexes[NCCL_TOPO_MAX_NODES];
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNetIndexes, localNetCount, NULL));
+
+  if (*localNetCount == 0) {
+    WARN("Could not find any local path from gpu %d to net.", gpu);
+    return ncclInternalError;
+  }
+  // Convert index to ids
+  for (int n=0; n<*localNetCount; n++) localNets[n] = system->nodes[NET].nodes[localNetIndexes[n]].id;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
   ncclResult_t ret = ncclSuccess;
   int netIndex;
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 49d408d95..f153620e7 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -138,6 +138,7 @@ struct ncclTopoNode {
     }gpu;
     struct {
       int dev; // Plugin dev number
+      uint64_t pciId;
       uint64_t asic;
       int port;
       float bw;
@@ -177,6 +178,7 @@ struct ncclTopoSystem {
   struct ncclTopoNodeSet nodes[NCCL_TOPO_NODE_TYPES];
   float maxBw;
   float totalBw;
+  int inter;
 };
 
 ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
index bfb279850..0520e7234 100644
--- a/src/graph/tuning.cc
+++ b/src/graph/tuning.cc
@@ -176,7 +176,7 @@ static const ncclTunerConstants_t ncclTunerConstantsDefaults = {
     {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
     {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
     {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */
-    {2*36.7, 2*36.7, 2*36.7}, /* Blackwell (N1/N2/N4) */
+    {2*36.7, 34.6, 2*36.7}, /* Blackwell (N1/N2/N4) */
   },
   .perChMaxTreeLL128Bws = {
     {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
diff --git a/src/graph/xml.cc b/src/graph/xml.cc
index 010120627..3ab8e20dd 100644
--- a/src/graph/xml.cc
+++ b/src/graph/xml.cc
@@ -575,32 +575,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
   const char* busId;
   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
   char* path = NULL;
-  ncclDebugNoWarn = NCCL_GRAPH;
-  getPciPath(busId, &path);
-  ncclDebugNoWarn = 0;
+  NOWARN(getPciPath(busId, &path), NCCL_GRAPH);
 
   if (path) {
     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
   }
   int index;
-  ncclDebugNoWarn = NCCL_GRAPH;
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "vendor", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "device", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_device", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"), NCCL_GRAPH);
   }
-  ncclDebugNoWarn = 0;
   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
   if (index == -1) {
     if (path) {
@@ -635,7 +631,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
   NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
   if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
     int nlinks;
-    char* peers;
+    char* peers = NULL;
     NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
     for (int l=0; l<nlinks; l++) {
       char* target = peers+l*BUSID_SIZE;
@@ -646,6 +642,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
         NCCLCHECK(xmlSetAttr(linkNode, "target", target));
       }
     }
+    free(peers);
   }
 
   struct ncclXmlNode* parent = pciNode->parent;
@@ -868,9 +865,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
       const char* busId;
       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
       char* path;
-      ncclDebugNoWarn = NCCL_GRAPH;
-      getPciPath(busId, &path);
-      ncclDebugNoWarn = 0;
+      NOWARN(getPciPath(busId, &path), NCCL_GRAPH);
       if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) {
         // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
         NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
diff --git a/src/include/allocator.h b/src/include/allocator.h
index 05da29a62..eccb5b5cd 100644
--- a/src/include/allocator.h
+++ b/src/include/allocator.h
@@ -7,6 +7,10 @@
 #ifndef NCCL_ALLOCATOR_H_
 #define NCCL_ALLOCATOR_H_
 
+#include "nccl.h"
+#include <stdint.h>
+#include <cuda_runtime.h>
+
 ////////////////////////////////////////////////////////////////////////////////
 // ncclSpace: Allocates contiguous segments of non-negative integers. Useful
 // as a memory allocator when we can't put allocator state within the memory
diff --git a/src/include/channel.h b/src/include/channel.h
index bd34f54c1..d5058c4f3 100644
--- a/src/include/channel.h
+++ b/src/include/channel.h
@@ -19,9 +19,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRa
 inline uint8_t ncclP2pChannelBaseForRound(struct ncclComm* comm, int p2pRound) {
   int base;
   if (comm->nNodes > 1) {
-    int nodeDelta = p2pRound/comm->maxLocalRanks;
-    int localDelta = p2pRound%comm->maxLocalRanks;
-    base = nodeDelta*divUp(comm->maxLocalRanks, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+    int localSize = comm->p2pSchedGroupSize;
+    int groupDelta = p2pRound / localSize;
+    int localDelta = p2pRound % localSize;
+    base = groupDelta*divUp(localSize, NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
     base += localDelta/NCCL_MAX_DEV_WORK_P2P_PER_BATCH;
   } else {
     base = p2pRound;
diff --git a/src/include/checks.h b/src/include/checks.h
index cbb5a2de4..16f515516 100644
--- a/src/include/checks.h
+++ b/src/include/checks.h
@@ -137,6 +137,21 @@
   } \
 } while (0)
 
+#define NCCLCHECKNOWARN(call, FLAGS) do { \
+  ncclResult_t RES; \
+  NOWARN(RES = call, FLAGS); \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    return RES; \
+  } \
+} while (0)
+
+#define NCCLCHECKGOTONOWARN(call, RES, label, FLAGS) do { \
+  NOWARN(RES = call, FLAGS); \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    goto label; \
+  } \
+} while (0)
+
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
   uint32_t* tmpAbortFlag = (abortFlagPtr);     \
   ncclResult_t RES = call;                \
diff --git a/src/include/comm.h b/src/include/comm.h
index 22faf3682..e1b37db16 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -140,6 +140,9 @@ struct ncclSharedResources {
 
   /* proxy related shared res */
   struct ncclProxyState* proxyState;
+
+  // GIN state
+  struct ncclGinState ginState;
 };
 
 struct ncclChannel {
@@ -455,6 +458,7 @@ struct ncclComm {
 
   ncclNet_t* ncclNet;
   void* netContext;
+  void* ginContext;
   int netPluginIndex;
   int ncclNetVer;
   ncclNetDeviceType netDeviceType;
@@ -468,7 +472,7 @@ struct ncclComm {
   int maxTreePattern;
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
-  bool directMode;
+  bool directMode; // if any process manages more than one local rank
   int cuMemSupport;
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
@@ -523,6 +527,7 @@ struct ncclComm {
   // Channels (per peer) for p2p
   int p2pnChannels;
   int p2pnChannelsPerPeer;
+  int p2pSchedGroupSize;
 
   // Should this comm allocate LL buffers for network P2P connections?
   bool allocP2pNetLLBuffers;
@@ -550,6 +555,7 @@ struct ncclComm {
   uint32_t* childAbortFlag;
   uint32_t* childAbortFlagDev;
   uint32_t destroyFlag;
+  uint32_t revokedFlag;
 
   // Device side of the communicator (for cudaFree's)
   struct ncclKernelComm* devComm; // actually = &ncclKernelCommAndChannels::comm
@@ -651,11 +657,12 @@ struct ncclComm {
   // CE Collective
   struct ncclCeColl ceColl;
   struct ncclIntruQueue<struct ncclCeInitTask, &ncclCeInitTask::next> ceInitTaskQueue;
-  
+
   // buffer registration cache
   struct ncclRegCache regCache;
   int isAllNvlink;
-  bool isAllDirectP2p;
+  bool isAllDirectP2p; // Subject to NCCL_P2P_LEVEL (for local ranks only).
+  bool isAllCudaP2p; // Raw CUDA capability (for local ranks only).
   int symmetricSupport;
   bool useNetPXN;
   bool useGdr;
diff --git a/src/include/debug.h b/src/include/debug.h
index 3822e8760..f332d749d 100644
--- a/src/include/debug.h
+++ b/src/include/debug.h
@@ -29,6 +29,14 @@ extern char ncclLastError[];
 #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 
+#define NOWARN(EXPR, FLAGS) \
+  do { \
+    int oldNoWarn = ncclDebugNoWarn; \
+    ncclDebugNoWarn = FLAGS; \
+    (EXPR); \
+    ncclDebugNoWarn = oldNoWarn; \
+  } while(0)
+
 #define INFO(FLAGS, ...) \
     do{ \
         int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
diff --git a/src/include/dev_runtime.h b/src/include/dev_runtime.h
index 5f6e66e33..70bf77496 100644
--- a/src/include/dev_runtime.h
+++ b/src/include/dev_runtime.h
@@ -52,6 +52,7 @@ struct ncclDevrState {
   int* lsaRankList;
 
   size_t granularity; // cuMemGetAllocationGranularity
+  bool ginEnabled;
   struct ncclDevrMemory* memHead;
   struct ncclDevrWindowSorted* winSorted;
   int winSortedCapacity, winSortedCount;
diff --git a/src/include/device.h b/src/include/device.h
index 9ffc26095..b1cef15b3 100644
--- a/src/include/device.h
+++ b/src/include/device.h
@@ -53,7 +53,7 @@ extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
   #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
 #endif
 
-#include "net_device.h"
+#include "nccl_device/net_device.h"
 
 enum ncclDevRedOp_t {
   ncclDevSum, ncclDevProd, ncclDevMinMax,
@@ -153,6 +153,7 @@ struct ncclProxyConnector {
   int sameProcess;
   struct ncclProxyConnection* connection;
   ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
+  ncclResult_t (*proxyGinProgress)(struct ncclProxyState* proxyState);
 };
 
 struct ncclConnector {
@@ -528,7 +529,8 @@ __host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_
 
 // Host-side table of kernel function pointers.
 extern int const ncclDevKernelCount;
-extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
+extern void* ncclDevKernelList[/*ncclDevKernelCount*/];
+extern int ncclDevKernelRequirements[/*ncclDevKernelCount*/];
 
 // Table of most specialized kernel function to run given func index.
 extern int const ncclDevFuncRowToId[];
diff --git a/src/include/env.h b/src/include/env.h
new file mode 100644
index 000000000..0e00b3144
--- /dev/null
+++ b/src/include/env.h
@@ -0,0 +1,23 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_ENV_H_
+#define NCCL_INT_ENV_H_
+
+#include "nccl_env.h"
+
+// Initialize Env Plugin
+ncclResult_t ncclEnvPluginInit(void);
+// Finalize Env Plugin
+void ncclEnvPluginFinalize(void);
+// Env plugin get function for NCCL params, called in ncclGetEnv()
+const char* ncclEnvPluginGetEnv(const char* name);
+
+bool ncclEnvPluginInitialized(void);
+
+ncclResult_t ncclInitEnv(void);
+
+#endif
diff --git a/src/include/gin/gin_host.h b/src/include/gin/gin_host.h
new file mode 100644
index 000000000..d82a79505
--- /dev/null
+++ b/src/include/gin/gin_host.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_HOST_H_
+#define _NCCL_GIN_HOST_H_
+
+#include "allocator.h"
+#include "nccl.h"
+#include "nccl_net.h"
+#include "nccl_device/gin/gin_device_host_common.h"
+#include <pthread.h>
+
+struct ncclGinState {
+  ncclGin_t* ncclGin;
+  void* ginInstance;
+  bool connected;
+  int ginType;
+  int ginCommCount;
+  void* ginComms[NCCL_GIN_MAX_CONTEXTS];
+  void* ginCtx[NCCL_GIN_MAX_CONTEXTS];
+  ncclNetDeviceHandle_t* ginDevHandles[NCCL_GIN_MAX_CONTEXTS];
+  int needsProxyProgress;  // Whether we need to progress GIN operations with the proxy
+  int ginProgress;         // GIN progress is enabled
+  pthread_t thread;
+  pthread_mutex_t threadLock;
+  pthread_cond_t threadCond;
+  ncclResult_t asyncResult;
+
+  int signalSpaceSize;
+  int counterSpaceSize;
+  ncclSpace signalSpace;
+  ncclSpace counterSpace;
+};
+
+extern int64_t ncclParamGinType();
+
+// FIXME change to ncclGinState instead of ncclComm, no need to pass comm
+ncclResult_t ncclGinConnectOnce(struct ncclComm* comm);
+ncclResult_t ncclGinFinalize(struct ncclComm* comm);
+ncclResult_t ncclGinProgress(struct ncclGinState* ginState);
+ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size,
+                             void* ginHostWins[NCCL_GIN_MAX_CONTEXTS],
+                             ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]);
+ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]);
+ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0,
+                                         int nCounters, uint32_t* outCounter0);
+ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals,
+                                        uint32_t counter0, int nCounters);
+ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError);
+
+#endif
diff --git a/src/include/gin/gin_host_proxy.h b/src/include/gin/gin_host_proxy.h
new file mode 100644
index 000000000..14e8b93ca
--- /dev/null
+++ b/src/include/gin/gin_host_proxy.h
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef GIN_HOST_PROXY_H_
+#define GIN_HOST_PROXY_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include "nccl.h"
+#include "gin/gin_host.h"
+#include "plugin/nccl_net.h"
+
+ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId,
+                                       int nSignals, int nCounters, void **outGinCtx,
+                                       ncclNetDeviceHandle_v11_t **outDevHandle);
+ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size,
+                                  int type, int mr_flags, void **mhandle, void **ginHandle);
+ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle);
+ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx);
+ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx);
+ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError);
+
+#endif
diff --git a/src/include/graph.h b/src/include/graph.h
index 6b926717e..203b6a1d1 100644
--- a/src/include/graph.h
+++ b/src/include/graph.h
@@ -34,7 +34,7 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank, int* cudaP2p);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
 enum ncclTopoGdrMode {
   ncclTopoGdrModeDisable = 0,
@@ -73,9 +73,18 @@ ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
+ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount);
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
 
+enum netDevsPolicy {
+  NETDEVS_POLICY_AUTO = 0x0,
+  NETDEVS_POLICY_ALL = 0x1,
+  NETDEVS_POLICY_MAX = 0x2,
+  NETDEVS_POLICY_UNDEF = 0xffffffff
+};
+ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum);
+
 // Allows for up to 32 NICs per node on GB200-NVL72
 #define NCCL_TOPO_MAX_NODES 576
 ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
diff --git a/src/include/group.h b/src/include/group.h
index 6e317c6c4..3b08d9f16 100644
--- a/src/include/group.h
+++ b/src/include/group.h
@@ -82,6 +82,10 @@ inline ncclResult_t ncclGroupStartInternal() {
   return ncclSuccess;
 }
 
+inline bool ncclGroupEnabled() {
+  return ncclGroupDepth != 0;
+}
+
 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
   if (ncclGroupDepth > 0) {
     if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
diff --git a/src/include/nccl_device.h b/src/include/nccl_device.h
index 88b2531d1..35e216c62 100644
--- a/src/include/nccl_device.h
+++ b/src/include/nccl_device.h
@@ -4,12 +4,12 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "nccl_device/impl/comm__funcs.h"
 #include "nccl_device/coop.h"
+#include "nccl_device/impl/barrier__funcs.h"
+#include "nccl_device/impl/comm__funcs.h"
 #include "nccl_device/impl/core__funcs.h"
 #include "nccl_device/impl/ll_a2a__funcs.h"
-#include "nccl_device/impl/mem_barrier__funcs.h"
-//#include "nccl_device/net_barrier__funcs.h"
-//#include "nccl_device/net_scratch_a2a__funcs.h"
-//#include "nccl_device/barrier__funcs.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
+#include "nccl_device/impl/gin__funcs.h"
+#include "nccl_device/impl/gin_barrier__funcs.h"
 #include "nccl_device/impl/ptr__funcs.h"
diff --git a/src/include/nccl_device/barrier.h b/src/include/nccl_device/barrier.h
new file mode 100644
index 000000000..0c11f6e5c
--- /dev/null
+++ b/src/include/nccl_device/barrier.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER_H_
+#define _NCCL_DEVICE_BARRIER_H_
+#include "impl/core__types.h"
+#include "impl/lsa_barrier__types.h"
+#include "impl/gin_barrier__types.h"
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclBarrierSession_internal;
+
+template<typename Coop>
+struct ncclBarrierSession: ncclBarrierSession_internal<Coop> {
+  // Full featured constructor:
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin,
+    ncclLsaBarrierHandle innerBarHandle,
+    ncclGinBarrierHandle outerBarHandle,
+    uint32_t index,
+    bool multimem=false, ncclMultimemHandle innerMmHandle={}
+  );
+  // Convenience constructors for baked in teams:
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false
+  );
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false
+  );
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagRail, ncclGin, uint32_t index
+  );
+
+  ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& lsaBarrier();
+  NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ginBarrier();
+
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
+};
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER_H_
diff --git a/src/include/nccl_device/coop.h b/src/include/nccl_device/coop.h
index 9a8d4b0a8..4af229dfb 100644
--- a/src/include/nccl_device/coop.h
+++ b/src/include/nccl_device/coop.h
@@ -30,7 +30,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
     return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
   }
   NCCL_DEVICE_INLINE void sync() {
-    __syncwarp(laneMask());
+    if (nThreadsPow2 > 1) __syncwarp(laneMask());
   }
 };
 #endif
@@ -43,7 +43,7 @@ typedef ncclCoopTile<32> ncclCoopWarp;
 #if __CUDACC__
 struct ncclCoopLanes { // Some lanes of this warp.
   uint32_t lmask;
-  
+
   NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
 
   NCCL_DEVICE_INLINE int thread_rank() const {
@@ -71,7 +71,7 @@ struct ncclCoopWarpSpan {
   NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
     warp0(warp0), nWarps(nWarps), id(id) {
   }
-  
+
   NCCL_DEVICE_INLINE int thread_rank() const {
     return threadIdx.x - 32*warp0;
   }
@@ -100,16 +100,16 @@ struct ncclCoopCta {
 
 #if __CUDACC__
 template<int nThreadsPow2>
-NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopTile<nThreadsPow2> coop) {
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopTile<nThreadsPow2> coop) {
   return coop.laneMask();
 }
-NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopLanes coop) {
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopLanes coop) {
   return coop.lmask;
 }
-NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopWarpSpan coop) {
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopWarpSpan coop) {
   return -1u;
 }
-NCCL_DEVICE_INLINE uint32_t ncclCoopLaneMask(ncclCoopCta coop) {
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopCta coop) {
   return -1u;
 }
 #endif
@@ -126,6 +126,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa
 NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
 #endif
 
+#if __CUDACC__
+template<int nThreads>
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile<nThreads>) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; }
+#endif
+
 #if __CUDACC__
 // Pick threads of our warp that are safe to use collectively.
 NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
@@ -149,4 +157,55 @@ NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThread
 }
 #endif
 
+#if __CUDACC__
+template<int nThreads, typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile<nThreads>, T value, int root, bool entrySync=true) {
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads);
+  return v;
+}
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) {
+  uint32_t m = coop.lmask;
+  uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root);
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r);
+  return v;
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() {
+  __shared__ ulong2 stash[15];
+  return stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value;
+  coop.sync();
+  return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id];
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() {
+  __shared__ ulong2 stash;
+  return &stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value;
+  coop.sync();
+  return *(T*)ncclCoopBcast_Cta_stash();
+}
+#endif
+
 #endif
diff --git a/src/include/nccl_device/core.h b/src/include/nccl_device/core.h
index dd41d6925..9b0061a72 100644
--- a/src/include/nccl_device/core.h
+++ b/src/include/nccl_device/core.h
@@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t;
 typedef uint32_t ncclDevResourceHandle;
 typedef ncclDevResourceHandle ncclDevResourceHandle_t;
 
+typedef uint32_t ncclGinSignal_t;
+typedef uint32_t ncclGinCounter_t;
+
 struct ncclLsaBarrierHandle;
 typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
 
+struct ncclGinBarrierHandle;
+typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t;
+
 struct ncclLLA2AHandle;
 typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
 
@@ -59,13 +65,26 @@ struct ncclDevCommRequirements {
 
   bool lsaMultimem; // Enable multimem on lsa team
 
+  int barrierCount;
   int lsaBarrierCount;
+  int railGinBarrierCount;
+
+  int lsaLLA2ABlockCount, lsaLLA2ASlotCount;
+
+  bool ginForceEnable;
+  int ginContextCount; // This is a hint, the actual context count in the devcomm may not match.
+  int ginSignalCount; // Guaranteed to start at id=0
+  int ginCounterCount; // Guaranteed to start at id=0
 };
 
 struct ncclDevResourceRequirements {
   ncclDevResourceRequirements_t* next;
   size_t bufferSize, bufferAlign;
   ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
+  int ginSignalCount;
+  int ginCounterCount;
+  ncclGinSignal_t* outGinSignalStart;
+  ncclGinCounter_t* outGinCounterStart;
 };
 
 struct ncclTeamRequirements {
diff --git a/src/include/nccl_device/gin.h b/src/include/nccl_device/gin.h
new file mode 100644
index 000000000..45623d08c
--- /dev/null
+++ b/src/include/nccl_device/gin.h
@@ -0,0 +1,207 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION_H_
+#define _NCCL_DEVICE_GIN_SESSION_H_
+#include "core.h"
+#include "gin/gin_device_common.h"
+
+#if __CUDACC__
+struct ncclGinCtx; // Definition in nccl_device/gin/gin_device_host_common.h
+template<unsigned> struct ncclGinCtx_M; // ...
+
+struct ncclGinDescriptorSmem; // A type user allocates in __shared__ memory
+
+// Used as completion actions for ncclGinSession::put
+struct ncclGin_None {};
+
+struct ncclGin_SignalAdd { ncclGinSignal_t signal; uint64_t value; };
+// SignalInc: equivalent to SignalAdd{+1} except it may not be mixed with any
+// other signal operator without intervening signal reset(). Formally: for a
+// given signal, all operations between successive reset()'s of that signal must
+// either all be SignalInc or all not SignalInc.
+struct ncclGin_SignalInc { ncclGinSignal_t signal; };
+// Support deferred:
+// struct ncclGin_SignalSet { ncclGinSignal_t signal; uint64_t value; };
+struct ncclGin_CounterInc { ncclGinCounter_t counter; };
+
+struct ncclGin_DescriptorSmem { ncclGinDescriptorSmem* descriptor; };
+
+template<unsigned backendMask>
+struct ncclGin_BackendMask;
+
+template<ncclNetDeviceType backend>
+using ncclGin_BackendOne = ncclGin_BackendMask<(1u<<(int)backend)>;
+
+using ncclGin = ncclGin_BackendMask<NCCL_GIN_BACKEND_MASK_ALL>;
+
+#endif
+
+#if __CUDACC__
+template<unsigned backendMask>
+struct ncclGin_BackendMask {
+  ncclDevComm const& comm;
+  uint32_t nContexts:8, contextId:8, _ginBackend:8;
+
+  // Loads GIN context into registers. Each context has one QP per peer.
+  NCCL_DEVICE_INLINE ncclGin_BackendMask(ncclDevComm const&, int contextIndex);
+
+  template<
+    // Action to take on peer when put completes. If a signalling action is used
+    // then that signal will be visible only after the payload of this put as well as
+    // the payloads of preceding puts on this netContext to the same peer are settled.
+    typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet}
+    // Action to take locally when source has been consumed.
+    typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc}
+    // Set of threads participating in this put. Must be a subset of Coop.
+    typename Coop = ncclCoopThread,
+    // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem}
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void put(
+    ncclTeam, int peer,
+    ncclWindow_t dstWnd, size_t dstOffset,
+    ncclWindow_t srcWnd, size_t srcOffset, size_t bytes,
+    RemoteAction remoteAction = ncclGin_None{},
+    LocalAction localAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T,
+    // Action to take on peer when put completes. If a signalling action is used
+    // then that signal will be visible only after the payload of this put as well as
+    // the payloads of preceding puts on this context to the same peer are settled.
+    typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet}
+    // Action to take locally when source has been consumed.
+    typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc}
+    // Set of threads participating in this put. Must be a subset of Coop.
+    typename Coop = ncclCoopThread,
+    // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem}
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void put(
+    ncclTeam, int peer,
+    ncclSymPtr<T> dstElts, ncclSymPtr<T> srcElts, size_t nElts,
+    RemoteAction remoteAction = ncclGin_None{},
+    LocalAction localAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T, // requires sizeof(T) <= 8
+    // See put() for all template arguments.
+    typename RemoteAction = ncclGin_None,
+    typename Coop = ncclCoopThread,
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void putValue(
+    ncclTeam, int peer,
+    ncclWindow_t dstWnd, size_t dstOffset, T value,
+    RemoteAction remoteAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T, // requires sizeof(T) <= 8
+    // See put() for all template arguments.
+    typename RemoteAction = ncclGin_None,
+    typename Coop = ncclCoopThread,
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void putValue(
+    ncclTeam, int peer,
+    ncclSymPtr<T> dst, T value,
+    RemoteAction remoteAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<typename RemoteAction,
+           typename Coop = ncclCoopThread,
+           typename DescriptorSmem = ncclGin_None>
+  NCCL_DEVICE_INLINE void signal(
+    ncclTeam, int peer, RemoteAction remoteAction,
+    Coop coop = ncclCoopThread(),
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  // All source buffers from put's from any thread in this coop will be safe to reuse.
+  // Flush does not guarantee that data has settled in remote memory.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void flush(Coop, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Counter and signal wait use "rolling" comparison logic of a given bit-width
+  // such that unsigned overflow does not disturb the property that: x < x+1.
+  //
+  // bool rolling_less_equal(uint64_t a, uint64_t b, int bits) {
+  //   uint64_t m = uint64_t(-1)>>(64-bits);
+  //   return ((b-a) & m) <= (m>>1);
+  // }
+  //
+  // The condition waited for is that the supplied value is rolling_less_equal
+  // to the internal value.
+  //
+  // Counters are restricted to using a maximum of 56 bits despite that being fewer
+  // than a uint64_t can carry.
+
+  NCCL_DEVICE_INLINE uint64_t readCounter(ncclGinCounter_t counter, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitCounter(Coop, ncclGinCounter_t counter, uint64_t least, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Each signal has a dedicated "shadow" which the user is free to manipulate for
+  // any reason. The only calls which manipulate the shadow are `increaseSignalShadow`
+  // and `resetSignal`.
+  NCCL_DEVICE_INLINE uint64_t* getSignalShadowPtr(ncclGinSignal_t signal) const;
+  NCCL_DEVICE_INLINE void increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const;
+
+  // Returns current value of signal with all but bottom bits set to zero.
+  NCCL_DEVICE_INLINE uint64_t readSignal(ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait for signal to meet or exceed value.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitSignal(Coop, ncclGinSignal_t signal, uint64_t least, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait for signal to meet or exceed shadow value.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitSignalMeetShadow(Coop, ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait until signal exceeds shadow by `leastDelta` (typically 1), updates shadow
+  // with latest value, and returns with `before` equal to previous shadow value
+  // and `delta` equal to difference.
+  template<typename Coop, typename Uint>
+  NCCL_DEVICE_INLINE void waitSignalFollowShadow(Coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Sets to zero. May not race with concurrent modifications to counter.
+  NCCL_DEVICE_INLINE void resetCounter(ncclGinCounter_t counter) const;
+  // Sets signal and shadow to zero. May not race with concurrent modifcations to signal.
+  NCCL_DEVICE_INLINE void resetSignal(ncclGinSignal_t signal) const;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // internal:
+
+  void* _ginHandle;
+  uint64_t* _signalShadows;
+
+  NCCL_DEVICE_INLINE ncclGinCtx_M<backendMask> _makeCtx() const;
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_SESSION_H_
diff --git a/src/include/nccl_device/gin/gdaki/gin_gdaki.h b/src/include/nccl_device/gin/gdaki/gin_gdaki.h
new file mode 100644
index 000000000..c14a5e292
--- /dev/null
+++ b/src/include/nccl_device/gin/gdaki/gin_gdaki.h
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_GDAKI_H_
+#define _NCCL_DEVICE_GIN_GDAKI_H_
+
+#ifndef DOCA_VERBS_USE_CUDA_WRAPPER
+#define DOCA_VERBS_USE_CUDA_WRAPPER
+#endif
+
+#ifndef DOCA_VERBS_USE_NET_WRAPPER
+#define DOCA_VERBS_USE_NET_WRAPPER
+#endif
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1
+#endif
+
+#include "../gin_device_common.h"
+#include "gin_gdaki_device_host_common.h"
+#include "doca_gpunetio/doca_gpunetio_device.h"
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+#include <stdio.h>
+#endif
+
+template <>
+struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    using nccl::utility::loadConst;
+
+    coop.sync();
+    if (coop.thread_rank() == 0) {
+      ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
+      doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
+      doca_gpu_dev_verbs_qp* companion_qp;
+      ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
+      ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin;
+
+      doca_gpu_dev_verbs_addr raddr, laddr;
+      if (hasWins) {
+        raddr.addr = dstOff;
+        raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
+        laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey);
+      }
+
+      doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
+      if (hasSignal) {
+        if (signalOp == ncclGinSignalInc) signalOpArg = 1;
+        sig_raddr.addr = sizeof(uint64_t) * signalId;
+        sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
+        sig_laddr.addr = 0;
+        sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      doca_gpu_dev_verbs_addr counter_raddr, counter_laddr;
+      if (hasCounter) {
+        companion_qp = loadConst(&gdaki->companion_gdqp) + peer;
+        counter_raddr.addr = sizeof(uint64_t) * counterId;
+        counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank);
+        counter_laddr.addr = 0;
+        counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      // cuda::thread_scope_system has the lowest value
+      if ((required == cuda::thread_scope_system) && (given > required)) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+      }
+
+      if (hasWins) {
+        if (hasSignal && hasCounter) {
+          doca_gpu_dev_verbs_put_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr,
+            counter_laddr, 1);
+        } else if (hasSignal) {
+          doca_gpu_dev_verbs_put_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg);
+        } else if (hasCounter) {
+          doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr,
+                                              counter_laddr, 1);
+        } else {
+          doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes);
+        }
+      } else {
+        if (hasCounter) {
+          doca_gpu_dev_verbs_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1);
+        } else {
+          doca_gpu_dev_verbs_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, sig_raddr, sig_laddr, signalOpArg);
+        }
+      }
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+      doca_gpu_dev_verbs_wait(qp);
+      if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp);
+#endif
+    }
+    coop.sync();
+  }
+};
+
+template <>
+struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcVal, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    using nccl::utility::loadConst;
+
+    coop.sync();
+    if (coop.thread_rank() == 0) {
+      ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
+      doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
+      ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
+
+      doca_gpu_dev_verbs_addr raddr;
+      raddr.addr = dstOff;
+      raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
+
+      doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
+      if (hasSignal) {
+        if (signalOp == ncclGinSignalInc) signalOpArg = 1;
+        sig_raddr.addr = sizeof(uint64_t) * signalId;
+        sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
+        sig_laddr.addr = 0;
+        sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      // cuda::thread_scope_system has the lowest value
+      if ((required == cuda::thread_scope_system) && (given > required)) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+      }
+
+      if (hasSignal) {
+        doca_gpu_dev_verbs_p_signal<T, DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+          qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg);
+      } else {
+        doca_gpu_dev_verbs_p(qp, raddr, srcVal);
+      }
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+      doca_gpu_dev_verbs_wait(qp);
+#endif
+    }
+    coop.sync();
+  }
+};
+
+template <>
+struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    loadConst(&gdaki->counters_table.buffer)[counterId] = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    loadConst(&gdaki->signals_table.buffer)[signalId] = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    return loadConst(&gdaki->counters_table.buffer) + counterId;
+  }
+};
+
+template <>
+struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    return loadConst(&gdaki->signals_table.buffer) + signalId;
+  }
+};
+
+template <>
+struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp);
+#pragma unroll 1
+    for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) {
+      doca_gpu_dev_verbs_wait(qps + peer);
+    }
+  }
+};
+
+#endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */
diff --git a/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h b/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
new file mode 100644
index 000000000..20299346f
--- /dev/null
+++ b/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
+#define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
+
+#include <linux/types.h>
+
+#define NCCL_GIN_GDAKI_VERSION 100
+
+template <typename T>
+struct ncclGinGdakiGlobalGPUBufferTable {
+  T *buffer;
+  __be32 *rkeys;
+  __be32 lkey;
+};
+
+struct ncclGinGdakiGPUContext {
+  struct doca_gpu_dev_verbs_qp *gdqp;
+  struct doca_gpu_dev_verbs_qp *companion_gdqp;
+  struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> counters_table;
+  struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> signals_table;
+
+  // Local buffer we don't consume but is required for some operations.
+  __be32 sink_buffer_lkey;
+};
+
+struct ncclGinGdakiMemHandle {
+  __be32 *rkeys;
+  __be32 lkey;
+};
+
+#endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */
diff --git a/src/include/nccl_device/gin/gin_device_api.h b/src/include/nccl_device/gin/gin_device_api.h
new file mode 100644
index 000000000..20dde3af3
--- /dev/null
+++ b/src/include/nccl_device/gin/gin_device_api.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_GIN_DEVICE_API_H_
+#define _NCCL_GIN_DEVICE_API_H_
+
+#include "gin_device_common.h"
+
+#if NCCL_GIN_GDAKI_ENABLE
+#include "gdaki/gin_gdaki.h"
+#endif
+#if NCCL_GIN_PROXY_ENABLE
+#include "proxy/gin_proxy.h"
+#endif
+
+#endif
diff --git a/src/include/nccl_device/gin/gin_device_common.h b/src/include/nccl_device/gin/gin_device_common.h
new file mode 100644
index 000000000..d0d4c8fa3
--- /dev/null
+++ b/src/include/nccl_device/gin/gin_device_common.h
@@ -0,0 +1,120 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_DEVICE_COMMON_H_
+#define _NCCL_GIN_DEVICE_COMMON_H_
+
+#include "../net_device.h"
+#include "../utility.h"
+#include "gin_device_host_common.h"
+
+#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
+#define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
+#endif
+
+#ifndef NCCL_GIN_PROXY_ENABLE
+#define NCCL_GIN_PROXY_ENABLE 1
+#endif
+
+#ifndef NCCL_GIN_GDAKI_ENABLE
+#if CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700
+#define NCCL_GIN_GDAKI_ENABLE 1
+#else
+#define NCCL_GIN_GDAKI_ENABLE 0
+#endif
+#endif
+
+#define NCCL_GIN_BACKEND_MASK_ALL                                               \
+  (((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \
+   ((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI)
+
+struct ncclGinCtx {
+  ncclNetDeviceType backend;
+  int rank;
+  int nRanks;
+  void* handle;
+};
+
+template <unsigned backendMask>
+struct ncclGinCtx_M : ncclGinCtx {};
+
+struct ncclGinDescriptorSmem {
+  alignas(16) char space[64];
+};
+
+#if __CUDACC__
+template <ncclNetDeviceType backend>
+struct ncclGinApi_Put {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_PutValue {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcData, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_GetSignalPtr {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId);
+};
+template <ncclNetDeviceType backend>
+struct ncclGinApi_GetCounterPtr {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_ResetSignal {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_ResetCounter {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_Flush {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord);
+};
+#endif
+
+#if __CUDACC__
+template <template <ncclNetDeviceType> typename ApiFn, unsigned beMask, typename... Arg>
+NCCL_DEVICE_INLINE static decltype(auto) ncclGinCall(ncclGinCtx_M<beMask> ctx, Arg&&... arg) {
+  bool singleton = (beMask & (beMask - 1)) == 0;  // Only one bit set
+  switch (singleton ? __popc(beMask - 1) : (int)ctx.backend) {
+#if NCCL_GIN_PROXY_ENABLE
+    case (int)NCCL_NET_DEVICE_GIN_PROXY:
+      if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_PROXY))) __builtin_unreachable();
+      return ApiFn<NCCL_NET_DEVICE_GIN_PROXY>::call(ctx, static_cast<Arg&&>(arg)...);
+#endif
+#if NCCL_GIN_GDAKI_ENABLE
+    case (int)NCCL_NET_DEVICE_GIN_GDAKI:
+      if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_GDAKI))) __builtin_unreachable();
+      return ApiFn<NCCL_NET_DEVICE_GIN_GDAKI>::call(ctx, static_cast<Arg&&>(arg)...);
+#endif
+    default:
+      __builtin_unreachable();
+  }
+}
+#endif
+
+#endif
diff --git a/src/include/nccl_device/gin/gin_device_host_common.h b/src/include/nccl_device/gin/gin_device_host_common.h
new file mode 100644
index 000000000..639a7eb1a
--- /dev/null
+++ b/src/include/nccl_device/gin/gin_device_host_common.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_DEVICE_HOST_COMMON_H_
+#define _NCCL_GIN_DEVICE_HOST_COMMON_H_
+
+#include <cuda.h>
+#include "../net_device.h"
+#include "../core.h"  // for ncclGin{Signal|Counter}_t
+
+#define NCCL_GIN_MAX_CONTEXTS 4
+
+typedef struct ncclGinGpuCtx *ncclGinGpuCtx_t;
+typedef void *ncclGinWindow_t;
+
+typedef enum ncclGinSignalOp_t {
+  ncclGinSignalInc = 0,
+  ncclGinSignalAdd,
+} ncclGinSignalOp_t;
+
+#endif
diff --git a/src/include/nccl_device/gin/proxy/gin_proxy.h b/src/include/nccl_device/gin/proxy/gin_proxy.h
new file mode 100644
index 000000000..1233f8eba
--- /dev/null
+++ b/src/include/nccl_device/gin/proxy/gin_proxy.h
@@ -0,0 +1,235 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_PROXY_H_
+#define _NCCL_DEVICE_GIN_PROXY_H_
+
+//#include <config.h>
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include "nccl.h"
+#include "nccl_device/utility.h"
+#include "../gin_device_host_common.h"
+#include "gin_proxy_device_host_common.h"
+
+namespace nccl {
+namespace gin {
+namespace proxy {
+NCCL_DEVICE_INLINE void flush(ncclGinProxyGpuCtx_t* proxyCtx, uint32_t pe, cuda::memory_order ord) {
+  using nccl::utility::loadConst;
+  using nccl::utility::rollingLessEq;
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> pi(loadConst(&proxyCtx->pis)[pe]);
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> ci(loadConst(&proxyCtx->cis)[pe]);
+
+  // The PI and CI can keep moving because of concurrent threads posting GFDs to this queue, and the CPU consuming them.
+  // Therefore, to prevent overflow issues in the while statement, we need to use a special comparison function.
+  uint32_t p = pi.load(cuda::memory_order_relaxed);
+#pragma unroll 1
+  while (!rollingLessEq<uint32_t>(p, ci.load(ord))) continue;
+}
+
+
+template <typename Coop>
+NCCL_DEVICE_INLINE void postGfd(Coop coop, ncclGinProxyGpuCtx_t* proxyCtx, ncclGinProxyGfd_t* gfd,
+                                uint32_t pe) {
+  using nccl::utility::loadConst;
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> pi(loadConst(&proxyCtx->pis)[pe]);
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> ci(loadConst(&proxyCtx->cis)[pe]);
+  ncclGinProxyGfd_t* q = &loadConst(&proxyCtx->queues)[pe * proxyCtx->queueSize];
+  uint32_t queueSize = loadConst(&proxyCtx->queueSize);
+
+  if (coop.thread_rank() == 0) {
+    // claim a slot in the gfd queue
+    uint32_t idx = pi.fetch_add(1, cuda::memory_order_relaxed);
+    // wait for credits
+    while (queueSize <= idx - ci.load(cuda::memory_order_relaxed)) {
+    }
+    idx &= queueSize - 1;
+// 4x16 byte store with the write-through cache hint
+#pragma unroll
+    for (uint8_t i = 0; i < 4; i++) {
+      __stwt((uint4*)&q[idx] + i, ((uint4*)gfd)[i]);
+    }
+  }
+}
+
+template <typename T>
+// Descriptor must be at least GWQ_GFD_SIZE bytes and it should be aligned
+__device__ __forceinline__ void buildGfd(ncclGinProxyGfd_t* gfd, ncclGinProxyOp_t op, T srcVal,
+                                         bool hasInline, size_t srcOff, ncclGinWindow_t srcHandle,
+                                         size_t dstOff, ncclGinWindow_t dstHandle, size_t size,
+                                         ncclGinCounter_t counterId, ncclGinSignal_t signalId,
+                                         uint64_t signalVal) {
+  gfd->qword[ncclGinProxyGfdHeader].header.flag = 1;
+  gfd->qword[ncclGinProxyGfdHeader].header.op = op;
+  gfd->qword[ncclGinProxyGfdHeader].header.size = (uint64_t)size;
+
+  if (hasInline) {
+    gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.flag = 1;
+    gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow = (uint32_t)srcVal;
+    gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.flag = 1;
+    if (sizeof(T) > 4)
+      gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 = (uint64_t)srcVal >> 32;
+    if (sizeof(T) > 6)
+      gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh = (uint64_t)srcVal >> 48;
+  } else {
+    gfd->qword[ncclGinProxyGfdSrcOff].srcOff.flag = 1;
+    gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff = (uint64_t)srcOff;
+    gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.flag = 1;
+    gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle = (uint64_t)srcHandle;
+  }
+
+  gfd->qword[ncclGinProxyGfdDstOff].dstOff.flag = 1;
+  gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff = (uint64_t)dstOff;
+  gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.flag = 1;
+  gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle = (uint64_t)dstHandle;
+
+  gfd->qword[ncclGinProxyGfdCompletion].completion.flag = 1;
+  gfd->qword[ncclGinProxyGfdCompletion].completion.counterId = (uint16_t)counterId;
+  gfd->qword[ncclGinProxyGfdCompletion].completion.signalId = (uint16_t)signalId;
+
+  // The signal value is split between two qwords, as the signal value is a full 64 bits
+  gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow = (uint16_t)signalVal;
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.flag = 1;
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 = (uint16_t)(signalVal >> 16);
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh = (uint32_t)(signalVal >> 32);
+
+  gfd->qword[ncclGinProxyGfdReserved].flag.v = 1;
+}
+
+__device__ __forceinline__ void constructProxyOp(ncclGinProxyOp_t& op, bool hasInline,
+                                                 bool hasSignal, ncclGinSignalOp_t signalOp,
+                                                 bool hasCounter) {
+  op = ncclGinProxyOpPut;
+  if (hasInline)
+    op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                       static_cast<uint8_t>(ncclGinProxyOpWithInline));
+  if (hasCounter)
+    op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                       static_cast<uint8_t>(ncclGinProxyOpWithCounter));
+  if (hasSignal) {
+    switch (signalOp) {
+      case ncclGinSignalInc:
+        op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                           static_cast<uint8_t>(ncclGinProxyOpWithSignalInc));
+        break;
+      case ncclGinSignalAdd:
+        op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                           static_cast<uint8_t>(ncclGinProxyOpWithSignalAdd));
+        break;
+      default:
+        __builtin_unreachable();
+    }
+  }
+}
+
+template <typename Coop, typename T>
+NCCL_DEVICE_INLINE void put(Coop coop, ncclGinProxyGfd_t* gfd, ncclGinProxyGpuCtx_t* proxyCtx,
+                            int peer, ncclGinWindow_t dstWnd, size_t dstOff, T srcVal,
+                            bool hasInline, ncclGinWindow_t srcWnd, size_t srcOff, size_t bytes,
+                            bool hasSignal, ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                            uint64_t signalVal, bool hasCounter, ncclGinCounter_t counterId,
+                            cuda::thread_scope required, cuda::thread_scope given) {
+  if ((int)given > (int)cuda::thread_scope_system) {
+    cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+  }
+  ncclGinProxyOp_t op;
+  constructProxyOp(op, hasInline, hasSignal, signalOp, hasCounter);
+  nccl::gin::proxy::buildGfd(gfd, op, srcVal, hasInline, srcOff, srcWnd, dstOff, dstWnd, bytes,
+                             hasCounter ? counterId : 0, hasSignal ? signalId : 0, signalVal);
+  nccl::gin::proxy::postGfd<Coop>(coop, proxyCtx, gfd, peer);
+}
+}  // namespace proxy
+}  // namespace gin
+}  // namespace nccl
+
+template <>
+struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* counter = nccl::utility::loadConst(&proxyCtx->counters) + counterId;
+    return counter;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* counter = nccl::utility::loadConst(&proxyCtx->counters) + counterId;
+    *counter = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* signal = nccl::utility::loadConst(&proxyCtx->signals) + signalId;
+    return signal;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* signal = nccl::utility::loadConst(&proxyCtx->signals) + signalId;
+    *signal = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+#pragma unroll 1
+    for (int pe = coop.thread_rank(); pe < ctx.nRanks; pe += coop.size()) {
+      nccl::gin::proxy::flush(proxyCtx, pe, ord);
+    }
+  }
+};
+
+template <>
+struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    ncclGinProxyGfd_t tmpDesc;
+    ncclGinProxyGfd_t* desc = hasDescriptor ? (ncclGinProxyGfd_t*)descriptor : &tmpDesc;
+    nccl::gin::proxy::put<Coop, uint64_t>(
+      coop, desc, (ncclGinProxyGpuCtx_t*)ctx.handle, peer, dstWin, dstOff, 0, false, srcWin, srcOff,
+      bytes, hasSignal, signalId, signalOp, signalOpArg, hasCounter, counterId, required, given);
+  }
+};
+
+template <>
+struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcVal, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    ncclGinProxyGfd_t tmpDesc;
+    ncclGinProxyGfd_t* desc = hasDescriptor ? (ncclGinProxyGfd_t*)descriptor : &tmpDesc;
+    nccl::gin::proxy::put<Coop, T>(coop, desc, (ncclGinProxyGpuCtx_t*)ctx.handle, peer, dstWin,
+                                   dstOff, srcVal, true, nullptr, 0, sizeof(T), hasSignal, signalId,
+                                   signalOp, signalOpArg, false, 0, required, given);
+  }
+};
+
+#endif
diff --git a/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h b/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
new file mode 100644
index 000000000..8466f8874
--- /dev/null
+++ b/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef GIN_PROXY_DEFS_H
+#define GIN_PROXY_DEFS_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define NCCL_GIN_PROXY_VERSION 100
+
+typedef enum {
+  ncclGinProxyOpPut = 1 << 0,
+  ncclGinProxyOpBaseMask = 1 << 0,
+  ncclGinProxyOpWithInline = 1 << 1,
+  ncclGinProxyOpWithCounter = 1 << 2,
+  ncclGinProxyOpWithSignalInc = 1 << 3,
+  ncclGinProxyOpWithSignalAdd = 1 << 4,
+  ncclGinProxyOpComplMask = ~ncclGinProxyOpPut,
+} ncclGinProxyOp_t;
+
+static_assert(sizeof(void *) == sizeof(uint64_t) && sizeof(size_t) == sizeof(uint64_t),
+              "The proxy code is built on the assumption that the pointer size is 64 bits and at "
+              "most 57 bits are used for the actual pointer.");
+
+typedef union {
+  uint64_t raw;
+  struct {
+    uint64_t v : 1;
+    uint64_t resv : 63;
+  } __attribute__((packed)) flag;
+  struct {
+    uint64_t flag : 1;
+    uint64_t op : 6;
+    uint64_t size : 57;
+  } __attribute__((packed)) header;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t srcOff : 63;
+  } __attribute__((packed)) srcOff;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t srcHandle : 63;
+  } __attribute__((packed)) srcHandle;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint32_t inlineValLow;
+    uint16_t inlineValLow2;
+  } __attribute__((packed)) inlineLow;
+  // inline supports a max of 96 bit / 12 byte values
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint16_t inlineValHigh;
+    uint8_t resv1;
+    uint32_t resv2;
+  } __attribute__((packed)) inlineHigh;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t dstOff : 63;
+  } __attribute__((packed)) dstOff;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t dstHandle : 63;
+  } __attribute__((packed)) dstHandle;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv1 : 7;
+    // must be non-zero if WITH_COUNTER is set
+    uint16_t counterId;
+    // must be non-zero if WITH_SIGNAL_INC, WITH_SIGNAL_ADD, or WITH_SIGNAL_SET is set
+    uint16_t signalId;
+    uint16_t signalValLow;
+    uint8_t resv2;
+  } __attribute__((packed)) completion;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint16_t signalValLow2;
+    uint32_t signalValHigh;
+  } __attribute__((packed)) signalVal;
+} ncclGinProxyQword_t;
+static_assert(sizeof(ncclGinProxyQword_t) == sizeof(uint64_t),
+              "sizeof(ncclGinProxyQword_t) != sizeof(uint64_t)");
+
+typedef enum {
+  ncclGinProxyGfdHeader = 0,
+  ncclGinProxyGfdInlineLow = 1,
+  ncclGinProxyGfdInlineHigh = 2,
+  ncclGinProxyGfdSrcOff = 1,
+  ncclGinProxyGfdSrcHandle = 2,
+  ncclGinProxyGfdDstOff = 3,
+  ncclGinProxyGfdDstHandle = 4,
+  ncclGinProxyGfdCompletion = 5,
+  ncclGinProxyGfdSignalVal = 6,
+  ncclGinProxyGfdReserved = 7,
+  ncclGinProxyGfdQwords = 8,
+} ncclGinProxyGfdQwordIdx_t;
+
+typedef struct __attribute__((packed)) {
+  ncclGinProxyQword_t qword[ncclGinProxyGfdQwords];
+} ncclGinProxyGfd_t;
+static_assert(sizeof(ncclGinProxyGfd_t) == 64,
+              "sizeof(ncclGinProxyGfd_t) != 64 - it is crucial the GFD is 64 bytes!");
+
+typedef struct {
+  int nranks;
+  uint32_t queueSize;
+  ncclGinProxyGfd_t *queues;
+  uint32_t *pis;
+  // The consumer indices will reside in CPU or GPU memory depending on the availability of GDR
+  uint32_t *cis;
+
+  uint64_t *counters;
+  uint64_t *signals;
+} ncclGinProxyGpuCtx_t;
+
+#endif
diff --git a/src/include/nccl_device/gin_barrier.h b/src/include/nccl_device/gin_barrier.h
new file mode 100644
index 000000000..b5859583c
--- /dev/null
+++ b/src/include/nccl_device/gin_barrier.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER_H_
+#define _NCCL_DEVICE_GIN_BARRIER_H_
+#include "core.h"
+#include "gin.h"
+
+struct ncclGinBarrierHandle;
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclGinBarrierCreateRequirement(ncclComm_t, ncclTeam_t, int nBarriers, ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+
+#if __CUDACC__
+enum class ncclGinFenceLevel {
+  Relaxed
+};
+
+template<typename Coop>
+struct ncclGinBarrierSession_internal;
+
+template<typename Coop>
+struct ncclGinBarrierSession: ncclGinBarrierSession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclGinBarrierSession(Coop, ncclGin, ncclTeam, ncclGinBarrierHandle, uint32_t index);
+  NCCL_DEVICE_INLINE ncclGinBarrierSession(Coop, ncclGin, ncclTeamTagRail, uint32_t index);
+
+  NCCL_DEVICE_INLINE ~ncclGinBarrierSession();
+
+  ncclGinBarrierSession(ncclGinBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER_H_
diff --git a/src/include/nccl_device/impl/barrier__funcs.h b/src/include/nccl_device/impl/barrier__funcs.h
new file mode 100644
index 000000000..69e1dfda5
--- /dev/null
+++ b/src/include/nccl_device/impl/barrier__funcs.h
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_BARRIER__FUNCS_H_
+#include "barrier__types.h"
+#include "lsa_barrier__funcs.h"
+#include "gin_barrier__funcs.h"
+#include "../utility.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin gin,
+    ncclLsaBarrierHandle innerHandle, ncclGinBarrierHandle outerHandle,
+    uint32_t index, bool multimem, ncclMultimemHandle innerMmHandle
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::present(gin),
+    nccl::utility::present(coop, gin.comm, innerTeam, innerHandle, index, multimem, innerMmHandle),
+    nccl::utility::present(coop, gin, outerTeam, outerHandle, index)
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagWorld, ncclGin gin, uint32_t index, bool multimem
+  ):
+  ncclBarrierSession<Coop>(
+    coop, ncclTeamLsa(gin.comm), ncclTeamRail(gin.comm), gin,
+    gin.comm.lsaBarrier, gin.comm.railGinBarrier,
+    index, multimem, gin.comm.lsaMultimem
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagLsa, ncclDevComm const& comm, uint32_t index, bool multimem
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::Absent(),
+    nccl::utility::present(coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem),
+    nccl::utility::Absent()
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagRail, ncclGin gin, uint32_t index
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::present(gin),
+    nccl::utility::Absent(),
+    nccl::utility::present(coop, gin, ncclTeamRail(gin.comm), gin.comm.railGinBarrier, index)
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& ncclBarrierSession<Coop>::lsaBarrier() {
+  return this->innerLsaBar.thing;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ncclBarrierSession<Coop>::ginBarrier() {
+  return this->outerGinBar.thing;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclBarrierSession<Coop>::sync(Coop, cuda::memory_order ord, ncclGinFenceLevel fence) {
+  if (this->innerLsaBar.present) {
+    this->innerLsaBar.thing.sync(this->coop, this->outerGinBar.present ? nccl::utility::releaseOrderOf(ord) : ord);
+  }
+  if (this->outerGinBar.present) {
+    this->outerGinBar.thing.sync(this->coop, this->innerLsaBar.present ? nccl::utility::acquireOrderOf(ord) : ord, fence);
+  }
+}
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER__FUNCS_H_
diff --git a/src/include/nccl_device/impl/barrier__types.h b/src/include/nccl_device/impl/barrier__types.h
new file mode 100644
index 000000000..2a262a856
--- /dev/null
+++ b/src/include/nccl_device/impl/barrier__types.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_BARRIER__TYPES_H_
+#include "../barrier.h"
+#include "../utility.h"
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclBarrierSession_internal {
+  Coop coop;
+  nccl::utility::Optional<ncclGin> gin;
+  nccl::utility::Optional<ncclLsaBarrierSession<Coop>> innerLsaBar;
+  nccl::utility::Optional<ncclGinBarrierSession<Coop>> outerGinBar;
+
+  template<typename GinInit, typename InnerInit, typename OuterInit>
+  NCCL_DEVICE_INLINE ncclBarrierSession_internal(
+      Coop coop, GinInit ginInit, InnerInit innerInit, OuterInit outerInit
+    ):
+    coop(coop), gin{ginInit}, innerLsaBar{innerInit}, outerGinBar{outerInit} {
+  }
+};
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER__TYPES_H_
diff --git a/src/include/nccl_device/impl/comm__types.h b/src/include/nccl_device/impl/comm__types.h
index 680d7055b..532b4ec4c 100644
--- a/src/include/nccl_device/impl/comm__types.h
+++ b/src/include/nccl_device/impl/comm__types.h
@@ -8,8 +8,9 @@
 #define _NCCL_DEVICE_COMM__TYPES_H_
 #include "../comm.h"
 #include "core__types.h"
-#include "mem_barrier__types.h"
 #include "ll_a2a__types.h"
+#include "lsa_barrier__types.h"
+#include "gin_barrier__types.h"
 
 struct ncclDevCommWindowTable;
 #if __cplusplus
@@ -35,6 +36,16 @@ struct ncclDevComm {
 
   ncclMultimemHandle_t lsaMultimem;
   ncclLsaBarrierHandle_t lsaBarrier;
+  ncclGinBarrierHandle_t railGinBarrier;
+
+  uint8_t ginContextCount;
+  uint8_t ginTypes[4];
+  void* ginHandles[4];
+  uint32_t ginSignalBase;
+  int ginSignalCount;
+  uint32_t ginCounterBase;
+  int ginCounterCount;
+  uint64_t* ginSignalShadows;
 };
 
 #endif // _NCCL_DEVICE_COMM__TYPES_H_
diff --git a/src/include/nccl_device/impl/core__funcs.h b/src/include/nccl_device/impl/core__funcs.h
index 1087cd289..b7a7de342 100644
--- a/src/include/nccl_device/impl/core__funcs.h
+++ b/src/include/nccl_device/impl/core__funcs.h
@@ -153,6 +153,38 @@ NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset
 }
 #endif
 
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclWindow_t ncclFindWindow(Coop coop, ncclDevComm const& comm, void const *ptr) {
+  using nccl::utility::loadConst;
+  auto coalesced = ncclCoopCoalesced(coop);
+  ncclDevCommWindowTable* t = comm.windowTable;
+  while (true) {
+    bool found = false;
+    int index = coalesced.thread_rank();
+    #pragma unroll 1
+    while (index < 32) {
+      uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
+      ncclDevCommWindowTable::Entry e = loadConst(&t->entries[index]);
+      if ((e.base != 0) && (e.size != 0) && (e.window != 0)) {
+        if (uptr - uintptr_t(e.base) < uintptr_t(e.size)) {
+          found = true;
+          break;
+        }
+      }
+      index += coalesced.size();
+    }
+    uint32_t mask = __ballot_sync(ncclCoopGetLaneMask(coalesced), found);
+    if (mask != 0) {
+      int source = __popc(mask-1);
+      index = __shfl_sync(ncclCoopGetLaneMask(coalesced), index, source);
+      return loadConst(&t->entries[index].window);
+    }
+    t = loadConst(&t->next);
+  }
+}
+#endif
+
 NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) {
   return ((size_t)h)*128;
 }
diff --git a/src/include/nccl_device/impl/core__types.h b/src/include/nccl_device/impl/core__types.h
index d2d1350b1..9b8317a9d 100644
--- a/src/include/nccl_device/impl/core__types.h
+++ b/src/include/nccl_device/impl/core__types.h
@@ -7,16 +7,18 @@
 #ifndef _NCCL_DEVICE_CORE__TYPES_H_
 #define _NCCL_DEVICE_CORE__TYPES_H_
 #include "../core.h"
+#include "nccl_device/gin/gin_device_host_common.h"
 
 // nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t;
 struct ncclWindow_vidmem {
   void* winHost;
-  //ncclGinWindow_t ginWin;
   char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team
   int lsaRank;
   int worldRank;
   uint32_t stride4G;
   uint32_t mcOffset4K;
+  uint32_t ginOffset4K;
+  ncclGinWindow_t ginWins[NCCL_GIN_MAX_CONTEXTS];
 };
 
 struct ncclMultimemHandle {
diff --git a/src/include/nccl_device/impl/gin__funcs.h b/src/include/nccl_device/impl/gin__funcs.h
new file mode 100644
index 000000000..fb00bb7f1
--- /dev/null
+++ b/src/include/nccl_device/impl/gin__funcs.h
@@ -0,0 +1,407 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
+#define _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
+#include "gin__types.h"
+#include "ptr__types.h"
+#if __CUDACC__
+#include "nccl_device/gin/gin_device_api.h"
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE ncclGin_BackendMask<beMask>::ncclGin_BackendMask(ncclDevComm const& comm, int contextIndex):
+  comm(comm) {
+  this->nContexts = comm.ginContextCount;
+
+  static_assert(NCCL_GIN_MAX_CONTEXTS == 4, "Required for following modulo hack to work.");
+  // this->contextId = contextIndex % comm.ginContextCount;
+  this->contextId = comm.ginContextCount == 3
+    ? uint32_t(contextIndex)%3 // 3 is only non power of 2
+    : contextIndex & (comm.ginContextCount-1); // powers of 2
+
+  this->_ginBackend = comm.ginTypes[this->contextId];
+  this->_ginHandle = comm.ginHandles[this->contextId];
+  this->_signalShadows = comm.ginSignalShadows + this->contextId*comm.ginSignalCount;
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE ncclGinCtx_M<beMask> ncclGin_BackendMask<beMask>::_makeCtx() const {
+  ncclGinCtx_M<beMask> ans;
+  ans.backend = (ncclNetDeviceType)_ginBackend;
+  ans.rank = comm.rank;
+  ans.nRanks = comm.nRanks;
+  ans.handle = _ginHandle;
+  return ans;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin descriptor helpers:
+
+#if __CUDACC__
+template<typename Descriptor>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isDescriptor(Descriptor) { return false; }
+template<typename Descriptor>
+NCCL_DEVICE_INLINE constexpr ncclGinDescriptorSmem* ncclGin_getDescriptor(Descriptor) { return nullptr; }
+
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isDescriptor(ncclGin_DescriptorSmem) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinDescriptorSmem* ncclGin_getDescriptor(ncclGin_DescriptorSmem arg) { return arg.descriptor; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin signal helpers:
+
+#if __CUDACC__
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(RemoteAction) { return false; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(ncclGin const&, RemoteAction) { return -1u; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(RemoteAction) { return (ncclGinSignalOp_t)0; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(RemoteAction) { return 0; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(ncclGin_SignalInc) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(
+    ncclGin const& net, ncclGin_SignalInc arg
+  ) {
+  return net.comm.ginSignalBase + arg.signal;
+}
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(ncclGin_SignalInc arg) {
+  return ncclGinSignalInc;
+}
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(ncclGin_SignalInc) { return 1; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(ncclGin_SignalAdd) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(
+    ncclGin const& net, ncclGin_SignalAdd arg
+  ) {
+  return net.comm.ginSignalBase + arg.signal;
+}
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(ncclGin_SignalAdd arg) {
+  return ncclGinSignalAdd;
+}
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(ncclGin_SignalAdd arg) { return arg.value; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin counter helpers:
+
+#if __CUDACC__
+template<typename LocalAction>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isCounter(LocalAction) { return false; }
+template<typename LocalAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getCounterId(ncclGin const&, LocalAction) { return -1u; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isCounter(ncclGin_CounterInc) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getCounterId(ncclGin const& net, ncclGin_CounterInc arg) { return net.comm.ginCounterBase + arg.counter; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename LocalAction, // one of ncclGin_{None|CounterInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::put(
+    ncclTeam team, int peer,
+    ncclWindow_t dstWin, size_t dstOffset,
+    ncclWindow_t srcWin, size_t srcOffset, size_t bytes,
+    RemoteAction remoteAction, LocalAction localAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,  cuda::thread_scope givenRelease
+  ) const {
+  using nccl::utility::loadConst;
+  ncclGinCtx_M<beMask> ctx = this->_makeCtx();
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_Put>(ctx,
+      coop, ncclTeamRankToWorld(this->comm, team, peer), /*hasWins=*/true,
+      loadConst(&dstWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&dstWin->ginOffset4K)) + dstOffset,
+      loadConst(&srcWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&srcWin->ginOffset4K)) + srcOffset, bytes,
+      ncclGin_isSignal(remoteAction),
+      ncclGin_getSignalId(*this, remoteAction),
+      ncclGin_getSignalOp(remoteAction),
+      ncclGin_getSignalOpArg(remoteAction),
+      ncclGin_isCounter(localAction),
+      ncclGin_getCounterId(*this, localAction),
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease,
+      givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename LocalAction, // one of ncclGin_{None|CounterInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::put(
+    ncclTeam team, int peer,
+    ncclSymPtr<T> dstElts, ncclSymPtr<T> srcElts, size_t nElts,
+    RemoteAction remoteAction, LocalAction localAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  this->put(
+    team, peer, dstElts.window, dstElts.offset, srcElts.window, srcElts.offset, nElts*sizeof(T),
+    remoteAction, localAction, coop, descriptor, requiredRelease, givenRelease
+  );
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::putValue(
+    ncclTeam team, int peer,
+    ncclWindow_t dstWin, size_t dstOffset, T value,
+    RemoteAction remoteAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  static_assert(sizeof(T) <= 8, "Required: sizeof(T) <= 8");
+  using nccl::utility::loadConst;
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_PutValue>(this->_makeCtx(),
+      coop, ncclTeamRankToWorld(this->comm, team, peer),
+      loadConst(&dstWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&dstWin->ginOffset4K)) + dstOffset,
+      value,
+      ncclGin_isSignal(remoteAction),
+      ncclGin_getSignalId(*this, remoteAction),
+      ncclGin_getSignalOp(remoteAction),
+      ncclGin_getSignalOpArg(remoteAction),
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease, givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::putValue(
+    ncclTeam team, int peer,
+    ncclSymPtr<T> dst, T value,
+    RemoteAction remoteAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  this->putValue(
+    team, peer, dst.window, dst.offset, value, remoteAction, coop, descriptor, requiredRelease, givenRelease
+  );
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename RemoteAction, typename Coop, typename DescriptorSmem>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::signal(
+    ncclTeam team, int peer, RemoteAction action, Coop coop, DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_Put>(this->_makeCtx(),
+      coop, ncclTeamRankToWorld(this->comm, team, peer),
+      /*hasWins=*/false, nullptr, 0, nullptr, 0, 0,
+      ncclGin_isSignal(action),
+      ncclGin_getSignalId(*this, action),
+      ncclGin_getSignalOp(action),
+      ncclGin_getSignalOpArg(action),
+      /*hasCounter=*/false, 0,
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease, givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::flush(Coop coop, cuda::memory_order ord) const {
+  coop.sync();
+  ncclGinCall<ncclGinApi_Flush>(this->_makeCtx(), coop, ord);
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitCounter(
+    Coop coop, ncclGinCounter_t counter, uint64_t least, int bits, cuda::memory_order ord
+  ) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetCounterPtr>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t ncclGin_BackendMask<beMask>::readCounter(ncclGinCounter_t counter, int bits, cuda::memory_order ord) const {
+  uint64_t* ptr = ncclGinCall<ncclGinApi_GetCounterPtr>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+  uint64_t mask = uint64_t(-1)>>(64-bits);
+  return mask & cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t* ncclGin_BackendMask<beMask>::getSignalShadowPtr(ncclGinSignal_t signal) const {
+  return &this->_signalShadows[signal];
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const {
+  asm volatile("red.relaxed.cta.add.u64 [%0],%1;" :: "l"(this->_signalShadows + signal), "l"(delta) : "memory");
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t ncclGin_BackendMask<beMask>::readSignal(ncclGinSignal_t signal, int bits, cuda::memory_order ord) const {
+  uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+  uint64_t mask = uint64_t(-1)>>(64-bits);
+  return mask & cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignal(Coop coop, ncclGinSignal_t signal, uint64_t least, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got, bits));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignalMeetShadow(Coop coop, ncclGinSignal_t signal, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    uint64_t least = this->_signalShadows[signal];
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got, bits));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop, typename Uint>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignalFollowShadow(Coop coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  uint64_t before64 = this->_signalShadows[signal];
+  uint64_t after64;
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    #pragma unroll 1
+    do after64 = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(before64 + leastDelta, after64, bits));
+  }
+  if (ncclCoopWithinWarp(coop) && bits <= 32) { // do a single __shfl_sync instead of 2
+    uint32_t mask = uint32_t(-1)>>(32-bits);
+    after64 = ncclCoopBcast(coop, (uint32_t)after64, 0, /*entrySync=*/false);
+    *before = (Uint)(mask & before64);
+    *delta = (Uint)(mask & (after64 - before64));
+  } else {
+    uint64_t mask = uint64_t(-1)>>(64-bits);
+    after64 = ncclCoopBcast(coop, after64, 0, /*entrySync=*/false);
+    *before = (Uint)(mask & before64);
+    *delta = (Uint)(mask & (after64 - before64));
+  }
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::resetCounter(ncclGinCounter_t counter) const {
+  ncclGinCall<ncclGinApi_ResetCounter>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::resetSignal(ncclGinSignal_t signal) const {
+  ncclGinCall<ncclGinApi_ResetSignal>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+  this->_signalShadows[signal] = 0;
+}
+#endif
+
+#endif // _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
diff --git a/src/include/nccl_device/impl/gin__types.h b/src/include/nccl_device/impl/gin__types.h
new file mode 100644
index 000000000..e096d52e3
--- /dev/null
+++ b/src/include/nccl_device/impl/gin__types.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION__TYPES_H_
+#define _NCCL_DEVICE_GIN_SESSION__TYPES_H_
+#include "../gin.h"
+#endif
diff --git a/src/include/nccl_device/impl/gin_barrier__funcs.h b/src/include/nccl_device/impl/gin_barrier__funcs.h
new file mode 100644
index 000000000..9c629ee81
--- /dev/null
+++ b/src/include/nccl_device/impl/gin_barrier__funcs.h
@@ -0,0 +1,66 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
+#include "gin_barrier__types.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::ncclGinBarrierSession(
+    Coop coop, ncclGin net, ncclTeam team, ncclGinBarrierHandle handle, uint32_t barrierIndex
+  ):
+  ncclGinBarrierSession_internal<Coop>{coop, net, team, handle, (int)barrierIndex} {
+  uint32_t* epochs = (uint32_t*)ncclGetResourceBufferLocalPointer(net.comm, handle.bufHandle);
+  this->epoch = epochs[barrierIndex*NCCL_GIN_MAX_CONTEXTS + net.contextId];
+  this->signal = handle.signal0 + barrierIndex;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::ncclGinBarrierSession(
+    Coop coop, ncclGin net, ncclTeamTagRail, uint32_t barrierIndex
+  ):
+  ncclGinBarrierSession(coop, net, ncclTeamRail(net.comm), net.comm.railGinBarrier, barrierIndex) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::~ncclGinBarrierSession() {
+  if (this->coop.thread_rank() == 0) {
+    uint32_t* epochs = (uint32_t*)ncclGetResourceBufferLocalPointer(this->net.comm, this->handle.bufHandle);
+    epochs[this->index*NCCL_GIN_MAX_CONTEXTS + this->net.contextId] = this->epoch;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGinBarrierSession<Coop>::sync(Coop, cuda::memory_order ord, ncclGinFenceLevel fence) {
+  this->coop.sync();
+  #pragma unroll 1
+  for (int i=this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+    int peer = 1 + this->team.rank + i;
+    if (this->team.nRanks <= peer) peer -= this->team.nRanks;
+    this->net.signal(
+      this->team, peer, ncclGin_SignalInc{this->signal}, ncclCoopThread(), ncclGin_None(),
+      nccl::utility::releaseOrderOf(ord) != cuda::memory_order_relaxed
+        ? cuda::thread_scope_thread
+        : cuda::thread_scope_system
+    );
+  }
+  this->epoch += this->team.nRanks-1;
+  if (this->coop.thread_rank() == 0) {
+    this->net.waitSignal(ncclCoopThread(), this->signal, this->epoch, 32, nccl::utility::acquireOrderOf(ord));
+  }
+  this->coop.sync();
+}
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
diff --git a/src/include/nccl_device/impl/gin_barrier__types.h b/src/include/nccl_device/impl/gin_barrier__types.h
new file mode 100644
index 000000000..83e2f636c
--- /dev/null
+++ b/src/include/nccl_device/impl/gin_barrier__types.h
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
+#include "../gin_barrier.h"
+#include "core__types.h"
+#include "gin__types.h"
+
+struct ncclGinBarrierHandle {
+  ncclGinSignal_t signal0;
+  ncclDevResourceHandle_t bufHandle;
+};
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclGinBarrierSession_internal {
+  Coop coop;
+  ncclGin net;
+  ncclTeam team;
+  ncclGinBarrierHandle handle;
+  int index;
+  uint32_t epoch;
+  ncclGinSignal_t signal;
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
diff --git a/src/include/nccl_device/impl/mem_barrier__funcs.h b/src/include/nccl_device/impl/lsa_barrier__funcs.h
similarity index 99%
rename from src/include/nccl_device/impl/mem_barrier__funcs.h
rename to src/include/nccl_device/impl/lsa_barrier__funcs.h
index 86a5d0fbc..b791a9f1e 100644
--- a/src/include/nccl_device/impl/mem_barrier__funcs.h
+++ b/src/include/nccl_device/impl/lsa_barrier__funcs.h
@@ -6,7 +6,7 @@
 
 #ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
 #define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
-#include "mem_barrier__types.h"
+#include "lsa_barrier__types.h"
 #include "comm__types.h"
 
 #if __CUDACC__
diff --git a/src/include/nccl_device/impl/mem_barrier__types.h b/src/include/nccl_device/impl/lsa_barrier__types.h
similarity index 98%
rename from src/include/nccl_device/impl/mem_barrier__types.h
rename to src/include/nccl_device/impl/lsa_barrier__types.h
index 8498cd6ba..ee58ab19a 100644
--- a/src/include/nccl_device/impl/mem_barrier__types.h
+++ b/src/include/nccl_device/impl/lsa_barrier__types.h
@@ -6,7 +6,7 @@
 
 #ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
 #define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
-#include "../mem_barrier.h"
+#include "../lsa_barrier.h"
 #include "core__types.h"
 
 struct ncclLsaBarrierHandle {
diff --git a/src/include/nccl_device/ll_a2a.h b/src/include/nccl_device/ll_a2a.h
index db3a517b7..6d516a47c 100644
--- a/src/include/nccl_device/ll_a2a.h
+++ b/src/include/nccl_device/ll_a2a.h
@@ -25,7 +25,7 @@ struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
   NCCL_DEVICE_INLINE ~ncclLLA2ASession();
 
   ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable
-  
+
   template<typename T>
   NCCL_DEVICE_INLINE void send(int peer, int slot, T data);
 
@@ -41,7 +41,7 @@ struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
   template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
   NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red)
     -> decltype(eltToAcc(nccl::utility::declval<Elt>())) ;
-  
+
   // End an alltoall region. For every peer in team you must have done both of the
   // following each of which can be accomplished using any thread in coop:
   //  1. Targeted that peer with at least one send().
diff --git a/src/include/nccl_device/mem_barrier.h b/src/include/nccl_device/lsa_barrier.h
similarity index 100%
rename from src/include/nccl_device/mem_barrier.h
rename to src/include/nccl_device/lsa_barrier.h
diff --git a/src/include/net_device.h b/src/include/nccl_device/net_device.h
similarity index 89%
rename from src/include/net_device.h
rename to src/include/nccl_device/net_device.h
index 99ae9c38b..423f1027a 100644
--- a/src/include/net_device.h
+++ b/src/include/nccl_device/net_device.h
@@ -14,7 +14,12 @@
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
 #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
 
-typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+typedef enum {
+  NCCL_NET_DEVICE_HOST=0,
+  NCCL_NET_DEVICE_UNPACK=1,
+  NCCL_NET_DEVICE_GIN_PROXY=2,
+  NCCL_NET_DEVICE_GIN_GDAKI=3,
+} ncclNetDeviceType;
 
 typedef struct {
   ncclNetDeviceType netDeviceType; // Network offload type
diff --git a/src/include/nccl_device/utility.h b/src/include/nccl_device/utility.h
index b98a0d973..7ace74bb4 100644
--- a/src/include/nccl_device/utility.h
+++ b/src/include/nccl_device/utility.h
@@ -40,6 +40,19 @@ T&& declval() noexcept {
   static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval.");
 }
 
+template<typename T, T value_>
+struct ValueAsType { static constexpr T value = value_; };
+
+// Returns the value zero but the compiler cannot prove that it is zero so it
+// is useful to inhibit compiler optimizations.
+#if __CUDACC__
+template<typename=void>
+NCCL_DEVICE_INLINE int opaqueZero() {
+  __device__ static int zero = 0;
+  return __ldg(&zero);
+}
+#endif
+
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
 NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) {
   return (x+y-1)/y;
@@ -98,6 +111,17 @@ NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) {
   return (x & (x-1)) == 0;
 }
 
+template<typename Uint>
+NCCL_HOST_DEVICE_INLINE bool rollingLessEq(Uint a, Uint b, int nBits = 8*sizeof(Uint)) {
+  static_assert(Uint(0) < Uint(-1), "Uint must be unsigned.");
+  Uint m = Uint(-1) >> (8*sizeof(Uint) - nBits);
+  return ((b-a) & m) <= m>>1;
+}
+template<typename Uint>
+NCCL_HOST_DEVICE_INLINE bool rollingLessThan(Uint a, Uint b, int nBits = 8*sizeof(Uint)) {
+  return !rollingLessEq(b, a, nBits);
+}
+
 // Produce the reciprocal of x for use in idivByRcp
 NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) {
   return uint32_t(-1)/x + isPow2(x);
@@ -212,18 +236,6 @@ NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) {
 }
 #endif
 
-#if __CUDACC__
-NCCL_DEVICE_INLINE void fenceAcquireGpu() {
-  static __device__ int dummy;
-  int tmp;
-  asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
-  dummy = tmp;
-}
-NCCL_DEVICE_INLINE void fenceReleaseGpu() {
-  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
-}
-#endif
-
 #if __CUDACC__
 NCCL_DEVICE_INLINE cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
   return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
@@ -237,6 +249,44 @@ NCCL_DEVICE_INLINE cuda::memory_order releaseOrderOf(cuda::memory_order ord) {
 }
 #endif
 
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order ord, cuda::thread_scope scope) {
+  switch (scope) {
+  case cuda::thread_scope_thread:
+    return cuda::atomic_ref<T, cuda::thread_scope_thread>{*ptr}.load(ord);
+  case cuda::thread_scope_block:
+    return cuda::atomic_ref<T, cuda::thread_scope_block>{*ptr}.load(ord);
+  case cuda::thread_scope_device:
+    return cuda::atomic_ref<T, cuda::thread_scope_device>{*ptr}.load(ord);
+  case cuda::thread_scope_system:
+    return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(ord);
+  default: __builtin_unreachable();
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE void atomicStore(T* ptr, T val, cuda::memory_order ord, cuda::thread_scope scope) {
+  switch (scope) {
+  case cuda::thread_scope_thread:
+    cuda::atomic_ref<T, cuda::thread_scope_thread>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_block:
+    cuda::atomic_ref<T, cuda::thread_scope_block>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_device:
+    cuda::atomic_ref<T, cuda::thread_scope_device>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_system:
+    cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, ord);
+    break;
+  default: __builtin_unreachable();
+  }
+}
+#endif
+
 #if __CUDACC__
 NCCL_DEVICE_INLINE int lane() {
   int ret;
diff --git a/src/include/net.h b/src/include/net.h
index f13eebb06..93ef179a3 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -17,6 +17,7 @@
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 ncclResult_t ncclNetInit(struct ncclComm* comm);
+ncclResult_t ncclNetInitFromParent(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
 ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev);
 ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDev);
@@ -28,5 +29,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
 
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
+extern ncclGin_t ncclGinIbGdaki;
+extern ncclGin_t ncclGinIbProxy;
 
 #endif
diff --git a/src/include/nvtx.h b/src/include/nvtx.h
index 8f20be43d..c1dbe5fd9 100644
--- a/src/include/nvtx.h
+++ b/src/include/nvtx.h
@@ -37,10 +37,11 @@
 #define NVTX_SID_AlltoAll             16
 #define NVTX_SID_Gather               17
 #define NVTX_SID_Scatter              18
+#define NVTX_SID_CommRevoke           19 // same schema as NVTX_SID_CommInitRank
 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 19 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 20 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/src/include/nvtx_payload_schemas.h b/src/include/nvtx_payload_schemas.h
index 587c1a2a4..9a47fbe86 100644
--- a/src/include/nvtx_payload_schemas.h
+++ b/src/include/nvtx_payload_schemas.h
@@ -49,11 +49,12 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static c
   )
 )
 // The typedef and payload schema for ncclCommInitRank is also used for,
-// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
+// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, ncclCommAbort, and ncclCommRevoke.
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommRevoke;
 
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
diff --git a/src/include/plugin/env/env_v1.h b/src/include/plugin/env/env_v1.h
new file mode 100644
index 000000000..eb6aa09ae
--- /dev/null
+++ b/src/include/plugin/env/env_v1.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef ENV_V1_H_
+#define ENV_V1_H_
+
+#include "nccl.h"
+
+typedef struct {
+  const char* name;
+  // Initialize the environment plugin
+  // Input
+  //  - ncclMajor: NCCL major version number
+  //  - ncclMinor: NCCL minor version number
+  //  - ncclPatch: NCCL patch version number
+  //  - suffix: NCCL version suffix string
+  ncclResult_t (*init)(uint8_t ncclMajor, uint8_t ncclMinor, uint8_t ncclPatch, const char* suffix);
+  // Finalize the environment plugin
+  ncclResult_t (*finalize)(void);
+  // Get environment variable value
+  // Input
+  //  - name: environment variable name
+  // Output
+  //  - returns: pointer to environment variable value string, or NULL if not found. The plugin is responsible for keeping the
+  //             returned value (address) valid until it is no longer needed by NCCL. This happens when NCCL calls ``finalize``
+  //             or ``getEnv`` again on the same variable name. In any other case, modifying the variable (e.g., through
+  //             ``setenv``) is considered undefined behavior since NCCL might access the returned address after the plugin has
+  //             reset the variable.
+  const char* (*getEnv)(const char* name);
+} ncclEnv_v1_t;
+
+#endif
diff --git a/src/include/plugin/nccl_env.h b/src/include/plugin/nccl_env.h
new file mode 100644
index 000000000..3cb85a6ee
--- /dev/null
+++ b/src/include/plugin/nccl_env.h
@@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENV_H_
+#define NCCL_ENV_H_
+
+#include "env/env_v1.h"
+
+typedef ncclEnv_v1_t ncclEnv_t;
+
+#define NCCL_ENV_PLUGIN_SYMBOL ncclEnvPlugin_v1
+
+#endif // end include guard
diff --git a/src/include/plugin/nccl_net.h b/src/include/plugin/nccl_net.h
index d92a21b4e..d4daa29d4 100644
--- a/src/include/plugin/nccl_net.h
+++ b/src/include/plugin/nccl_net.h
@@ -9,7 +9,7 @@
 
 #include "nccl.h"
 #include "nccl_common.h"
-#include "net_device.h"
+#include "nccl_device/net_device.h"
 #include <stdint.h>
 
 #define NCCL_NET_HANDLE_MAXSIZE 128
@@ -25,6 +25,10 @@
 #define NCCL_PTR_CUDA 0x2
 #define NCCL_PTR_DMABUF 0x4
 
+#define NCCL_NET_MR_FLAG_FORCE_SO (1 << 0)
+#define NCCL_NET_SIGNAL_OP_INC 0x1
+#define NCCL_NET_SIGNAL_OP_ADD 0x2
+
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
@@ -44,6 +48,7 @@
 
 typedef ncclNet_v11_t ncclNet_t;
 typedef ncclCollNet_v11_t ncclCollNet_t;
+typedef ncclGin_v11_t ncclGin_t;
 typedef ncclNetSGE_v11_t ncclNetSGE_t;
 typedef ncclNetProperties_v11_t ncclNetProperties_t;
 typedef ncclNetAttr_v11_t ncclNetAttr_t;
diff --git a/src/include/plugin/net/net_v11.h b/src/include/plugin/net/net_v11.h
index 68e100637..b7ea1c7e7 100644
--- a/src/include/plugin/net/net_v11.h
+++ b/src/include/plugin/net/net_v11.h
@@ -17,7 +17,6 @@ typedef struct {
   int trafficClass;
 } ncclNetCommConfig_v11_t;
 
-
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
@@ -185,4 +184,53 @@ typedef struct {
   ncclResult_t (*finalize)(void* ctx);
 } ncclCollNet_v11_t;
 
+typedef struct {
+  // Name of the GIN support (mainly for logs)
+  const char* name;
+  // Initialize the GIN support.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing GIN operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Create a group for GIN operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* ctx, void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Create device-side GIN context. devHandle will be passed to device code.
+  // This function is not used in GIN_PROXY mode.
+  ncclResult_t (*createContext)(void* collComm, int nSignals, int nCounters, void** ginCtx, ncclNetDeviceHandle_v11_t** devHandle);
+  // Collective memory registration
+  ncclResult_t (*regMrSym)(void* collComm, void* data, size_t size, int type, uint64_t mrFlags, void** mhandle, void **ginHandle);
+  ncclResult_t (*regMrSymDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mrFlags, void** mhandle, void **ginHandle);
+  ncclResult_t (*deregMrSym)(void* collComm, void* mhandle);
+  // Close and free collective comm objects
+  ncclResult_t (*destroyContext)(void* ginCtx);
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Put operations
+  ncclResult_t (*iput)(void* collComm, uint64_t srcOff, void* srcMhandle, size_t size,
+      uint64_t dstOff, void* dstMhandle, uint32_t rank, void** request);
+  ncclResult_t (*iputSignal)(void* collComm, uint64_t srcOff, void* srcMhandle,
+      size_t size, uint64_t dstOff, void* dstMhandle,
+      uint32_t rank, uint64_t signalOff, void *signalMhandle,
+      uint64_t signalValue, uint32_t signalOp, void** request);
+
+  // Test whether a request is complete.
+  ncclResult_t (*test)(void* collComm, void* request, int* done);
+
+  // Progress function. Will be called if non-NULL in GIN_PROXY mode, or if devHandle.needsProxyProgress=1.
+  ncclResult_t (*ginProgress)(void* collComm);
+
+  // Query the last error for the GIN support. Particularly important when ginProgress is not used, to report errors.
+  ncclResult_t (*queryLastError)(void* ginCtx, bool *hasError);
+
+  // Finalize the GIN support
+  ncclResult_t (*finalize)(void* ctx);
+} ncclGin_v11_t;
 #endif // end include guard
diff --git a/src/include/plugin/plugin.h b/src/include/plugin/plugin.h
index 83b58e985..ff1ca27c8 100644
--- a/src/include/plugin/plugin.h
+++ b/src/include/plugin/plugin.h
@@ -13,11 +13,13 @@ enum ncclPluginType {
   ncclPluginTypeNet,
   ncclPluginTypeTuner,
   ncclPluginTypeProfiler,
+  ncclPluginTypeEnv,
 };
 
 void* ncclOpenNetPluginLib(const char* name);
 void* ncclOpenTunerPluginLib(const char* name);
 void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclOpenEnvPluginLib(const char* name);
 void* ncclGetNetPluginLib(enum ncclPluginType type);
 ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type);
 
diff --git a/src/include/proxy.h b/src/include/proxy.h
index 4613ada49..4a79612f4 100644
--- a/src/include/proxy.h
+++ b/src/include/proxy.h
@@ -16,6 +16,7 @@
 #include "shmutils.h"
 #include "p2p.h"
 #include "collectives.h"
+#include "gin/gin_host.h"
 
 typedef enum : uint8_t {
   ncclPatternRing,
@@ -322,6 +323,8 @@ struct ncclProxyState {
   bool dmaBufSupport;
   ncclNet_t* ncclNet;
   ncclCollNet_t* ncclCollNet;
+  struct ncclGinState* ginState;
+
   uint32_t* abortFlag;
   bool directMode;
   // Service threads
diff --git a/src/include/register.h b/src/include/register.h
index edfc722de..938432fe1 100644
--- a/src/include/register.h
+++ b/src/include/register.h
@@ -51,6 +51,9 @@ struct ncclReg {
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
   // collnet reg
   void* collnetHandle;
+  // gin reg
+  void** ginMhandles;
+  void** ginHandles;
   struct ncclProxyConnector* collnetProxyconn;
   // general ipc reg
   struct ncclPeerRegIpcAddr regIpcAddrs;
diff --git a/src/include/socket.h b/src/include/socket.h
index adeae9b2a..bc7df9a3d 100644
--- a/src/include/socket.h
+++ b/src/include/socket.h
@@ -66,7 +66,13 @@ struct ncclSocket {
   int finalizeCounter; // Used to keep track of initial handshake for async sockets.
   char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 };
-
+struct ncclSocketOp {
+  int op;                    // NCCL_SOCKET_SEND or NCCL_SOCKET_RECV
+  struct ncclSocket* sock;   // Socket to operate on
+  void* ptr;                 // Data pointer
+  int size;                  // Size of data
+  int offset;                // Current progress offset
+};
 const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
@@ -96,6 +102,7 @@ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
+ncclResult_t ncclSocketMultiOp(struct ncclSocketOp* ops, int numOps);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
diff --git a/src/include/sym_kernels.h b/src/include/sym_kernels.h
index 4e742eff7..bf86dc3d1 100644
--- a/src/include/sym_kernels.h
+++ b/src/include/sym_kernels.h
@@ -105,7 +105,8 @@ ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* tas
 
 // Generated by src/device/symmetric/generate.py
 extern int const ncclSymkKernelCount;
-extern void* const ncclSymkKernelList[];
+extern void* ncclSymkKernelList[];
+extern int ncclSymkKernelRequirements[/*ncclSymkKernelCount*/];
 void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
 const char* ncclSymkKernelIdToString(int kernelId);
 
diff --git a/src/include/transport.h b/src/include/transport.h
index 39e479e24..ac6dd4295 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -124,8 +124,8 @@ struct ncclTransport {
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex);
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
-ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode, bool* isAllCudaP2p);
+bool ncclP2pUsesMemcpy();
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -167,4 +167,13 @@ ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *p
 ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle);
 #endif
 
+ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclIpcMapSymmetric(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
+ncclResult_t ncclIpcFreeSymmetric(struct ncclComm* comm, size_t size, void* symPtr);
+ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsMapSymmetric(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsFreeSymmetric(struct ncclComm* comm, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
+
 #endif
diff --git a/src/include/utils.h b/src/include/utils.h
index 46389985f..fadc112b7 100644
--- a/src/include/utils.h
+++ b/src/include/utils.h
@@ -67,6 +67,16 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   return ret;
 }
 
+static inline int gcd(int a, int b) {
+  // use the euclidian algorithm
+  while (b != 0) {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 template<typename Int>
diff --git a/src/init.cc b/src/init.cc
index ebf942c02..8e8b0fdaa 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -35,6 +35,7 @@
 #include <mutex>
 #include "ce_coll.h"
 #include "nvtx.h"
+#include "env.h"
 
 #define STR2(v) #v
 #define STR(v) STR2(v)
@@ -119,7 +120,6 @@ static ncclResult_t initResult = ncclSuccess;
 static std::once_flag initOnceFlag;
 
 static void initOnceFunc() {
-  initEnv();
   setCpuStackSize();
   initGdrCopy();
   // Always initialize bootstrap network
@@ -134,6 +134,19 @@ static ncclResult_t ncclInit() {
   return initResult;
 }
 
+static ncclResult_t envInitResult = ncclSuccess;
+static std::once_flag envInitOnceFlag;
+
+static void envInitOnceFunc() {
+  NCCLCHECKGOTO(ncclEnvPluginInit(), envInitResult, exit);
+exit:;
+}
+
+ncclResult_t ncclInitEnv() {
+  std::call_once(envInitOnceFlag, envInitOnceFunc);
+  return envInitResult;
+}
+
 NCCL_API(ncclResult_t, ncclGetVersion, int* version);
 ncclResult_t ncclGetVersion(int* version) {
   if (version == NULL) return ncclInvalidArgument;
@@ -143,6 +156,7 @@ ncclResult_t ncclGetVersion(int* version) {
 
 NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
 ncclResult_t ncclGetUniqueId(ncclUniqueId* out) {
+  NCCLCHECK(ncclInitEnv());
   NCCLCHECK(ncclInit());
   NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
   struct ncclBootstrapHandle handle;
@@ -270,8 +284,13 @@ static ncclResult_t commFree(ncclComm_t comm) {
   for (int channel=0; channel<MAXCHANNELS; channel++)
     NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks, 1, comm->localRanks));
 
+  // GIN may use proxy. We need to finalize it before destroying the proxy.
+  NCCLCHECK(ncclGinFinalize(comm));
+
+  int sharedResRefCount = 0;
   if (comm->sharedRes) {
-    if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) {
+    sharedResRefCount = ncclAtomicRefCountDecrement(&comm->sharedRes->refCount);
+    if (sharedResRefCount == 0) {
       for (int c=0; c<MAXCHANNELS; c++) {
         if (comm->sharedRes->peers[c]) free(comm->sharedRes->peers[c]);
         if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]);
@@ -315,7 +334,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
-  NCCLCHECK(ncclNetFinalize(comm));
+  if (sharedResRefCount == 0) NCCLCHECK(ncclNetFinalize(comm));
   ncclCudaContextDrop(comm->context);
   free(comm);
 
@@ -386,7 +405,26 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   comm->rank = rank;
   comm->nRanks = ndev;
 
-  NCCLCHECK(ncclNetInit(comm));
+  if (parent == NULL || !parent->shareResources) {
+    struct ncclSharedResources* sharedRes = NULL;
+    NCCLCHECK(ncclCalloc(&sharedRes, 1));
+    /* most of attributes are assigned later in initTransportsRank(). */
+    sharedRes->owner = comm;
+    sharedRes->tpNRanks = comm->nRanks;
+    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
+    comm->sharedRes = sharedRes;
+    sharedRes->refCount = 1;
+    NCCLCHECK(ncclNetInit(comm));
+  } else {
+    comm->sharedRes = parent->sharedRes;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+    NCCLCHECK(ncclNetInitFromParent(comm, parent));
+  }
+
   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
   if (parent && parent->shareResources) {
@@ -432,24 +470,6 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   // Mark channels as non initialized.
   for (int c=0; c < MAXCHANNELS; c++) comm->channels[c].id = -1;
 
-  if (parent == NULL || !parent->shareResources) {
-    struct ncclSharedResources* sharedRes = NULL;
-    NCCLCHECK(ncclCalloc(&sharedRes, 1));
-    /* most of attributes are assigned later in initTransportsRank(). */
-    sharedRes->owner = comm;
-    sharedRes->tpNRanks = comm->nRanks;
-    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
-    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
-    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
-    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
-    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
-    comm->sharedRes = sharedRes;
-    sharedRes->refCount = 1;
-  } else {
-    comm->sharedRes = parent->sharedRes;
-    ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
-  }
-
   if (comm->topParentRanks == NULL) {
     NCCLCHECK(ncclCalloc(&comm->topParentRanks, comm->nRanks));
     for (int i = 0; i < comm->nRanks; ++i)
@@ -729,18 +749,94 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 #define TIMER_INIT_ALLOC 7
 #define TIMERS_INIT_COUNT 8
 
+extern int64_t ncclParamWinStride();
+
 static ncclResult_t initNvlDomainInfo(struct ncclComm* comm) {
   // Initialize NVLink domain info
   comm->nvlDomainInfo.nNvlDomains = comm->nNodes;
   comm->nvlDomainInfo.minRanksPerNvlDomain = comm->minLocalRanks;
   comm->nvlDomainInfo.maxRanksPerNvlDomain = comm->maxLocalRanks;
-  
+
   TRACE(NCCL_INIT, "NVLink domains: %d domains, min ranks per domain: %d, max ranks per domain: %d",
         comm->nNodes, comm->nvlDomainInfo.minRanksPerNvlDomain, comm->nvlDomainInfo.maxRanksPerNvlDomain);
 
   return ncclSuccess;
 }
 
+NCCL_PARAM(GroupSize, "P2P_SCHEDULE_GROUP_SIZE", NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+
+static ncclResult_t ncclP2pSchedule(struct ncclComm* comm) {
+  struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+  // We decompose all the nodes into group of ranks of equal size.
+  int groupSize = ncclParamGroupSize();
+  for (int node = 0; node < comm->nNodes; node++) {
+    int localRanks = nodeRanks[node].localRanks;
+    if (localRanks % groupSize != 0 || localRanks < groupSize) groupSize = gcd(groupSize, nodeRanks[node].localRanks);
+  }
+  comm->p2pSchedGroupSize = groupSize;
+
+  int local = comm->localRank % groupSize; // local id inside my group
+  int group = comm->localRank / groupSize; // id of my group, incremented when going over the previous nodes
+  int nGroups = comm->nRanks / groupSize;
+  int nGroupsPow2 = pow2Up(nGroups);
+
+  int *groupToNode, *groupToLocal;
+  NCCLCHECK(ncclCalloc(&groupToNode, nGroups));  // node hosting the group
+  NCCLCHECK(ncclCalloc(&groupToLocal, nGroups)); // local offset of the group
+  int groupCount = 0;
+  for (int n = 0; n < comm->nNodes; ++n) {
+    if (0 != comm->nodeRanks[n].localRanks % groupSize) {
+      WARN("nLocals = %d should be a diviser of the number of ranks in node %d = %d", groupSize, n, comm->nodeRanks[n].localRanks);
+      return ncclInternalError;
+    }
+    int nGroupsInNode = comm->nodeRanks[n].localRanks / groupSize;
+    for (int g = 0; g < nGroupsInNode; ++g) {
+      groupToLocal[groupCount] = g * groupSize;
+      groupToNode[groupCount] = n;
+      groupCount++;
+    }
+    if (n < comm->node) group += nGroupsInNode;
+  }
+  if (groupCount != nGroups) {
+    WARN("Group creation failed: %d vs %d", groupCount, nGroups);
+    return ncclInternalError;
+  }
+  INFO(NCCL_GRAPH,"%s: group size used is %d",__func__,groupSize);
+
+  uint32_t groupRound = 0, groupDelta = 0;
+  int round = 0;
+  // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
+  // Since that formula only produces valid permutations when N is a pow of 2,
+  // we let N = pow2Up(n) and filter out results greater-eq to n.
+  // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
+  do {
+    if (groupDelta < nGroups) { // Filter nonsensical group deltas
+      int sendGroup = (group + groupDelta) % nGroups;
+      int recvGroup = (group - groupDelta + nGroups) % nGroups;
+      int sendNode = groupToNode[sendGroup];
+      int recvNode = groupToNode[recvGroup];
+      for (int delta = 0; delta < groupSize; delta++) {
+        int sendLocal = groupToLocal[sendGroup] + (local + delta) % groupSize;
+        int recvLocal = groupToLocal[recvGroup] + (local - delta + groupSize) % groupSize;
+        comm->p2pSchedule[round].sendRank = nodeRanks[sendNode].localRankToRank[sendLocal];
+        comm->p2pSchedule[round].recvRank = nodeRanks[recvNode].localRankToRank[recvLocal];
+        round += 1;
+      }
+    }
+    groupRound += 1;
+    groupDelta = (groupDelta + groupRound) & (nGroupsPow2 - 1); // Quadratic update
+  } while (groupRound != nGroupsPow2);
+
+  free(groupToNode);
+  free(groupToLocal);
+
+  if (round != comm->nRanks) {
+    WARN("P2p schedule creation has bugs.");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
@@ -822,13 +918,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   do {
     // Compute intra-process ranks
     int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
-    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
 
     comm->nvlsRegSupport = 1;
     for (int i = 0; i < nranks; i++) {
-      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
-          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+      comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+      comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) &&
+          (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
         // Rank is in same process
         if (intraProcRanks == 0) intraProcRank0 = i;
         if (i == rank) intraProcRank = intraProcRanks;
@@ -1154,7 +1250,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // Profiler plugin context has to be initialized before proxy thread
   NCCLCHECK(ncclProfilerPluginInit(comm));
 
-  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode, &comm->isAllCudaP2p), ret, fail);
   // Launch proxy service thread, after this, the proxy calls can be used.
   if (parent && parent->shareResources) {
     comm->proxyState = parent->sharedRes->proxyState;
@@ -1165,61 +1261,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
 
   timers[TIMER_INIT_CONNECT] = clockNano();
-  do { // Build p2p schedule
-    int node = comm->node;
-    int nNodes = comm->nNodes;
-    int nRanks = comm->nRanks;
-    int local = comm->localRank;
-    int nLocals = comm->maxLocalRanks;
-    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
-    bool flat = false;
-    for (int node = 0; node < nNodes; node++) {
-      if (nodeRanks[node].localRanks != nLocals) {
-        flat = true;
-        nNodes = 1; node = 0;
-        nLocals = nRanks; local = rank;
-        break;
-      }
-    }
-    int nNodesPow2 = pow2Up(nNodes);
-    int nLocalsPow2 = pow2Up(nLocals);
-    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
-    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
-    uint32_t nodeRound = 0;
-    uint32_t nodeDelta = 0;
-    int round = 0;
-    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
-    // Since that formula only produces valid permutations when N is a pow of 2,
-    // we let N = pow2Up(n) and filter out results greater-eq to n.
-    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
-    do {
-      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
-        int sendNode = (node + nodeDelta) % nNodes;
-        int recvNode = (node - nodeDelta + nNodes) % nNodes;
-        uint32_t localRound = 0;
-        uint32_t localDelta = 0;
-        do {
-          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
-            int sendLocal = (local + localDelta) % nLocals;
-            int recvLocal = (local - localDelta + nLocals) % nLocals;
-            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
-            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
-            round += 1;
-          }
-          localRound += 1;
-          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
-        } while (localRound != nLocalsPow2);
-      }
-      nodeRound += 1;
-      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
-    } while (nodeRound != nNodesPow2);
-
-    if (round != nRanks) {
-      WARN("P2p schedule creation has bugs.");
-      ret = ncclInternalError;
-      goto fail;
-    }
-  } while (0);
+  // Build p2p schedule
+  comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, comm->nRanks);
+  comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, comm->nRanks);
+  NCCLCHECK(ncclP2pSchedule(comm));
 
   comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
   if (comm->runtimeConn) {
@@ -1328,7 +1373,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
   }
 
-  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+  comm->symmetricSupport = comm->isAllCudaP2p && ncclParamWinEnable() && ncclCuMemEnable();
   comm->devrState.bigSize = 0;
 
   comm->ceColl.baseUCSymReadyPtr = NULL;
@@ -1372,7 +1417,7 @@ NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 
 NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", NCCL_CONFIG_UNDEF_INT);
-NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
+NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", NCCL_CONFIG_UNDEF_INT);
 
 
 #define NCCL_COMMINIT_FUNCNAME_LEN 128
@@ -1605,6 +1650,8 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   cgaClusterSizeEnv = ncclParamCGAClusterSize();
   if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
+    if (comm->config.cgaClusterSize != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config cgaClusterSize reset to NCCL_MAX_CGA_CLUSTER_SIZE=%d", cgaClusterSizeEnv);
     comm->config.cgaClusterSize = cgaClusterSizeEnv;
   } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
     INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
@@ -1615,16 +1662,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
     if (minCTAsEnv <= 0)
       INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
-    else
+    else {
+      if (comm->config.minCTAs != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config minCTAs reset to NCCL_MIN_CTAS=%d", minCTAsEnv);
       comm->config.minCTAs = minCTAsEnv;
+    }
   }
 
   maxCTAsEnv = ncclParamMaxCTAs();
   if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
     if (maxCTAsEnv <= 0)
       INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
-    else
+    else {
+      if (comm->config.maxCTAs != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config maxCTAs reset to NCCL_MAX_CTAS=%d", maxCTAsEnv);
       comm->config.maxCTAs = maxCTAsEnv;
+    }
   }
 
   /* override configuration with env variable. */
@@ -1632,22 +1685,30 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (nChannelsPerNetPeerEnv != NCCL_CONFIG_UNDEF_INT) {
     if (nChannelsPerNetPeerEnv <= 0)
       INFO(NCCL_ENV, "NCCL_NCHANNELS_PER_NET_PEER %d is too low, leaving it set at %d", nChannelsPerNetPeerEnv, comm->config.nChannelsPerNetPeer);
-    else
+    else {
+      if (comm->config.nChannelsPerNetPeer != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config nChannelsPerNetPeer reset to NCCL_NCHANNELS_PER_NET_PEER=%d", nChannelsPerNetPeerEnv);
       comm->config.nChannelsPerNetPeer = nChannelsPerNetPeerEnv;
+    }
   }
 
   nvlinkUtilCentricSchedEnableEnv = ncclParamNvlinkUtilCentricSchedEnable();
   if (nvlinkUtilCentricSchedEnableEnv != NCCL_CONFIG_UNDEF_INT) {
     if (nvlinkUtilCentricSchedEnableEnv != 0 && nvlinkUtilCentricSchedEnableEnv != 1)
       INFO(NCCL_ENV, "NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE %d is not valid, leaving it set at %d", nvlinkUtilCentricSchedEnableEnv, comm->config.nvlinkCentricSched);
-    else
+    else {
+      if (comm->config.nvlinkCentricSched != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config nvlinkCentricSched reset to NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE=%d", nvlinkUtilCentricSchedEnableEnv);
       comm->config.nvlinkCentricSched = nvlinkUtilCentricSchedEnableEnv;
+    }
   }
 
   envNetName = ncclGetEnv("NCCL_NET");
   if (envNetName)
     tmpNetName = envNetName;
   if (tmpNetName != NULL) {
+    if (comm->config.netName != NCCL_CONFIG_UNDEF_PTR)
+      INFO(NCCL_ENV, "Comm config netName reset to NCCL_NET=%s", tmpNetName);
     int netNameLen = strlen(tmpNetName) + 1;
     comm->config.netName = (char*)malloc(netNameLen);
     memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
@@ -1657,10 +1718,14 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   splitShareEnv = ncclParamCommSplitShareResources();
   if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.splitShare != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config splitShare reset to NCCL_COMM_SPLIT_SHARE_RESOURCES=%d", splitShareEnv);
     comm->config.splitShare = splitShareEnv;
   }
   shrinkShareEnv = ncclParamCommShrinkShareResources();
   if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.shrinkShare != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config shrinkShare reset to NCCL_COMM_SHRINK_SHARE_RESOURCES=%d", shrinkShareEnv);
     comm->config.shrinkShare = shrinkShareEnv;
   }
 
@@ -1670,6 +1735,8 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (collnetEnableEnv != NULL) {
     int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
     if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
+      if (comm->config.collnetEnable != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config collnetEnable reset to NCCL_COLLNET_ENABLE=%d", collnetEnableInt);
       comm->config.collnetEnable = collnetEnableInt;
       INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
     }
@@ -1677,11 +1744,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   ctaPolicyEnv = ncclParamCtaPolicy();
   if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.CTAPolicy != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config CTAPolicy reset to NCCL_CTA_POLICY=%d", ctaPolicyEnv);
     comm->config.CTAPolicy = ctaPolicyEnv;
   }
 
   nvlsCTAsEnv = ncclParamNvlsChannels();
   if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config nvlsCTAs reset to NCCL_NVLS_NCHANNELS=%d", nvlsCTAsEnv);
     comm->config.nvlsCTAs = nvlsCTAsEnv;
   }
 
@@ -1972,6 +2043,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
 
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRank)
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   (void)ncclCudaLibraryInit();
@@ -2071,6 +2143,7 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t *internalConfigPtr = NULL;
 
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
 
   NCCLCHECK(ncclGroupStartInternal());
@@ -2102,6 +2175,7 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
 
 NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
 ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
 
   int cudaDev;
@@ -2328,6 +2402,103 @@ static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) {
   return ncclSuccess;
 }
 
+NCCL_API(ncclResult_t, ncclCommRevoke, ncclComm_t comm, int revokeFlags);
+struct ncclCommRevokeAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t comm;
+};
+
+static ncclResult_t commRevokeAsync(struct ncclAsyncJob* job_) {
+  struct ncclCommRevokeAsyncJob* job = (struct ncclCommRevokeAsyncJob*)job_;
+  ncclComm_t comm = job->comm;
+  ncclResult_t res = ncclSuccess;
+  NCCLCHECKGOTO(PtrCheck(comm, "CommRevokeAsync", "comm"), res, exit);
+  INFO(NCCL_INIT, "CommRevokeAsync START comm %p rank %d nRanks %d nNodes %d localRank %d cudaDev %d",
+      comm, comm->rank, comm->nRanks, comm->nNodes, comm->localRank, comm->cudaDev);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit);
+  NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), res, exit);
+  NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit);
+  NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, /*waitSome=*/true), res, exit);
+  NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), res, exit);
+  {
+    ncclResult_t _tmpret = ncclSuccess;
+    if ((_tmpret = ncclProxyStop(comm)) != ncclSuccess) {
+      WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, _tmpret);
+    }
+    if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
+      PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
+      if (comm->proxyState->threadUDS) {
+        // UDS support
+        PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
+      }
+      // Mark threads as joined so later cleanup (e.g., commFree) won't join again
+      comm->proxyState->thread = 0;
+      comm->proxyState->threadUDS = 0;
+    }
+  }
+  NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
+exit:
+  (void)ncclCommSetAsyncError(comm, res);
+  INFO(NCCL_INIT, "CommRevokeAsync END comm %p result %d", comm, res);
+  return res;
+}
+
+ncclResult_t ncclCommRevoke(ncclComm_t comm, int revokeFlags) {
+  NVTX3_RANGE(NcclNvtxParamsCommRevoke);
+
+  if (comm == NULL) {
+    return ncclSuccess;
+  }
+  // For now only NCCL_REVOKE_DEFAULT (0) is supported
+  if (revokeFlags != NCCL_REVOKE_DEFAULT) {
+    return ncclInvalidArgument;
+  }
+  // Disallow revoke if destroy/finalize in progress
+  if (comm->destroyFlag || comm->finalizeCalled) {
+    return ncclInvalidArgument;
+  }
+  // Disallow revoke if revoke in progress
+  if (comm->revokedFlag) {
+    return ncclInvalidArgument;
+  }
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Revoke START",
+      comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+
+  NCCLCHECK(ncclGroupStartInternal());
+  (void)setCommAbortFlags(comm,1);
+  comm->revokedFlag = 1;
+  (void)ncclCommEnsureReady(comm);
+  comm->finalizeCalled = true;
+
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  struct ncclCommRevokeAsyncJob *job = NULL;
+  ncclResult_t res = ncclSuccess;
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommRevoke, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commRevokeAsync, NULL, free, comm), res, fail);
+
+exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  if (comm) {
+    if (!comm->config.blocking) {
+      NCCLCHECK(ncclCommGetAsyncError(comm, &res));
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommRevoke, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
+  }
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Revoke COMPLETE, result %d", comm, rank, nranks, cudaDev, comm->busId, res);
+  return res;
+fail:
+  if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, res);
+  goto exit;
+}
+
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort(ncclComm_t comm) {
   NVTX3_RANGE(NcclNvtxParamsCommAbort);
@@ -2406,8 +2577,9 @@ static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm,
     childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
 
     // Set the shareResource field, this is used throughout the init and must be reset every time.
-    // If we shrink, we only reuse resources if we shrink in the default mode
-    comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare;
+    // Never share resources if the parent communicator has been revoked.
+    // If we shrink, we only reuse resources in default mode.
+    comm->shareResources = !comm->revokedFlag && (isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare);
     if (comm->shareResources) {
       childComm->abortFlag = comm->abortFlag;
       childComm->abortFlagDev = comm->abortFlagDev;
@@ -2537,6 +2709,26 @@ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) {
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+
+  /* Check gin status */
+  if (*asyncError == ncclSuccess && comm->sharedRes && comm->sharedRes->ginState.ncclGin) {
+    struct ncclGinState* ginState = &comm->sharedRes->ginState;
+    // Gin progress thread status
+    if (ginState->needsProxyProgress) *asyncError = __atomic_load_n(&comm->sharedRes->ginState.asyncResult, __ATOMIC_ACQUIRE);
+    // Gin side errors, also works when we have no GIN progress thread.
+    if (*asyncError == ncclSuccess) {
+      bool ginError;
+      for (int c=0; c<comm->sharedRes->ginState.ginCommCount; c++) {
+        NCCLCHECK(ncclGinQueryLastError(&comm->sharedRes->ginState, &ginError));
+        if (ginError) {
+          WARN("GIN Error on gin context %d\n", c);
+          *asyncError = ncclRemoteError;
+          break;
+        }
+      }
+    }
+  }
+
   /* if there is linked group job, we should complete it. */
   if (*asyncError == ncclSuccess && comm->groupJob) {
     NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
diff --git a/src/libnccl.map b/src/libnccl.map
new file mode 100644
index 000000000..1dbfcd508
--- /dev/null
+++ b/src/libnccl.map
@@ -0,0 +1,8 @@
+{
+    global:
+        nccl*;
+        pnccl*;
+
+    local:
+        *;
+};
diff --git a/src/misc/ibvsymbols.cc b/src/misc/ibvsymbols.cc
index bd5f33390..c70e73b54 100644
--- a/src/misc/ibvsymbols.cc
+++ b/src/misc/ibvsymbols.cc
@@ -50,7 +50,7 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
   ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
   ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
   ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
-  
+
   ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece);
   ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece);
 
diff --git a/src/misc/ibvwrap.cc b/src/misc/ibvwrap.cc
index 6d6586e78..96d55937c 100644
--- a/src/misc/ibvwrap.cc
+++ b/src/misc/ibvwrap.cc
@@ -144,12 +144,17 @@ ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_devic
 }
 
 ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+#ifndef NCCL_BUILD_RDMA_CORE
   // First try and query the extended port attributes (e.g. active_speed_ex)
   if (ibv_query_port_ex(context, port_num, port_attr) != 0) {
     // Fall back to the original attribute API call, but zero all members first
     memset(port_attr, 0, sizeof(*port_attr));
     IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
   }
+#else
+  // When using system rdma-core, use the regular ibv_query_port
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+#endif
   return ncclSuccess;
 }
 
diff --git a/src/misc/param.cc b/src/misc/param.cc
index 9060b0066..55cd595f6 100644
--- a/src/misc/param.cc
+++ b/src/misc/param.cc
@@ -6,6 +6,7 @@
 
 #include "param.h"
 #include "debug.h"
+#include "env.h"
 
 #include <algorithm>
 #include <errno.h>
@@ -93,6 +94,6 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
 }
 
 const char* ncclGetEnv(const char* name) {
-  initEnv();
-  return getenv(name);
+  ncclInitEnv();
+  return ncclEnvPluginGetEnv(name);
 }
diff --git a/src/misc/socket.cc b/src/misc/socket.cc
index 5633fef3e..f000d47e5 100644
--- a/src/misc/socket.cc
+++ b/src/misc/socket.cc
@@ -902,6 +902,29 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int
 }
 
 
+ncclResult_t ncclSocketMultiOp(struct ncclSocketOp* ops, int numOps) {
+  if (ops == NULL || numOps <= 0) {
+    WARN("ncclSocketMultiOp: invalid arguments ops=%p numOps=%d", ops, numOps);
+    return ncclInvalidArgument;
+  }
+
+  for (int i = 0; i < numOps; i++) {
+    if (ops[i].sock == NULL) {
+      WARN("ncclSocketMultiOp: invalid socket at index %d", i);
+      return ncclInvalidArgument;
+    }
+    ops[i].offset = 0;
+  }
+  int completedOps=0, i=0;
+  while(completedOps < numOps){
+    if (ops[i].offset < ops[i].size){
+      NCCLCHECK(socketProgress(ops[i].op, ops[i].sock, ops[i].ptr, ops[i].size, &ops[i].offset));
+      if(ops[i].offset >= ops[i].size) completedOps++;
+    }
+    i=(i+1)%numOps;
+  }
+  return ncclSuccess;
+}
 // Receive or detect connection closed
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
   int offset = 0;
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 0c53c826e..61de6b800 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -69,6 +69,9 @@ typedef enum { ncclSuccess                 =  0,
 #define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
 #define NCCL_SHRINK_ABORT 0x01   /* First, terminate ongoing parent operations, and then shrink the parent communicator */
 
+/* ncclCommRevoke flags */
+#define NCCL_REVOKE_DEFAULT 0x00 /* reserved for future use; must be 0 */
+
 /* Communicator configuration. Users can assign value to attributes to specify the
  * behavior of a communicator. */
 typedef struct ncclConfig_v22800 {
@@ -194,6 +197,15 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 
+/* Revoke a communicator. ncclCommRevoke stops all in-flight operations
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is quiescent; then, management operations (destroy, split,
+ * shrink) can proceed safely. Calling ncclCommFinalize after revoke is invalid.
+ * Additionally, resource sharing via splitShare/shrinkShare is disabled while revoked.
+ * revokeFlags must be NCCL_REVOKE_DEFAULT (0). */
+ncclResult_t  ncclCommRevoke(ncclComm_t comm, int revokeFlags);
+ncclResult_t pncclCommRevoke(ncclComm_t comm, int revokeFlags);
+
 /* Creates one or more communicators from an existing one.
  * Ranks with the same color will end up in the same communicator.
  * Within the new communicator, key will be used to order ranks.
diff --git a/src/nccl_device/CMakeLists.txt b/src/nccl_device/CMakeLists.txt
index 9d0c3d100..4b73ef1cc 100644
--- a/src/nccl_device/CMakeLists.txt
+++ b/src/nccl_device/CMakeLists.txt
@@ -2,7 +2,8 @@
 set(SYM_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/core.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/ll_a2a.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/mem_barrier.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/lsa_barrier.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_barrier.cc
 )
 
 # Add register sources to parent scope
diff --git a/src/nccl_device/gin_barrier.cc b/src/nccl_device/gin_barrier.cc
new file mode 100644
index 000000000..fef97b664
--- /dev/null
+++ b/src/nccl_device/gin_barrier.cc
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_device/impl/gin_barrier__funcs.h"
+
+NCCL_API(ncclResult_t, ncclGinBarrierCreateRequirement, ncclComm_t comm, ncclTeam_t team, int nBarriers, ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+ncclResult_t ncclGinBarrierCreateRequirement(
+    ncclComm_t comm, ncclTeam_t team, int nBarriers,
+    ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq
+  ) {
+  memset(outReq, 0, sizeof(*outReq));
+  outReq->bufferSize = nBarriers*NCCL_GIN_MAX_CONTEXTS*sizeof(uint32_t);
+  outReq->bufferAlign = alignof(uint32_t);
+  outReq->outBufferHandle = &outHandle->bufHandle;
+  outReq->ginSignalCount = nBarriers;
+  outReq->outGinSignalStart = &outHandle->signal0;
+  return ncclSuccess;
+}
diff --git a/src/nccl_device/mem_barrier.cc b/src/nccl_device/lsa_barrier.cc
similarity index 94%
rename from src/nccl_device/mem_barrier.cc
rename to src/nccl_device/lsa_barrier.cc
index b6c400fa4..a2153caa9 100644
--- a/src/nccl_device/mem_barrier.cc
+++ b/src/nccl_device/lsa_barrier.cc
@@ -5,7 +5,7 @@
  ************************************************************************/
 
 #include "core.h"
-#include "nccl_device/impl/mem_barrier__funcs.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
 
 NCCL_API(ncclResult_t, ncclLsaBarrierCreateRequirement, ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
 ncclResult_t ncclLsaBarrierCreateRequirement(
diff --git a/src/plugin/CMakeLists.txt b/src/plugin/CMakeLists.txt
index 2ef9282f6..bbbf7c0b4 100644
--- a/src/plugin/CMakeLists.txt
+++ b/src/plugin/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(profiler)
 add_subdirectory(net)
 add_subdirectory(tuner)
+add_subdirectory(env)
 
 # Plugin sources
 set(PLUGIN_SOURCES
@@ -9,9 +10,11 @@ set(PLUGIN_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/plugin_open.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/tuner.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/env.cc
     ${PLUGIN_NET_SOURCES}
     ${PLUGIN_PROFILER_SOURCES}
     ${PLUGIN_TUNER_SOURCES}
+    ${PLUGIN_ENV_SOURCES}
 )
 
 # Add plugin sources to parent scope
diff --git a/src/plugin/env.cc b/src/plugin/env.cc
new file mode 100644
index 000000000..2249bba47
--- /dev/null
+++ b/src/plugin/env.cc
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <mutex>
+#include <atomic>
+
+#include "checks.h"
+#include "debug.h"
+#include "env.h"
+#include "param.h"
+#include "plugin.h"
+
+extern ncclEnv_t* getNcclEnv_v1(void* lib);
+
+static void* envPluginLib = nullptr;
+static ncclEnv_t* ncclEnvPlugin = nullptr;
+extern ncclEnv_v1_t ncclIntEnv_v1;
+
+#define EXT_ENV_PLUGIN 0
+#define INT_ENV_PLUGIN 1
+#define NUM_ENV_PLUGIN 2
+static ncclEnv_t *ncclEnvPlugins[NUM_ENV_PLUGIN] = { nullptr, &ncclIntEnv_v1 };
+
+enum {
+  envPluginLoadFailed  = -1,
+  envPluginLoadReady   =  0,
+  envPluginLoadSuccess =  1,
+};
+static int envPluginStatus = envPluginLoadReady;
+
+static ncclResult_t ncclEnvPluginLoad(void) {
+  const char* envName;
+  if (envPluginStatus != envPluginLoadReady) goto exit;
+
+  if ((envName = getenv("NCCL_ENV_PLUGIN")) != nullptr) {
+    INFO(NCCL_ENV, "NCCL_ENV_PLUGIN set by environment to %s", envName);
+    if (strcasecmp(envName, "none") == 0) {
+      goto fail;
+    }
+  }
+  envPluginLib = ncclOpenEnvPluginLib(envName);
+  if (nullptr == envPluginLib) {
+    goto fail;
+  } else if (ncclPluginLibPaths[ncclPluginTypeEnv]) {
+    envName = ncclPluginLibPaths[ncclPluginTypeEnv];
+  }
+
+  ncclEnvPlugins[EXT_ENV_PLUGIN] = getNcclEnv_v1(envPluginLib);
+  if (nullptr == ncclEnvPlugins[EXT_ENV_PLUGIN]) {
+    INFO(NCCL_INIT, "External env plugin %s is unsupported", envName);
+    goto fail;
+  }
+  INFO(NCCL_INIT, "Successfully loaded external env plugin %s", envName);
+
+  envPluginStatus = envPluginLoadSuccess;
+
+exit:
+  return ncclSuccess;
+fail:
+  // Fallback to internal/default plugin
+  if (envPluginLib) NCCLCHECK(ncclClosePluginLib(envPluginLib, ncclPluginTypeEnv));
+  envPluginLib = nullptr;
+  envPluginStatus = envPluginLoadFailed;
+  goto exit;
+}
+
+static ncclResult_t ncclEnvPluginUnload(void) {
+  if (ncclEnvPlugin) {
+    INFO(NCCL_INIT, "ENV/Plugin: Closing env plugin %s", ncclEnvPlugin->name);
+  }
+  if (ncclEnvPlugins[EXT_ENV_PLUGIN]) {
+    ncclEnvPlugin = ncclEnvPlugins[INT_ENV_PLUGIN];
+    ncclEnvPlugins[EXT_ENV_PLUGIN] = nullptr;
+  }
+  NCCLCHECK(ncclClosePluginLib(envPluginLib, ncclPluginTypeEnv));
+  return ncclSuccess;
+}
+
+void ncclEnvPluginFinalize(void);
+
+static bool initialized;
+
+ncclResult_t ncclEnvPluginInit(void) {
+  initEnv();
+  NCCLCHECK(ncclEnvPluginLoad());
+  ncclEnvPlugin = (envPluginLoadSuccess == envPluginStatus) ? ncclEnvPlugins[EXT_ENV_PLUGIN] : ncclEnvPlugins[INT_ENV_PLUGIN];
+  NCCLCHECK(ncclEnvPlugin->init(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, NCCL_SUFFIX));
+  atexit(ncclEnvPluginFinalize);
+  __atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
+  return ncclSuccess;
+}
+
+void ncclEnvPluginFinalize(void) {
+  if (ncclEnvPlugin->finalize) {
+    ncclEnvPlugin->finalize();
+    ncclEnvPluginUnload();
+  }
+}
+
+const char* ncclEnvPluginGetEnv(const char* name) {
+  return ncclEnvPlugin->getEnv(name);
+}
+
+bool ncclEnvPluginInitialized(void) {
+  return __atomic_load_n(&initialized, __ATOMIC_ACQUIRE);
+}
diff --git a/src/plugin/env/CMakeLists.txt b/src/plugin/env/CMakeLists.txt
new file mode 100644
index 000000000..07ca7e13d
--- /dev/null
+++ b/src/plugin/env/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Env plugin sources
+set(PLUGIN_ENV_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/env_v1.cc
+)
+
+# Env plugin sources to parent scope
+set(PLUGIN_ENV_SOURCES ${PLUGIN_ENV_SOURCES} PARENT_SCOPE)
diff --git a/src/plugin/env/env_v1.cc b/src/plugin/env/env_v1.cc
new file mode 100644
index 000000000..fa2b6b1b2
--- /dev/null
+++ b/src/plugin/env/env_v1.cc
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_env.h"
+
+static ncclEnv_v1_t* ncclEnv_v1;
+
+ncclEnv_t* getNcclEnv_v1(void* lib) {
+  ncclEnv_v1 = (ncclEnv_v1_t*)dlsym(lib, "ncclEnvPlugin_v1");
+  if (ncclEnv_v1) {
+    INFO(NCCL_INIT|NCCL_ENV, "ENV/Plugin: Using %s (v1)", ncclEnv_v1->name);
+    return ncclEnv_v1;
+  }
+  return nullptr;
+}
+
+static ncclResult_t ncclEnvInit(uint8_t ncclMajor, uint8_t ncclMinor, uint8_t ncclPatch, const char* suffix) {
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclEnvFinalize(void) {
+  return ncclSuccess;
+}
+
+static const char* ncclEnvGetEnv(const char* name) {
+  return getenv(name);
+}
+
+ncclEnv_v1_t ncclIntEnv_v1 = {
+  .name = "ncclEnvDefault",
+  .init = ncclEnvInit,
+  .finalize = ncclEnvFinalize,
+  .getEnv = ncclEnvGetEnv,
+};
diff --git a/src/plugin/net.cc b/src/plugin/net.cc
index 6abd0804d..6ec428a2d 100644
--- a/src/plugin/net.cc
+++ b/src/plugin/net.cc
@@ -19,6 +19,7 @@
 
 typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
 typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
+typedef ncclGin_t* getNcclGin_t(void* netPluginLib);
 
 extern getNcclNet_t getNcclNet_v6;
 extern getNcclNet_t getNcclNet_v7;
@@ -32,12 +33,14 @@ extern getNcclCollNet_t getNcclCollNet_v8;
 extern getNcclCollNet_t getNcclCollNet_v9;
 extern getNcclCollNet_t getNcclCollNet_v10;
 extern getNcclCollNet_t getNcclCollNet_v11;
-
+extern getNcclGin_t getNcclGin_v11;
 NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 0);
 #define NCCL_NET_VERSION_COUNT 6
 int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {11, 10, 9, 8, 7, 6};
 getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v11, getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
 getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v11, getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
+#define NCCL_GIN_VERSION_COUNT 1
+getNcclGin_t* getNcclGin[NCCL_GIN_VERSION_COUNT] = {getNcclGin_v11};
 
 #define NCCL_NET_NUM_INTERNAL_PLUGINS 2
 
@@ -58,6 +61,8 @@ typedef struct netPluginLib {
   ncclCollNet_t* ncclCollNet;                   // Pointer to the ncclCollNet_t structure
   ncclNetPluginState_t ncclNetPluginState;      // State of the nccl net plugin
   ncclNetPluginState_t ncclCollNetPluginState;  // State of the nccl coll net plugin
+  ncclGin_t* ncclGin;                           // Pointer to the ncclGin_t structure
+  ncclNetPluginState_t ncclGinPluginState;      // State of the nccl gin plugin
   int ncclNetPluginRefCount;                    // Reference count for the nccl net plugin
   int netPhysDevs;                              // ncclNet - number of physical devices
   int netVirtDevs;                              // ncclNet - number of virtual devices
@@ -115,6 +120,17 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
   else
     pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
 
+  // load gin
+  for (int i = 0; i < NCCL_GIN_VERSION_COUNT; i++) {
+    pluginLib->ncclGin = getNcclGin[i](pluginLib->dlHandle);
+    if (pluginLib->ncclGin) break;
+  }
+
+  if (pluginLib->ncclGin == nullptr)
+    pluginLib->ncclGinPluginState = ncclNetPluginStateLoadFailed;
+  else
+    pluginLib->ncclGinPluginState = ncclNetPluginStateInitReady;
+
   INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external network plugin %s",
        (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name));
 exit:
@@ -155,10 +171,14 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
 
 static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* pluginLib) {
   int ndev;
+  // Init must be called for each new comm to set the right context
   if (pluginLib->ncclNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclNet) {
     ncclNetCommConfig_t commConfig = {};
     commConfig.trafficClass = comm->config.trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : comm->config.trafficClass;
     if (pluginLib->ncclNet->init(&comm->netContext, comm->commHash, &commConfig, ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
+  }
+  // Detection of the devices is only done when the plugin is being initialized the first time
+  if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
     if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
     pluginLib->netPhysDevs = ndev;
     pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT;
@@ -166,15 +186,39 @@ static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* plu
   pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
   INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
 
+  // Init must be called for each new comm to set the right context
   if (pluginLib->ncclCollNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
     if (pluginLib->ncclCollNet->init(&comm->collNetContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
-    else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  }
+  // Detection of the devices is only done when the plugin is being initialized the first time
+  if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
+    if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
     else {
       pluginLib->collNetPhysDevs = ndev;
       pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
       pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
     }
   }
+
+  if (pluginLib->ncclGinPluginState == ncclNetPluginStateInitReady && pluginLib->ncclGin) {
+    if ((ncclParamGinType() == -1) && (pluginLib->ncclGin == (ncclGin_t *)-1)) {
+      void* throwAwayContext = nullptr;
+      if (ncclGinIbGdaki.init(&throwAwayContext, comm->commHash, ncclDebugLog) == ncclSuccess) {
+        if (ncclGinIbGdaki.devices(&ndev) == ncclSuccess && ndev > 0) {
+          pluginLib->ncclGin = &ncclGinIbGdaki;
+        }
+        ncclGinIbGdaki.finalize(throwAwayContext);
+      }
+      else {
+        pluginLib->ncclGin = &ncclGinIbProxy;
+      }
+    }
+    if (pluginLib->ncclGin->init(&comm->ginContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
+    else if (pluginLib->ncclGin->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
+    else {
+      pluginLib->ncclGinPluginState = ncclNetPluginStateEnabled;
+    }
+  }
 exit:
   return ncclSuccess;
 fail:
@@ -184,12 +228,11 @@ static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* plu
   pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
   pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
   pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
   goto exit;
 }
 
 static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
-  const char* netName = comm->config.netName;
-  if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
   if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
 
   if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
@@ -202,6 +245,10 @@ static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginI
     if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
       comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
     }
+    if (netPluginLibs[pluginIndex].ncclGinPluginState >= ncclNetPluginStateEnabled) {
+      INFO(NCCL_INIT|NCCL_NET, "Assigned GIN plugin %s to comm", netPluginLibs[pluginIndex].ncclGin->name);
+      comm->sharedRes->ginState.ncclGin = netPluginLibs[pluginIndex].ncclGin;
+    }
   }
 exit:
   return ncclSuccess;
@@ -209,6 +256,7 @@ static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginI
   *isAssigned = false;
   netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
   netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
+  netPluginLibs[pluginIndex].ncclGinPluginState = ncclNetPluginStateEnabled;
   goto exit;
 }
 
@@ -274,12 +322,32 @@ static void initPluginLibsOnceFunc() {
 
   // Add 2 internal ib and socket plugins
   netPluginLibs[pluginCounter].ncclNet = &ncclNetIb;
-  netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
+  netPluginLibs[pluginCounter].ncclGin = NULL;
+  if (ncclParamGinType() == -1)
+    netPluginLibs[pluginCounter].ncclGin = (ncclGin_t *)-1;
+  else if (ncclParamGinType() == NCCL_NET_DEVICE_GIN_PROXY)
+    netPluginLibs[pluginCounter].ncclGin = &ncclGinIbProxy;
+  else if (ncclParamGinType() == NCCL_NET_DEVICE_GIN_GDAKI)
+    netPluginLibs[pluginCounter].ncclGin = &ncclGinIbGdaki;
+  netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateInitReady;
+  netPluginLibs[pluginCounter].ncclGinPluginState = netPluginLibs[pluginCounter].ncclGin ? ncclNetPluginStateInitReady : ncclNetPluginStateLoadFailed;
+  ++pluginCounter;
   netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket;
   netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
   pluginCount = pluginCounter;
 }
 
+static ncclResult_t ncclNetPluginFinalize(struct ncclComm* comm, int pluginIndex) {
+  NCCLCHECK(netPluginLibs[pluginIndex].ncclNet->finalize(comm->netContext));
+  if (netPluginLibs[pluginIndex].ncclCollNet && netPluginLibs[pluginIndex].ncclCollNetPluginState == ncclNetPluginStateEnabled) NCCLCHECK(netPluginLibs[pluginIndex].ncclCollNet->finalize(comm->collNetContext));
+  if (netPluginLibs[pluginIndex].ncclGin && netPluginLibs[pluginIndex].ncclGinPluginState == ncclNetPluginStateEnabled) NCCLCHECK(netPluginLibs[pluginIndex].ncclGin->finalize(comm->ginContext));
+  netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
+  if (pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) {
+    NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[pluginIndex]));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
   bool ncclNetPluginInitialized = false;
   std::call_once(initPluginLibsOnceFlag, initPluginLibsOnceFunc);
@@ -288,17 +356,22 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
     if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
       NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
     }
-    if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady) {
+    if ((netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady)
+        && (!comm->config.netName || (strcasecmp(comm->config.netName, netPluginLibs[pluginIndex].ncclNet->name) == 0))) {
+      // plugin init must be done by all comms to setup the context, therefore we use ">="
       NCCLCHECK(ncclNetPluginInit(comm, &netPluginLibs[pluginIndex]));
-    }
-    if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
-      bool isAssigned = false;
-      NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
-      if (isAssigned) {
-        // If one external plugin is assigned to a comm, then disable all other external plugins
-        ncclNetPluginDisableOtherExternal(pluginIndex);
-        ncclNetPluginInitialized = true;
-        break;
+      if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
+        bool isAssigned = false;
+        NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
+        if (isAssigned) {
+          // If one external plugin is assigned to a comm, then disable all other external plugins
+          ncclNetPluginDisableOtherExternal(pluginIndex);
+          ncclNetPluginInitialized = true;
+          break;
+        }
+        else {
+          ncclNetPluginFinalize(comm, pluginIndex);
+        }
       }
     }
   }
@@ -307,15 +380,28 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
   return ncclInvalidUsage;
 }
 
+ncclResult_t ncclNetInitFromParent(struct ncclComm* comm, struct ncclComm* parent) {
+  ncclResult_t ret = ncclSuccess;
+  comm->netContext = parent->netContext;
+  comm->collNetContext = parent->collNetContext;
+  comm->ginContext = parent->ginContext;
+  comm->ncclNet = parent->ncclNet;
+  comm->ncclCollNet = parent->ncclCollNet;
+  comm->netPluginIndex = parent->netPluginIndex;
+  if (comm->config.netName != NCCL_CONFIG_UNDEF_PTR && strcasecmp(comm->config.netName, parent->config.netName)) {
+    WARN("Comm config netName (%s) does not match the parent (%s)", comm->config.netName, parent->config.netName);
+    ret = ncclInvalidUsage;
+  }
+  if (comm->config.trafficClass != NCCL_CONFIG_UNDEF_INT && comm->config.trafficClass != parent->config.trafficClass) {
+    INFO(NCCL_INIT, "Comm config trafficClass (%d) does not match the parent (%d)", comm->config.trafficClass, parent->config.trafficClass);
+  }
+  return ret;
+}
+
 ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
   int pluginIndex = comm->netPluginIndex;
   std::lock_guard<std::mutex> lock(netPluginMutex);
-  NCCLCHECK(comm->ncclNet->finalize(comm->netContext));
-  if (comm->collNetContext) NCCLCHECK(comm->ncclCollNet->finalize(comm->collNetContext));
-  netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
-  for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
-    NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
-  }
+  NCCLCHECK(ncclNetPluginFinalize(comm, pluginIndex));
   return ncclSuccess;
 }
 
@@ -396,8 +482,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     char* gpuPtr = NULL;
     void* mHandle = NULL;
     ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTONOWARN(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1, NCCL_NET);
 
     bool connected;
     connected = false;
@@ -409,22 +494,22 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
       }
 
       if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2);
+        NCCLCHECKGOTONOWARN(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2, NCCL_NET);
 
       if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
+        NCCLCHECKGOTONOWARN(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2, NCCL_NET);
 
       connected = (rComm != NULL) && (sComm != NULL);
     }
 
-    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+    NCCLCHECKGOTONOWARN(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2, NCCL_NET);
+    NOWARN(ret = comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), NCCL_NET);
+    if (ret == ncclSuccess) {
+      NCCLCHECKNOWARN(comm->ncclNet->deregMr(sComm, mHandle), NCCL_NET);
+      NCCLCHECKNOWARN(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), NCCL_NET);
+      NCCLCHECKNOWARN(comm->ncclNet->deregMr(rComm, mHandle), NCCL_NET);
       gdrSupportMatrix[comm->cudaDev] = 1;
     }
-    ncclDebugNoWarn = 0;
     NCCLCHECK(ncclCudaFree(gpuPtr));
 cleanup2:
     if (rComm != NULL)
diff --git a/src/plugin/net/net_v10.cc b/src/plugin/net/net_v10.cc
index 591a57ac0..217d6407d 100644
--- a/src/plugin/net/net_v10.cc
+++ b/src/plugin/net/net_v10.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
@@ -192,7 +191,7 @@ static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
   ncclCollNet.test = ncclCollNet_v10->test;
   ncclCollNet.closeColl = ncclCollNet_v10->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v10->closeListen;
-  ncclCollNet.makeVDevice = ncclCollNet_makeVDevice;
+  ncclCollNet.makeVDevice = (ncclCollNet_v10->makeVDevice) ? ncclCollNet_makeVDevice : nullptr;
   ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
diff --git a/src/plugin/net/net_v11.cc b/src/plugin/net/net_v11.cc
index b13a0efb9..a88db8bde 100644
--- a/src/plugin/net/net_v11.cc
+++ b/src/plugin/net/net_v11.cc
@@ -5,12 +5,12 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include <dlfcn.h>
 
 static ncclNet_v11_t* ncclNet_v11;
 static ncclCollNet_v11_t* ncclCollNet_v11;
+static ncclGin_v11_t* ncclGin_v11;
 
 ncclNet_t* getNcclNet_v11(void* lib) {
   ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11");
@@ -29,3 +29,12 @@ ncclCollNet_t* getNcclCollNet_v11(void* lib) {
   }
   return nullptr;
 }
+
+ncclGin_t* getNcclGin_v11(void* lib) {
+  ncclGin_v11 = (ncclGin_v11_t*)dlsym(lib, "ncclGinPlugin_v11");
+  if (ncclGin_v11) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded gin plugin %s (v11)", ncclGin_v11->name);
+    return ncclGin_v11;
+  }
+  return nullptr;
+}
diff --git a/src/plugin/net/net_v6.cc b/src/plugin/net/net_v6.cc
index 73eb8614d..6cf40d4e5 100644
--- a/src/plugin/net/net_v6.cc
+++ b/src/plugin/net/net_v6.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/src/plugin/net/net_v7.cc b/src/plugin/net/net_v7.cc
index a13717294..8121273dc 100644
--- a/src/plugin/net/net_v7.cc
+++ b/src/plugin/net/net_v7.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/src/plugin/net/net_v8.cc b/src/plugin/net/net_v8.cc
index d241d5dc5..3b1aaa58e 100644
--- a/src/plugin/net/net_v8.cc
+++ b/src/plugin/net/net_v8.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/src/plugin/net/net_v9.cc b/src/plugin/net/net_v9.cc
index 12011aa8c..b32d6f4fd 100644
--- a/src/plugin/net/net_v9.cc
+++ b/src/plugin/net/net_v9.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/src/plugin/plugin_open.cc b/src/plugin/plugin_open.cc
index 740f22065..ffa20dbb1 100644
--- a/src/plugin/plugin_open.cc
+++ b/src/plugin/plugin_open.cc
@@ -15,14 +15,14 @@
 
 #define MAX_STR_LEN 255
 
-#define NUM_LIBS 3
+#define NUM_LIBS 4
 static char* libNames[NUM_LIBS];
 char* ncclPluginLibPaths[NUM_LIBS];
 static void *libHandles[NUM_LIBS];
-static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
-static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler" };
-static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
-static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
+static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER", "ENV" };
+static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler", "libnccl-env" };
+static const char *pluginFallback[NUM_LIBS] = { "", "", "", "" };
+static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT, NCCL_INIT|NCCL_ENV };
 
 static void* tryOpenLib(char* name, int* err, char* errStr) {
   *err = 0;
@@ -124,6 +124,10 @@ void* ncclOpenProfilerPluginLib(const char* name) {
   return openPluginLib(ncclPluginTypeProfiler, name);
 }
 
+void* ncclOpenEnvPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeEnv, name);
+}
+
 void* ncclGetNetPluginLib(enum ncclPluginType type) {
   if (libNames[ncclPluginTypeNet]) {
     // increment the reference counter of the net library
diff --git a/src/proxy.cc b/src/proxy.cc
index 25a14cd64..75ca46953 100644
--- a/src/proxy.cc
+++ b/src/proxy.cc
@@ -1875,6 +1875,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->dmaBufSupport = comm->dmaBufSupport;
     proxyState->ncclNet = comm->ncclNet;
     proxyState->ncclCollNet = comm->ncclCollNet;
+    proxyState->ginState = &comm->sharedRes->ginState;
     proxyState->netContext = comm->netContext;
     proxyState->collNetContext = comm->collNetContext;
     proxyState->profilerContext = comm->profilerContext;
diff --git a/src/ras/client.cc b/src/ras/client.cc
index 8061cef4e..8e1762fa6 100644
--- a/src/ras/client.cc
+++ b/src/ras/client.cc
@@ -28,6 +28,7 @@ static const char* hostName = "localhost";
 static const char* port = STR(NCCL_RAS_CLIENT_PORT);
 static int timeout = -1;
 static bool verbose = false;
+static const char* format = "text";
 static int sock = -1;
 
 static void printUsage(const char* argv0) {
@@ -35,6 +36,7 @@ static void printUsage(const char* argv0) {
           "Usage: %s [OPTION]...\n"
           "Query the state of a running NCCL job.\n"
           "\nOptions:\n"
+          "  -f, --format=FMT    Output format: text or json (text by default)\n"
           "  -h, --host=HOST     Host name or IP address of the RAS client socket of the\n"
           "                      NCCL job to connect to (localhost by default)\n"
           "  -p, --port=PORT     TCP port of the RAS client socket of the NCCL job\n"
@@ -51,17 +53,25 @@ static void parseArgs(int argc, char** argv) {
   int c;
   int optIdx = 0;
   struct option longOpts[] = {
+    {"format",  required_argument, NULL, 'f'},
+    {"help",    no_argument,       NULL, 'e'},
     {"host",    required_argument, NULL, 'h'},
     {"port",    required_argument, NULL, 'p'},
     {"timeout", required_argument, NULL, 't'},
     {"verbose", no_argument,       NULL, 'v'},
-    {"help",    no_argument,       NULL, 'e'},
     {"version", no_argument,       NULL, 'r'},
     {0}
   };
 
-  while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) {
+  while ((c = getopt_long(argc, argv, "f:h:p:t:v", longOpts, &optIdx)) != -1) {
     switch (c) {
+      case 'f':
+        format = optarg;
+        if (strcasecmp(format, "text") != 0 && strcasecmp(format, "json") != 0) {
+          fprintf(stderr, "Invalid format: %s (must be text or json)\n", format);
+          exit(1);
+        }
+        break;
       case 'h':
         hostName = optarg;
         break;
@@ -265,9 +275,51 @@ static int connectToNCCL() {
   goto retry;
 }
 
+static int setOutputFormat() {
+  char msgBuf[4096];
+  int bytes;
+
+  // Only set format if it's not the default.
+  if (strcasecmp(format, "text") != 0) {
+    snprintf(msgBuf, sizeof(msgBuf), "SET FORMAT %s\n", format);
+    if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("write to socket");
+      return 1;
+    }
+    // Read response.
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("read socket");
+      return 1;
+    }
+    if (bytes == 0) {
+      fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+      return 1;
+    }
+    if (strcasecmp(msgBuf, "OK\n")) {
+      fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+      return 1;
+    }
+  }
+  return 0;
+}
+
 int getNCCLStatus() {
   char msgBuf[4096];
   int bytes;
+
+  // Set the output format.
+  if (setOutputFormat() != 0) {
+    return 1;
+  }
+
+  // Send the status command.
   snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : ""));
   if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
     if (errno == EAGAIN || errno == EWOULDBLOCK)
diff --git a/src/ras/client_support.cc b/src/ras/client_support.cc
index 3eafe1b79..a5f8ed65f 100644
--- a/src/ras/client_support.cc
+++ b/src/ras/client_support.cc
@@ -106,6 +106,9 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
 
+// CUDA version information - shared across functions.
+static int cudaDriverVersion = -1, cudaRuntimeVersion = -1;
+
 
 static ncclResult_t getNewClientEntry(struct rasClient** pClient);
 static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
@@ -135,6 +138,22 @@ static const char* ncclErrorToString(ncclResult_t err);
 static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
 static bool rasCountIsOutlier(int count, bool verbose, int totalCount = -1);
 
+// CUDA version information - shared across functions.
+static void rasDumpCommsToJSON(struct rasClient* client, struct rasCollComms* commsData,
+                               struct rasCollective* coll, const int* peerIdxConv);
+static void jsonWriteHeader(const char* ncclVersion, int cudaRuntime, int cudaDriver,
+                            const char* timestamp, int commsCount);
+static void jsonStartCommunicator(unsigned long commHash, unsigned long hostHash, unsigned long pidHash,
+                                  int commSize, int ranksCount, int missingCount, bool firstComm);
+static void jsonWriteRankData(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                              int initState, int asyncError, bool finalizeCalled, bool destroyFlag,
+                              bool abortFlag, const unsigned long* collCounts, bool firstRank);
+static void jsonStartMissingRanks();
+static void jsonWriteMissingRank(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                                 bool unresponsive, bool dead, bool firstMissing);
+static void jsonEndCommunicator();
+static void jsonWriteFooter(double collectionTime, int timeoutsCount);
+
 
 ///////////////////////////////////
 // General rasClients functions. //
@@ -207,6 +226,7 @@ static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
   client->sock = client->pfd = -1;
   ncclIntruQueueConstruct(&client->sendQ);
   client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
+  client->outputFormat = RAS_OUTPUT_TEXT;  // Initialize to default TEXT format.
 
   if (rasClientsHead) {
     rasClientsTail->next = client;
@@ -359,6 +379,24 @@ void rasClientEventLoop(struct rasClient* client, int pollIdx) {
         // We don't copy the terminating '\0', hence memcpy rather than strcpy.
         memcpy(msg, rasLine, msgLen);
         rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strncasecmp(cmd, "set format ", strlen("set format ")) == 0) {
+        char* format = cmd + strlen("set format ");
+        if (strcasecmp(format, "text") == 0) {
+          client->outputFormat = RAS_OUTPUT_TEXT;
+          strcpy(rasLine, "OK\n");
+        } else if (strcasecmp(format, "json") == 0) {
+          client->outputFormat = RAS_OUTPUT_JSON;
+          strcpy(rasLine, "OK\n");
+        } else {
+          snprintf(rasLine, sizeof(rasLine), "ERROR: Invalid format %s\n", format);
+        }
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
       } else if (strcasecmp(cmd, "status") == 0) {
         client->status = RAS_CLIENT_INIT;
         (void)rasClientRun(client);
@@ -487,23 +525,29 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
   int firstIdx, nPeers;
   struct rasValCount valCounts[NCCL_MAX_LOCAL_RANKS];
   int nValCounts;
-  static int cudaDriver = -1, cudaRuntime = -1;
 
   TRACE(NCCL_RAS, "RAS: rasClientRunInit: starting");
 
   rasOutReset();
-  rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
-               " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
-  if (cudaRuntime == -1)
-    cudaRuntimeGetVersion(&cudaRuntime);
-  if (cudaDriver == -1)
-    cudaDriverGetVersion(&cudaDriver);
-  rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntime, cudaDriver);
-  msgLen = rasOutLength();
-  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
-  rasOutExtract(msg);
-  rasClientEnqueueMsg(client, msg, msgLen);
-  msg = nullptr;
+
+  // Get CUDA version info once for all output formats.
+  if (cudaRuntimeVersion == -1)
+    cudaRuntimeGetVersion(&cudaRuntimeVersion);
+  if (cudaDriverVersion == -1)
+    cudaDriverGetVersion(&cudaDriverVersion);
+
+  // For structured formats (JSON), skip the initial text output.
+  // It will be included in the structured output later.
+  if (client->outputFormat == RAS_OUTPUT_TEXT) {
+    rasOutAppend("NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
+                 " compiled with CUDA " STR(CUDA_MAJOR) "." STR(CUDA_MINOR) "\n");
+    rasOutAppend("CUDA runtime version %d, driver version %d\n\n", cudaRuntimeVersion, cudaDriverVersion);
+    msgLen = rasOutLength();
+    NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+    rasOutExtract(msg);
+    rasClientEnqueueMsg(client, msg, msgLen);
+    msg = nullptr;
+  }
 
   totalGpus = totalNodes = 0;
   firstNGpusNode = 0; // #GPUs on the first peer of a node.
@@ -550,15 +594,17 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode);
   TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal);
 
-  rasOutAppend("Job summary\n"
-               "===========\n\n");
+  // Only output job summary for text format.
+  if (client->outputFormat == RAS_OUTPUT_TEXT) {
+    rasOutAppend("Job summary\n"
+                 "===========\n\n");
 
-  if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
-    rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
-                 "(total)   per node  per process    (total)  (total)\n"
-                 "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
-                 totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
-  } else {
+    if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
+      rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
+                   "(total)   per node  per process    (total)  (total)\n"
+                   "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
+                   totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
+    } else {
     // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
     // but of a group of peers, so calculating it is more involved.  We store the value in a temporary auxRasPeers
     // array.
@@ -704,8 +750,9 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
           } // for (peerIdx)
         } // if (rasCountIsOutlier(vc->count))
       } // for (i)
-    } // !consistentNPeersGlobal
-  } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+      } // !consistentNPeersGlobal
+    } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+  } // TEXT format only
 
 #if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
       // To be revisited with future extensions to RAS.
@@ -726,12 +773,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       ret = ncclInProgress; // We need to wait for async. responses.
   }
 #endif
-  rasOutAppend("\nCommunicators...");
-  msgLen = rasOutLength();
-  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
-  rasOutExtract(msg);
-  rasClientEnqueueMsg(client, msg, msgLen);
-  msg = nullptr;
+  // Only send "Communicators..." message for text format.
+  if (client->outputFormat == RAS_OUTPUT_TEXT) {
+    rasOutAppend("\nCommunicators...");
+    msgLen = rasOutLength();
+    NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+    rasOutExtract(msg);
+    rasClientEnqueueMsg(client, msg, msgLen);
+    msg = nullptr;
+  }
   {
     struct rasCollRequest collReq = {};
     bool allDone = false;
@@ -770,7 +820,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   client->coll = nullptr;
 
   rasOutReset();
-  rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
+  rasOutAppend(" obtained a result in %.3fs\n", (clockNano()-coll->startTime)/1e9);
   if (coll->nLegTimeouts > 0) {
     rasOutAppend(" Warning: encountered %d communication timeout%s while gathering data\n", coll->nLegTimeouts,
                  (coll->nLegTimeouts > 1 ? "s" : ""));
@@ -827,7 +877,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
     rasOutAppend(" Collected data about %d unidirectional connection%s\n",
                  connsData->nConns, (connsData->nConns > 1 ? "s" : ""));
     rasOutAppend(" Travel times (valid only if system clocks are synchronized between nodes):\n"
-                 "  Minimum %fs, maximum %fs, average %fs\n",
+                 "  Minimum %.3fs, maximum %.3fs, average %.3fs\n",
                  connsData->travelTimeMin/1e9, connsData->travelTimeMax/1e9,
                  connsData->travelTimeSum/(1e9*connsData->travelTimeCount));
   } else {
@@ -845,7 +895,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
         int sourcePeerIdx = rasPeerFind(&negativeMin->source);
         int destPeerIdx = rasPeerFind(&negativeMin->dest);
         if (sourcePeerIdx != -1 && destPeerIdx != -1)
-          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %fs\n",
+          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %.3fs\n",
                        ncclSocketToHost(&negativeMin->source, rasLine, sizeof(rasLine)), rasPeers[sourcePeerIdx].pid,
                        ncclSocketToHost(&negativeMin->dest, lineBuf, sizeof(lineBuf)), rasPeers[destPeerIdx].pid,
                        negativeMin->travelTimeMin/1e9);
@@ -923,7 +973,6 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   client->coll = nullptr;
 
   rasOutReset();
-  rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
 
   // Calculate the number of missing peers early as we rely on it for other things.
   nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
@@ -955,6 +1004,21 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
   qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
 
+  // Check output format and call appropriate dump function for JSON.
+  if (client->outputFormat == RAS_OUTPUT_JSON) {
+    rasDumpCommsToJSON(client, commsData, coll, peerIdxConv);
+    msgLen = rasOutLength();
+    NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+    rasOutExtract(msg);
+    rasClientEnqueueMsg(client, msg, msgLen);
+    msg = nullptr;
+    client->status = RAS_CLIENT_FINISHED;
+    goto exit;
+  }
+
+  // Default TEXT format continues below.
+  rasOutAppend(" (%.3fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
+
   // Fill in the remaining fields of auxComm's.
   for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
     struct rasAuxComm* auxComm = auxComms+commIdx;
@@ -1818,6 +1882,111 @@ static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) {
 }
 
 
+//////////////////////////////////////////////////////////
+// JSON utility functions for structured output format. //
+//////////////////////////////////////////////////////////
+
+// Writes the JSON document header with metadata and opens the communicators array.
+static void jsonWriteHeader(const char* ncclVersion, int cudaRuntime, int cudaDriver,
+                            const char* timestamp, int commsCount) {
+  rasOutAppend("{\n");
+  rasOutAppend("  \"nccl_version\": \"%s\",\n", ncclVersion);
+  rasOutAppend("  \"cuda_runtime_version\": %d,\n", cudaRuntime);
+  rasOutAppend("  \"cuda_driver_version\": %d,\n", cudaDriver);
+  rasOutAppend("  \"timestamp\": \"%s\",\n", timestamp);
+  rasOutAppend("  \"communicators_count\": %d,\n", commsCount);
+  rasOutAppend("  \"communicators\": [\n");
+}
+
+// Starts a new communicator entry in the JSON output.
+static void jsonStartCommunicator(unsigned long commHash, unsigned long hostHash, unsigned long pidHash,
+                                  int commSize, int ranksCount, int missingCount, bool firstComm) {
+  if (!firstComm) rasOutAppend(",\n");
+  rasOutAppend("    {\n");
+  rasOutAppend("      \"hash\": \"0x%lx\",\n", commHash);
+  rasOutAppend("      \"secondary_hash\": \"0x%lx:0x%lx\",\n", hostHash, pidHash);
+  rasOutAppend("      \"size\": %d,\n", commSize);
+  rasOutAppend("      \"ranks_count\": %d,\n", ranksCount);
+  rasOutAppend("      \"missing_ranks_count\": %d,\n", missingCount);
+  rasOutAppend("      \"ranks\": [\n");
+}
+
+// Writes detailed rank information including status and collective operation counts.
+static void jsonWriteRankData(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                              int initState, int asyncError, bool finalizeCalled, bool destroyFlag,
+                              bool abortFlag, const unsigned long* collCounts, bool firstRank) {
+  if (!firstRank) rasOutAppend(",\n");
+  rasOutAppend("        {\n");
+  rasOutAppend("          \"rank\": %d,\n", rank);
+  rasOutAppend("          \"host\": \"%s\",\n", host);
+  rasOutAppend("          \"pid\": %d,\n", pid);
+  rasOutAppend("          \"cuda_dev\": %d,\n", cudaDev);
+  rasOutAppend("          \"nvml_dev\": %d,\n", nvmlDev);
+
+  // Status object.
+  rasOutAppend("          \"status\": {\n");
+  rasOutAppend("            \"init_state\": %d,\n", initState);
+  rasOutAppend("            \"async_error\": %d,\n", asyncError);
+  rasOutAppend("            \"finalize_called\": %s,\n", finalizeCalled ? "true" : "false");
+  rasOutAppend("            \"destroy_flag\": %s,\n", destroyFlag ? "true" : "false");
+  rasOutAppend("            \"abort_flag\": %s\n", abortFlag ? "true" : "false");
+  rasOutAppend("          },\n");
+
+  // Collective counts object.
+  rasOutAppend("          \"collective_counts\": {\n");
+  for (int op = 0; op < NCCL_NUM_FUNCTIONS; op++) {
+    rasOutAppend("            \"%s\": %lu", ncclFuncToString((ncclFunc_t)op), collCounts[op]);
+    if (op < NCCL_NUM_FUNCTIONS - 1) rasOutAppend(",");
+    rasOutAppend("\n");
+  }
+  rasOutAppend("          }\n");
+  rasOutAppend("        }");
+}
+
+// Closes the ranks array and starts the missing ranks section.
+static void jsonStartMissingRanks() {
+  rasOutAppend("\n"
+               "      ],\n");
+
+  rasOutAppend("      \"missing_ranks\": [\n");
+}
+
+// Writes basic information for a missing rank.
+static void jsonWriteMissingRank(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                                 bool unresponsive, bool dead, bool firstMissing) {
+  if (!firstMissing) rasOutAppend(",\n");
+  rasOutAppend("        {\n");
+  rasOutAppend("          \"rank\": %d,\n", rank);
+  rasOutAppend("          \"host\": \"%s\",\n", host);
+  rasOutAppend("          \"pid\": %d,\n", pid);
+  rasOutAppend("          \"cuda_dev\": %d,\n", cudaDev);
+  rasOutAppend("          \"nvml_dev\": %d\n", nvmlDev);
+
+  // Status object.
+  rasOutAppend("          \"status\": {\n");
+  rasOutAppend("            \"unresponsive\": %s,\n", unresponsive ? "true" : "false");
+  rasOutAppend("            \"considered_dead\": %s\n", dead ? "true" : "false");
+  rasOutAppend("          }\n");
+  rasOutAppend("        }");
+}
+
+// Closes the current communicator entry.
+static void jsonEndCommunicator() {
+  rasOutAppend("\n      ]\n");
+  rasOutAppend("    }");
+}
+
+// Writes the JSON document footer with RAS metadata and closes the document.
+static void jsonWriteFooter(double collectionTime, int timeoutsCount) {
+  rasOutAppend("\n  ],\n");
+  rasOutAppend("  \"ras\": {\n");
+  rasOutAppend("    \"collection_time_sec\": %.3f,\n", collectionTime);
+  rasOutAppend("    \"timeouts_count\": %d\n", timeoutsCount);
+  rasOutAppend("  }\n");
+  rasOutAppend("}\n");
+}
+
+
 ////////////////////////////////////////////////////////////
 // String formatting functions for various types of data. //
 ////////////////////////////////////////////////////////////
@@ -1887,6 +2056,81 @@ static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* b
   }
 }
 
+// Dump communicator data to JSON format - build JSON in output buffer.
+static void rasDumpCommsToJSON(struct rasClient* client, struct rasCollComms* commsData,
+                               struct rasCollective* coll, const int* peerIdxConv) {
+  char hostBuf[256], timeBuf[64];
+
+  time_t timestampSec = time(NULL);
+  struct tm tmBuffer;
+  struct tm* tmInfo = localtime_r(&timestampSec, &tmBuffer);
+  strftime(timeBuf, sizeof(timeBuf), "%F %T", tmInfo);
+
+  // Write JSON header with metadata.
+  jsonWriteHeader(STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX,
+                  cudaRuntimeVersion, cudaDriverVersion, timeBuf, commsData->nComms);
+
+  struct rasCollComms::comm* comm = commsData->comms;
+
+  // Iterate through communicators.
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    jsonStartCommunicator(comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash,
+                          comm->commNRanks, comm->nRanks, comm->nMissingRanks, (commIdx == 0));
+
+    // Add each rank.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = comm->ranks + rankIdx;
+
+      // Get host and pid information.
+      const char* host = "unknown";
+      int pid = -1;
+      if (rank->peerIdx >= 0 && peerIdxConv && peerIdxConv[rank->peerIdx] >= 0) {
+        int rasPeerIdx = peerIdxConv[rank->peerIdx];
+        host = ncclSocketToHost(&rasPeers[rasPeerIdx].addr, hostBuf, sizeof(hostBuf));
+        pid = rasPeers[rasPeerIdx].pid;
+      }
+
+      jsonWriteRankData(rank->commRank, host, pid, rank->cudaDev, rank->nvmlDev,
+                        rank->status.initState, rank->status.asyncError,
+                        rank->status.finalizeCalled, rank->status.destroyFlag, rank->status.abortFlag,
+                        rank->collOpCounts, (rankIdx == 0));
+    }
+
+    // Start missing ranks section.
+    jsonStartMissingRanks();
+
+    // Add missing ranks.
+    struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks + comm->nRanks);
+    for (int missingIdx = 0; missingIdx < comm->nMissingRanks; missingIdx++) {
+      struct rasCollCommsMissingRank* missingRank = missingRanks + missingIdx;
+
+      // Get host and pid for missing rank.
+      int rasPeerIdx = rasPeerFind(&missingRank->addr);
+      const char* host = "unknown";
+      int pid = -1;
+      bool unresponsive = (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                                   ncclSocketsCompare) == nullptr);
+      bool dead = rasPeerIsDead(&missingRank->addr);
+      if (rasPeerIdx >= 0) {
+        host = ncclSocketToHost(&rasPeers[rasPeerIdx].addr, hostBuf, sizeof(hostBuf));
+        pid = rasPeers[rasPeerIdx].pid;
+      }
+
+      jsonWriteMissingRank(missingRank->commRank, host, pid, missingRank->cudaDev, missingRank->nvmlDev,
+                           unresponsive, dead, (missingIdx == 0));
+    }
+
+    jsonEndCommunicator();
+
+    // Move to the next communicator.
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) +
+                                        comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+  }
+
+  // Write JSON footer with RAS metadata.
+  jsonWriteFooter((clockNano()-coll->startTime)/1e9, coll->nLegTimeouts);
+}
+
 // Determines if the given count constitutes an outlier.
 static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
   if (count == 1)
diff --git a/src/ras/ras_internal.h b/src/ras/ras_internal.h
index 17326c342..5f390ee6b 100644
--- a/src/ras/ras_internal.h
+++ b/src/ras/ras_internal.h
@@ -436,6 +436,13 @@ typedef enum {
   RAS_CLIENT_FINISHED = 99
 } rasClientStatus;
 
+// Output format enum for different data export types.
+// This is shared between client and server.
+typedef enum {
+  RAS_OUTPUT_TEXT = 0,    // Default human-readable format.
+  RAS_OUTPUT_JSON = 1     // JSON format (always verbose).
+} rasOutputFormat;
+
 // Describes a RAS client.
 struct rasClient {
   struct rasClient* next;
@@ -456,6 +463,8 @@ struct rasClient {
   int verbose;
   int64_t timeout;
 
+  rasOutputFormat outputFormat;  // TEXT or JSON output format.
+
   // State stored during asynchronous operations such as collectives.
   struct rasCollective* coll;
 };
diff --git a/src/register/register.cc b/src/register/register.cc
index b118a4cc4..888c0fc0c 100644
--- a/src/register/register.cc
+++ b/src/register/register.cc
@@ -115,15 +115,24 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
 
 NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
 ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  if (!ncclParamLocalRegister())
+  if (!ncclParamLocalRegister() || ncclP2pUsesMemcpy()) {
     *handle = NULL;
-  else
+    INFO(NCCL_REG, "Skipping registration for buffer %p size %zi (LocalRegister=%ld, P2pUsesMemcpy=%d)",
+         buff, size, ncclParamLocalRegister(), ncclP2pUsesMemcpy());
+  } else {
     NCCLCHECK(ncclRegister(comm, buff, size, false, handle));
+  }
   return ncclSuccess;
 }
 
 ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  if (ncclP2pUsesMemcpy()) {
+    *handle = NULL;
+    INFO(NCCL_REG, "Skipping graph registration for buffer %p size %zi (P2pUsesMemcpy=%d)",
+         buff, size, ncclP2pUsesMemcpy());
+  } else {
+    NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  }
   return ncclSuccess;
 }
 
diff --git a/src/scheduler/symmetric_sched.cc b/src/scheduler/symmetric_sched.cc
index 440b6061b..e09beba15 100644
--- a/src/scheduler/symmetric_sched.cc
+++ b/src/scheduler/symmetric_sched.cc
@@ -16,6 +16,7 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
   int fnOpTySymIndices[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes];
   struct ncclKernelPlanner* planner = &comm->planner;
   struct ncclTaskColl* remainTasksTail = nullptr;
+  bool foundSymm = false;
 
   memset(tasksSymByFnOpTy, 0, sizeof(tasksSymByFnOpTy));
   *remainTasksHead = nullptr;
@@ -31,6 +32,7 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
       task->next = tasksSymByFnOpTy[index];
       tasksSymByFnOpTy[index] = task;
       planner->nTasksColl--;
+      foundSymm = true;
     } else {
       if (*remainTasksHead) {
         remainTasksTail->next = task;
@@ -43,6 +45,8 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
   }
   if (remainTasksTail) remainTasksTail->next = nullptr;
 
+  if (!foundSymm) goto exit;
+
   // make sure kernel args space can hold at least a single work
   assert(comm->workArgsBytes >= ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, 1));
 
diff --git a/src/sym_kernels.cc b/src/sym_kernels.cc
index df4965d56..ea238e6ee 100644
--- a/src/sym_kernels.cc
+++ b/src/sym_kernels.cc
@@ -55,6 +55,18 @@ constexpr uint32_t kernelMask_RS = 1<<ncclSymkKernelId_ReduceScatter_LD |
                                    1<<ncclSymkKernelId_ReduceScatter_LDMC |
                                    1<<ncclSymkKernelId_ReduceScatter_LL;
 
+constexpr uint32_t kernelMask_LSA = 1<<ncclSymkKernelId_AllReduce_AGxLL_R |
+                                    1<<ncclSymkKernelId_AllReduce_AGxLLMC_R |
+                                    1<<ncclSymkKernelId_AllReduce_RSxLD_AGxST |
+                                    1<<ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                    1<<ncclSymkKernelId_AllGather_LL |
+                                    1<<ncclSymkKernelId_AllGather_LLMC |
+                                    1<<ncclSymkKernelId_AllGather_ST |
+                                    1<<ncclSymkKernelId_AllGather_STMC |
+                                    1<<ncclSymkKernelId_ReduceScatter_LL |
+                                    1<<ncclSymkKernelId_ReduceScatter_LD |
+                                    1<<ncclSymkKernelId_ReduceScatter_LDMC;
+
 static uint32_t kernelMask_coll(ncclFunc_t coll) {
   switch (coll) {
   case ncclFuncAllGather: return kernelMask_AG;
@@ -218,7 +230,9 @@ ncclResult_t ncclSymkInitOnce(struct ncclComm* comm) {
     lla2aReq.next = reqs.resourceRequirementsList;
     reqs.resourceRequirementsList = &lla2aReq;
 
-    NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm));
+    if (comm->nNodes == 1) {
+      NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm));
+    }
   }
   return ncclSuccess;
 }
@@ -297,6 +311,8 @@ static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDe
   // to be at least 32 bytes per chunk)
   if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
 
+  if (comm->nNodes > 1) kmask &= ~kernelMask_LSA;
+
   return kmask;
 }
 
diff --git a/src/transport.cc b/src/transport.cc
index d98b98b1b..7b039231d 100644
--- a/src/transport.cc
+++ b/src/transport.cc
@@ -71,32 +71,46 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
 NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
 #include <sys/time.h>
 
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode) {
-  bool supportFlag = true;
+// Tests communicator for CUDA P2P connectivity (local ranks only).
+// *isAllDirectP2p returns 1 if all local ranks have CUDA P2P connectivity with each other
+// and are no further than NCCL_P2P_LEVEL apart.
+// *directMode returns 1 if *any* two local ranks are managed by the same process.
+// *isAllCudaP2p returns 1 if all local ranks have CUDA P2P connectivity with each other, irrespective of the distance.
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode,
+                                       bool* isAllCudaP2p) {
+  bool ncclP2pFlag = true;
   bool directFlag = false;
-  if (comm->localRanks == 1) {
-    supportFlag = false;
-  } else {
-    for (int i = 0; i < comm->localRanks; ++i) {
-      for (int j = i + 1; j < comm->localRanks; ++j) {
-        int ipeer = comm->localRankToRank[i];
-        int jpeer = comm->localRankToRank[j];
-        struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
-        struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
-        int canConnect = 0;
-        int intermediateRank = -1;
-        NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank, &canConnect, NULL, &intermediateRank));
-        if (!canConnect || intermediateRank != -1) {
-          supportFlag = false;
-        }
-        if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
-        if (!supportFlag && directFlag) break;
+  bool cudaP2pFlag = true;
+  for (int i = 0; i < comm->localRanks; ++i) {
+    for (int j = i + 1; j < comm->localRanks; ++j) {
+      int ipeer = comm->localRankToRank[i];
+      int jpeer = comm->localRankToRank[j];
+      struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
+      struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
+      int canConnect = 0;
+      int intermediateRank = -1;
+      int cudaP2p = 0;
+      NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank,
+                                 &canConnect, NULL, &intermediateRank, &cudaP2p));
+      if (!canConnect || intermediateRank != -1) {
+        ncclP2pFlag = false;
+      }
+      if (!cudaP2p) {
+        cudaP2pFlag = false;
+      }
+      if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) {
+        directFlag = true;
+      }
+      if (!ncclP2pFlag && directFlag && !cudaP2pFlag) {
+        break;
       }
     }
   }
-  *isAllDirectP2p = supportFlag;
+  *isAllDirectP2p = ncclP2pFlag;
   *directMode = directFlag;
-  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d", supportFlag, directFlag);
+  *isAllCudaP2p = cudaP2pFlag;
+  INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d isAllCudaP2p %d",
+       *isAllDirectP2p, *directMode, *isAllCudaP2p);
   return ncclSuccess;
 }
 
diff --git a/src/transport/CMakeLists.txt b/src/transport/CMakeLists.txt
index 0485008c0..58a0a3d41 100644
--- a/src/transport/CMakeLists.txt
+++ b/src/transport/CMakeLists.txt
@@ -11,5 +11,16 @@ set(TRANSPORT_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/generic.cc
 )
 
+add_subdirectory(gdaki)
+
 # Add transport sources to parent scope
 set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} PARENT_SCOPE)
+
+# Add DOCA device headers to parent scope
+set(DEVICE_DOCA_HEADERS ${DEVICE_DOCA_HEADERS} PARENT_SCOPE)
+
+# Add DOCA sources to parent scope
+set(DOCA_SOURCES ${DOCA_SOURCES} PARENT_SCOPE)
+
+# Add DOCA_HOME to parent scope
+set(DOCA_HOME ${DOCA_HOME} PARENT_SCOPE)
diff --git a/src/transport/gdaki/CMakeLists.txt b/src/transport/gdaki/CMakeLists.txt
new file mode 100644
index 000000000..5f422dc6d
--- /dev/null
+++ b/src/transport/gdaki/CMakeLists.txt
@@ -0,0 +1,65 @@
+# DOCA
+# Allow users to specify DOCA_HOME via cmake -DDOCA_HOME=<path>
+# Default to transport/gdaki/doca-gpunetio if not specified
+if(NOT DEFINED DOCA_HOME)
+    set(DOCA_HOME ${CMAKE_CURRENT_SOURCE_DIR}/doca-gpunetio)
+endif()
+
+# Copy DOCA GPUNetIO headers to build directory
+set(DOCA_INCLUDE_SOURCE_DIR ${DOCA_HOME}/include)
+set(DOCA_INCLUDE_DEST_DIR ${CMAKE_BINARY_DIR}/include/nccl_device/gin/gdaki/doca_gpunetio)
+
+# Get all header files from the source directory, including subfolders
+file(GLOB DOCA_HEADER_TOP ${DOCA_INCLUDE_SOURCE_DIR}/doca_gpunetio_device.h)
+file(GLOB DOCA_HEADER_COMMON ${DOCA_INCLUDE_SOURCE_DIR}/common/*.h)
+file(GLOB DOCA_HEADER_DEVICE ${DOCA_INCLUDE_SOURCE_DIR}/device/*.cuh)
+
+# Copy top-level header
+foreach(HEADER_FILE ${DOCA_HEADER_TOP})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/${HEADER_NAME})
+endforeach()
+
+# Copy common/ headers
+foreach(HEADER_FILE ${DOCA_HEADER_COMMON})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/common/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/common/${HEADER_NAME})
+endforeach()
+
+# Copy device/ headers
+foreach(HEADER_FILE ${DOCA_HEADER_DEVICE})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/device/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/device/${HEADER_NAME})
+endforeach()
+
+# Add DOCA device headers to parent scope
+set(DEVICE_DOCA_HEADERS ${DEVICE_DOCA_HEADERS} PARENT_SCOPE)
+
+# DOCA sources
+set(DOCA_SOURCES
+    ${DOCA_HOME}/src/doca_verbs_qp.cpp
+    ${DOCA_HOME}/src/doca_verbs_cq.cpp
+    ${DOCA_HOME}/src/doca_verbs_device_attr.cpp
+    ${DOCA_HOME}/src/doca_verbs_umem.cpp
+    ${DOCA_HOME}/src/doca_verbs_srq.cpp
+    ${DOCA_HOME}/src/doca_verbs_uar.cpp
+    ${DOCA_HOME}/src/doca_gpunetio.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_log.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_high_level.cpp
+    ${DOCA_HOME}/src/doca_verbs_cuda_wrapper.cpp
+    ${DOCA_HOME}/src/doca_verbs_mlx5dv_wrapper.cpp
+    ${DOCA_HOME}/src/doca_verbs_ibv_wrapper.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_gdrcopy.cpp
+)
+
+# Add DOCA sources to parent scope
+set(DOCA_SOURCES ${DOCA_SOURCES} PARENT_SCOPE)
+
+# Add DOCA_HOME to parent scope
+set(DOCA_HOME ${DOCA_HOME} PARENT_SCOPE)
+
+# Add gin_host_gdaki.cc to TRANSPORT_SOURCES in parent scope
+set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_gdaki.cc PARENT_SCOPE)
diff --git a/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h b/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h
new file mode 100644
index 000000000..1de849c94
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h
@@ -0,0 +1,398 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_def.h
+ * @brief GDAKI common definitions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_VERBS_DEF_H
+#define DOCA_GPUNETIO_VERBS_DEF_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <limits.h>
+#include <linux/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Macro to temporarily cast a variable to volatile.
+ */
+#define DOCA_GPUNETIO_VOLATILE(x) (*(volatile typeof(x) *)&(x))
+
+/**
+ * Default warp size value of 32 threads
+ */
+#define DOCA_GPUNETIO_VERBS_WARP_SIZE 32
+
+/**
+ * Default warp full mask value
+ */
+#define DOCA_GPUNETIO_VERBS_WARP_FULL_MASK 0xffffffff
+
+/**
+ * Default page size alignment on GPU
+ */
+#define DOCA_GPUNETIO_VERBS_PAGE_SIZE 65536
+
+/**
+ * CQE Consumer Index Mask - 24bits counter
+ */
+#define DOCA_GPUNETIO_VERBS_CQE_CI_MASK 0xFFFFFF
+
+/**
+ * WQE Producer Index Mask - 16bits counter
+ */
+#define DOCA_GPUNETIO_VERBS_WQE_PI_MASK 0xFFFF
+
+#define DOCA_GPUNETIO_IB_MLX5_WQE_SQ_SHIFT 6
+
+/**
+ * Set to 1 if mkeys passed to the wqe functions
+ * are already swapped by application.
+ * Otherwise set it to 0.
+ */
+#define DOCA_GPUNETIO_VERBS_MKEY_SWAPPED 1
+
+/**
+ * Enable debug prints in this headerfile.
+ * Bad for performance, should be used only for debugging
+ */
+#ifndef DOCA_GPUNETIO_VERBS_ENABLE_DEBUG
+#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 0
+#endif
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+#include <assert.h>
+#define DOCA_GPUNETIO_VERBS_ASSERT(x) assert(x)
+#else
+#define DOCA_GPUNETIO_VERBS_ASSERT(x) \
+    do {                              \
+    } while (0)
+#endif
+
+/**
+ * WQE data segment inline data with byte count
+ */
+#define DOCA_GPUNETIO_VERBS_MAX_INLINE_SIZE 28
+
+/**
+ * CQE Opcode Shift Bytes
+ */
+#define DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT 4
+
+#define DOCA_GPUNETIO_VERBS_CQE_SIZE 64
+
+#define DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT 8
+/**
+ * Max RDMA transfer size
+ */
+#define DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT 30
+#define DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE \
+    (1ULL << DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT)  // 1GiB
+
+#ifndef ACCESS_ONCE
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE(x) ACCESS_ONCE(x)
+#endif
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, v) (ACCESS_ONCE(x) = (v))
+#endif
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_NOP = 0x00,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND_INVAL = 0x01,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE = 0x08,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE_IMM = 0x09,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND = 0x0a,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND_IMM = 0x0b,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_TSO = 0x0e,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ = 0x10,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_CS = 0x11,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA = 0x12,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_MASKED_CS = 0x14,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_MASKED_FA = 0x15,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_FMR = 0x19,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_LOCAL_INVAL = 0x1b,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_WAIT = 0x0f,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_CONFIG_CMD = 0x1f,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SET_PSV = 0x20,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_DUMP = 0x23,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_UMR = 0x25,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_TAG_MATCHING = 0x28,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_FLOW_TBL_ACCESS = 0x2c,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_MMO = 0x2F,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_CQE_ERROR = 0x0,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_FIRST_CQE_ERROR = 0x1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ALWAYS = 0x2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_AND_EQE = 0x3,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_NO_FENCE = 0x0,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_INITIATOR_SMALL_FENCE = 0x1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE = 0x2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_STRONG_ORDERING = 0x3,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE_AND_INITIATOR_SMALL_FENCE = 0x4,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_CUSTOM = 0x100, /* None of the previous, use custom value */
+};
+
+/**
+ * GPUNetIO Verbs flags for WQE control segment
+ */
+enum doca_gpu_dev_verbs_wqe_ctrl_flags {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE = DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ALWAYS << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_ERROR_UPDATE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_CQE_ERROR << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_FIRST_CQE_ERROR =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_FIRST_CQE_ERROR << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_SOLICITED = 1 << 1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FENCE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE_AND_INITIATOR_SMALL_FENCE << 5,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_INITIATOR_SMALL_FENCE << 5,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_STRONG_ORDERING =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_STRONG_ORDERING << 5
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_RCV_DBR = 0,
+    DOCA_GPUNETIO_IB_MLX5_SND_DBR = 1,
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_mem_type
+ * @brief Memory type of the buffer.
+ */
+enum doca_gpu_dev_verbs_mem_type {
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_AUTO =
+        0,  ///< Automatically select the most performant memory type
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_HOST = 1,      ///< Allocate resource on host memory
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU = 2,       ///< Allocate resource on GPU memory
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_mem_type
+ * @brief Memory type of the buffer.
+ */
+enum doca_gpu_dev_verbs_qp_type {
+    DOCA_GPUNETIO_VERBS_QP_SQ = 0,  ///< Use QP SQ
+};
+
+enum doca_gpu_dev_verbs_exec_scope {
+    DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD = 0,
+    DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_sync_scope
+ * @brief Synchronization scope.
+ */
+enum doca_gpu_dev_verbs_sync_scope {
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS = 0,       ///< System synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU = 1,       ///< GPU synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA = 2,       ///< CTA synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_resource_sharing_mode
+ * @brief Resource sharing mode.
+ */
+enum doca_gpu_dev_verbs_resource_sharing_mode {
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE =
+        0,  ///< The resource is exclusive to one CUDA thread
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA = 1,       ///< The resource is shared among CUDA
+                                                             ///< threads in the same CTA
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU = 2,       ///< The resource is shared among CUDA
+                                                             ///< threads in the same GPU
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_nic_handler
+ * @brief The processor that handles the NIC.
+ */
+enum doca_gpu_dev_verbs_nic_handler {
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO = 0,  ///< Automatically select the most performant handler
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY = 1,  ///< CPU Proxy
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB = 2,  ///< GPU SM, regular DB
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF = 3,  ///< GPU SM, BlueFlame DB
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_TYPE_MAX,       ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_gpu_code_opt
+ * @brief GPU code optimization for GDA-KI. They can be combined using bitwise or.
+ */
+enum doca_gpu_dev_verbs_gpu_code_opt {
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT = 0,  ///< Use default code optimization
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE = (1 << 0),  ///< Use store.async.release
+                                                                      ///< code optimization
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_MAX = INT_MAX                    ///< Sentinel value
+};
+
+enum doca_gpu_dev_verbs_signal_op {
+    DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD = 0,  ///< Signal operation - Add
+};
+
+enum {
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MIN = 3,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MAX = 4,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_ATOMIC_FA_CAS = 4,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_WAIT = 2
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_INLINE_SEG = 0x80000000,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK = 1,
+    DOCA_GPUNETIO_IB_MLX5_CQE_REQ = 0,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_WR_IMM = 1,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND = 2,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND_IMM = 3,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND_INV = 4,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESIZE_CQ = 5,
+    DOCA_GPUNETIO_IB_MLX5_CQE_NO_PACKET = 6,
+    DOCA_GPUNETIO_IB_MLX5_CQE_SIG_ERR = 12,
+    DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR = 13,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_ERR = 14,
+    DOCA_GPUNETIO_IB_MLX5_CQE_INVALID = 15,
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_data_seg {
+    __be32 byte_count;
+    __be32 lkey;
+    __be64 addr;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg {
+    __be32 opmod_idx_opcode;
+    __be32 qpn_ds;
+    uint8_t signature;
+    __be16 dci_stream_channel_id;
+    uint8_t fm_ce_se;
+    __be32 imm;
+} __attribute__((__packed__)) __attribute__((__aligned__(4)));
+
+struct doca_gpunetio_ib_mlx5_wqe_raddr_seg {
+    __be64 raddr;
+    __be32 rkey;
+    __be32 reserved;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_atomic_seg {
+    __be64 swap_add;
+    __be64 compare;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg {
+    uint32_t byte_count;
+};
+
+struct doca_gpunetio_ib_mlx5_tm_cqe {
+    __be32 success;
+    __be16 hw_phase_cnt;
+    uint8_t rsvd0[12];
+};
+
+struct doca_gpunetio_ib_ibv_tmh {
+    uint8_t opcode;      /* from enum ibv_tmh_op */
+    uint8_t reserved[3]; /* must be zero */
+    __be32 app_ctx;      /* opaque user data */
+    __be64 tag;
+};
+
+struct doca_gpunetio_ib_mlx5_cqe64 {
+    union {
+        struct {
+            uint8_t rsvd0[2];
+            __be16 wqe_id;
+            uint8_t rsvd4[13];
+            uint8_t ml_path;
+            uint8_t rsvd20[4];
+            __be16 slid;
+            __be32 flags_rqpn;
+            uint8_t hds_ip_ext;
+            uint8_t l4_hdr_type_etc;
+            __be16 vlan_info;
+        };
+        struct doca_gpunetio_ib_mlx5_tm_cqe tm_cqe;
+        /* TMH is scattered to CQE upon match */
+        struct doca_gpunetio_ib_ibv_tmh tmh;
+    };
+    __be32 srqn_uidx;
+    __be32 imm_inval_pkey;
+    uint8_t app;
+    uint8_t app_op;
+    __be16 app_info;
+    __be32 byte_cnt;
+    __be64 timestamp;
+    __be32 sop_drop_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+struct doca_gpunetio_ib_mlx5_err_cqe_ex {
+    uint8_t rsvd0[32];
+    __be32 srqn;
+    uint8_t rsvd1[16];
+    uint8_t hw_err_synd;
+    uint8_t hw_synd_type;
+    uint8_t vendor_err_synd;
+    uint8_t syndrome;
+    __be32 s_wqe_opcode_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_VERBS_DEF_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h b/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h
new file mode 100644
index 000000000..1067833df
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h
@@ -0,0 +1,203 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_dev.h
+ * @brief GDAKI common definitions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_VERBS_DEV_H
+#define DOCA_GPUNETIO_VERBS_DEV_H
+
+#include "doca_gpunetio_verbs_def.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @typedef doca_gpu_dev_verbs_ticket_t
+ * @brief Ticket type used in one-sided APIs.
+ */
+typedef uint64_t doca_gpu_dev_verbs_ticket_t;
+
+/**
+ * Describes GPUNetIO dev WQE crtl segment.
+ */
+struct doca_gpu_dev_verbs_wqe_ctrl_seg {
+    __be32 opmod_idx_opcode; /**< opcode + wqe idx */
+    __be32 qpn_ds;           /**< qp number */
+    union {
+        struct {
+            uint8_t signature; /**< signature */
+            uint8_t rsvd[2];   /**< reserved */
+            uint8_t fm_ce_se;  /**< fm_ce_se */
+        };
+        struct {
+            __be32 signature_fm_ce_se; /**< all flags in or */
+        };
+    };
+
+    __be32 imm; /**< immediate */
+} __attribute__((__aligned__(8)));
+
+/**
+ * Describes GPUNetIO dev WQE crtl segment.
+ */
+struct doca_gpu_dev_verbs_wqe_wait_seg {
+    uint32_t resv[2];
+    __be32 max_index;
+    __be32 qpn_cqn;
+} __attribute__((__packed__)) __attribute__((__aligned__(8)));
+
+/**
+ * @struct doca_gpu_dev_verbs_addr
+ * @brief This structure holds the address and key of a memory region.
+ */
+struct doca_gpu_dev_verbs_addr {
+    uint64_t addr;
+    __be32 key;
+};
+
+/**
+ * Describes GPUNetIO dev general WQE.
+ */
+struct doca_gpu_dev_verbs_wqe {
+    union {
+        /* Generic inline Data */
+        struct {
+            uint8_t inl_data[64];
+        };
+
+        /* Generic Data */
+        struct {
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg2;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg3;
+        };
+
+        /* Read/Write */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg rw_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rw_rseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg rw_dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg rw_dseg1;
+        };
+
+        /* Atomic */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg at_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_raddr_seg at_rseg;
+            struct doca_gpunetio_ib_mlx5_wqe_atomic_seg at_seg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg at_dseg;
+        };
+
+        /* Send */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg snd_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg1;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg2;
+        };
+
+        /* Wait */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg wait_cseg;
+            struct doca_gpu_dev_verbs_wqe_wait_seg wait_dseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg padding0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg padding1;
+        };
+    };
+} __attribute__((__aligned__(8)));
+
+/**
+ * Describes GPUNetIO dev CQ
+ */
+struct doca_gpu_dev_verbs_cq {
+    uint8_t *cqe_daddr;                         /**< CQE address */
+    uint32_t cq_num;                            /**< CQ number */
+    uint32_t cqe_num;                           /**< Total number of CQEs in CQ */
+    __be32 *dbrec;                              /**< CQE Doorbell Record */
+    uint64_t cqe_ci;                            /**< CQE Consumer Index */
+    uint32_t cqe_mask;                          /**< Mask of total number of CQEs in CQ */
+    uint8_t cqe_size;                           /**< Single CQE size (64B default) */
+    uint64_t cqe_rsvd;                          /**< All previous CQEs are polled */
+    enum doca_gpu_dev_verbs_mem_type mem_type;  ///< Memory type of the completion queue
+};
+
+/**
+ * Describes GPUNetIO dev QP
+ */
+struct doca_gpu_dev_verbs_qp {
+    uint64_t sq_rsvd_index;        ///< All WQE slots prior to this index are reserved
+    uint64_t sq_ready_index;       ///< All WQE slots prior to this index are ready
+    uint64_t sq_wqe_pi;            /**< tbd */
+    uint32_t sq_num;               /**< SQ num */
+    uint32_t sq_num_shift8;        /**< SQ num << 8 */
+    uint32_t sq_num_shift8_be;     /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_1ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_2ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_3ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_4ds; /**< SQ num << 8 big endian */
+    int sq_lock;                   /**< SQ lock */
+    uint16_t sq_wqe_num;           /**< tbd */
+    uint16_t sq_wqe_mask;          /**< tbd */
+    uint8_t *sq_wqe_daddr;         /**< tbd */
+    __be32 *sq_dbrec;              /**< tbd */
+    uint64_t *sq_db;               /**< tbd */
+
+    /* Compatibility with DOCA GPUNetIO full, not really used */
+    uint32_t rq_num;         /**< tbd */
+    uint64_t rq_wqe_pi;      /**< tbd */
+    uint32_t rq_wqe_num;     /**< tbd */
+    uint32_t rq_wqe_mask;    /**< tbd */
+    uint8_t *rq_wqe_daddr;   /**< tbd */
+    __be32 *rq_dbrec;        /**< tbd */
+    uint32_t rcv_wqe_size;   /**< tbd */
+    uint64_t rq_rsvd_index;  /**< All previous WQEs are reserved */
+    uint64_t rq_ready_index; /**< All previous WQEs are ready */
+    int rq_lock;             /**< RQ lock */
+
+    struct doca_gpu_dev_verbs_cq cq_sq; /**< SQ CQ connected to QP */
+    struct doca_gpu_dev_verbs_cq cq_rq; /**< RQ CQ connected to QP */
+
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;  ///< NIC handler
+    enum doca_gpu_dev_verbs_mem_type mem_type;        ///< Memory type of the completion
+} __attribute__((__aligned__(8)));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_VERBS_DEV_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh
new file mode 100644
index 000000000..293754d59
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh
@@ -0,0 +1,422 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_common.cuh
+ * @brief GDAKI common device structs and functions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_COMMON_H
+#define DOCA_GPUNETIO_DEV_VERBS_COMMON_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda/atomic>
+#include <math.h>
+
+#include "../common/doca_gpunetio_verbs_dev.h"
+
+#if __CUDA_ARCH__ >= 1000
+#define DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE 1
+#endif
+
+#if __CUDA_ARCH__ >= 900
+#define DOCA_GPUNETIO_VERBS_HAS_TMA_COPY 1
+#endif
+
+#if CUDA_VERSION >= 12020
+#define DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO 1
+#else
+#warning "warning: doca_gpunetio should be used with a CUDA version >= 12020."
+#endif
+
+#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
+#define DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
+#endif
+
+/**
+ * @brief Queries the global timer
+ *
+ * @return The value of the global timer
+ */
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_query_globaltimer() {
+    uint64_t ret;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(ret)::"memory");
+    return ret;
+}
+
+__device__ static __forceinline__ unsigned int doca_gpu_dev_verbs_get_lane_id() {
+    unsigned int ret;
+    asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret));
+    return ret;
+}
+
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_bswap64(uint64_t x) {
+    uint64_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask;\n\t"
+        ".reg .b32 ign;\n\t"
+        ".reg .b32 lo;\n\t"
+        ".reg .b32 hi;\n\t"
+        ".reg .b32 new_lo;\n\t"
+        ".reg .b32 new_hi;\n\t"
+        "mov.b32 mask, 0x0123;\n\t"
+        "mov.b64 {lo,hi}, %1;\n\t"
+        "prmt.b32 new_hi, lo, ign, mask;\n\t"
+        "prmt.b32 new_lo, hi, ign, mask;\n\t"
+        "mov.b64 %0, {new_lo,new_hi};\n\t"
+        "}"
+        : "=l"(ret)
+        : "l"(x));
+    return ret;
+}
+
+__device__ static __forceinline__ uint32_t doca_gpu_dev_verbs_bswap32(uint32_t x) {
+    uint32_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask;\n\t"
+        ".reg .b32 ign;\n\t"
+        "mov.b32 mask, 0x0123;\n\t"
+        "prmt.b32 %0, %1, ign, mask;\n\t"
+        "}"
+        : "=r"(ret)
+        : "r"(x));
+    return ret;
+}
+
+__device__ static __forceinline__ uint16_t doca_gpu_dev_verbs_bswap16(uint16_t x) {
+    uint16_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b8 hi;\n\t"
+        ".reg .b8 lo;\n\t"
+        "mov.b16 {hi, lo}, %1;\n\t"
+        "mov.b16 %0, {lo, hi};\n\t"
+        "}"
+        : "=h"(ret)
+        : "h"(x));
+    return ret;
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+__device__ static __forceinline__ void doca_gpu_dev_verbs_store_relaxed_mmio(uint64_t *ptr,
+                                                                             uint64_t val) {
+    asm volatile("st.mmio.relaxed.sys.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+#endif
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence_acquire() {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("fence.acquire.cta;");
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("fence.acquire.gpu;");
+    else
+        asm volatile("fence.acquire.sys;");
+#else
+    // fence.acquire is not available in PTX. Emulate that with st.release.
+    uint32_t dummy;
+    const uint32_t val = 0;
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("ld.acquire.cta.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("ld.acquire.gpu.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+    else
+        asm volatile("ld.acquire.sys.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+#endif
+}
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence_release() {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("fence.release.cta;");
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("fence.release.gpu;");
+    else
+        asm volatile("fence.release.sys;");
+#else
+    // fence.release is not available in PTX. Emulate that with st.release.
+    uint32_t dummy;
+    const uint32_t val = 0;
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("st.release.cta.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+    else
+        asm volatile("st.release.sys.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+#endif
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_async_store_release(uint32_t *ptr,
+                                                                              uint32_t val) {
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.async.mmio.release.gpu.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+    else
+        asm volatile("st.async.mmio.release.sys.global.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_async_store_release(uint64_t *ptr,
+                                                                              uint64_t val) {
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.async.mmio.release.gpu.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+    else
+        asm volatile("st.async.mmio.release.sys.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+#endif
+
+__device__ static __forceinline__ bool doca_gpu_dev_verbs_isaligned(void *ptr, size_t alignment) {
+    bool status;
+    status = (((uintptr_t)ptr & (alignment - 1)) == 0);
+    return status;
+}
+
+/**
+ * @brief Copy data from src to dst. The data must have natural alignment with it's size.
+ *
+ * @param dst - Destination pointer
+ * @param src - Source pointer
+ * @param bytes - Number of bytes to copy
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_aligned_data(void *dst, void *src,
+                                                                              size_t bytes) {
+    size_t remaining_bytes = bytes;
+    size_t copied_size;
+    while (remaining_bytes > 0) {
+        if (remaining_bytes >= sizeof(uint32_t)) {
+            *(uint32_t *)dst = *(uint32_t *)src;
+            copied_size = sizeof(uint32_t);
+        } else if (remaining_bytes >= sizeof(uint16_t)) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            copied_size = sizeof(uint16_t);
+        } else {
+            *(uint8_t *)dst = *(uint8_t *)src;
+            copied_size = sizeof(uint8_t);
+        }
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+/**
+ * @brief Copy data from src to dst. The data may or may not have natural alignment with it's size.
+ *
+ * @param dst - Destination pointer
+ * @param src - Source pointer
+ * @param bytes - Number of bytes to copy
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_data(void *dst, void *src,
+                                                                      size_t bytes) {
+    size_t remaining_bytes = bytes;
+    size_t copied_size;
+    while (remaining_bytes > 0) {
+        if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint64_t)) &&
+            doca_gpu_dev_verbs_isaligned(src, sizeof(uint64_t)) &&
+            remaining_bytes >= sizeof(uint64_t)) {
+            *(uint64_t *)dst = *(uint64_t *)src;
+            copied_size = sizeof(uint64_t);
+        } else if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint32_t)) &&
+                   doca_gpu_dev_verbs_isaligned(src, sizeof(uint32_t)) &&
+                   remaining_bytes >= sizeof(uint32_t)) {
+            *(uint32_t *)dst = *(uint32_t *)src;
+            copied_size = sizeof(uint32_t);
+        } else if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint16_t)) &&
+                   doca_gpu_dev_verbs_isaligned(src, sizeof(uint16_t)) &&
+                   remaining_bytes >= sizeof(uint16_t)) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            copied_size = sizeof(uint16_t);
+        } else {
+            *(uint8_t *)dst = *(uint8_t *)src;
+            copied_size = sizeof(uint8_t);
+        }
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+template <typename T>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_inl_aligned_data(T *dst, T *src,
+                                                                                  size_t bytes) {
+    size_t remaining_bytes = bytes;
+    const size_t copied_size = sizeof(T);
+    while (remaining_bytes > 0) {
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode,
+          bool need_fence_acquire = false>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_max(T *ptr, T val) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        T old_val = *ptr;
+        *ptr = max(old_val, val);
+        return old_val;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<T, cuda::thread_scope_block> ptr_aref(*ptr);
+        return ptr_aref.fetch_max(
+            val, need_fence_acquire ? cuda::memory_order_acquire : cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<T, cuda::thread_scope_device> ptr_aref(*ptr);
+        return ptr_aref.fetch_max(
+            val, need_fence_acquire ? cuda::memory_order_acquire : cuda::memory_order_relaxed);
+    }
+    return 0;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_add(T *ptr, T val) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        T old_val = *ptr;
+        *ptr = old_val + val;
+        return old_val;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<T, cuda::thread_scope_block> ptr_aref(*ptr);
+        return ptr_aref.fetch_add(val, cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<T, cuda::thread_scope_device> ptr_aref(*ptr);
+        return ptr_aref.fetch_add(val, cuda::memory_order_relaxed);
+    }
+    return 0;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_read(T *ptr) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        return *ptr;
+    else
+        return READ_ONCE(*ptr);
+}
+
+/**
+ * @brief Lock a resource
+ *
+ * @param lock - Pointer to the lock
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_lock(int *lock) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        *lock = 1;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        while (atomicCAS_block(lock, 0, 1) != 0) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        while (atomicCAS(lock, 0, 1) != 0) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+    }
+}
+
+/**
+ * @brief Unlock a resource
+ *
+ * @param lock - Pointer to the lock
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_unlock(int *lock) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        *lock = 0;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<int, cuda::thread_scope_block> lock_aref(*lock);
+        lock_aref.store(0, cuda::memory_order_release);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
+        lock_aref.store(0, cuda::memory_order_release);
+    }
+}
+
+__device__ static __forceinline__ uint8_t doca_gpu_dev_verbs_load_relaxed_sys_global(uint8_t *ptr) {
+    uint16_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b8 %0, [%1];" : "=h"(ret) : "l"(ptr));
+    return (uint8_t)ret;
+}
+
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_load_relaxed_sys_global(uint32_t *ptr) {
+    uint32_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_load_relaxed_sys_global(uint64_t *ptr) {
+    uint64_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_load_relaxed(uint64_t *ptr) {
+    uint64_t ret = 0;
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        ret = *ptr;
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA)
+        asm volatile("ld.relaxed.cta.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU)
+        asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+/**
+ * @brief Calculate the ceiling of x / y, where y is (2^denominator_shift)
+ *
+ * @param x - Numerator
+ * @param denominator_shift - Denominator shift (y = 2^denominator_shift)
+ * @return The ceiling of x / y
+ */
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_div_ceil_aligned_pow2(uint64_t x, unsigned int denominator_shift) {
+    uint64_t y = 1ULL << denominator_shift;
+    return ((x & ~(y - 1)) >> denominator_shift) + (!!(x & (y - 1)));
+}
+
+/**
+ * @brief Calculate the ceiling of x / y, where y is (2^denominator_shift).
+ * The result must fit in 32 bits. This is a faster implementation than gdaki_div_ceil_aligned_pow2.
+ *
+ * @param x - Numerator
+ * @param denominator_shift - Denominator shift (y = 2^denominator_shift)
+ * @return The ceiling of x / y
+ */
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(uint64_t x, int denominator_shift) {
+    return uint32_t(x >> denominator_shift) + !!__funnelshift_r(0, uint32_t(x), denominator_shift);
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_COMMON_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh
new file mode 100644
index 000000000..f17886643
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh
@@ -0,0 +1,421 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_counter.cuh
+ * @brief GDAKI CUDA device functions for One-sided Shared QP ops
+ *
+ * @{
+ */
+
+#ifndef DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH
+#define DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH
+
+#include "doca_gpunetio_dev_verbs_qp.cuh"
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/**
+ * @brief Submit work requests to the NIC using the DB protocol.
+ *
+ * @param qps - Array of Queue Pair (QP)
+ * @param prod_indices - Array of producer indices
+ * @param num_qps - Number of Queue Pair (QP)
+ */
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_db_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    uint64_t old_prod_indices[num_qps];
+    __be64 db_vals[num_qps];
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qps[i]->sq_lock);
+        old_prod_indices[i] = doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode, true>(
+            &qps[i]->sq_wqe_pi, prod_indices[i]);
+        if (old_prod_indices[i] < prod_indices[i]) {
+            // Early rining of the DB to push WQEs to the NIC ASAP.
+            __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qps[i]->sq_db);
+            db_vals[i] = doca_gpu_dev_verbs_prepare_db(qps[i], prod_indices[i]);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+            if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+                doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr,
+                                                                   (uint64_t)db_vals[i]);
+            } else
+#endif
+            {
+                doca_gpu_dev_verbs_fence_release<sync_scope>();
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+                { doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_vals[i]); }
+#else
+                {
+                    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(
+                        *((uint64_t *)db_ptr));
+                    db_ptr_aref.store(db_vals[i], cuda::memory_order_relaxed);
+                }
+#endif
+            }
+        }
+    }
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        if (old_prod_indices[i] < prod_indices[i]) {
+            // In case the recovery path is triggered, the later DB ringing will cover for
+            // correctness.
+            doca_priv_gpu_dev_verbs_update_dbr(qps[i], prod_indices[i]);
+            __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qps[i]->sq_db);
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+            if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+                doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr,
+                                                                   (uint64_t)db_vals[i]);
+            } else
+#endif
+            {
+                doca_gpu_dev_verbs_fence_release<sync_scope>();
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+                { doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_vals[i]); }
+#else
+                {
+                    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(
+                        *((uint64_t *)db_ptr));
+                    db_ptr_aref.store(db_vals[i], cuda::memory_order_relaxed);
+                }
+#endif
+            }
+        }
+        doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qps[i]->sq_lock);
+    }
+}
+
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_proxy_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        doca_gpu_dev_verbs_ring_proxy<resource_sharing_mode>(qps[i], prod_indices[i]);
+    }
+}
+
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        const enum doca_gpu_dev_verbs_nic_handler qp_nic_handler =
+            (enum doca_gpu_dev_verbs_nic_handler)__ldg((int *)&qps[0]->nic_handler);
+        if (qp_nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB)
+            doca_gpu_dev_verbs_submit_db_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+                qps, prod_indices);
+        else
+            doca_gpu_dev_verbs_submit_proxy_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+                qps, prod_indices);
+    } else if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB) {
+        doca_gpu_dev_verbs_submit_db_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+            qps, prod_indices);
+    } else {
+        doca_gpu_dev_verbs_submit_proxy_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+            qps, prod_indices);
+    }
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_qp *companion_qp,
+    struct doca_gpu_dev_verbs_addr counter_raddr, struct doca_gpu_dev_verbs_addr counter_laddr,
+    uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint64_t num_chunks =
+        doca_gpu_dev_verbs_div_ceil_aligned_pow2(size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint64_t num_chunks =
+        doca_gpu_dev_verbs_div_ceil_aligned_pow2(size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    // Put
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks + 1);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    // Signal
+    ++wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    // Counter
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    // Signal
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+
+    // Counter
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH */
diff --git a/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh
new file mode 100644
index 000000000..ae834a7ba
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh
@@ -0,0 +1,295 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_cq.cuh
+ * @brief GDAKI CUDA device functions for CQ management
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_CQ_H
+#define DOCA_GPUNETIO_DEV_VERBS_CQ_H
+
+#include <errno.h>
+
+#include "doca_gpunetio_dev_verbs_common.cuh"
+
+/**
+ * @brief Return device CQ SQ pointer from a device QP
+ *
+ * @param[in] qp - Dev QP pointer
+ *
+ * @return Dev CQ pointer
+ */
+__device__ static __forceinline__ struct doca_gpu_dev_verbs_cq *doca_gpu_dev_verbs_qp_get_cq_sq(
+    struct doca_gpu_dev_verbs_qp *qp) {
+    return &(qp->cq_sq);
+}
+
+/**
+ * @brief Increament and round up CQE id
+ *
+ * @param[in] cqe_idx - cqe idx
+ * @param[in] increment - cqe idx increment
+ *
+ * @return cqe incremented idx
+ */
+__device__ static __forceinline__ uint32_t doca_gpu_dev_verbs_cqe_idx_inc_mask(uint32_t cqe_idx,
+                                                                               uint32_t increment) {
+    return (cqe_idx + increment) & DOCA_GPUNETIO_VERBS_CQE_CI_MASK;
+}
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+/**
+ * @brief Print error CQE values
+ *
+ * @param[in] cqe64 - erroneous cqe
+ *
+ * @return
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_cq_print_cqe_err(
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64) {
+    struct doca_gpunetio_ib_mlx5_err_cqe_ex *err_cqe =
+        (struct doca_gpunetio_ib_mlx5_err_cqe_ex *)cqe64;
+
+    printf(
+        "got completion with err: "
+        "syndrome=%#x, vendor_err_synd=%#x, "
+        "hw_err_synd=%#x, hw_synd_type=%#x, wqe_counter=%u\n",
+        err_cqe->syndrome, err_cqe->vendor_err_synd, err_cqe->hw_err_synd, err_cqe->hw_synd_type,
+        doca_gpu_dev_verbs_bswap16(err_cqe->wqe_counter));
+}
+#endif
+
+/**
+ * @brief [Internal] Poll the Completion Queue (CQ) at a specific index.
+ * This function does not update the SW consumer index nor guarantees the ordering.
+ * It also does not wait for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_priv_gpu_dev_verbs_poll_one_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    uint8_t *cqe = (uint8_t *)__ldg((uintptr_t *)&cq->cqe_daddr);
+    const uint32_t cqe_num = __ldg(&cq->cqe_num);
+    uint32_t idx = cons_index & (cqe_num - 1);
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64 =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)(cqe + (idx * DOCA_GPUNETIO_VERBS_CQE_SIZE));
+
+    uint64_t cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+
+    if (cons_index < cqe_ci) return 0;
+    if (cons_index >= cqe_ci + cqe_num) return EBUSY;
+
+    uint8_t opown;
+    uint8_t opcode;
+    bool observed_completion;
+
+#if __CUDA_ARCH__ >= 900
+    opown = doca_gpu_dev_verbs_load_relaxed_sys_global((uint8_t *)&cqe64->op_own);
+
+    observed_completion =
+        !((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num));
+#else
+    uint32_t cqe_chunk;
+    uint16_t wqe_counter;
+
+    cqe_chunk = doca_gpu_dev_verbs_load_relaxed_sys_global((uint32_t *)&cqe64->wqe_counter);
+    cqe_chunk = doca_gpu_dev_verbs_bswap32(cqe_chunk);
+    wqe_counter = cqe_chunk >> 16;
+    opown = cqe_chunk & 0xff;
+
+    observed_completion =
+        !((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num)) &&
+        (wqe_counter == ((uint32_t)cons_index & 0xffff));
+#endif
+
+    if (!observed_completion) return EBUSY;
+
+    opcode = opown >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+    if (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) doca_gpu_dev_verbs_cq_print_cqe_err(cqe64);
+#endif
+    return (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) * -EIO;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ) at a specific index. This function does
+ * not wait for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ * @return On success, doca_gpu_dev_verbs_poll_one_cq_at() returns 0. If the completion is
+ * not available, returns EBUSY. If it is a completion with error, returns a
+ * negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_one_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    int status =
+        doca_priv_gpu_dev_verbs_poll_one_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+    if (status == 0) {
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+        doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode>(&cq->cqe_ci, cons_index + 1);
+    }
+    return status;
+}
+
+/**
+ * @brief [Internal] Poll the Completion Queue (CQ) at a specific index.
+ * This function does not update the SW consumer index nor guarantees the ordering.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_priv_gpu_dev_verbs_poll_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)__ldg((uintptr_t *)&cq->cqe_daddr);
+    const uint32_t cqe_num = __ldg(&cq->cqe_num);
+    uint32_t idx = cons_index & (cqe_num - 1);
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64 = &cqe[idx];
+    uint8_t opown;
+    uint8_t opcode;
+    uint64_t cqe_ci;
+#if __CUDA_ARCH__ >= 900
+    do {
+        cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+        [[unlikely]] if (cons_index < cqe_ci)
+            return 0;
+        opown = doca_gpu_dev_verbs_load_relaxed_sys_global((uint8_t *)&cqe64->op_own);
+    } while ((cons_index >= cqe_ci + cqe_num) ||
+             ((cqe_ci <= cons_index) &&
+              ((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num))));
+#else
+    uint32_t cqe_chunk;
+    uint16_t wqe_counter;
+
+    do {
+        cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+        [[unlikely]] if (cons_index < cqe_ci)
+            return 0;
+        cqe_chunk = doca_gpu_dev_verbs_load_relaxed_sys_global((uint32_t *)&cqe64->wqe_counter);
+        cqe_chunk = doca_gpu_dev_verbs_bswap32(cqe_chunk);
+        wqe_counter = cqe_chunk >> 16;
+        opown = cqe_chunk & 0xff;
+    } while ((cons_index >= cqe_ci + cqe_num) ||
+             ((cqe_ci <= cons_index) &&
+              (((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num)) ||
+               (wqe_counter != ((uint32_t)cons_index & 0xffff)))));
+#endif
+
+    opcode = opown >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+    if (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) doca_gpu_dev_verbs_cq_print_cqe_err(cqe64);
+#endif
+    return (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) * -EIO;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ) at a specific index. This function waits for the completion
+ * to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ * @return On success, doca_gpu_dev_verbs_poll_cq_at() returns 0. If it is a completion with
+ * error, returns a negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    int status = doca_priv_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+    if (status == 0) {
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+        doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode>(&cq->cqe_ci, cons_index + 1);
+    }
+    return status;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ). This function waits for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param count - Number of completions to poll
+ * @return On success, doca_gpu_dev_verbs_poll_cq() returns 0. If it is a completion with
+ * error, returns a negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_cq(struct doca_gpu_dev_verbs_cq *cq,
+                                                                 uint32_t count) {
+    uint64_t cons_index =
+        doca_gpu_dev_verbs_atomic_add<uint64_t, resource_sharing_mode>(&cq->cqe_rsvd, count) +
+        count - 1;
+    return doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+}
+
+/**
+ * @brief Increment CQ DBREC
+ *
+ * @param[in] cq - GPU Completion Queue
+ * @param[in] cqe_num - CQE num to increment
+ *
+ * @return new CQE consumer index
+ */
+template <bool is_overrun>
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_cq_update_dbrec(struct doca_gpu_dev_verbs_cq *cq, uint32_t cqe_num) {
+    uint32_t cqe_ci = DOCA_GPUNETIO_VOLATILE(cq->cqe_ci);
+
+    cqe_ci = (cqe_ci + cqe_num) & DOCA_GPUNETIO_VERBS_CQE_CI_MASK;
+    if (is_overrun == false) {
+        asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;"
+                     :
+                     : "l"(cq->dbrec), "r"(doca_gpu_dev_verbs_bswap32(cqe_ci)));
+    }
+
+    DOCA_GPUNETIO_VOLATILE(cq->cqe_ci) = cqe_ci;
+
+    return cqe_ci;
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_CQ_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh
new file mode 100644
index 000000000..57c65bec1
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh
@@ -0,0 +1,508 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_onesided.cuh
+ * @brief GDAKI CUDA device functions for One-sided Shared QP ops
+ *
+ * @{
+ */
+
+#ifndef DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH
+#define DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH
+
+#include "doca_gpunetio_dev_verbs_qp.cuh"
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/* **************************************** PUT **************************************** */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint32_t num_chunks = doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(
+        size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+#if __CUDA_ARCH__ >= 800
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx = 0, wqe_idx;
+    uint32_t base_wqe_idx_0 = 0, base_wqe_idx_1 = 0;
+    uint32_t lane_idx = doca_gpu_dev_verbs_get_lane_id();
+
+    DOCA_GPUNETIO_VERBS_ASSERT(size <= DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE);
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+
+    if (lane_idx == 0) {
+        base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(
+            qp, DOCA_GPUNETIO_VERBS_WARP_SIZE);
+        base_wqe_idx_0 = (uint32_t)base_wqe_idx;
+        base_wqe_idx_1 = (uint32_t)(base_wqe_idx >> 32);
+    }
+    __syncwarp();
+
+    base_wqe_idx_0 = __reduce_max_sync(DOCA_GPUNETIO_VERBS_WARP_FULL_MASK, base_wqe_idx_0);
+    base_wqe_idx_1 = __reduce_max_sync(DOCA_GPUNETIO_VERBS_WARP_FULL_MASK, base_wqe_idx_1);
+    base_wqe_idx = ((uint64_t)base_wqe_idx_1) << 32 | base_wqe_idx_0;
+
+    wqe_idx = base_wqe_idx + lane_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_write(qp, wqe_ptr, wqe_idx,
+                                         DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0, raddr.addr,
+                                         raddr.key, laddr.addr, laddr.key, size);
+
+    __syncwarp();
+    if (lane_idx == 0) {
+        doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(
+            qp, base_wqe_idx, base_wqe_idx + DOCA_GPUNETIO_VERBS_WARP_SIZE - 1);
+        doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                                  nic_handler>(qp, base_wqe_idx + DOCA_GPUNETIO_VERBS_WARP_SIZE);
+    }
+    __syncwarp();
+
+    *out_ticket = wqe_idx;
+#else
+    printf("__CUDA_ARCH__ < 800, WARP mode not enabled\n");
+    *out_ticket = 0;
+#endif
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_put_thread<resource_sharing_mode, nic_handler>(qp, raddr, laddr, size,
+                                                                          out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_put_warp<resource_sharing_mode, nic_handler>(qp, raddr, laddr, size,
+                                                                        out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put(struct doca_gpu_dev_verbs_qp *qp,
+                                                              struct doca_gpu_dev_verbs_addr raddr,
+                                                              struct doca_gpu_dev_verbs_addr laddr,
+                                                              size_t size) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_put<resource_sharing_mode, nic_handler, exec_scope>(qp, raddr, laddr, size,
+                                                                           &ticket);
+}
+
+/* **************************************** PUT INLINE **************************************** */
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_p_thread<T, resource_sharing_mode, nic_handler>(qp, raddr, value,
+                                                                           out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_p_warp<T, resource_sharing_mode, nic_handler>(qp, raddr, value,
+                                                                         out_ticket);
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p(struct doca_gpu_dev_verbs_qp *qp,
+                                                            struct doca_gpu_dev_verbs_addr raddr,
+                                                            T value) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_p<T, resource_sharing_mode, nic_handler, exec_scope>(qp, raddr, value,
+                                                                            &ticket);
+}
+
+/* **************************************** PUT SIGNAL **************************************** */
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+
+    uint32_t num_chunks = doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(
+        size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks + 1);
+
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    ++wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_put_signal_thread<sig_op, resource_sharing_mode, nic_handler>(
+            qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_put_signal_warp<sig_op, resource_sharing_mode, nic_handler>(
+            qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_put_signal<sig_op, resource_sharing_mode, nic_handler, exec_scope>(
+        qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, &ticket);
+}
+
+template <typename T, enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_addr sig_raddr, struct doca_gpu_dev_verbs_addr sig_laddr,
+    uint64_t sig_val, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 2);
+    wqe_idx = base_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+
+    ++wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_addr sig_raddr, struct doca_gpu_dev_verbs_addr sig_laddr,
+    uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_p_signal<T, sig_op, resource_sharing_mode, nic_handler>(
+        qp, raddr, value, sig_raddr, sig_laddr, sig_val, &ticket);
+}
+
+/* **************************************** SIGNAL **************************************** */
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_signal_thread<sig_op, resource_sharing_mode, nic_handler>(
+            qp, sig_raddr, sig_laddr, sig_val, out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_signal_warp<sig_op, resource_sharing_mode, nic_handler>(
+            qp, sig_raddr, sig_laddr, sig_val, out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_signal<sig_op, resource_sharing_mode, nic_handler>(qp, sig_raddr, sig_laddr,
+                                                                          sig_val, &ticket);
+}
+
+/* **************************************** OTHERS **************************************** */
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait(struct doca_gpu_dev_verbs_qp *qp,
+                                                               doca_gpu_dev_verbs_ticket_t ticket) {
+    doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode>(doca_gpu_dev_verbs_qp_get_cq_sq(qp),
+                                                         ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait(struct doca_gpu_dev_verbs_qp *qp) {
+    uint64_t ticket =
+        doca_gpu_dev_verbs_atomic_read<uint64_t, resource_sharing_mode>(&qp->sq_rsvd_index);
+    [[unlikely]] if (ticket == 0)
+        return;
+    --ticket;
+    doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode>(doca_gpu_dev_verbs_qp_get_cq_sq(qp),
+                                                         ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence(struct doca_gpu_dev_verbs_qp *qp) {
+    // This is no-op in the current implementation
+    return;
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH */
diff --git a/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh
new file mode 100644
index 000000000..64019b00c
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh
@@ -0,0 +1,824 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_qp.cuh
+ * @brief GDAKI CUDA device functions for QP management
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_QP_H
+#define DOCA_GPUNETIO_DEV_VERBS_QP_H
+
+#include <cuda/atomic>
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/* *********** WQE UTILS *********** */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_store_wqe_seg(uint64_t *ptr,
+                                                                        uint64_t *val) {
+    asm volatile("st.weak.cs.v2.b64 [%0], {%1, %2};" : : "l"(ptr), "l"(val[0]), "l"(val[1]));
+}
+
+/**
+ * @brief Get a pointer to the WQE buffer at a specific index
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe_idx - Index of the WQE to get
+ * @return Pointer to the WQE buffer at the specified index
+ */
+__device__ static __forceinline__ struct doca_gpu_dev_verbs_wqe *doca_gpu_dev_verbs_get_wqe_ptr(
+    struct doca_gpu_dev_verbs_qp *qp, uint16_t wqe_idx) {
+    const uint16_t nwqes_mask = __ldg(&qp->sq_wqe_mask);
+    const uintptr_t wqe_addr = __ldg((uintptr_t *)&qp->sq_wqe_daddr);
+    const uint16_t idx = wqe_idx & nwqes_mask;
+    return (struct doca_gpu_dev_verbs_wqe *)(wqe_addr +
+                                             (idx << DOCA_GPUNETIO_IB_MLX5_WQE_SQ_SHIFT));
+}
+
+/* *********** WQE SHARING *********** */
+
+/**
+ * @brief Wait until the given WQE slot is available.
+ * All prior WQE slots are also guaranteed to be available.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe_idx - WQE slot index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait_until_slot_available(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t wqe_idx) {
+    const uint16_t nwqes = __ldg(&qp->sq_wqe_num);
+    [[likely]] if (wqe_idx >= nwqes)
+        doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(&(qp->cq_sq),
+                                                                      wqe_idx - nwqes);
+}
+
+/**
+ * @brief Reserve a number of WQE slots.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param count - Number of WQE slots to reserve
+ * @return The index of the first reserved WQE slot
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ,
+          bool wait_for_availability = true>
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_reserve_wq_slots(struct doca_gpu_dev_verbs_qp *qp, uint32_t count) {
+    uint64_t wqe_idx =
+        doca_gpu_dev_verbs_atomic_add<uint64_t, resource_sharing_mode>(&qp->sq_rsvd_index, count);
+    if (wait_for_availability)
+        doca_gpu_dev_verbs_wait_until_slot_available<resource_sharing_mode>(qp,
+                                                                            wqe_idx + count - 1);
+    return wqe_idx;
+}
+
+/**
+ * @brief Mark the WQEs in the range [from_wqe_idx, to_wqe_idx] as ready.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param from_wqe_idx - Starting WQE index
+ * @param to_wqe_idx - Ending WQE index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_mark_wqes_ready(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t from_wqe_idx, uint64_t to_wqe_idx) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        qp->sq_ready_index = to_wqe_idx + 1;
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_block> ready_index_aref(qp->sq_ready_index);
+        while (ready_index_aref.load(cuda::memory_order_relaxed) != from_wqe_idx) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+        ready_index_aref.store(to_wqe_idx + 1, cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ready_index_aref(qp->sq_ready_index);
+        while (ready_index_aref.load(cuda::memory_order_relaxed) != from_wqe_idx) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+        ready_index_aref.store(to_wqe_idx + 1, cuda::memory_order_relaxed);
+    }
+}
+
+/* *********** QP DBR/DB *********** */
+
+/**
+ * @brief Prepare the DBR (Doorbell Record)
+ *
+ * @param prod_index - Producer index
+ * @return DBR value
+ */
+__device__ static __forceinline__ __be32 doca_gpu_dev_verbs_prepare_dbr(uint32_t prod_index) {
+    __be32 dbrec_val;
+
+    // This is equivalent to
+    // HTOBE32(dbrec_head & 0xffff);
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask1;\n\t"
+        ".reg .b32 dbrec_head_16b;\n\t"
+        ".reg .b32 ign;\n\t"
+        ".reg .b32 mask2;\n\t"
+        "mov.b32 mask1, 0xffff;\n\t"
+        "mov.b32 mask2, 0x123;\n\t"
+        "and.b32 dbrec_head_16b, %1, mask1;\n\t"
+        "prmt.b32 %0, dbrec_head_16b, ign, mask2;\n\t"
+        "}"
+        : "=r"(dbrec_val)
+        : "r"(prod_index));
+
+    return dbrec_val;
+}
+
+/**
+ * @brief [Internal] Update the NIC DBR (Doorbell Record).
+ * This function does not guarantee the ordering.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_priv_gpu_dev_verbs_update_dbr(
+    struct doca_gpu_dev_verbs_qp *qp, uint32_t prod_index) {
+    __be32 dbrec_val = doca_gpu_dev_verbs_prepare_dbr(prod_index);
+    __be32 *dbrec_ptr = (__be32 *)__ldg((uintptr_t *)&qp->sq_dbrec);
+
+    cuda::atomic_ref<__be32, cuda::thread_scope_system> dbrec_ptr_aref(*dbrec_ptr);
+    dbrec_ptr_aref.store(dbrec_val, cuda::memory_order_relaxed);
+}
+
+/**
+ * @brief Update the NIC DBR (Doorbell Record)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_update_dbr(
+    struct doca_gpu_dev_verbs_qp *qp, uint32_t prod_index) {
+    __be32 dbrec_val = doca_gpu_dev_verbs_prepare_dbr(prod_index);
+    __be32 *dbrec_ptr = (__be32 *)__ldg((uintptr_t *)&qp->sq_dbrec);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+    if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+        doca_gpu_dev_verbs_async_store_release<sync_scope>(dbrec_ptr, dbrec_val);
+    } else
+#endif
+    {
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        doca_priv_gpu_dev_verbs_update_dbr<qp_type>(qp, prod_index);
+    }
+}
+
+/**
+ * @brief Prepare the DB (Doorbell)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @return DB value
+ */
+__device__ static __forceinline__ __be64
+doca_gpu_dev_verbs_prepare_db(struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg ctrl_seg = {0};
+
+    // The only ctrl segment fields that are inspected while ringing
+    // the DB are QP number and WQE index
+    ctrl_seg.qpn_ds = __ldg(&qp->sq_num_shift8_be);
+    ctrl_seg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32((prod_index << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT));
+
+    return *(uint64_t *)&ctrl_seg;
+}
+
+/* *************************** Ring Doorbell *************************** */
+
+/**
+ * @brief Ring the DB (Doorbell)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_db(struct doca_gpu_dev_verbs_qp *qp,
+                                                                  uint64_t prod_index) {
+    __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qp->sq_db);
+    __be64 db_val = doca_gpu_dev_verbs_prepare_db(qp, prod_index);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+    if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+        doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr, (uint64_t)db_val);
+    } else
+#endif
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+    {
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_val);
+    }
+#else
+    {
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(*((uint64_t *)db_ptr));
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        db_ptr_aref.store(db_val, cuda::memory_order_relaxed);
+    }
+#endif
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_TMA_COPY
+/**
+ * @brief Ring the BF (BlueFlame). Requires shared memory.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe - WQE to be ringed. This buffer must be in shared memory.
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_bf(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr) {
+    void *bf_ptr = (void *)__ldg((uintptr_t *)&qp->sq_db);
+    uint64_t *wqe = (uint64_t *)wqe_ptr;
+
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+    asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], 64;"
+                 :
+                 : "l"(bf_ptr), "l"(*wqe));
+}
+#endif
+
+/**
+ * @brief Ring the BF (BlueFlame). Requires at least 8 threads in the warp.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe - WQE to be ringed
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_bf_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr) {
+    unsigned int lane_id = doca_gpu_dev_verbs_get_lane_id();
+    uint64_t *bf_ptr = (uint64_t *)qp->sq_db;
+    uint64_t *wqe = (uint64_t *)wqe_ptr;
+
+    if (lane_id == 0) doca_gpu_dev_verbs_fence_release<sync_scope>();
+    __syncwarp();
+
+    if (lane_id < 8) {
+        bf_ptr[lane_id] = wqe[lane_id];
+    }
+}
+
+/**
+ * @brief Ring the proxy.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_idx - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_proxy(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_idx) {
+    uint64_t *proxy_ptr = (uint64_t *)__ldg((uintptr_t *)&qp->sq_db);
+    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> proxy_ptr_aref(*proxy_ptr);
+
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        proxy_ptr_aref.store(prod_idx, cuda::memory_order_relaxed);
+        WRITE_ONCE(*proxy_ptr, prod_idx);
+    } else {
+        proxy_ptr_aref.fetch_max(prod_idx, cuda::memory_order_relaxed);
+    }
+}
+
+/**
+ * @brief Submit a work request to the NIC using the DB protocol.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_db(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+
+    uint64_t old_prod_index = doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode, true>(
+        &qp->sq_wqe_pi, prod_index);
+    if (old_prod_index < prod_index) {
+        // Early rining of the DB to push WQEs to the NIC ASAP.
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+
+        // In case the recovery path is triggered, the later DB ringing will cover for correctness.
+        doca_priv_gpu_dev_verbs_update_dbr<qp_type>(qp, prod_index);
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+    }
+
+    doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+}
+
+/**
+ * @brief Submit a work request to the NIC using the BlueFlame protocol.
+ * This function requires a single thread. Users must pass a pointer to a WQE stored in shared
+ * memory. Hopper or a newer generation is required to leaverage the BlueFlame protocol.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @param smem_wqe - WQE to be submitted directly to the NIC. The buffer must be in shared memory.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_bf(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index,
+    struct doca_gpu_dev_verbs_wqe *smem_wqe) {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_TMA_COPY
+    doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+    unsigned long long int old_prod_index =
+        doca_gpu_dev_verbs_atomic_max<unsigned long long int, resource_sharing_mode, true>(
+            (unsigned long long int *)&qp->sq_wqe_pi, (unsigned long long int)prod_index);
+    if (old_prod_index < prod_index) {
+        doca_gpu_dev_verbs_ring_bf<sync_scope>(qp, smem_wqe);
+        doca_priv_gpu_dev_verbs_update_dbr<DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+    }
+    doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+#else
+    doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope, code_opt,
+                                 DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+#endif
+}
+
+/**
+ * @brief Submit all the WQEs up to the given producer index to the NIC using the BlueFlame
+ * protocol. This function must be called by all threads in the warp. At least 8 threads are
+ * required.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @param wqe - WQE to be submitted directly to the NIC
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_bf_warp(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index, struct doca_gpu_dev_verbs_wqe *wqe) {
+    unsigned int lane_id = doca_gpu_dev_verbs_get_lane_id();
+    unsigned long long int old_prod_index;
+    if (lane_id == 0) {
+        doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+        old_prod_index =
+            doca_gpu_dev_verbs_atomic_max<unsigned long long int, resource_sharing_mode, true>(
+                (unsigned long long int *)&qp->sq_wqe_pi, (unsigned long long int)prod_index);
+    }
+    __syncwarp();
+    old_prod_index = __shfl_sync(0xFFFFFFFF, old_prod_index, 0);
+    if (old_prod_index < prod_index) {
+        doca_gpu_dev_verbs_ring_bf_warp(qp, wqe);
+        __syncwarp();
+        if (lane_id == 0) {
+            doca_priv_gpu_dev_verbs_update_dbr<DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+            doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+        }
+    }
+    if (lane_id == 0) doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+    __syncwarp();
+}
+
+/**
+ * @brief Submit all the WQEs up to the given producer index to the NIC via the CPU proxy.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_proxy(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+    doca_gpu_dev_verbs_ring_proxy<resource_sharing_mode>(qp, prod_index);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit(struct doca_gpu_dev_verbs_qp *qp,
+                                                                 uint64_t prod_index) {
+    const enum doca_gpu_dev_verbs_nic_handler qp_nic_handler =
+        (enum doca_gpu_dev_verbs_nic_handler)__ldg((int *)&qp->nic_handler);
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        if (qp_nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB)
+            doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope,
+                                         DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT, qp_type>(
+                qp, prod_index);
+        else
+            doca_gpu_dev_verbs_submit_proxy<resource_sharing_mode, sync_scope>(qp, prod_index);
+    } else if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB) {
+        doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope,
+                                     DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT, qp_type>(qp,
+                                                                                        prod_index);
+    } else {
+        doca_gpu_dev_verbs_submit_proxy<resource_sharing_mode, sync_scope>(qp, prod_index);
+    }
+}
+
+/* *********** WQE PREPARATION *********** */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_nop(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_NOP);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_1ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t immediate,
+    const uint64_t raddr, const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0,
+    const uint32_t bytes0) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_3ds);
+    cseg.fm_ce_se = ctrl_flags;
+    cseg.imm = immediate;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t immediate,
+    const uint64_t raddr, const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0,
+    const uint32_t bytes0, const uint64_t laddr1, const uint32_t lkey1, const uint32_t bytes1) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_4ds);
+    cseg.fm_ce_se = ctrl_flags;
+    cseg.imm = immediate;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    dseg1.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes1 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg1.lkey = lkey1;
+#else
+    dseg1.lkey = doca_gpu_dev_verbs_bswap32(lkey1);
+#endif
+    dseg1.addr = doca_gpu_dev_verbs_bswap64(laddr1);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg1));
+}
+
+/**
+ * @brief Prepare the header segment of an inline RDMA Write WQE.
+ * The data segment is prepared separately.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint32_t bytes) {
+    int ds;
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+
+    if (bytes > sizeof(struct doca_gpunetio_ib_mlx5_wqe_data_seg) -
+                    sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg))
+        ds = DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MAX;
+    else
+        ds = DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MIN;
+
+    assert(bytes <= DOCA_GPUNETIO_VERBS_MAX_INLINE_SIZE);
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) | ds);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+}
+
+/**
+ * @brief Prepare the data segment of an inline RDMA Write WQE.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param data - Data to be written
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+template <typename T>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr, T data) {
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg *data_seg_ptr =
+        (struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg
+             *)((uintptr_t)wqe_ptr + sizeof(struct doca_gpu_dev_verbs_wqe_ctrl_seg) +
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg));
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg data_seg;
+    uint32_t bytes = sizeof(T);
+
+    data_seg.byte_count = doca_gpu_dev_verbs_bswap32(bytes | DOCA_GPUNETIO_IB_MLX5_INLINE_SEG);
+    *(uint32_t *)data_seg_ptr = data_seg.byte_count;
+    if (bytes <= sizeof(uint32_t)) {
+        T *dst = (T *)((uintptr_t)data_seg_ptr + sizeof(data_seg));
+        *dst = data;
+    } else {
+        uint32_t *dst32 = (uint32_t *)((uintptr_t)data_seg_ptr + sizeof(data_seg));
+        dst32[0] = ((uint32_t *)&data)[0];
+        dst32[1] = ((uint32_t *)&data)[1];
+    }
+}
+
+/**
+ * @brief Prepare a RDMA Write WQE with inline data
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write_inl(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr, const uint32_t bytes) {
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg data_seg;
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg *data_seg_ptr =
+        (struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg
+             *)((uintptr_t)wqe_ptr + sizeof(struct doca_gpu_dev_verbs_wqe_ctrl_seg) +
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg));
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx, ctrl_flags, raddr,
+                                                         rkey, bytes);
+
+    data_seg.byte_count = doca_gpu_dev_verbs_bswap32(bytes | DOCA_GPUNETIO_IB_MLX5_INLINE_SEG);
+    *(uint32_t *)data_seg_ptr = data_seg.byte_count;
+
+    doca_gpu_dev_verbs_memcpy_data((void *)((uintptr_t)data_seg_ptr + sizeof(data_seg)),
+                                   (void *)(uintptr_t)laddr, bytes);
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_read(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0, const uint32_t bytes0) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_3ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count = doca_gpu_dev_verbs_bswap32(bytes0);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_read(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0, const uint32_t bytes0,
+    const uint64_t laddr1, const uint32_t lkey1, const uint32_t bytes1) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) | 4);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    dseg1.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes1 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg1.lkey = lkey1;
+#else
+    dseg1.lkey = doca_gpu_dev_verbs_bswap32(lkey1);
+#endif
+    dseg1.addr = doca_gpu_dev_verbs_bswap64(laddr1);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg1));
+}
+
+/**
+ * @brief Prepare an Atomic WQE
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_atomic(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr, const uint32_t rkey,
+    const uint64_t laddr, const uint32_t lkey, const uint32_t bytes, const uint64_t compare_add,
+    const uint64_t swap_add) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_atomic_seg atseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_4ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    atseg.swap_add = doca_gpu_dev_verbs_bswap64(
+        opcode == DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA ? compare_add : swap_add);
+    atseg.compare = doca_gpu_dev_verbs_bswap64(compare_add);
+
+    dseg.byte_count = doca_gpu_dev_verbs_bswap32(bytes);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg.lkey = lkey;
+#else
+    dseg.lkey = doca_gpu_dev_verbs_bswap32(lkey);
+#endif
+    dseg.addr = doca_gpu_dev_verbs_bswap64(laddr);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(atseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg));
+}
+
+/**
+ * @brief Prepare a Wait WQE
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_wait(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr, uint16_t wqe_idx,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t max_index,
+    const uint32_t qpn_cqn) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpu_dev_verbs_wqe_wait_seg wseg;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_WAIT);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) |
+                                             DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_WAIT);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    wseg.max_index = doca_gpu_dev_verbs_bswap32(max_index);
+    wseg.qpn_cqn = doca_gpu_dev_verbs_bswap32(qpn_cqn);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(wseg));
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_QP_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h
new file mode 100644
index 000000000..f0bd0d46b
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_config.h
+ * @brief A header file for the DOCA GPUNetIO build-time configuration. This header
+ * file may be generated by calling scripts/configure.
+ */
+
+#ifndef DOCA_GPUNETIO_CONFIG_H
+#define DOCA_GPUNETIO_CONFIG_H
+/* DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF support */
+#define DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF 1
+/* DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR support */
+#define DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR 1
+/* DOCA_GPUNETIO_HAVE_CUDA_DMABUF support */
+#define DOCA_GPUNETIO_HAVE_CUDA_DMABUF 1
+#endif  // DOCA_GPUNETIO_CONFIG_H
diff --git a/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h
new file mode 100644
index 000000000..39483f5ad
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_device.h
+ * @brief A header file that includes all necessary device APIs
+ */
+
+#ifndef DOCA_GPUNETIO_DEVICE_H
+#define DOCA_GPUNETIO_DEVICE_H
+
+#include "common/doca_gpunetio_verbs_def.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "device/doca_gpunetio_dev_verbs_common.cuh"
+#include "device/doca_gpunetio_dev_verbs_cq.cuh"
+#include "device/doca_gpunetio_dev_verbs_qp.cuh"
+#include "device/doca_gpunetio_dev_verbs_onesided.cuh"
+#include "device/doca_gpunetio_dev_verbs_counter.cuh"
+
+#endif /* DOCA_GPUNETIO_DEVICE_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h
new file mode 100644
index 000000000..283dd1482
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h
@@ -0,0 +1,49 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_host.h
+ * @brief A header file that includes all necessary host APIs
+ */
+
+#ifndef DOCA_GPUNETIO_HOST_H
+#define DOCA_GPUNETIO_HOST_H
+
+#include "doca_gpunetio_config.h"
+#include "common/doca_gpunetio_verbs_def.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "host/mlx5_ifc.h"
+#include "host/mlx5_prm.h"
+#include "host/doca_error.h"
+#include "host/doca_verbs.h"
+#include "host/doca_gpunetio.h"
+#include "host/doca_gpunetio_high_level.h"
+
+#endif /* DOCA_GPUNETIO_HOST_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h b/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h
new file mode 100644
index 000000000..7cb101b3f
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_errors.h
+ * @brief A header file for the doca_error APIs
+ */
+
+#ifndef DOCA_ERROR_H
+#define DOCA_ERROR_H
+
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief DOCA API return codes
+ */
+typedef enum doca_error {
+    DOCA_SUCCESS = 0,                      /**< Success */
+    DOCA_ERROR_UNKNOWN = 1,                /**< Unknown error */
+    DOCA_ERROR_NOT_PERMITTED = 2,          /**< Operation not permitted */
+    DOCA_ERROR_IN_USE = 3,                 /**< Resource already in use */
+    DOCA_ERROR_NOT_SUPPORTED = 4,          /**< Operation not supported */
+    DOCA_ERROR_AGAIN = 5,                  /**< Resource temporarily unavailable, try again */
+    DOCA_ERROR_INVALID_VALUE = 6,          /**< Invalid input */
+    DOCA_ERROR_NO_MEMORY = 7,              /**< Memory allocation failure */
+    DOCA_ERROR_INITIALIZATION = 8,         /**< Resource initialization failure */
+    DOCA_ERROR_TIME_OUT = 9,               /**< Timer expired waiting for resource */
+    DOCA_ERROR_SHUTDOWN = 10,              /**< Shut down in process or completed */
+    DOCA_ERROR_CONNECTION_RESET = 11,      /**< Connection reset by peer */
+    DOCA_ERROR_CONNECTION_ABORTED = 12,    /**< Connection aborted */
+    DOCA_ERROR_CONNECTION_INPROGRESS = 13, /**< Connection in progress */
+    DOCA_ERROR_NOT_CONNECTED = 14,         /**< Not Connected */
+    DOCA_ERROR_NO_LOCK = 15,               /**< Unable to acquire required lock */
+    DOCA_ERROR_NOT_FOUND = 16,             /**< Resource Not Found */
+    DOCA_ERROR_IO_FAILED = 17,             /**< Input/Output Operation Failed */
+    DOCA_ERROR_BAD_STATE = 18,             /**< Bad State */
+    DOCA_ERROR_UNSUPPORTED_VERSION = 19,   /**< Unsupported version */
+    DOCA_ERROR_OPERATING_SYSTEM = 20,      /**< Operating system call failure */
+    DOCA_ERROR_DRIVER = 21,                /**< DOCA Driver call failure */
+    DOCA_ERROR_UNEXPECTED = 22,            /**< An unexpected scenario was detected */
+    DOCA_ERROR_ALREADY_EXIST = 23,         /**< Resource already exist */
+    DOCA_ERROR_FULL = 24,                  /**< No more space in resource */
+    DOCA_ERROR_EMPTY = 25,                 /**< No entry is available in resource */
+    DOCA_ERROR_IN_PROGRESS = 26,           /**< Operation is in progress */
+    DOCA_ERROR_TOO_BIG = 27,               /**< Requested operation too big to be contained */
+    DOCA_ERROR_AUTHENTICATION = 28,        /**< Authentication failure */
+    DOCA_ERROR_BAD_CONFIG = 29,            /**< Configuration is not valid */
+    DOCA_ERROR_SKIPPED = 30, /**< Result is valid, but some previous output data was dropped */
+    DOCA_ERROR_DEVICE_FATAL_ERROR = 31 /**< Device experienced a fatal error */
+} doca_error_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+#endif /* DOCA_ERROR_H_ */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h b/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h
new file mode 100644
index 000000000..7d0f46ac6
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h
@@ -0,0 +1,387 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio.h
+ * @brief A header file for the doca_gpunetio APIs
+ */
+
+#ifndef DOCA_GPUNETIO_H
+#define DOCA_GPUNETIO_H
+
+#include "host/doca_error.h"
+#include "doca_gpunetio_config.h"
+#include "common/doca_gpunetio_verbs_def.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************************************************************************************************************
+ * DOCA GPU Lightweight opaque types
+ *********************************************************************************************************************/
+/**
+ * Opaque structure representing a DOCA GPU device handler.
+ */
+struct doca_gpu;
+
+/**
+ * @brief Type of memory the GPUNetIO library can allocate
+ *
+ */
+enum doca_gpu_mem_type {
+    /* GPU memory not accessible from CPU. */
+    DOCA_GPU_MEM_TYPE_GPU = 0,
+    /* GPU memory with direct access from CPU. */
+    DOCA_GPU_MEM_TYPE_GPU_CPU = 1,
+    /* CPU memory with direct access from GPU. */
+    DOCA_GPU_MEM_TYPE_CPU_GPU = 2,
+};
+
+/**
+ * @brief Forward declaration
+ *
+ */
+struct doca_gpu_dev_verbs_qp;
+struct doca_gpu_dev_verbs_cq;
+
+/**
+ * @brief GPUNetIO QP handler accessible from CPU
+ *
+ */
+struct doca_gpu_verbs_qp {
+    struct doca_gpu *gpu_dev;
+    struct doca_verbs_qp *qp;
+    uint64_t *cpu_db;
+    uint64_t sq_wqe_pi_last;
+    uint64_t *sq_db;
+    __be32 *sq_dbrec;
+    bool cpu_proxy;
+    uint32_t sq_num_shift8_be;
+    /* CPU handler */
+    struct doca_gpu_dev_verbs_qp *qp_cpu;
+    /* GPU handler */
+    struct doca_gpu_dev_verbs_qp *qp_gpu;
+};
+
+/**
+ * @brief GPUNetIO QP Error info.
+ */
+struct doca_gpu_verbs_qp_error_info {
+    bool has_error;
+    int syndrome;
+    int vendor_err_synd;
+    int hw_err_synd;
+    int hw_synd_type;
+    int wqe_counter;
+};
+
+typedef void *doca_gpu_verbs_service_t;
+
+/**
+ * @brief Create a DOCA GPUNETIO handler.
+ *
+ * @param [in] gpu_bus_id
+ * GPU PCIe address.
+ * @param [out] gpu_dev
+ * Pointer to the newly created gpu device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - gpu_dev argument is a NULL pointer.
+ * - DOCA_ERROR_NOT_FOUND - GPU not found at the input PCIe address
+ * - DOCA_ERROR_NO_MEMORY - failed to alloc doca_gpu.
+ *
+ */
+doca_error_t doca_gpu_create(const char *gpu_bus_id, struct doca_gpu **gpu_dev);
+
+/**
+ * @brief Destroy a DOCA GPUNETIO handler.
+ *
+ * @param [in] gpu_dev
+ * Pointer to handler to be destroyed.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ */
+doca_error_t doca_gpu_destroy(struct doca_gpu *gpu_dev);
+
+/**
+ * Allocate a GPU accessible memory buffer. Assumes DPDK has been already attached with
+ * doca_gpu_to_dpdk(). According to the memory type specified, the buffer can be allocated in:
+ * - DOCA_GPU_MEM_TYPE_GPU memptr_gpu is not NULL while memptr_cpu is NULL.
+ * - DOCA_GPU_MEM_TYPE_GPU_CPU both memptr_gpu and memptr_cpu are not NULL.
+ * - DOCA_GPU_MEM_TYPE_CPU_GPU both memptr_gpu and memptr_cpu are not NULL.
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] size
+ * Buffer size in bytes.
+ * @param [in] alignment
+ * Buffer memory alignment.
+ * If 0, the return is a pointer that is suitably aligned
+ * for any kind of variable (in the same manner as malloc()).
+ * Otherwise, the return is a pointer that is a multiple of *align*.
+ * Alignment value must be a power of two.
+ * @param [in] mtype
+ * Type of memory buffer. See enum doca_gpu_memtype for reference.
+ * @param [out] memptr_gpu
+ * GPU memory pointer. Must be used with CUDA API and within CUDA kernels.
+ * @param [out] memptr_cpu
+ * CPU memory pointer. Must be used for CPU direct access to the memory.
+ *
+ * @return
+ * Non NULL memptr_gpu pointer on success, NULL otherwise.
+ * Non NULL memptr_cpu pointer on success in case of DOCA_GPU_MEM_TYPE_CPU_GPU and
+ * DOCA_GPU_MEM_TYPE_GPU_CPU, NULL otherwise. DOCA_SUCCESS - in case of success. doca_error code -
+ * in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_NO_MEMORY - if an error occurred dealing with GPU memory.
+ */
+doca_error_t doca_gpu_mem_alloc(struct doca_gpu *gpu_dev, size_t size, size_t alignment,
+                                enum doca_gpu_mem_type mtype, void **memptr_gpu, void **memptr_cpu);
+
+/**
+ * Free a GPU memory buffer.
+ * Only memory allocated with doca_gpu_mem_alloc() can be freed with this function.
+ *
+ * @param [in] gpu
+ * DOCA GPUNetIO handler.
+ * @param [in] memptr_gpu
+ * GPU memory pointer to be freed.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_mem_free(struct doca_gpu *gpu, void *memptr_gpu);
+
+/**
+ * Create a GPU handler for a Verbs QP object
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] qp
+ * DOCA network device handler.
+ * @param [in] nic_handler
+ * Type of NIC handler for this QP.
+ * @param [in] gpu_qp_umem_dev_ptr
+ * GPU external UMEM.
+ * @param [in] cq_sq
+ * DOCA Verbs CQ SQ CPU object connected to the QP.
+ * @param [out] qp_out
+ * DOCA GPUNetIO Verbs QP object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_export_qp(struct doca_gpu *gpu_dev, struct doca_verbs_qp *qp,
+                                      enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                                      void *gpu_qp_umem_dev_ptr, struct doca_verbs_cq *cq_sq,
+                                      struct doca_gpu_verbs_qp **qp_out);
+
+/**
+ * Destroy a GPU handler for a Verbs QP object
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] qp_cpu
+ * DOCA Verbs QP CPU object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_unexport_qp(struct doca_gpu *gpu_dev, struct doca_gpu_verbs_qp *qp);
+
+/**
+ * Get a GPUNetIO GPU device handler handler from a GPUNetIO Verbs QP object.
+ *
+ * @param [in] qp
+ * DOCA GPUNetIO Verbs QP object.
+ * @param [out] qp_gpu
+ * DOCA GPUNetIO Verbs QP GPU object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_get_qp_dev(struct doca_gpu_verbs_qp *qp,
+                                       struct doca_gpu_dev_verbs_qp **qp_gpu);
+
+/**
+ * Return a DMABuf file descriptor from a GPU memory address if the GPU device and CUDA installation
+ * supports DMABuf.
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] memptr_gpu
+ * GPU memory pointer to be freed.
+ * @param [in] size
+ * Size in bytes to map.
+ * @param [out] dmabuf_fd
+ * DMABuf file descriptor
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_NOT_SUPPORTED - DMABuf not supported
+ */
+doca_error_t doca_gpu_dmabuf_fd(struct doca_gpu *gpu_dev, void *memptr_gpu, size_t size,
+                                int *dmabuf_fd);
+
+/**
+ * Check if UAR can be registered on GPU
+ *
+ * @param [in] db
+ * UAR address
+ * @param [out] out_can_register
+ * Can register on GPU
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_can_gpu_register_uar(void *db, bool *out_can_register);
+
+/**
+ * Export UAR to GPU
+ *
+ * @param [in] sq_db
+ * SQ UAR address
+ * @param [out] uar_addr_gpu
+ * SQ UAR GPU address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - if UAR mapping failed
+ */
+doca_error_t doca_gpu_verbs_export_uar(uint64_t *sq_db, uint64_t **uar_addr_gpu);
+
+/**
+ * Unexport UAR from GPU
+ *
+ * @param [in] uar_addr_gpu
+ * SQ UAR GPU address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_unexport_uar(uint64_t *uar_addr_gpu);
+
+/**
+ * Progress QP (ring db) in case of CPU proxy mode
+ *
+ * @param [in] qp_cpu
+ * QP to progress
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_cpu_proxy_progress(struct doca_gpu_verbs_qp *qp_cpu);
+
+/**
+ * Create a service object.
+ *
+ * @param [out] out_service
+ * Service handle
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_service(doca_gpu_verbs_service_t *out_service);
+
+/**
+ * Monitor a QP and make forward progress.
+ *
+ * @param [in] service
+ * Service object
+ * @param [in] qp
+ * QP to monitor
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_service_monitor_qp(doca_gpu_verbs_service_t service,
+                                               struct doca_gpu_verbs_qp *qp);
+
+/**
+ * Destroy a service object.
+ *
+ * @param [in] service
+ * Service object to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_service(doca_gpu_verbs_service_t service);
+
+/**
+ * Query the last error of a GPUNetIO QP
+ *
+ * @param [in] qp
+ * QP to query
+ * @param [out] error_info
+ * Error info
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_query_last_error(struct doca_gpu_verbs_qp *qp,
+                                             struct doca_gpu_verbs_qp_error_info *error_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h b/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h
new file mode 100644
index 000000000..8d71f8da2
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h
@@ -0,0 +1,191 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_high_level.h
+ * @brief A header file for the doca_gpunetio High-level APIs
+ */
+
+#ifndef DOCA_GPUNETIO_HIGH_LEVEL_H
+#define DOCA_GPUNETIO_HIGH_LEVEL_H
+
+#include "doca_gpunetio.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum doca_gpu_verbs_mem_reg_type {
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT =
+        0,  ///< Automatically select the most appropriate method
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_DMABUF = 1,   ///< Use CUDA DMABUF to register memory
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_PEERMEM = 2,  ///< Use CUDA PeerMem to register memory
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_MAX,               ///< Sentinel value
+};
+
+struct doca_gpu_verbs_qp_init_attr_hl {
+    struct doca_gpu *gpu_dev;
+    struct ibv_pd *ibpd;
+    uint16_t sq_nwqe;
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;
+    enum doca_gpu_verbs_mem_reg_type mreg_type;
+};
+
+struct doca_gpu_verbs_qp_hl {
+    struct doca_gpu *gpu_dev; /* DOCA GPU device to use */
+
+    // CQ
+    struct doca_verbs_cq *cq_sq;
+    void *cq_sq_umem_gpu_ptr;
+    struct doca_verbs_umem *cq_sq_umem;
+    void *cq_sq_umem_dbr_gpu_ptr;
+    struct doca_verbs_umem *cq_sq_umem_dbr;
+
+    // QP
+    struct doca_verbs_qp *qp;
+    void *qp_umem_gpu_ptr;
+    struct doca_verbs_umem *qp_umem;
+    void *qp_umem_dbr_gpu_ptr;
+    struct doca_verbs_umem *qp_umem_dbr;
+    struct doca_verbs_uar *external_uar;
+
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;
+
+    // QP GPUNetIO Object
+    struct doca_gpu_verbs_qp *qp_gverbs;
+};
+
+struct doca_gpu_verbs_qp_group_hl {
+    struct doca_gpu_verbs_qp_hl qp_main;
+    struct doca_gpu_verbs_qp_hl qp_companion;
+};
+
+/**
+ * Create an high-level GPUNetIO QP.
+ * This function encapsulate all required steps using doca verbs and doca gpunetio to
+ * create a GDAKI QP.
+ *
+ * @param [in] qp_init_attr
+ * High-level QP init attributes.
+ * @param [out] qp
+ * GPUNetIO QP device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_qp_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                         struct doca_gpu_verbs_qp_hl **qp);
+
+/**
+ * Destroy an high-level GPUNetIO QP.
+ *
+ * @param [in] qp
+ * GPUNetIO high-level QP to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_qp_hl(struct doca_gpu_verbs_qp_hl *qp);
+
+/**
+ * Create an high-level GPUNetIO QP group (main and companion).
+ * This function encapsulate all required steps using doca verbs and doca gpunetio to
+ * create two GDAKI QPs, main one and the one used for core direct operations.
+ * The two QPs share the same UAR.
+ *
+ * @param [in] qp_init_attr
+ * High-level QP init attributes.
+ * @param [out] qpg
+ * GPUNetIO QP Group device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_qp_group_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                               struct doca_gpu_verbs_qp_group_hl **qpg);
+
+/**
+ * Destroy an high-level GPUNetIO QP group.
+ *
+ * @param [in] qp
+ * GPUNetIO high-level QP group to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_qp_group_hl(struct doca_gpu_verbs_qp_group_hl *qpg);
+
+/**
+ * Creates a flat list of GPU QP.
+ * Copies each struct doca_gpu_dev_verbs_qp inside the struct doca_gpu_verbs_qp_hl into
+ * a GPU array to avoid pointers dereferencing.
+ *
+ * @param [in] qp_list
+ * GPUNetIO high-level QP array
+ * @param [in] num_elems
+ * Number of QP in the qp_list
+ * @param [out] qp_gpu
+ * Array of GPU QP structures.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_qp_flat_list_create_hl(struct doca_gpu_verbs_qp_hl **qp_list,
+                                                   uint32_t num_elems,
+                                                   struct doca_gpu_dev_verbs_qp **qp_gpu);
+
+/**
+ * Destry a flat list of GPU QP.
+ *
+ * @param [in] qp_gpu
+ * Array of GPU QP structures.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_qp_flat_list_destroy_hl(struct doca_gpu_dev_verbs_qp *qp_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_HIGH_LEVEL_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h b/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h
new file mode 100644
index 000000000..a760cb3af
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h
@@ -0,0 +1,2467 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs.h
+ * @brief A header file for the doca_verbs APIs
+ */
+
+#ifndef DOCA_VERBS_H
+#define DOCA_VERBS_H
+
+#include <errno.h>
+
+#include "doca_error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************************************************************************************************************
+ * DOCA Verbs opaque types
+ *********************************************************************************************************************/
+/**
+ * Opaque structure representing a DOCA Verbs QP Init Attributes instance.
+ */
+struct doca_verbs_qp_init_attr;
+/**
+ * Opaque structure representing a DOCA Verbs QP Attributes instance.
+ */
+struct doca_verbs_qp_attr;
+/**
+ * Opaque structure representing a DOCA Verbs Queue Pair instance.
+ */
+struct doca_verbs_qp;
+/**
+ * Opaque structure representing a DOCA Verbs CQ Attributes instance.
+ */
+struct doca_verbs_cq_attr;
+/**
+ * Opaque structure representing a DOCA Verbs Completion Queue instance.
+ */
+struct doca_verbs_cq;
+/**
+ * Opaque structure representing a DOCA Verbs Shared Receive Queue instance.
+ */
+struct doca_verbs_srq;
+/**
+ * Opaque structure representing a DOCA Verbs SRQ Init Attributes
+ */
+struct doca_verbs_srq_init_attr;
+/**
+ * Opaque structure representing a DOCA Verbs AH instance.
+ */
+struct doca_verbs_ah_attr;
+/**
+ * Opaque structure representing a DOCA UMEM instance.
+ */
+struct doca_verbs_umem;
+/**
+ * Opaque structure representing a DOCA UAR instance.
+ */
+struct doca_verbs_uar;
+/**
+ * Opaque structure representing a DOCA Device Attributes instance.
+ */
+struct doca_verbs_device_attr;
+
+/**
+ * @brief Verbs RC QP type define.
+ */
+#define DOCA_VERBS_QP_TYPE_RC 0x0
+
+/**
+ * @brief Verbs QP state.
+ */
+enum doca_verbs_qp_state {
+    DOCA_VERBS_QP_STATE_RST = 0x0,
+    DOCA_VERBS_QP_STATE_INIT = 0x1,
+    DOCA_VERBS_QP_STATE_RTR = 0x2,
+    DOCA_VERBS_QP_STATE_RTS = 0x3,
+    DOCA_VERBS_QP_STATE_ERR = 0x4,
+};
+
+/**
+ * @brief Verbs address type.
+ */
+enum doca_verbs_addr_type {
+    DOCA_VERBS_ADDR_TYPE_IPv4,      /**< IPv4 type */
+    DOCA_VERBS_ADDR_TYPE_IPv6,      /**< IPv6 type */
+    DOCA_VERBS_ADDR_TYPE_IB_GRH,    /**< IB with GRH type */
+    DOCA_VERBS_ADDR_TYPE_IB_NO_GRH, /**< IB without GRH type */
+};
+
+/**
+ * @brief MTU size in bytes.
+ */
+enum doca_verbs_mtu_size {
+    DOCA_VERBS_MTU_SIZE_256_BYTES = 0x0,
+    DOCA_VERBS_MTU_SIZE_512_BYTES = 0x1,
+    DOCA_VERBS_MTU_SIZE_1K_BYTES = 0x2,
+    DOCA_VERBS_MTU_SIZE_2K_BYTES = 0x3,
+    DOCA_VERBS_MTU_SIZE_4K_BYTES = 0x4,
+    DOCA_VERBS_MTU_SIZE_RAW_ETHERNET = 0x5, /* Reserved */
+};
+
+/**
+ * @brief DOCA Verbs UAR allocation type.
+ */
+enum doca_verbs_uar_allocation_type {
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME = 0,
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE = 1,
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED = 2,
+};
+
+/**
+ * @brief CQ overrun
+ */
+enum doca_verbs_cq_overrun {
+    DOCA_VERBS_CQ_DISABLE_OVERRUN = 0, /**< Disable overrun by default. */
+    DOCA_VERBS_CQ_ENABLE_OVERRUN = 1,  /**< Enable overrun. */
+};
+
+/**
+ * @brief DOCA Verbs SRQ type.
+ */
+enum doca_verbs_srq_type {
+    DOCA_VERBS_SRQ_TYPE_LINKED_LIST,
+    DOCA_VERBS_SRQ_TYPE_CONTIGUOUS,
+};
+
+/**
+ * @brief DOCA Verbs Atomic Type.
+ */
+enum doca_verbs_qp_atomic_type {
+    DOCA_VERBS_QP_ATOMIC_MODE_NONE = 0x0,
+    DOCA_VERBS_QP_ATOMIC_MODE_IB_SPEC = 0x1,
+    DOCA_VERBS_QP_ATOMIC_MODE_UP_TO_8BYTES = 0x3
+};
+
+/**
+ * @brief Verbs QP attributes
+ *
+ * @details These defines can be used with doca_verbs_qp_modify() to set QP attributes.
+ * These attributes are used in several QP state transition commands.
+ *
+ * For each command bellow there are optional and required attributes depending on QP type:
+ * - *->rst:
+ *		QP type RC:
+ *			required: next_state
+ *			optional: NONE
+ *		QP type UC:
+ *			required: next_state
+ *			optional: NONE
+ * - *->err:
+ *		QP type RC:
+ *			required: next_state
+ *			optional: NONE
+ *		QP type UC:
+ *			required: next_state
+ *			optional: NONE
+ * - rst->init:
+ * 		QP type RC:
+ *			required: next_state, allow_remote_write, allow_remote_read, allow_atomic,
+ *pkey_index, port_num optional: NONE QP type UC: required: next_state, allow_remote_write,
+ *pkey_index, port_num optional: NONE
+ * - init->init:
+ *		QP type RC:
+ *			required: NONE
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic, pkey_index,
+ *port_num QP type UC: required: NONE optional: allow_remote_write, pkey_index, port_num
+ * - init->rtr:
+ *		QP type RC:
+ *			required: next_state, rq_psn, dest_qp_num, path_mtu, ah_attr, min_rnr_timer
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic, pkey_index
+ *		QP type UC:
+ *			required: next_state, rq_psn, dest_qp_num, path_mtu, ah_attr
+ *			optional: allow_remote_write, pkey_index
+ * - rtr->rts:
+ *		QP type RC:
+ *			required: next_state, sq_psn, ack_timeout, retry_cnt, rnr_retry
+ *			optional: allow_remote_write, min_rnr_timer
+ *		QP type UC:
+ *			required: next_state, sq_psn,
+ *			optional: allow_remote_write
+ * - rts->rts:
+ *		QP type RC:
+ *			required: NONE
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic,
+ *min_rnr_timer, ah_attr QP type UC: required: NONE optional: allow_remote_write, ah_attr
+ *
+ */
+/**
+ * @brief Allow Remote Write attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE (1 << 0)
+/**
+ * @brief Allow Remote Read attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ (1 << 1)
+/**
+ * @brief PKEY Index attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PKEY_INDEX (1 << 2)
+/**
+ * @brief Minimum RNR Timer attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER (1 << 3)
+/**
+ * @brief Port Number attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PORT_NUM (1 << 4)
+/**
+ * @brief Next State attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_NEXT_STATE (1 << 5)
+/**
+ * @brief Current State attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_CURRENT_STATE (1 << 6)
+/**
+ * @brief Path MTU attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PATH_MTU (1 << 7)
+/**
+ * @brief RQ PSN attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RQ_PSN (1 << 8)
+/**
+ * @brief SQ PSN attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_SQ_PSN (1 << 9)
+/**
+ * @brief Destination QP attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_DEST_QP_NUM (1 << 10)
+/**
+ * @brief ACK Timeout attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ACK_TIMEOUT (1 << 11)
+/**
+ * @brief Retry Counter attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RETRY_CNT (1 << 12)
+/**
+ * @brief RNR Retry attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RNR_RETRY (1 << 13)
+/**
+ * @brief AH attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_AH_ATTR (1 << 14)
+
+/**
+ * @brief Specifies the length of a GID (Global ID) in bytes.
+ */
+#define DOCA_VERBS_GID_BYTE_LENGTH 16
+
+/**
+ * @brief Invalid dmabuf_fd value. Used to notify the umem must be registered without dmabuf.
+ */
+#define DOCA_VERBS_DMABUF_INVALID_FD 0xFFFFFFFF
+/**
+ * @brief GID struct.
+ */
+struct doca_verbs_gid {
+    uint8_t raw[DOCA_VERBS_GID_BYTE_LENGTH]; /**< The raw value of the GID */
+};
+
+/**********************************************************************************************************************
+ * DOCA Verbs functions
+ *********************************************************************************************************************/
+
+/**
+ * @brief Create a DOCA Verbs QP Init Attributes instance.
+ *
+ * @param [out] verbs_qp_init_attr
+ * Pointer to pointer to be set to point to the created verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_init_attr_create(struct doca_verbs_qp_init_attr **verbs_qp_init_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs QP Init Attributes instance.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_destroy(struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set pd attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] pd
+ * pd attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_pd(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                            struct ibv_pd *pd);
+
+/**
+ * @brief Get pd attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * pd attribute.
+ */
+struct ibv_pd *doca_verbs_qp_init_attr_get_pd(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set send_cq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] send_cq
+ * send_cq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_send_cq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 struct doca_verbs_cq *send_cq);
+
+/**
+ * @brief Get send_cq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * send_cq attribute.
+ */
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_send_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set receive_cq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] receive_cq
+ * receive_cq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_receive_cq(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_cq *receive_cq);
+
+/**
+ * @brief Get receive_cq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * receive_cq attribute.
+ */
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_receive_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set sq_sig_all attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] sq_sig_all
+ * sq_sig_all attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_sq_sig_all(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, int sq_sig_all);
+
+/**
+ * @brief Get sq_sig_all attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * sq_sig_all attribute.
+ */
+int doca_verbs_qp_init_attr_get_sq_sig_all(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set sq_wr attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] sq_wr
+ * sq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_sq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t sq_wr);
+
+/**
+ * @brief Get sq_wr attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * sq_wr attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_sq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set rq_wr attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] rq_wr
+ * rq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_rq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t rq_wr);
+
+/**
+ * @brief Get rq_wr attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * rq_wr attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_rq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set send_max_sges attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] send_max_sges
+ * send_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_send_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t send_max_sges);
+
+/**
+ * @brief Get send_max_sges attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * send_max_sges attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_send_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set receive_max_sges attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] receive_max_sges
+ * receive_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_receive_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t receive_max_sges);
+
+/**
+ * @brief Get receive_max_sges attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * receive_max_sges attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_receive_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set max_inline_data attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] max_inline_data
+ * max_inline_data attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_max_inline_data(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t max_inline_data);
+
+/**
+ * @brief Get max_inline_data attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * max_inline_data attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_max_inline_data(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set user_index attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] user_index
+ * user_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_user_index(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t user_index);
+
+/**
+ * @brief Get user_index attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * user_index attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_user_index(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set qp_type attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] qp_type
+ * qp_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_qp_type(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 uint32_t qp_type);
+
+/**
+ * @brief Get qp_type attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * qp_type attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_qp_type(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set external umem attributes for verbs_qp_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Set external DBR umem attributes for verbs_qp_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the dbr umem by himself,
+ * in compare with the default mode where the dbr umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_dbr_umem
+ * External dbr umem instance.
+ * @param [in] external_dbr_umem_offset
+ * The offset in the external dbr umem buffer to set the DBR
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_dbr_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Get external umem attributes from verbs_qp_init_attr.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [out] external_umem
+ * External umem instance.
+ * @param [out] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_get_external_umem(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset);
+
+/**
+ * @brief Set external uar attribute for verbs_qp_init_attr.
+ *
+ * Setting these attribute means that the user wants to create and provide the uar by himself,
+ * in compare with the default mode where the uar is created internally.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_uar
+ * External uar instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_uar(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar *external_uar);
+
+/**
+ * @brief Get external uar attribute from verbs_qp_init_attr.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [out] external_uar
+ * External uar instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_get_external_uar(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar **external_uar);
+
+/**
+ * @brief Set qp_context attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] qp_context
+ * qp_context attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_qp_context(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, void *qp_context);
+
+/**
+ * @brief Get qp_context attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * qp_context attribute.
+ */
+void *doca_verbs_qp_init_attr_get_qp_context(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set srq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] srq
+ * srq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_srq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                             struct doca_verbs_srq *srq);
+
+/**
+ * @brief Get srq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * srq attribute.
+ */
+struct doca_verbs_srq *doca_verbs_qp_init_attr_get_srq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set CORE direct for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] core_direct_master
+ * Set core direct attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_core_direct_master(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint8_t core_direct_master);
+
+/**
+ * @brief Get CORE Direct attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * min_rnr_timer attribute.
+ */
+uint8_t doca_verbs_qp_init_attr_get_core_direct_master(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Create a DOCA Verbs QP Attributes instance.
+ *
+ * @param [out] verbs_qp_attr
+ * Pointer to pointer to be set to point to the created verbs_qp_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_attr_create(struct doca_verbs_qp_attr **verbs_qp_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs QP Attributes instance.
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_destroy(struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set next_state attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] next_state
+ * next_state attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_next_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               enum doca_verbs_qp_state next_state);
+
+/**
+ * @brief Get next_state attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * next_state attribute.
+ */
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_next_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set current_state attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] current_state
+ * current_state attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_current_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  enum doca_verbs_qp_state current_state);
+
+/**
+ * @brief Get current_state attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * current_state attribute.
+ */
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_current_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set path_mtu attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] path_mtu
+ * path_mtu attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_path_mtu(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             enum doca_verbs_mtu_size path_mtu);
+
+/**
+ * @brief Get path_mtu attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * path_mtu attribute.
+ */
+enum doca_verbs_mtu_size doca_verbs_qp_attr_get_path_mtu(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set rq_psn attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] rq_psn
+ * rq_psn attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_rq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t rq_psn);
+
+/**
+ * @brief Get rq_psn attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * rq_psn attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_rq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set sq_psn attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] sq_psn
+ * sq_psn attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_sq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t sq_psn);
+
+/**
+ * @brief Get sq_psn attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * sq_psn attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_sq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set dest_qp_num attribute for verbs_qp_attr
+ * @note The destination QP number used to establish a connection with the destination QP during the
+ * QP state modification.
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] dest_qp_num
+ * dest_qp_num attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_dest_qp_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint32_t dest_qp_num);
+
+/**
+ * @brief Get dest_qp_num attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * dest_qp_num attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_dest_qp_num(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_remote_write attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_remote_write
+ * allow_remote_write attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_write(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                       int allow_remote_write);
+
+/**
+ * @brief Get allow_remote_write attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_remote_write attribute.
+ */
+int doca_verbs_qp_attr_get_allow_remote_write(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_remote_read attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_remote_read
+ * allow_remote_read attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_read(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                      int allow_remote_read);
+
+/**
+ * @brief Get allow_remote_read attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_remote_read attribute.
+ */
+int doca_verbs_qp_attr_get_allow_remote_read(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_atomic attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_atomic
+ * allow_atomic attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_atomic(
+    struct doca_verbs_qp_attr *verbs_qp_attr, enum doca_verbs_qp_atomic_type allow_atomic_type);
+
+/**
+ * @brief Get allow_atomic attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_atomic attribute.
+ */
+enum doca_verbs_qp_atomic_type doca_verbs_qp_attr_get_allow_remote_atomic(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set ah_attr attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] ah_attr
+ * ah_attr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_ah_attr(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                            struct doca_verbs_ah_attr *ah_attr);
+
+/**
+ * @brief Get ah_attr attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * ah_attr attribute.
+ */
+struct doca_verbs_ah_attr *doca_verbs_qp_attr_get_ah_attr(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set pkey_index attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] pkey_index
+ * pkey_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_pkey_index(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               uint16_t pkey_index);
+
+/**
+ * @brief Get pkey_index attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * pkey_index attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_pkey_index(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set port_num attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] port_num
+ * port_num attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_port_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             uint16_t port_num);
+
+/**
+ * @brief Get port_num attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * port_num attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_port_num(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set ack_timeout attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] ack_timeout
+ * ack_timeout attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_ack_timeout(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint16_t ack_timeout);
+
+/**
+ * @brief Get ack_timeout attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * ack_timeout attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_ack_timeout(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set retry_cnt attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] retry_cnt
+ * retry_cnt attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_retry_cnt(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t retry_cnt);
+
+/**
+ * @brief Get retry_cnt attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * retry_cnt attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_retry_cnt(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set rnr_retry attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] rnr_retry
+ * rnr_retry attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_rnr_retry(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t rnr_retry);
+
+/**
+ * @brief Get rnr_retry attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * rnr_retry attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_rnr_retry(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set min_rnr_timer attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] min_rnr_timer
+ * min_rnr_timer attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_min_rnr_timer(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  uint16_t min_rnr_timer);
+
+/**
+ * @brief Get min_rnr_timer attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * min_rnr_timer attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_min_rnr_timer(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Create a DOCA Verbs AH instance.
+ *
+ * @param [in] context
+ * Pointer to context instance.
+ * @param [out] verbs_ah
+ * Pointer to pointer to be set to point to the created verbs_ah instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_ah_attr_create(struct ibv_context *context,
+                                       struct doca_verbs_ah_attr **verbs_ah);
+
+/**
+ * @brief Destroy a DOCA Verbs AH instance.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_destroy(struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set gid attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] gid
+ * gid attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_gid(struct doca_verbs_ah_attr *verbs_ah,
+                                        struct doca_verbs_gid gid);
+
+/**
+ * @brief Get gid attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * gid attribute.
+ */
+struct doca_verbs_gid doca_verbs_ah_get_gid(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set addr_type attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] addr_type
+ * addr_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_addr_type(struct doca_verbs_ah_attr *verbs_ah,
+                                              enum doca_verbs_addr_type addr_type);
+
+/**
+ * @brief Get addr_type attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * addr_type attribute.
+ */
+enum doca_verbs_addr_type doca_verbs_ah_get_addr_type(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set dlid attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] dlid
+ * dlid attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_dlid(struct doca_verbs_ah_attr *verbs_ah, uint32_t dlid);
+
+/**
+ * @brief Get dlid attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * dlid attribute.
+ */
+uint32_t doca_verbs_ah_get_dlid(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set sl attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] sl
+ * sl attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_sl(struct doca_verbs_ah_attr *verbs_ah, uint8_t sl);
+
+/**
+ * @brief Get sl attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * sl attribute.
+ */
+uint8_t doca_verbs_ah_get_sl(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set sgid_index attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] sgid_index
+ * sgid_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_sgid_index(struct doca_verbs_ah_attr *verbs_ah,
+                                               uint8_t sgid_index);
+
+/**
+ * @brief Get sgid_index attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * sgid_index attribute.
+ */
+uint8_t doca_verbs_ah_get_sgid_index(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set static_rate attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] static_rate
+ * static_rate attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_static_rate(struct doca_verbs_ah_attr *verbs_ah,
+                                                uint8_t static_rate);
+
+/**
+ * @brief Get static_rate attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * static_rate attribute.
+ */
+uint8_t doca_verbs_ah_get_static_rate(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set hop_limit attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] hop_limit
+ * hop_limit attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_hop_limit(struct doca_verbs_ah_attr *verbs_ah,
+                                              uint8_t hop_limit);
+
+/**
+ * @brief Get hop_limit attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * hop_limit attribute.
+ */
+uint8_t doca_verbs_ah_get_hop_limit(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set traffic_class attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] traffic_class
+ * traffic_class attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_traffic_class(struct doca_verbs_ah_attr *verbs_ah,
+                                                  uint8_t traffic_class);
+
+/**
+ * @brief Get traffic_class attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * traffic_class attribute.
+ */
+uint8_t doca_verbs_ah_get_traffic_class(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Create a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] verbs_qp_init_attr
+ * Pointer to qp_init_attr instance.
+ * @param [out] verbs_qp
+ * Pointer to pointer to be set to point to the created verbs_qp instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_create(struct ibv_context *context,
+                                  struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                  struct doca_verbs_qp **verbs_qp);
+
+/**
+ * @brief Destroy a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_destroy(struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Modify a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] attr_mask
+ * Mask for QP attributes. see define for DOCA_VERBS_QP_ATTR_*
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_modify(struct doca_verbs_qp *verbs_qp,
+                                  struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask);
+
+/**
+ * @brief Query the attributes of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [out] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [out] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_query(struct doca_verbs_qp *verbs_qp,
+                                 struct doca_verbs_qp_attr *verbs_qp_attr,
+                                 struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Get the Work Queue attributes of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [out] sq_buf
+ * Pointer to Send Queue buffer.
+ * @param [out] sq_num_entries
+ * The number of entries in Send Queue buffer.
+ * @param [out] rq_buf
+ * Pointer to Receive Queue buffer.
+ * @param [out] rq_num_entries
+ * The number of entries in Receive Queue buffer.
+ * @param [out] rwqe_size_bytes
+ * Receive WQE size in bytes.
+ *
+ */
+void doca_verbs_qp_get_wq(const struct doca_verbs_qp *verbs_qp, void **sq_buf,
+                          uint32_t *sq_num_entries, void **rq_buf, uint32_t *rq_num_entries,
+                          uint32_t *rwqe_size_bytes);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The DBR address.
+ */
+void *doca_verbs_qp_get_dbr_addr(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Get the UAR address of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The UAR register address.
+ */
+void *doca_verbs_qp_get_uar_addr(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Get the QP number of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The QP number.
+ */
+uint32_t doca_verbs_qp_get_qpn(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Create a DOCA Verbs CQ Attributes instance.
+ *
+ * @param [out] verbs_cq_attr
+ * Pointer to pointer to be set to point to the created verbs_cq_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_cq_attr_create(struct doca_verbs_cq_attr **verbs_cq_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs CQ Attributes instance.
+ *
+ * @param [in] verbs_cq_attr
+ * Pointer to verbs_cq_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_destroy(struct doca_verbs_cq_attr *verbs_cq_attr);
+
+/**
+ * @brief Set cq_size attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] cq_size
+ * cq size (num entries).
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_size(struct doca_verbs_cq_attr *cq_attr, uint32_t cq_size);
+
+/**
+ * @brief Set cq_context attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] cq_context
+ * User data. cq_context may be null in case the application regrets setting a user data.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_context(struct doca_verbs_cq_attr *cq_attr,
+                                               void *cq_context);
+
+/**
+ * @brief Set external umem attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Completion Queue.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                  struct doca_verbs_umem *external_umem,
+                                                  uint64_t external_umem_offset);
+
+/**
+ * @brief Set external dbr umem attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to create and provide the dbr umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Completion Queue.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_dbr_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                      struct doca_verbs_umem *external_umem,
+                                                      uint64_t external_umem_offset);
+
+/**
+ * @brief Set external uar attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to provide an external uar by himself,
+ * in compare with the default mode where uar is created internally.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_uar
+ * External uar.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_uar(struct doca_verbs_cq_attr *cq_attr,
+                                                 struct doca_verbs_uar *external_uar);
+
+/**
+ * @brief Enable cq_overrun attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] overrun
+ * enable or disable overrun (@see doca_verbs_cq_overrun).
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_overrun(struct doca_verbs_cq_attr *cq_attr,
+                                               enum doca_verbs_cq_overrun overrun);
+/**
+ * @brief Create a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] verbs_cq_attr
+ * Pointer to verbs_cq_attr instance.
+ * @param [out] verbs_cq
+ * Pointer to pointer to be set to point to the created doca_verbs_cq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_cq_create(struct ibv_context *context,
+                                  struct doca_verbs_cq_attr *verbs_cq_attr,
+                                  struct doca_verbs_cq **verbs_cq);
+
+/**
+ * @brief Destroy a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_destroy(struct doca_verbs_cq *verbs_cq);
+
+/**
+ * @brief Get the Completion Queue attributes of a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ * @param [out] cq_buf
+ * Pointer to Completion Queue buffer.
+ * @param [out] cq_num_entries
+ * The number of entries in Completion Queue buffer.
+ * @param [out] cq_entry_size
+ * The size of each entry in Completion Queue buffer.
+ *
+ */
+void doca_verbs_cq_get_wq(struct doca_verbs_cq *verbs_cq, void **cq_buf, uint32_t *cq_num_entries,
+                          uint8_t *cq_entry_size);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ * @param [out] uar_db_reg
+ * Pointer to the UAR doorbell record
+ * @param [out] ci_dbr
+ * Pointer to the CI doorbell record
+ * @param [out] arm_dbr
+ * Pointer to the arm doorbell record
+ */
+void doca_verbs_cq_get_dbr_addr(struct doca_verbs_cq *verbs_cq, uint64_t **uar_db_reg,
+                                uint32_t **ci_dbr, uint32_t **arm_dbr);
+
+/**
+ * @brief Get the CQ number of a DOCA Verbs CQ instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ *
+ * @return
+ * The CQ number.
+ */
+uint32_t doca_verbs_cq_get_cqn(const struct doca_verbs_cq *verbs_cq);
+
+/**
+ * @brief Create a DOCA Verbs SRQ Init Attributes instance.
+ *
+ * @param [out] verbs_srq_init_attr
+ * Pointer to pointer to be set to point to the created verbs_srq_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_srq_init_attr_create(struct doca_verbs_srq_init_attr **verbs_srq_init_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs SRQ Init Attributes instance.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_destroy(struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set srq_wr attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] srq_wr
+ * srq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_srq_wr(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t srq_wr);
+
+/**
+ * @brief Get srq_wr attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * srq_wr attribute.
+ */
+uint32_t doca_verbs_srq_init_attr_get_srq_wr(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+/**
+ * @brief Set receive_max_sges attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] receive_max_sges
+ * receive_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_receive_max_sges(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t receive_max_sges);
+
+/**
+ * @brief Get receive_max_sges attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * receive_max_sges attribute.
+ */
+uint32_t doca_verbs_srq_init_attr_get_receive_max_sges(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set srq_type attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] srq_type
+ * srq_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_type(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                               enum doca_verbs_srq_type srq_type);
+
+/**
+ * @brief Get srq_type attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * srq_type attribute.
+ */
+enum doca_verbs_srq_type doca_verbs_srq_init_attr_get_type(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set pd attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] pd
+ * pd attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_pd(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                             struct ibv_pd *pd);
+
+/**
+ * @brief Get pd attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * pd attribute.
+ */
+struct ibv_pd *doca_verbs_srq_init_attr_get_pd(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set external umem attributes for verbs_srq_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_external_umem(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Get external umem attributes from verbs_srq_init_attr.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [out] external_umem
+ * External umem instance.
+ * @param [out] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_get_external_umem(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset);
+
+/**
+ * @brief Create a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_context
+ * Pointer to verbs_context instance.
+ * @param [in] verbs_srq_init_attr
+ * Pointer to srq_init_attr instance.
+ * @param [out] verbs_srq
+ * Pointer to pointer to be set to point to the created verbs_srq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_srq_create(struct ibv_context *verbs_context,
+                                   struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                   struct doca_verbs_srq **verbs_srq);
+
+/**
+ * @brief Destroy a DOCA IB Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_destroy(struct doca_verbs_srq *verbs_srq);
+
+/**
+ * @brief Get the SRQ number of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * The SRQ number.
+ */
+uint32_t doca_verbs_srq_get_srqn(const struct doca_verbs_srq *verbs_srq);
+
+/**
+ * @brief Get the Work Queue attributes of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ * @param [out] srq_buf
+ * Pointer to Shared Receive Queue buffer.
+ * @param [out] srq_num_entries
+ * The number of entries in Shared Receive Queue buffer.
+ * @param [out] rwqe_size_bytes
+ * Receive WQE size in bytes.
+ *
+ */
+void doca_verbs_srq_get_wq(const struct doca_verbs_srq *verbs_srq, void **srq_buf,
+                           uint32_t *srq_num_entries, uint32_t *rwqe_size_bytes);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * The DBR address.
+ */
+void *doca_verbs_srq_get_dbr_addr(const struct doca_verbs_srq *verbs_srq);
+
+/**********************************************************************************************************************
+ * Capabilities functions
+ *********************************************************************************************************************/
+
+/**
+ * @brief Query DOCA Verbs device attributes.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [out] verbs_device_attr
+ * Pointer to pointer to be set to point to the created verbs_device_attr instance.
+ * User is expected to free this object with "doca_verbs_device_attr_free()".
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ * - DOCA_ERROR_NOT_DRIVER - low level layer failure.
+ */
+doca_error_t doca_verbs_query_device(struct ibv_context *context,
+                                     struct doca_verbs_device_attr **verbs_device_attr);
+
+/**
+ * @brief Free a DOCA Verbs Device Attributes instance.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_device_attr_free(struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of QPs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of QPs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_qp(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of work requests on send/receive queue supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of work requests on send/receive queue supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_qp_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of scatter/gather entries per send/receive work request in a QP
+ * other than RD supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of scatter/gather entries per send/receive work request in a QP other than RD
+ * supported by the device.
+ *
+ */
+uint32_t doca_verbs_device_attr_get_max_sge(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of CQs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of CQs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_cq(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of entries on CQ supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of entries on CQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_cqe(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of MRs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of MRs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_mr(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of PDs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of MRs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_pd(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of AHs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of AHs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_ah(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of SRQs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of SRQs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of work requests on SRQ supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of work requests on SRQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of scatter entries per receive work request in a SRQ supported by
+ * the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of scatter entries per receive work request in a SRQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of partitions supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of partitions supported by the device.
+ */
+uint16_t doca_verbs_device_attr_get_max_pkeys(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Check if a given QP type is supported on this device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ * @param [in] qp_type
+ * The QP type to check its support.
+ *
+ * @return
+ * DOCA_SUCCESS - in case QP type is supported.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid parameter was given.
+ * - DOCA_ERROR_NOT_SUPPORTED - if QP type is not supported.
+ */
+doca_error_t doca_verbs_device_attr_get_is_qp_type_supported(
+    const struct doca_verbs_device_attr *verbs_device_attr, uint32_t qp_type);
+
+/**
+ * @brief Create an instance of DOCA Verbs UMEM.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] address
+ * The umem address.
+ * @param [in] size
+ * The umem size.
+ * @param [in] access_flags
+ * The umem access flags.
+ * @param [in] dmabuf_fd
+ * The umem dmabuf file descriptor id.
+ * @param [in] dmabuf_offset
+ * The umem dmabuf offset.
+ * @param [out] umem_obj
+ * The umem object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_umem_create(struct ibv_context *context, void *address, size_t size,
+                                    uint32_t access_flags, int dmabuf_id, size_t dmabuf_offset,
+                                    struct doca_verbs_umem **umem_obj);
+
+/**
+ * @brief Destroy an instance of DOCA Verbs UMEM.
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_umem_destroy(struct doca_verbs_umem *umem_obj);
+
+/**
+ * @brief This method retrieves the umem id
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_id
+ * the umem id.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_id(const struct doca_verbs_umem *umem_obj, uint32_t *umem_id);
+
+/**
+ * @brief This method retrieves the umem size
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_size
+ * the umem size.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_size(const struct doca_verbs_umem *umem_obj, size_t *umem_size);
+
+/**
+ * @brief This method retrieves the umem address
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_address
+ * the umem address.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_address(const struct doca_verbs_umem *umem_obj,
+                                         void **umem_address);
+
+/**
+ * @brief Create a UAR object
+ *
+ * @param [in] context
+ * Pointer to ibv_context
+ * @param [in] allocation_type
+ * doca_uar_allocation_type
+ * @param [out] uar
+ * UAR object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_uar_create(struct ibv_context *context,
+                                   enum doca_verbs_uar_allocation_type allocation_type,
+                                   struct doca_verbs_uar **uar_obj);
+
+/**
+ * @brief Destroy a UAR object
+ *
+ * @param [in] uar
+ * UAR object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_destroy(struct doca_verbs_uar *uar_obj);
+
+/**
+ * @brief This method retrieves the UAR ID
+ *
+ * @param [in] uar
+ * UAR object
+ * @param [out] id
+ * The UAR ID
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_id_get(const struct doca_verbs_uar *uar, uint32_t *id);
+
+/**
+ * @brief This method retrieves the uar register address
+ *
+ * @param [in] uar
+ * UAR object
+ * @param [out] reg_addr
+ * UAR register address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_reg_addr_get(const struct doca_verbs_uar *uar_obj, void **reg_addr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h b/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h
new file mode 100644
index 000000000..112f7809a
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h
@@ -0,0 +1,5693 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX5_IFC_H
+#define MLX5_IFC_H
+
+#define u8 uint8_t
+
+#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)NULL)
+#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
+#define __mlx5_bit_off(typ, fld) offsetof(struct mlx5_ifc_##typ##_bits, fld)
+#define __mlx5_dw_off(bit_off) ((bit_off) / 32)
+#define __mlx5_64_off(bit_off) ((bit_off) / 64)
+#define __mlx5_dw_bit_off(bit_sz, bit_off) (32 - (bit_sz) - ((bit_off) & 0x1f))
+#define __mlx5_mask(bit_sz) ((uint32_t)((1ull << (bit_sz)) - 1))
+#define __mlx5_dw_mask(bit_sz, bit_off) (__mlx5_mask(bit_sz) << __mlx5_dw_bit_off(bit_sz, bit_off))
+
+#define MLX5_FLD_SZ_BITS(typ, fld) (__mlx5_bit_sz(typ, fld))
+#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
+#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
+#define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
+#define MLX5_ADDR_OF(typ, p, fld) ((unsigned char *)(p) + MLX5_BYTE_OFF(typ, fld))
+
+enum mlx5_cap_mode {
+    HCA_CAP_OPMOD_GET_MAX = 0,
+    HCA_CAP_OPMOD_GET_CUR = 1,
+};
+
+enum {
+    MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
+    MLX5_CMD_OP_INIT_HCA = 0x102,
+    MLX5_CMD_OP_TEARDOWN_HCA = 0x103,
+    MLX5_CMD_OP_ENABLE_HCA = 0x104,
+    MLX5_CMD_OP_QUERY_PAGES = 0x107,
+    MLX5_CMD_OP_MANAGE_PAGES = 0x108,
+    MLX5_CMD_OP_SET_HCA_CAP = 0x109,
+    MLX5_CMD_OP_QUERY_ISSI = 0x10a,
+    MLX5_CMD_OP_SET_ISSI = 0x10b,
+    MLX5_CMD_OP_CREATE_MKEY = 0x200,
+    MLX5_CMD_OP_DESTROY_MKEY = 0x202,
+    MLX5_CMD_OP_CREATE_EQ = 0x301,
+    MLX5_CMD_OP_DESTROY_EQ = 0x302,
+    MLX5_CMD_OP_CREATE_CQ = 0x400,
+    MLX5_CMD_OP_DESTROY_CQ = 0x401,
+    MLX5_CMD_OP_CREATE_QP = 0x500,
+    MLX5_CMD_OP_DESTROY_QP = 0x501,
+    MLX5_CMD_OP_RST2INIT_QP = 0x502,
+    MLX5_CMD_OP_INIT2RTR_QP = 0x503,
+    MLX5_CMD_OP_RTR2RTS_QP = 0x504,
+    MLX5_CMD_OP_RTS2RTS_QP = 0x505,
+    MLX5_CMD_OP_QP_2ERR = 0x507,
+    MLX5_CMD_OP_QP_2RST = 0x50a,
+    MLX5_CMD_OP_QUERY_QP = 0x50b,
+    MLX5_CMD_OP_INIT2INIT_QP = 0x50e,
+    MLX5_CMD_OP_CREATE_PSV = 0x600,
+    MLX5_CMD_OP_DESTROY_PSV = 0x601,
+    MLX5_CMD_OP_CREATE_SRQ = 0x700,
+    MLX5_CMD_OP_DESTROY_SRQ = 0x701,
+    MLX5_CMD_OP_CREATE_XRC_SRQ = 0x705,
+    MLX5_CMD_OP_DESTROY_XRC_SRQ = 0x706,
+    MLX5_CMD_OP_CREATE_DCT = 0x710,
+    MLX5_CMD_OP_DESTROY_DCT = 0x711,
+    MLX5_CMD_OP_QUERY_DCT = 0x713,
+    MLX5_CMD_OP_CREATE_XRQ = 0x717,
+    MLX5_CMD_OP_DESTROY_XRQ = 0x718,
+    MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740,
+    MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752,
+    MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
+    MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755,
+    MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760,
+    MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771,
+    MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772,
+    MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782,
+    MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783,
+    MLX5_CMD_OP_ALLOC_PD = 0x800,
+    MLX5_CMD_OP_DEALLOC_PD = 0x801,
+    MLX5_CMD_OP_ALLOC_UAR = 0x802,
+    MLX5_CMD_OP_DEALLOC_UAR = 0x803,
+    MLX5_CMD_OP_ACCESS_REG = 0x805,
+    MLX5_CMD_OP_ATTACH_TO_MCG = 0x806,
+    MLX5_CMD_OP_DETACH_FROM_MCG = 0x807,
+    MLX5_CMD_OP_ALLOC_XRCD = 0x80e,
+    MLX5_CMD_OP_DEALLOC_XRCD = 0x80f,
+    MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816,
+    MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN = 0x817,
+    MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT = 0x827,
+    MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT = 0x828,
+    MLX5_CMD_OP_SET_L2_TABLE_ENTRY = 0x829,
+    MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b,
+    MLX5_CMD_OP_QUERY_LAG = 0x842,
+    MLX5_CMD_OP_CREATE_TIR = 0x900,
+    MLX5_CMD_OP_DESTROY_TIR = 0x902,
+    MLX5_CMD_OP_CREATE_SQ = 0x904,
+    MLX5_CMD_OP_MODIFY_SQ = 0x905,
+    MLX5_CMD_OP_DESTROY_SQ = 0x906,
+    MLX5_CMD_OP_CREATE_RQ = 0x908,
+    MLX5_CMD_OP_DESTROY_RQ = 0x90a,
+    MLX5_CMD_OP_CREATE_RMP = 0x90c,
+    MLX5_CMD_OP_DESTROY_RMP = 0x90e,
+    MLX5_CMD_OP_CREATE_TIS = 0x912,
+    MLX5_CMD_OP_MODIFY_TIS = 0x913,
+    MLX5_CMD_OP_DESTROY_TIS = 0x914,
+    MLX5_CMD_OP_QUERY_TIS = 0x915,
+    MLX5_CMD_OP_CREATE_RQT = 0x916,
+    MLX5_CMD_OP_DESTROY_RQT = 0x918,
+    MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930,
+    MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931,
+    MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932,
+    MLX5_CMD_OP_CREATE_FLOW_GROUP = 0x933,
+    MLX5_CMD_OP_DESTROY_FLOW_GROUP = 0x934,
+    MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936,
+    MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938,
+    MLX5_CMD_OP_CREATE_FLOW_COUNTER = 0x939,
+    MLX5_CMD_OP_DEALLOC_FLOW_COUNTER = 0x93a,
+    MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d,
+    MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e,
+    MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940,
+    MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
+    MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00,
+    MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01,
+    MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02,
+    MLX5_CMD_OP_DESTROY_GENERAL_OBJECT = 0xa03,
+    MLX5_CMD_OP_CREATE_UMEM = 0xa08,
+    MLX5_CMD_OP_DESTROY_UMEM = 0xa0a,
+    MLX5_CMD_OP_SYNC_STEERING = 0xb00,
+};
+
+enum {
+    MLX5_CMD_STAT_OK = 0x0,
+    MLX5_CMD_STAT_INT_ERR = 0x1,
+    MLX5_CMD_STAT_BAD_OP_ERR = 0x2,
+    MLX5_CMD_STAT_BAD_PARAM_ERR = 0x3,
+    MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4,
+    MLX5_CMD_STAT_BAD_RES_ERR = 0x5,
+    MLX5_CMD_STAT_RES_BUSY = 0x6,
+    MLX5_CMD_STAT_LIM_ERR = 0x8,
+    MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9,
+    MLX5_CMD_STAT_IX_ERR = 0xa,
+    MLX5_CMD_STAT_NO_RES_ERR = 0xf,
+    MLX5_CMD_STAT_BAD_INP_LEN_ERR = 0x50,
+    MLX5_CMD_STAT_BAD_OUTP_LEN_ERR = 0x51,
+    MLX5_CMD_STAT_BAD_QP_STATE_ERR = 0x10,
+    MLX5_CMD_STAT_BAD_PKT_ERR = 0x30,
+    MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40,
+};
+
+enum {
+    MLX5_PAGES_CANT_GIVE = 0,
+    MLX5_PAGES_GIVE = 1,
+    MLX5_PAGES_TAKE = 2,
+};
+
+enum {
+    MLX5_REG_HOST_ENDIANNESS = 0x7004,
+};
+
+enum {
+    MLX5_CAP_PORT_TYPE_IB = 0x0,
+    MLX5_CAP_PORT_TYPE_ETH = 0x1,
+};
+
+enum mlx5_event {
+    MLX5_EVENT_TYPE_CMD = 0x0a,
+    MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb,
+};
+
+enum {
+    MLX5_EQ_DOORBEL_OFFSET = 0x40,
+};
+
+struct mlx5_ifc_atomic_caps_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 atomic_req_8B_endianness_mode[0x2];
+    u8 reserved_at_42[0x4];
+    u8 supported_atomic_req_8B_endianness_mode_1[0x1];
+
+    u8 reserved_at_47[0x19];
+
+    u8 reserved_at_60[0x20];
+
+    u8 reserved_at_80[0x10];
+    u8 atomic_operations[0x10];
+
+    u8 reserved_at_a0[0x10];
+    u8 atomic_size_qp[0x10];
+
+    u8 reserved_at_c0[0x10];
+    u8 atomic_size_dc[0x10];
+
+    u8 reserved_at_e0[0x1a0];
+
+    u8 fetch_add_pci_atomic[0x10];
+    u8 swap_pci_atomic[0x10];
+    u8 compare_swap_pci_atomic[0x10];
+
+    u8 reserved_at_2b0[0x550];
+};
+
+struct mlx5_ifc_roce_cap_bits {
+    u8 reserved_0[0x4];
+    u8 sw_r_roce_src_udp_port[0x1];
+    u8 fl_rc_qp_when_roce_disabled[0x1];
+    u8 fl_rc_qp_when_roce_enabled[0x1];
+    u8 reserved_at_7[0x17];
+    u8 qp_ts_format[0x2];
+
+    uint8_t reserved_at_20[0x60];
+
+    uint8_t reserved_at_80[0xc];
+    uint8_t l3_type[0x4];
+    uint8_t reserved_at_90[0x8];
+    uint8_t roce_version[0x8];
+
+    uint8_t reserved_at_a0[0x10];
+    uint8_t r_roce_dest_udp_port[0x10];
+
+    uint8_t r_roce_max_src_udp_port[0x10];
+    uint8_t r_roce_min_src_udp_port[0x10];
+
+    uint8_t reserved_at_e0[0x10];
+    uint8_t roce_address_table_size[0x10];
+
+    uint8_t reserved_at_100[0x700];
+};
+
+enum {
+    MLX5_MULTI_PATH_FT_MAX_LEVEL = 64,
+};
+
+struct mlx5_ifc_flow_table_context_bits {
+    u8 reformat_en[0x1];
+    u8 decap_en[0x1];
+    u8 sw_owner[0x1];
+    u8 termination_table[0x1];
+    u8 table_miss_action[0x4];
+    u8 level[0x8];
+    u8 reserved_at_10[0x8];
+    u8 log_size[0x8];
+
+    u8 reserved_at_20[0x8];
+    u8 table_miss_id[0x18];
+
+    u8 reserved_at_40[0x8];
+    u8 lag_master_next_table_id[0x18];
+
+    u8 reserved_at_60[0x60];
+
+    u8 sw_owner_icm_root_1[0x40];
+
+    u8 sw_owner_icm_root_0[0x40];
+};
+
+struct mlx5_ifc_create_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_create_flow_table_out_bits {
+    u8 status[0x8];
+    u8 icm_address_63_40[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 icm_address_39_32[0x8];
+    u8 table_id[0x18];
+
+    u8 icm_address_31_0[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_flow_table_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x80];
+
+    struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_sync_steering_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0xc0];
+};
+
+struct mlx5_ifc_sync_steering_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_device_mem_cap_bits {
+    u8 memic[0x1];
+    u8 reserved_at_1[0x1f];
+
+    u8 reserved_at_20[0xb];
+    u8 log_min_memic_alloc_size[0x5];
+    u8 reserved_at_30[0x8];
+    u8 log_max_memic_addr_alignment[0x8];
+
+    u8 memic_bar_start_addr[0x40];
+
+    u8 memic_bar_size[0x20];
+
+    u8 max_memic_size[0x20];
+
+    u8 steering_sw_icm_start_address[0x40];
+
+    u8 reserved_at_100[0x8];
+    u8 log_header_modify_sw_icm_size[0x8];
+    u8 reserved_at_110[0x2];
+    u8 log_sw_icm_alloc_granularity[0x6];
+    u8 log_steering_sw_icm_size[0x8];
+
+    u8 reserved_at_120[0x20];
+
+    u8 header_modify_sw_icm_start_address[0x40];
+};
+
+struct mlx5_ifc_flow_table_fields_supported_bits {
+    u8 outer_dmac[0x1];
+    u8 outer_smac[0x1];
+    u8 outer_ether_type[0x1];
+    u8 outer_ip_version[0x1];
+    u8 outer_first_prio[0x1];
+    u8 outer_first_cfi[0x1];
+    u8 outer_first_vid[0x1];
+    u8 outer_ipv4_ttl[0x1];
+    u8 outer_second_prio[0x1];
+    u8 outer_second_cfi[0x1];
+    u8 outer_second_vid[0x1];
+    u8 outer_ipv6_flow_label[0x1];
+    u8 outer_sip[0x1];
+    u8 outer_dip[0x1];
+    u8 outer_frag[0x1];
+    u8 outer_ip_protocol[0x1];
+    u8 outer_ip_ecn[0x1];
+    u8 outer_ip_dscp[0x1];
+    u8 outer_udp_sport[0x1];
+    u8 outer_udp_dport[0x1];
+    u8 outer_tcp_sport[0x1];
+    u8 outer_tcp_dport[0x1];
+    u8 outer_tcp_flags[0x1];
+    u8 outer_gre_protocol[0x1];
+    u8 outer_gre_key[0x1];
+    u8 outer_vxlan_vni[0x1];
+    u8 outer_geneve_vni[0x1];
+    u8 outer_geneve_oam[0x1];
+    u8 outer_geneve_protocol_type[0x1];
+    u8 outer_geneve_opt_len[0x1];
+    u8 source_vhca_port[0x1];
+    u8 source_eswitch_port[0x1];
+
+    u8 inner_dmac[0x1];
+    u8 inner_smac[0x1];
+    u8 inner_ether_type[0x1];
+    u8 inner_ip_version[0x1];
+    u8 inner_first_prio[0x1];
+    u8 inner_first_cfi[0x1];
+    u8 inner_first_vid[0x1];
+    u8 inner_ipv4_ttl[0x1];
+    u8 inner_second_prio[0x1];
+    u8 inner_second_cfi[0x1];
+    u8 inner_second_vid[0x1];
+    u8 inner_ipv6_flow_label[0x1];
+    u8 inner_sip[0x1];
+    u8 inner_dip[0x1];
+    u8 inner_frag[0x1];
+    u8 inner_ip_protocol[0x1];
+    u8 inner_ip_ecn[0x1];
+    u8 inner_ip_dscp[0x1];
+    u8 inner_udp_sport[0x1];
+    u8 inner_udp_dport[0x1];
+    u8 inner_tcp_sport[0x1];
+    u8 inner_tcp_dport[0x1];
+    u8 inner_tcp_flags[0x1];
+    u8 reserved_at_37[0x7];
+    u8 metadata_reg_b[0x1];
+    u8 metadata_reg_a[0x1];
+
+    u8 reserved_at_40[0x5];
+    u8 outer_first_mpls_over_udp_ttl[0x1];
+    u8 outer_first_mpls_over_udp_s_bos[0x1];
+    u8 outer_first_mpls_over_udp_exp[0x1];
+    u8 outer_first_mpls_over_udp_label[0x1];
+    u8 outer_first_mpls_over_gre_ttl[0x1];
+    u8 outer_first_mpls_over_gre_s_bos[0x1];
+    u8 outer_first_mpls_over_gre_exp[0x1];
+    u8 outer_first_mpls_over_gre_label[0x1];
+    u8 inner_first_mpls_ttl[0x1];
+    u8 inner_first_mpls_s_bos[0x1];
+    u8 inner_first_mpls_exp[0x1];
+    u8 inner_first_mpls_label[0x1];
+    u8 outer_first_mpls_ttl[0x1];
+    u8 outer_first_mpls_s_bos[0x1];
+    u8 outer_first_mpls_exp[0x1];
+    u8 outer_first_mpls_label[0x1];
+    u8 outer_emd_tag[0x1];
+    u8 inner_esp_spi[0x1];
+    u8 outer_esp_spi[0x1];
+    u8 inner_ipv6_hop_limit[0x1];
+    u8 outer_ipv6_hop_limit[0x1];
+    u8 bth_dst_qp[0x1];
+    u8 inner_first_svlan[0x1];
+    u8 inner_second_svlan[0x1];
+    u8 outer_first_svlan[0x1];
+    u8 outer_second_svlan[0x1];
+    u8 source_sqn[0x1];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dr_match_spec_bits {
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 ethertype[0x10];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 first_prio[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vid[0xc];
+
+    u8 ip_protocol[0x8];
+    u8 ip_dscp[0x6];
+    u8 ip_ecn[0x2];
+    u8 cvlan_tag[0x1];
+    u8 svlan_tag[0x1];
+    u8 frag[0x1];
+    u8 ip_version[0x4];
+    u8 tcp_flags[0x9];
+
+    u8 tcp_sport[0x10];
+    u8 tcp_dport[0x10];
+
+    u8 reserved_at_c0[0x10];
+    u8 ipv4_ihl[0x4];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 ipv4_checksum_ok[0x1];
+    u8 l4_checksum_ok[0x1];
+    u8 ip_ttl_hoplimit[0x8];
+
+    u8 udp_sport[0x10];
+    u8 udp_dport[0x10];
+
+    u8 src_ip_127_96[0x20];
+
+    u8 src_ip_95_64[0x20];
+
+    u8 src_ip_63_32[0x20];
+
+    u8 src_ip_31_0[0x20];
+
+    u8 dst_ip_127_96[0x20];
+
+    u8 dst_ip_95_64[0x20];
+
+    u8 dst_ip_63_32[0x20];
+
+    u8 dst_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_auto1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 source_vhca_port[0x4];
+    u8 source_sqn[0x18];
+
+    u8 source_eswitch_owner_vhca_id[0x10];
+    u8 source_port[0x10];
+
+    u8 outer_second_prio[0x3];
+    u8 outer_second_cfi[0x1];
+    u8 outer_second_vid[0xc];
+    u8 inner_second_prio[0x3];
+    u8 inner_second_cfi[0x1];
+    u8 inner_second_vid[0xc];
+
+    u8 outer_second_cvlan_tag[0x1];
+    u8 inner_second_cvlan_tag[0x1];
+    u8 outer_second_svlan_tag[0x1];
+    u8 inner_second_svlan_tag[0x1];
+    u8 outer_emd_tag[0x1];
+    u8 reserved_at_65[0xb];
+    u8 gre_protocol[0x10];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 vxlan_vni[0x18];
+    u8 reserved_at_b8[0x8];
+
+    u8 geneve_vni[0x18];
+    u8 reserved_at_e4[0x7];
+    u8 geneve_oam[0x1];
+
+    u8 reserved_at_ec[0xc];
+    u8 outer_ipv6_flow_label[0x14];
+
+    u8 reserved_at_100[0xc];
+    u8 inner_ipv6_flow_label[0x14];
+
+    u8 reserved_at_120[0xa];
+    u8 geneve_opt_len[0x6];
+    u8 geneve_protocol_type[0x10];
+
+    u8 reserved_at_140[0x8];
+    u8 bth_dst_qp[0x18];
+
+    u8 inner_esp_spi[0x20];
+
+    u8 outer_esp_spi[0x20];
+
+    u8 reserved_at_1a0[0x60];
+};
+
+struct mlx5_ifc_dr_match_set_misc2_bits {
+    u8 outer_first_mpls_label[0x14];
+    u8 outer_first_mpls_exp[0x3];
+    u8 outer_first_mpls_s_bos[0x1];
+    u8 outer_first_mpls_ttl[0x8];
+
+    u8 inner_first_mpls_label[0x14];
+    u8 inner_first_mpls_exp[0x3];
+    u8 inner_first_mpls_s_bos[0x1];
+    u8 inner_first_mpls_ttl[0x8];
+
+    u8 outer_first_mpls_over_gre_label[0x14];
+    u8 outer_first_mpls_over_gre_exp[0x3];
+    u8 outer_first_mpls_over_gre_s_bos[0x1];
+    u8 outer_first_mpls_over_gre_ttl[0x8];
+
+    u8 outer_first_mpls_over_udp_label[0x14];
+    u8 outer_first_mpls_over_udp_exp[0x3];
+    u8 outer_first_mpls_over_udp_s_bos[0x1];
+    u8 outer_first_mpls_over_udp_ttl[0x8];
+
+    u8 metadata_reg_c_7[0x20];
+    u8 metadata_reg_c_6[0x20];
+    u8 metadata_reg_c_5[0x20];
+    u8 metadata_reg_c_4[0x20];
+    u8 metadata_reg_c_3[0x20];
+    u8 metadata_reg_c_2[0x20];
+    u8 metadata_reg_c_1[0x20];
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_a[0x20];
+    u8 metadata_reg_b[0x20];
+
+    u8 reserved_at_260[0x40];
+};
+
+struct mlx5_ifc_dr_match_set_misc3_bits {
+    u8 inner_tcp_seq_num[0x20];
+
+    u8 outer_tcp_seq_num[0x20];
+
+    u8 inner_tcp_ack_num[0x20];
+
+    u8 outer_tcp_ack_num[0x20];
+
+    u8 reserved_at_80[0x8];
+    u8 outer_vxlan_gpe_vni[0x18];
+
+    u8 outer_vxlan_gpe_next_protocol[0x8];
+    u8 outer_vxlan_gpe_flags[0x8];
+    u8 reserved_at_b0[0x10];
+
+    u8 icmp_header_data[0x20];
+
+    u8 icmpv6_header_data[0x20];
+
+    u8 icmp_type[0x8];
+    u8 icmp_code[0x8];
+    u8 icmpv6_type[0x8];
+    u8 icmpv6_code[0x8];
+
+    u8 geneve_tlv_option_0_data[0x20];
+
+    u8 gtpu_teid[0x20];
+
+    u8 gtpu_msg_type[0x8];
+    u8 gtpu_msg_flags[0x8];
+    u8 reserved_at_150[0x10];
+
+    u8 gtpu_dw_2[0x20];
+
+    u8 gtpu_first_ext_dw_0[0x20];
+
+    u8 gtpu_dw_0[0x20];
+
+    u8 reserved_at_1c0[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc4_bits {
+    u8 prog_sample_field_value_0[0x20];
+
+    u8 prog_sample_field_id_0[0x20];
+
+    u8 prog_sample_field_value_1[0x20];
+
+    u8 prog_sample_field_id_1[0x20];
+
+    u8 prog_sample_field_value_2[0x20];
+
+    u8 prog_sample_field_id_2[0x20];
+
+    u8 prog_sample_field_value_3[0x20];
+
+    u8 prog_sample_field_id_3[0x20];
+
+    u8 prog_sample_field_value_4[0x20];
+
+    u8 prog_sample_field_id_4[0x20];
+
+    u8 prog_sample_field_value_5[0x20];
+
+    u8 prog_sample_field_id_5[0x20];
+
+    u8 prog_sample_field_value_6[0x20];
+
+    u8 prog_sample_field_id_6[0x20];
+
+    u8 prog_sample_field_value_7[0x20];
+
+    u8 prog_sample_field_id_7[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc5_bits {
+    u8 macsec_tag_0[0x20];
+
+    u8 macsec_tag_1[0x20];
+
+    u8 macsec_tag_2[0x20];
+
+    u8 macsec_tag_3[0x20];
+
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 tunnel_header_2[0x20];
+
+    u8 tunnel_header_3[0x20];
+
+    u8 reserved[0x100];
+};
+
+struct mlx5_ifc_dr_match_param_bits {
+    struct mlx5_ifc_dr_match_spec_bits outer;
+    struct mlx5_ifc_dr_match_set_misc_bits misc;
+    struct mlx5_ifc_dr_match_spec_bits inner;
+    struct mlx5_ifc_dr_match_set_misc2_bits misc2;
+    struct mlx5_ifc_dr_match_set_misc3_bits misc3;
+    struct mlx5_ifc_dr_match_set_misc4_bits misc4;
+    struct mlx5_ifc_dr_match_set_misc5_bits misc5;
+};
+
+struct mlx5_ifc_flow_table_prop_layout_bits {
+    u8 ft_support[0x1];
+    u8 flow_tag[0x1];
+    u8 flow_counter[0x1];
+    u8 flow_modify_en[0x1];
+    u8 modify_root[0x1];
+    u8 identified_miss_table[0x1];
+    u8 flow_table_modify[0x1];
+    u8 reformat[0x1];
+    u8 decap[0x1];
+    u8 reset_root_to_default[0x1];
+    u8 pop_vlan[0x1];
+    u8 push_vlan[0x1];
+    u8 fpga_vendor_acceleration[0x1];
+    u8 pop_vlan_2[0x1];
+    u8 push_vlan_2[0x1];
+    u8 reformat_and_vlan_action[0x1];
+    u8 modify_and_vlan_action[0x1];
+    u8 sw_owner[0x1];
+    u8 reformat_l3_tunnel_to_l2[0x1];
+    u8 reformat_l2_to_l3_tunnel[0x1];
+    u8 reformat_and_modify_action[0x1];
+    u8 reserved_at_15[0x9];
+    u8 sw_owner_v2[0x1];
+    u8 reserved_at_1f[0x1];
+
+    u8 reserved_at_20[0x2];
+    u8 log_max_ft_size[0x6];
+    u8 log_max_modify_header_context[0x8];
+    u8 max_modify_header_actions[0x8];
+    u8 max_ft_level[0x8];
+
+    u8 reserved_at_40[0x10];
+    u8 metadata_reg_b_width[0x8];
+    u8 metadata_reg_a_width[0x8];
+
+    u8 reserved_at_60[0x18];
+    u8 log_max_ft_num[0x8];
+
+    u8 reserved_at_80[0x10];
+    u8 log_max_flow_counter[0x8];
+    u8 log_max_destination[0x8];
+
+    u8 reserved_at_a0[0x18];
+    u8 log_max_flow[0x8];
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
+
+    struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support;
+};
+
+enum {
+    MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3,
+    MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4,
+    mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5,
+    MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7,
+    MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8,
+    MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9,
+    MLX5_FLEX_PARSER_GENEVE_OPT_0_ENABLED = 1 << 10,
+    MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11,
+    MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED = 1 << 16,
+    MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED = 1 << 17,
+    MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED = 1 << 18,
+    MLX5_FLEX_PARSER_GTPU_TEID_ENABLED = 1 << 19,
+};
+
+enum mlx5_ifc_steering_format_version {
+    MLX5_HW_CONNECTX_5 = 0x0,
+    MLX5_HW_CONNECTX_6DX = 0x1,
+};
+
+enum mlx5_ifc_ste_v1_modify_hdr_offset {
+    MLX5_MODIFY_HEADER_V1_QW_OFFSET = 0x20,
+};
+
+struct mlx5_ifc_cmd_hca_cap_bits {
+    u8 access_other_hca_roce[0x1];
+    u8 reserved_at_1[0x1e];
+    u8 vhca_resource_manager[0x1];
+
+    u8 hca_cap_2[0x1];
+    u8 reserved_at_21[0xf];
+    u8 vhca_id[0x10];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x2];
+    u8 qp_data_in_order[0x1];
+    u8 reserved_at_63[0x8];
+    u8 log_dma_mmo_max_size[0x5];
+    u8 reserved_at_70[0x10];
+
+    u8 log_max_srq_sz[0x8];
+    u8 log_max_qp_sz[0x8];
+    u8 reserved_at_90[0x3];
+    u8 isolate_vl_tc_new[0x1];
+    u8 reserved_at_94[0x4];
+    u8 prio_tag_required[0x1];
+    u8 reserved_at_99[0x2];
+    u8 log_max_qp[0x5];
+
+    u8 reserved_at_a0[0xb];
+    u8 log_max_srq[0x5];
+    u8 reserved_at_b0[0x10];
+
+    u8 reserved_at_c0[0x8];
+    u8 log_max_cq_sz[0x8];
+    u8 reserved_at_d0[0xb];
+    u8 log_max_cq[0x5];
+
+    u8 log_max_eq_sz[0x8];
+    u8 relaxed_ordering_write[0x1];
+    u8 reserved_at_e9[0x1];
+    u8 log_max_mkey[0x6];
+    u8 tunneled_atomic[0x1];
+    u8 as_notify[0x1];
+    u8 m_pci_port[0x1];
+    u8 m_vhca_mk[0x1];
+    u8 cmd_on_behalf[0x1];
+    u8 device_emulation_manager[0x1];
+    u8 terminate_scatter_list_mkey[0x1];
+    u8 repeated_mkey[0x1];
+    u8 dump_fill_mkey[0x1];
+    u8 reserved_at_f9[0x3];
+    u8 log_max_eq[0x4];
+
+    u8 max_indirection[0x8];
+    u8 fixed_buffer_size[0x1];
+    u8 log_max_mrw_sz[0x7];
+    u8 force_teardown[0x1];
+    u8 fast_teardown[0x1];
+    u8 log_max_bsf_list_size[0x6];
+    u8 umr_extended_translation_offset[0x1];
+    u8 null_mkey[0x1];
+    u8 log_max_klm_list_size[0x6];
+
+    u8 reserved_at_120[0x2];
+    u8 qpc_extension[0x1];
+    u8 reserved_at_123[0x7];
+    u8 log_max_ra_req_dc[0x6];
+    u8 reserved_at_130[0xa];
+    u8 log_max_ra_res_dc[0x6];
+
+    u8 reserved_at_140[0x7];
+    u8 sig_crc64_xp10[0x1];
+    u8 sig_crc32c[0x1];
+    u8 reserved_at_149[0x1];
+    u8 log_max_ra_req_qp[0x6];
+    u8 reserved_at_150[0x1];
+    u8 rts2rts_qp_udp_sport[0x1];
+    u8 rts2rts_lag_tx_port_affinity[0x1];
+    u8 dma_mmo_sq[0x1];
+    u8 reserved_at_154[0x6];
+    u8 log_max_ra_res_qp[0x6];
+
+    u8 end_pad[0x1];
+    u8 cc_query_allowed[0x1];
+    u8 cc_modify_allowed[0x1];
+    u8 start_pad[0x1];
+    u8 cache_line_128byte[0x1];
+    u8 gid_table_size_ro[0x1];
+    u8 pkey_table_size_ro[0x1];
+    u8 reserved_at_167[0x1];
+    u8 rnr_nak_q_counters[0x1];
+    u8 rts2rts_qp_counters_set_id[0x1];
+    u8 rts2rts_qp_dscp[0x1];
+    u8 reserved_at_16b[0x4];
+    u8 qcam_reg[0x1];
+    u8 gid_table_size[0x10];
+
+    u8 out_of_seq_cnt[0x1];
+    u8 vport_counters[0x1];
+    u8 retransmission_q_counters[0x1];
+    u8 debug[0x1];
+    u8 modify_rq_counters_set_id[0x1];
+    u8 rq_delay_drop[0x1];
+    u8 max_qp_cnt[0xa];
+    u8 pkey_table_size[0x10];
+
+    u8 vport_group_manager[0x1];
+    u8 vhca_group_manager[0x1];
+    u8 ib_virt[0x1];
+    u8 eth_virt[0x1];
+    u8 vnic_env_queue_counters[0x1];
+    u8 ets[0x1];
+    u8 nic_flow_table[0x1];
+    u8 eswitch_manager[0x1];
+    u8 device_memory[0x1];
+    u8 mcam_reg[0x1];
+    u8 pcam_reg[0x1];
+    u8 local_ca_ack_delay[0x5];
+    u8 port_module_event[0x1];
+    u8 enhanced_retransmission_q_counters[0x1];
+    u8 port_checks[0x1];
+    u8 pulse_gen_control[0x1];
+    u8 disable_link_up_by_init_hca[0x1];
+    u8 beacon_led[0x1];
+    u8 port_type[0x2];
+    u8 num_ports[0x8];
+
+    u8 reserved_at_1c0[0x1];
+    u8 pps[0x1];
+    u8 pps_modify[0x1];
+    u8 log_max_msg[0x5];
+    u8 multi_path_xrc_rdma[0x1];
+    u8 multi_path_dc_rdma[0x1];
+    u8 multi_path_rc_rdma[0x1];
+    u8 traffic_fast_control[0x1];
+    u8 max_tc[0x4];
+    u8 temp_warn_event[0x1];
+    u8 dcbx[0x1];
+    u8 general_notification_event[0x1];
+    u8 multi_prio_sq[0x1];
+    u8 afu_owner[0x1];
+    u8 fpga[0x1];
+    u8 rol_s[0x1];
+    u8 rol_g[0x1];
+    u8 ib_port_sniffer[0x1];
+    u8 wol_s[0x1];
+    u8 wol_g[0x1];
+    u8 wol_a[0x1];
+    u8 wol_b[0x1];
+    u8 wol_m[0x1];
+    u8 wol_u[0x1];
+    u8 wol_p[0x1];
+
+    u8 stat_rate_support[0x10];
+    u8 sig_block_4048[0x1];
+    u8 reserved_at_1f1[0xb];
+    u8 cqe_version[0x4];
+
+    u8 compact_address_vector[0x1];
+    u8 eth_striding_wq[0x1];
+    u8 reserved_at_202[0x1];
+    u8 ipoib_enhanced_offloads[0x1];
+    u8 ipoib_basic_offloads[0x1];
+    u8 ib_striding_wq[0x1];
+    u8 repeated_block_disabled[0x1];
+    u8 umr_modify_entity_size_disabled[0x1];
+    u8 umr_modify_atomic_disabled[0x1];
+    u8 umr_indirect_mkey_disabled[0x1];
+    u8 umr_fence[0x2];
+    u8 dc_req_sctr_data_cqe[0x1];
+    u8 dc_connect_qp[0x1];
+    u8 dc_cnak_trace[0x1];
+    u8 drain_sigerr[0x1];
+    u8 cmdif_checksum[0x2];
+    u8 sigerr_cqe[0x1];
+    u8 reserved_at_213[0x1];
+    u8 wq_signature[0x1];
+    u8 sctr_data_cqe[0x1];
+    u8 reserved_at_216[0x1];
+    u8 sho[0x1];
+    u8 tph[0x1];
+    u8 rf[0x1];
+    u8 dct[0x1];
+    u8 qos[0x1];
+    u8 eth_net_offloads[0x1];
+    u8 roce[0x1];
+    u8 atomic[0x1];
+    u8 extended_retry_count[0x1];
+
+    u8 cq_oi[0x1];
+    u8 cq_resize[0x1];
+    u8 cq_moderation[0x1];
+    u8 cq_period_mode_modify[0x1];
+    u8 cq_invalidate[0x1];
+    u8 reserved_at_225[0x1];
+    u8 cq_eq_remap[0x1];
+    u8 pg[0x1];
+    u8 block_lb_mc[0x1];
+    u8 exponential_backoff[0x1];
+    u8 scqe_break_moderation[0x1];
+    u8 cq_period_start_from_cqe[0x1];
+    u8 cd[0x1];
+    u8 atm[0x1];
+    u8 apm[0x1];
+    u8 vector_calc[0x1];
+    u8 umr_ptr_rlkey[0x1];
+    u8 imaicl[0x1];
+    u8 qp_packet_based[0x1];
+    u8 reserved_at_233[0x1];
+    u8 ipoib_enhanced_pkey_change[0x1];
+    u8 initiator_src_dct_in_cqe[0x1];
+    u8 qkv[0x1];
+    u8 pkv[0x1];
+    u8 set_deth_sqpn[0x1];
+    u8 rts2rts_primary_sl[0x1];
+    u8 initiator_src_dct[0x1];
+    u8 dc_v2[0x1];
+    u8 xrc[0x1];
+    u8 ud[0x1];
+    u8 uc[0x1];
+    u8 rc[0x1];
+
+    u8 uar_4k[0x1];
+    u8 reserved_at_241[0x9];
+    u8 uar_sz[0x6];
+    u8 reserved_at_250[0x2];
+    u8 umem_uid_0[0x1];
+    u8 log_max_dc_cnak_qps[0x5];
+    u8 log_pg_sz[0x8];
+
+    u8 bf[0x1];
+    u8 driver_version[0x1];
+    u8 pad_tx_eth_packet[0x1];
+    u8 query_driver_version[0x1];
+    u8 max_qp_retry_freq[0x1];
+    u8 qp_by_name[0x1];
+    u8 mkey_by_name[0x1];
+    u8 reserved_at_267[0x1];
+    u8 suspend_qp_uc[0x1];
+    u8 suspend_qp_ud[0x1];
+    u8 suspend_qp_rc[0x1];
+    u8 log_bf_reg_size[0x5];
+    u8 reserved_at_270[0x6];
+    u8 lag_dct[0x2];
+    u8 lag_tx_port_affinity[0x1];
+    u8 reserved_at_279[0x2];
+    u8 lag_master[0x1];
+    u8 num_lag_ports[0x4];
+
+    u8 num_of_diagnostic_counters[0x10];
+    u8 max_wqe_sz_sq[0x10];
+
+    u8 reserved_at_2a0[0x10];
+    u8 max_wqe_sz_rq[0x10];
+
+    u8 max_flow_counter_31_16[0x10];
+    u8 max_wqe_sz_sq_dc[0x10];
+
+    u8 reserved_at_2e0[0x7];
+    u8 max_qp_mcg[0x19];
+
+    u8 mlnx_tag_ethertype[0x10];
+    u8 reserved_at_310[0x8];
+    u8 log_max_mcg[0x8];
+
+    u8 reserved_at_320[0x3];
+    u8 log_max_transport_domain[0x5];
+    u8 reserved_at_328[0x3];
+    u8 log_max_pd[0x5];
+    u8 reserved_at_330[0xb];
+    u8 log_max_xrcd[0x5];
+
+    u8 nic_receive_steering_discard[0x1];
+    u8 receive_discard_vport_down[0x1];
+    u8 transmit_discard_vport_down[0x1];
+    u8 eq_overrun_count[0x1];
+    u8 nic_receive_steering_depth[0x1];
+    u8 invalid_command_count[0x1];
+    u8 quota_exceeded_count[0x1];
+    u8 reserved_at_347[0x1];
+    u8 log_max_flow_counter_bulk[0x8];
+    u8 max_flow_counter_15_0[0x10];
+
+    u8 modify_tis[0x1];
+    u8 reserved_at_361[0x2];
+    u8 log_max_rq[0x5];
+    u8 reserved_at_368[0x3];
+    u8 log_max_sq[0x5];
+    u8 reserved_at_370[0x3];
+    u8 log_max_tir[0x5];
+    u8 reserved_at_378[0x3];
+    u8 log_max_tis[0x5];
+
+    u8 basic_cyclic_rcv_wqe[0x1];
+    u8 reserved_at_381[0x2];
+    u8 log_max_rmp[0x5];
+    u8 reserved_at_388[0x3];
+    u8 log_max_rqt[0x5];
+    u8 reserved_at_390[0x3];
+    u8 log_max_rqt_size[0x5];
+    u8 reserved_at_398[0x3];
+    u8 log_max_tis_per_sq[0x5];
+
+    u8 ext_stride_num_range[0x1];
+    u8 reserved_at_3a1[0x2];
+    u8 log_max_stride_sz_rq[0x5];
+    u8 reserved_at_3a8[0x3];
+    u8 log_min_stride_sz_rq[0x5];
+    u8 reserved_at_3b0[0x3];
+    u8 log_max_stride_sz_sq[0x5];
+    u8 reserved_at_3b8[0x3];
+    u8 log_min_stride_sz_sq[0x5];
+
+    u8 hairpin[0x1];
+    u8 reserved_at_3c1[0x2];
+    u8 log_max_hairpin_queues[0x5];
+    u8 reserved_at_3c8[0x3];
+    u8 log_max_hairpin_wq_data_sz[0x5];
+    u8 reserved_at_3d0[0x3];
+    u8 log_max_hairpin_num_packets[0x5];
+    u8 reserved_at_3d8[0x3];
+    u8 log_max_wq_sz[0x5];
+
+    u8 nic_vport_change_event[0x1];
+    u8 disable_local_lb_uc[0x1];
+    u8 disable_local_lb_mc[0x1];
+    u8 log_min_hairpin_wq_data_sz[0x5];
+    u8 reserved_at_3e8[0x3];
+    u8 log_max_vlan_list[0x5];
+    u8 reserved_at_3f0[0x3];
+    u8 log_max_current_mc_list[0x5];
+    u8 reserved_at_3f8[0x3];
+    u8 log_max_current_uc_list[0x5];
+
+    u8 general_obj_types[0x40];
+
+    u8 reserved_at_440[0x4];
+    u8 steering_format_version[0x4];
+    u8 create_qp_start_hint[0x18];
+
+    u8 reserved_at_460[0x8];
+    u8 aes_xts[0x1];
+    u8 crypto[0x1];
+    u8 reserved_at_46a[0x6];
+    u8 max_num_eqs[0x10];
+
+    u8 sigerr_domain_and_sig_type[0x1];
+    u8 reserved_at_481[0x2];
+    u8 log_max_l2_table[0x5];
+    u8 reserved_at_488[0x8];
+    u8 log_uar_page_sz[0x10];
+
+    u8 reserved_at_4a0[0x20];
+
+    u8 device_frequency_mhz[0x20];
+
+    u8 device_frequency_khz[0x20];
+
+    u8 capi[0x1];
+    u8 create_pec[0x1];
+    u8 nvmf_target_offload[0x1];
+    u8 capi_invalidate[0x1];
+    u8 reserved_at_504[0x17];
+    u8 log_max_pasid[0x5];
+
+    u8 num_of_uars_per_page[0x20];
+
+    u8 flex_parser_protocols[0x20];
+
+    u8 reserved_at_560[0x10];
+    u8 flex_parser_header_modify[0x1];
+    u8 reserved_at_571[0x2];
+    u8 log_max_guaranteed_connections[0x5];
+    u8 reserved_at_578[0x3];
+    u8 log_max_dct_connections[0x5];
+
+    u8 log_max_atomic_size_qp[0x8];
+    u8 reserved_at_588[0x10];
+    u8 log_max_atomic_size_dc[0x8];
+
+    u8 reserved_at_5a0[0x1c];
+    u8 mini_cqe_resp_stride_index[0x1];
+    u8 cqe_128_always[0x1];
+    u8 cqe_compression_128b[0x1];
+    u8 cqe_compression[0x1];
+
+    u8 cqe_compression_timeout[0x10];
+    u8 cqe_compression_max_num[0x10];
+
+    u8 reserved_at_5e0[0x8];
+    u8 flex_parser_id_gtpu_dw_0[0x4];
+    u8 log_max_tm_offloaded_op_size[0x4];
+    u8 tag_matching[0x1];
+    u8 rndv_offload_rc[0x1];
+    u8 rndv_offload_dc[0x1];
+    u8 log_tag_matching_list_sz[0x5];
+    u8 reserved_at_5f8[0x3];
+    u8 log_max_xrq[0x5];
+
+    u8 affiliate_nic_vport_criteria[0x8];
+    u8 native_port_num[0x8];
+    u8 num_vhca_ports[0x8];
+    u8 flex_parser_id_gtpu_teid[0x4];
+    u8 reserved_at_61c[0x1];
+    u8 trusted_vnic_vhca[0x1];
+    u8 sw_owner_id[0x1];
+    u8 reserve_not_to_use[0x1];
+    u8 reserved_at_620[0x60];
+    u8 sf[0x1];
+    u8 reserved_at_682[0x43];
+    u8 flex_parser_id_geneve_opt_0[0x4];
+    u8 flex_parser_id_icmp_dw1[0x4];
+    u8 flex_parser_id_icmp_dw0[0x4];
+    u8 flex_parser_id_icmpv6_dw1[0x4];
+    u8 flex_parser_id_icmpv6_dw0[0x4];
+    u8 flex_parser_id_outer_first_mpls_over_gre[0x4];
+    u8 flex_parser_id_outer_first_mpls_over_udp_label[0x4];
+
+    u8 reserved_at_6e0[0x20];
+
+    u8 flex_parser_id_gtpu_dw_2[0x4];
+    u8 flex_parser_id_gtpu_first_ext_dw_0[0x4];
+    u8 reserved_at_708[0x18];
+
+    u8 reserved_at_720[0x20];
+
+    u8 reserved_at_740[0x8];
+    u8 dma_mmo_qp[0x1];
+    u8 reserved_at_749[0x17];
+
+    u8 reserved_at_760[0x60];
+
+    u8 match_definer_format_supported[0x40];
+};
+
+struct mlx5_ifc_header_modify_cap_properties_bits {
+    struct mlx5_ifc_flow_table_fields_supported_bits set_action_field_support;
+
+    u8 reserved_at_80[0x80];
+
+    struct mlx5_ifc_flow_table_fields_supported_bits add_action_field_support;
+
+    u8 reserved_at_180[0x80];
+
+    u8 copy_action_field_support[8][0x20];
+
+    u8 reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_flow_table_fields_supported_2_bits {
+    u8 reserved_at_0[0x17];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+    u8 psp_header[0x1];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+
+    u8 reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_flow_table_nic_cap_bits {
+    u8 nic_rx_multi_path_tirs[0x1];
+    u8 nic_rx_multi_path_tirs_fts[0x1];
+    u8 allow_sniffer_and_nic_rx_shared_tir[0x1];
+    u8 reserved_at_3[0x1];
+    u8 nic_rx_flow_tag_multipath_en[0x1];
+    u8 reserved_at_5[0x13];
+    u8 nic_receive_max_steering_depth[0x8];
+
+    u8 encap_general_header[0x1];
+    u8 reserved_at_21[0xa];
+    u8 log_max_packet_reformat_context[0x5];
+    u8 reserved_at_30[0x6];
+    u8 max_encap_header_size[0xa];
+
+    u8 reserved_at_40[0x1c0];
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
+
+    u8 reserved_at_e00[0x200];
+
+    struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits
+        ft_field_bitmask_support_2_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits
+        ft_field_bitmask_support_2_nic_transmit_sniffer;
+
+    u8 reserved_at_1400[0x200];
+
+    struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_transmit;
+
+    u8 sw_steering_nic_rx_action_drop_icm_address[0x40];
+
+    u8 sw_steering_nic_tx_action_drop_icm_address[0x40];
+
+    u8 sw_steering_nic_tx_action_allow_icm_address[0x40];
+
+    u8 reserved_at_20c0[0x5f40];
+};
+
+struct mlx5_ifc_flow_table_eswitch_cap_bits {
+    u8 reserved_at_0[0x1c];
+    u8 fdb_multi_path_to_table[0x1];
+    u8 reserved_at_1d[0x1e3];
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
+
+    u8 reserved_at_800[0x1000];
+
+    u8 sw_steering_fdb_action_drop_icm_address_rx[0x40];
+    u8 sw_steering_fdb_action_drop_icm_address_tx[0x40];
+    u8 sw_steering_uplink_icm_address_rx[0x40];
+    u8 sw_steering_uplink_icm_address_tx[0x40];
+
+    u8 reserved_at_1900[0x6700];
+};
+
+struct mlx5_ifc_odp_per_transport_service_cap_bits {
+    u8 send[0x1];
+    u8 receive[0x1];
+    u8 write[0x1];
+    u8 read[0x1];
+    u8 atomic[0x1];
+    u8 srq_receive[0x1];
+    u8 reserved_at_6[0x1a];
+};
+
+struct mlx5_ifc_odp_cap_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 sig[0x1];
+    u8 reserved_at_41[0x1f];
+
+    u8 reserved_at_60[0x20];
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps;
+
+    u8 reserved_at_120[0x6e0];
+};
+
+struct mlx5_ifc_e_switch_cap_bits {
+    u8 reserved_at_0[0x4b];
+    u8 log_max_esw_sf[0x5];
+    u8 esw_sf_base_id[0x10];
+    u8 reserved_at_60[0x7a0];
+};
+
+enum {
+    ELEMENT_TYPE_CAP_MASK_TASR = 1 << 0,
+    ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4,
+};
+
+enum {
+    TSAR_TYPE_CAP_MASK_DWRR = 1 << 0,
+};
+
+struct mlx5_ifc_qos_cap_bits {
+    u8 reserved_at_0[0x8];
+    u8 nic_sq_scheduling[0x1];
+    u8 nic_bw_share[0x1];
+    u8 nic_rate_limit[0x1];
+    u8 reserved_at_b[0x15];
+
+    u8 reserved_at_20[0x1];
+    u8 nic_qp_scheduling[0x1];
+    u8 reserved_at_22[0x1e];
+
+    u8 reserved_at_40[0xc0];
+
+    u8 nic_element_type[0x10];
+    u8 nic_tsar_type[0x10];
+
+    u8 reserved_at_120[0x6e0];
+};
+
+struct mlx5_ifc_cmd_hca_cap_2_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 reserved_at_80[0x13];
+    u8 log_reserved_qpn_granularity[0x5];
+    u8 reserved_at_98[0x8];
+
+    u8 reserved_at_a0[0x760];
+};
+
+enum {
+    MLX5_CRYPTO_CAPS_WRAPPED_IMPORT_METHOD_AES = 0x4,
+};
+
+struct mlx5_ifc_crypto_caps_bits {
+    u8 wrapped_crypto_operational[0x1];
+    u8 wrapped_crypto_going_to_commissioning[0x1];
+    u8 reserved_at_2[0x16];
+    u8 wrapped_import_method[0x8];
+
+    u8 reserved_at_20[0xb];
+    u8 log_max_num_deks[0x5];
+    u8 reserved_at_30[0x3];
+    u8 log_max_num_import_keks[0x5];
+    u8 reserved_at_38[0x3];
+    u8 log_max_num_creds[0x5];
+
+    u8 failed_selftests[0x10];
+    u8 num_nv_import_keks[0x8];
+    u8 num_nv_credentials[0x8];
+
+    u8 reserved_at_60[0x7a0];
+};
+
+union mlx5_ifc_hca_cap_union_bits {
+    struct mlx5_ifc_atomic_caps_bits atomic_caps;
+    struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+    struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
+    struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
+    struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
+    struct mlx5_ifc_device_mem_cap_bits device_mem_cap;
+    struct mlx5_ifc_odp_cap_bits odp_cap;
+    struct mlx5_ifc_roce_cap_bits roce_caps;
+    struct mlx5_ifc_qos_cap_bits qos_caps;
+    struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2;
+    struct mlx5_ifc_crypto_caps_bits crypto_caps;
+    u8 reserved_at_0[0x8000];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum mlx5_cap_type {
+    MLX5_CAP_GENERAL = 0,
+    MLX5_CAP_ODP = 2,
+    MLX5_CAP_ATOMIC = 3,
+    MLX5_CAP_ROCE,
+    MLX5_CAP_NUM,
+};
+
+enum {
+    MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE = 0x7 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE = 0x8 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_QOS = 0xc << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ESW = 0x9 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY = 0xf << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_CRYPTO = 0x1a << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE_CAP_2 = 0x20 << 1,
+};
+
+enum {
+    MLX5_MKC_ACCESS_MODE_MTT = 0x1,
+    MLX5_MKC_ACCESS_MODE_KLMS = 0x2,
+};
+
+struct mlx5_ifc_mkc_bits {
+    u8 reserved_at_0[0x1];
+    u8 free[0x1];
+    u8 reserved_at_2[0x1];
+    u8 access_mode_4_2[0x3];
+    u8 reserved_at_6[0x7];
+    u8 relaxed_ordering_write[0x1];
+    u8 reserved_at_e[0x1];
+    u8 small_fence_on_rdma_read_response[0x1];
+    u8 umr_en[0x1];
+    u8 a[0x1];
+    u8 rw[0x1];
+    u8 rr[0x1];
+    u8 lw[0x1];
+    u8 lr[0x1];
+    u8 access_mode_1_0[0x2];
+    u8 reserved_at_18[0x8];
+
+    u8 qpn[0x18];
+    u8 mkey_7_0[0x8];
+
+    u8 reserved_at_40[0x20];
+
+    u8 length64[0x1];
+    u8 bsf_en[0x1];
+    u8 sync_umr[0x1];
+    u8 reserved_at_63[0x2];
+    u8 expected_sigerr_count[0x1];
+    u8 reserved_at_66[0x1];
+    u8 en_rinval[0x1];
+    u8 pd[0x18];
+
+    u8 start_addr[0x40];
+
+    u8 len[0x40];
+
+    u8 bsf_octword_size[0x20];
+
+    u8 reserved_at_120[0x80];
+
+    u8 translations_octword_size[0x20];
+
+    u8 reserved_at_1c0[0x19];
+    u8 relaxed_ordering_read[0x1];
+    u8 reserved_at_1d9[0x1];
+    u8 log_page_size[0x5];
+
+    u8 reserved_at_1e0[0x3];
+    u8 crypto_en[0x2];
+    u8 reserved_at_1e5[0x1b];
+};
+
+struct mlx5_ifc_create_mkey_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 mkey_index[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_mkey_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x20];
+
+    u8 pg_access[0x1];
+    u8 mkey_umem_valid[0x1];
+    u8 reserved_at_62[0x1e];
+
+    struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+    u8 reserved_at_280[0x80];
+
+    u8 translations_octword_actual_size[0x20];
+
+    u8 reserved_at_320[0x560];
+
+    u8 klm_pas_mtt[0][0x20];
+};
+
+struct mlx5_ifc_destroy_mkey_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_mkey_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 mkey_index[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_l2_hdr_bits {
+    u8 dmac_47_16[0x20];
+    u8 dmac_15_0[0x10];
+    u8 smac_47_32[0x10];
+    u8 smac_31_0[0x20];
+    u8 ethertype[0x10];
+    u8 vlan_type[0x10];
+    u8 vlan[0x10];
+};
+
+enum {
+    FS_FT_NIC_RX = 0x0,
+    FS_FT_NIC_TX = 0x1,
+    FS_FT_ESW_EGRESS_ACL = 0x2,
+    FS_FT_ESW_INGRESS_ACL = 0x3,
+    FS_FT_FDB = 0X4,
+    FS_FT_SNIFFER_RX = 0X5,
+    FS_FT_SNIFFER_TX = 0X6,
+};
+
+struct mlx5_ifc_ste_general_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+    u8 reserved_at_60[0xa0];
+    u8 tag_value[0x60];
+    u8 bit_mask[0x60];
+};
+
+struct mlx5_ifc_ste_sx_transmit_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 sx_wire[0x1];
+    u8 sx_func_lb[0x1];
+    u8 sx_sniffer[0x1];
+    u8 sx_wire_enable[0x1];
+    u8 sx_func_lb_enable[0x1];
+    u8 sx_sniffer_enable[0x1];
+    u8 action_type[0x3];
+    u8 reserved_at_69[0x1];
+    u8 action_description[0x6];
+    u8 gvmi[0x10];
+
+    u8 encap_pointer_vlan_data[0x20];
+
+    u8 loopback_syndome_en[0x8];
+    u8 loopback_syndome[0x8];
+    u8 counter_trigger[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 go_back[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_rx_steering_mult_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 member_count[0x10];
+    u8 gvmi[0x10];
+
+    u8 qp_list_pointer[0x20];
+
+    u8 reserved_at_a0[0x1];
+    u8 tunneling_action[0x3];
+    u8 action_description[0x4];
+    u8 reserved_at_a8[0x8];
+    u8 counter_trigger_15_0[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x08];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 fail_on_error[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_modify_packet_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 number_of_re_write_actions[0x10];
+    u8 gvmi[0x10];
+
+    u8 header_re_write_actions_pointer[0x20];
+
+    u8 reserved_at_a0[0x1];
+    u8 tunneling_action[0x3];
+    u8 action_description[0x4];
+    u8 reserved_at_a8[0x8];
+    u8 counter_trigger_15_0[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x08];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 fail_on_error[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_single_action_flow_tag_v1_bits {
+    u8 action_id[0x8];
+    u8 flow_tag[0x18];
+};
+
+struct mlx5_ifc_ste_single_action_modify_list_v1_bits {
+    u8 action_id[0x8];
+    u8 num_of_modify_actions[0x8];
+    u8 modify_actions_ptr[0x10];
+};
+
+struct mlx5_ifc_ste_single_action_remove_header_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 reserved_at_10[0x2];
+    u8 end_anchor[0x6];
+    u8 reserved_at_18[0x4];
+    u8 decap[0x1];
+    u8 vni_to_cqe[0x1];
+    u8 qos_profile[0x2];
+};
+
+struct mlx5_ifc_ste_single_action_remove_header_size_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 outer_l4_remove[0x1];
+    u8 reserved_at_11[0x1];
+    u8 start_offset[0x7];
+    u8 reserved_at_18[0x1];
+    u8 remove_size[0x6];
+};
+
+struct mlx5_ifc_ste_double_action_copy_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 reserved_at_20[0x8];
+    u8 source_dw_offset[0x8];
+    u8 reserved_at_30[0x2];
+    u8 source_right_shifter[0x6];
+    u8 reserved_at_38[0x8];
+};
+
+struct mlx5_ifc_ste_double_action_set_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_add_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 add_value[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_insert_with_inline_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 start_offset[0x7];
+    u8 reserved_at_17[0x9];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_insert_with_ptr_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 start_offset[0x7];
+    u8 size[0x6];
+    u8 attributes[0x3];
+
+    u8 pointer[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_modify_action_list_v1_bits {
+    u8 action_id[0x8];
+    u8 modify_actions_pattern_pointer[0x18];
+
+    u8 number_of_modify_actions[0x8];
+    u8 modify_actions_argument_pointer[0x18];
+};
+
+enum {
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_RED = 0x0,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_YELLOW = 0x1,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_GREEN = 0x2,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_UNDEFINED = 0x3,
+};
+
+enum {
+    MLX5_IFC_ASO_CT_DIRECTION_INITIATOR = 0x0,
+    MLX5_IFC_ASO_CT_DIRECTION_RESPONDER = 0x1,
+};
+
+struct mlx5_ifc_ste_aso_first_hit_action_v1_bits {
+    u8 reserved_at_0[0x6];
+    u8 set[0x1];
+    u8 line_id[0x9];
+};
+
+struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits {
+    u8 reserved_at_0[0xc];
+    u8 action[0x1];
+    u8 initial_color[0x2];
+    u8 line_id[0x1];
+};
+
+struct mlx5_ifc_ste_aso_ct_action_v1_bits {
+    u8 reserved_at_0[0xf];
+    u8 direction[0x1];
+};
+
+struct mlx5_ifc_ste_double_action_aso_v1_bits {
+    u8 action_id[0x8];
+    u8 aso_context_number[0x18];
+
+    u8 dest_reg_id[0x2];
+    u8 change_ordering_tag[0x1];
+    u8 aso_check_ordering[0x1];
+    u8 aso_context_type[0x4];
+    u8 reserved_at_28[0x8];
+    union {
+        u8 aso_fields[0x10];
+        struct mlx5_ifc_ste_aso_first_hit_action_v1_bits first_hit;
+        struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits flow_meter;
+        struct mlx5_ifc_ste_aso_ct_action_v1_bits ct;
+    };
+};
+
+struct mlx5_ifc_ste_match_bwc_v1_bits {
+    u8 entry_format[0x8];
+    u8 counter_id[0x18];
+
+    u8 miss_address_63_48[0x10];
+    u8 match_definer_ctx_idx[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 reserved_at_5a[0x1];
+    u8 match_polarity[0x1];
+    u8 reparse[0x1];
+    u8 reserved_at_5d[0x3];
+
+    u8 next_table_base_63_48[0x10];
+    u8 hash_definer_ctx_idx[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 hash_type[0x2];
+    u8 hash_after_actions[0x1];
+    u8 reserved_at_9e[0x2];
+
+    u8 byte_mask[0x10];
+    u8 next_entry_format[0x1];
+    u8 mask_mode[0x1];
+    u8 gvmi[0xe];
+
+    u8 action[0x40];
+};
+
+struct mlx5_ifc_ste_mask_and_match_v1_bits {
+    u8 entry_format[0x8];
+    u8 counter_id[0x18];
+
+    u8 miss_address_63_48[0x10];
+    u8 match_definer_ctx_idx[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 reserved_at_5a[0x1];
+    u8 match_polarity[0x1];
+    u8 reparse[0x1];
+    u8 reserved_at_5d[0x3];
+
+    u8 next_table_base_63_48[0x10];
+    u8 hash_definer_ctx_idx[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 hash_type[0x2];
+    u8 hash_after_actions[0x1];
+    u8 reserved_at_9e[0x2];
+
+    u8 action[0x60];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_bits {
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 qp_type[0x2];
+    u8 ethertype_filter[0x1];
+    u8 reserved_at_43[0x1];
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 reserved_at_48[0x4];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_52[0x2];
+    u8 first_vlan_id[0xc];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 reserved_at_68[0x4];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_v1_bits {
+    u8 reserved_at_0[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 ip_fragmented[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x6];
+    u8 tcp_syn[0x1];
+    u8 reserved_at_67[0x3];
+    u8 force_loopback[0x1];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_dst_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 qp_type[0x2];
+    u8 ethertype_filter[0x1];
+    u8 reserved_at_43[0x1];
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 reserved_at_48[0x4];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_52[0x2];
+    u8 first_vlan_id[0xc];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 reserved_at_68[0x4];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_dst_v1_bits {
+    u8 reserved_at_0[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_lb[0x1];
+    u8 ip_fragmented[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x6];
+    u8 tcp_syn[0x1];
+    u8 reserved_at_67[0x3];
+    u8 force_lb[0x1];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_dst_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 smac_47_32[0x10];
+
+    u8 smac_31_0[0x20];
+
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 l3_type[0x2];
+    u8 reserved_at_66[0x6];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_dst_v1_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 smac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 reserved_at_50[0x2];
+    u8 functional_lb[0x1];
+    u8 reserved_at_53[0x5];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 reserved_at_5c[0x2];
+    u8 first_vlan_qualifier[0x2];
+
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+    u8 smac_15_0[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits {
+    u8 destination_address[0x20];
+
+    u8 source_address[0x20];
+
+    u8 source_port[0x10];
+    u8 destination_port[0x10];
+
+    u8 fragmented[0x1];
+    u8 first_fragment[0x1];
+    u8 reserved_at_62[0x2];
+    u8 reserved_at_64[0x1];
+    u8 ecn[0x2];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 dscp[0x6];
+    u8 reserved_at_76[0x2];
+    u8 protocol[0x8];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_v1_bits {
+    u8 source_address[0x20];
+
+    u8 destination_address[0x20];
+
+    u8 source_port[0x10];
+    u8 destination_port[0x10];
+
+    u8 reserved_at_60[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 fragmented[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 dscp[0x6];
+    u8 ecn[0x2];
+    u8 protocol[0x8];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits {
+    u8 dst_ip_127_96[0x20];
+
+    u8 dst_ip_95_64[0x20];
+
+    u8 dst_ip_63_32[0x20];
+
+    u8 dst_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l2_tnl_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 l2_tunneling_network_id[0x20];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 reserved_at_6c[0x3];
+    u8 gre_key_flag[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_tnl_v1_bits {
+    u8 l2_tunneling_network_id[0x20];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x3];
+    u8 ip_fragmented[0x1];
+    u8 reserved_at_64[0x2];
+    u8 encp_type[0x2];
+    u8 reserved_at_68[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv6_src_bits {
+    u8 src_ip_127_96[0x20];
+
+    u8 src_ip_95_64[0x20];
+
+    u8 src_ip_63_32[0x20];
+
+    u8 src_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits {
+    u8 version[0x4];
+    u8 ihl[0x4];
+    u8 reserved_at_8[0x8];
+    u8 total_length[0x10];
+
+    u8 identification[0x10];
+    u8 flags[0x3];
+    u8 fragment_offset[0xd];
+
+    u8 time_to_live[0x8];
+    u8 reserved_at_48[0x8];
+    u8 checksum[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_misc_v1_bits {
+    u8 identification[0x10];
+    u8 flags[0x3];
+    u8 fragment_offset[0xd];
+
+    u8 total_length[0x10];
+    u8 checksum[0x10];
+
+    u8 version[0x4];
+    u8 ihl[0x4];
+    u8 time_to_live[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x1c];
+    u8 voq_internal_prio[0x4];
+};
+
+struct mlx5_ifc_ste_eth_l4_bits {
+    u8 fragmented[0x1];
+    u8 first_fragment[0x1];
+    u8 reserved_at_2[0x6];
+    u8 protocol[0x8];
+    u8 dst_port[0x10];
+
+    u8 ipv6_version[0x4];
+    u8 reserved_at_24[0x1];
+    u8 ecn[0x2];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 src_port[0x10];
+
+    u8 ipv6_payload_length[0x10];
+    u8 ipv6_hop_limit[0x8];
+    u8 dscp[0x6];
+    u8 reserved_at_5e[0x2];
+
+    u8 tcp_data_offset[0x4];
+    u8 reserved_at_64[0x8];
+    u8 flow_label[0x14];
+};
+
+struct mlx5_ifc_ste_eth_l4_v1_bits {
+    u8 ipv6_version[0x4];
+    u8 reserved_at_4[0x4];
+    u8 dscp[0x6];
+    u8 ecn[0x2];
+    u8 ipv6_hop_limit[0x8];
+    u8 protocol[0x8];
+
+    u8 src_port[0x10];
+    u8 dst_port[0x10];
+
+    u8 first_fragment[0x1];
+    u8 reserved_at_41[0xb];
+    u8 flow_label[0x14];
+
+    u8 tcp_data_offset[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 fragmented[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 ipv6_paylen[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l4_misc_bits {
+    u8 checksum[0x10];
+    u8 length[0x10];
+
+    u8 seq_num[0x20];
+
+    u8 ack_num[0x20];
+
+    u8 urgent_pointer[0x10];
+    u8 window_size[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l4_misc_v1_bits {
+    u8 window_size[0x10];
+    u8 urgent_pointer[0x10];
+
+    u8 ack_num[0x20];
+
+    u8 seq_num[0x20];
+
+    u8 length[0x10];
+    u8 checksum[0x10];
+};
+
+struct mlx5_ifc_ste_mpls_bits {
+    u8 mpls0_label[0x14];
+    u8 mpls0_exp[0x3];
+    u8 mpls0_s_bos[0x1];
+    u8 mpls0_ttl[0x8];
+
+    u8 mpls1_label[0x20];
+
+    u8 mpls2_label[0x20];
+
+    u8 reserved_at_60[0x16];
+    u8 mpls4_s_bit[0x1];
+    u8 mpls4_qualifier[0x1];
+    u8 mpls3_s_bit[0x1];
+    u8 mpls3_qualifier[0x1];
+    u8 mpls2_s_bit[0x1];
+    u8 mpls2_qualifier[0x1];
+    u8 mpls1_s_bit[0x1];
+    u8 mpls1_qualifier[0x1];
+    u8 mpls0_s_bit[0x1];
+    u8 mpls0_qualifier[0x1];
+};
+
+struct mlx5_ifc_ste_mpls_v1_bits {
+    u8 reserved_at_0[0x15];
+    u8 mpls_ok[0x1];
+    u8 mpls4_s_bit[0x1];
+    u8 mpls4_qualifier[0x1];
+    u8 mpls3_s_bit[0x1];
+    u8 mpls3_qualifier[0x1];
+    u8 mpls2_s_bit[0x1];
+    u8 mpls2_qualifier[0x1];
+    u8 mpls1_s_bit[0x1];
+    u8 mpls1_qualifier[0x1];
+    u8 mpls0_s_bit[0x1];
+    u8 mpls0_qualifier[0x1];
+
+    u8 mpls0_label[0x14];
+    u8 mpls0_exp[0x3];
+    u8 mpls0_s_bos[0x1];
+    u8 mpls0_ttl[0x8];
+
+    u8 mpls1_label[0x20];
+
+    u8 mpls2_label[0x20];
+};
+
+struct mlx5_ifc_ste_register_0_bits {
+    u8 register_0_h[0x20];
+
+    u8 register_0_l[0x20];
+
+    u8 register_1_h[0x20];
+
+    u8 register_1_l[0x20];
+};
+
+struct mlx5_ifc_ste_register_1_bits {
+    u8 register_2_h[0x20];
+
+    u8 register_2_l[0x20];
+
+    u8 register_3_h[0x20];
+
+    u8 register_3_l[0x20];
+};
+
+struct mlx5_ifc_ste_gre_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_at_1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 strict_src_route[0x1];
+    u8 recur[0x3];
+    u8 flags[0x5];
+    u8 version[0x3];
+    u8 gre_protocol[0x10];
+
+    u8 checksum[0x10];
+    u8 offset[0x10];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 seq_num[0x20];
+};
+
+struct mlx5_ifc_ste_gre_v1_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_at_1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 strict_src_route[0x1];
+    u8 recur[0x3];
+    u8 flags[0x5];
+    u8 version[0x3];
+    u8 gre_protocol[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_flex_parser_0_bits {
+    u8 flex_parser_3[0x20];
+
+    u8 flex_parser_2[0x20];
+
+    u8 flex_parser_1[0x20];
+
+    u8 flex_parser_0[0x20];
+};
+
+struct mlx5_ifc_ste_flex_parser_1_bits {
+    u8 flex_parser_7[0x20];
+
+    u8 flex_parser_6[0x20];
+
+    u8 flex_parser_5[0x20];
+
+    u8 flex_parser_4[0x20];
+};
+
+struct mlx5_ifc_ste_tunnel_header_bits {
+    u8 tunnel_header_dw0[0x20];
+
+    u8 tunnel_header_dw1[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_tunnel_header_v1_bits {
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits {
+    u8 outer_vxlan_gpe_flags[0x8];
+    u8 reserved_at_8[0x10];
+    u8 outer_vxlan_gpe_next_protocol[0x8];
+
+    u8 outer_vxlan_gpe_vni[0x18];
+    u8 reserved_at_38[0x8];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits {
+    u8 reserved_at_0[0x2];
+    u8 geneve_opt_len[0x6];
+    u8 geneve_oam[0x1];
+    u8 reserved_at_9[0x7];
+    u8 geneve_protocol_type[0x10];
+
+    u8 geneve_vni[0x18];
+    u8 reserved_at_38[0x8];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_gtpu_bits {
+    u8 gtpu_msg_flags[0x8];
+    u8 gtpu_msg_type[0x8];
+    u8 reserved_at_10[0x10];
+
+    u8 gtpu_teid[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_general_purpose_bits {
+    u8 general_purpose_lookup_field[0x20];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_src_gvmi_qp_bits {
+    u8 loopback_syndrome[0x8];
+    u8 reserved_at_8[0x8];
+    u8 source_gvmi[0x10];
+
+    u8 reserved_at_20[0x5];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 source_is_requestor[0x1];
+    u8 source_qp[0x18];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_src_gvmi_qp_v1_bits {
+    u8 loopback_synd[0x8];
+    u8 reserved_at_8[0x7];
+    u8 functional_lb[0x1];
+    u8 source_gvmi[0x10];
+
+    u8 force_lb[0x1];
+    u8 reserved_at_21[0x1];
+    u8 source_is_requestor[0x1];
+    u8 reserved_at_23[0x5];
+    u8 source_qp[0x18];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_icmp_v1_bits {
+    u8 icmp_payload_data[0x20];
+
+    u8 icmp_header_data[0x20];
+
+    u8 icmp_type[0x8];
+    u8 icmp_code[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_def0_v1_bits {
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_c_1[0x20];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 ethertype[0x10];
+
+    u8 reserved_at_60[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 ip_frag[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 reserved_at_80[0xa];
+    u8 force_loopback[0x1];
+    u8 reserved_at_8b[0x3];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+};
+
+struct mlx5_ifc_ste_def2_v1_bits {
+    u8 metadata_reg_a[0x20];
+
+    u8 outer_ip_version[0x4];
+    u8 outer_ip_ihl[0x4];
+    u8 outer_ip_dscp[0x6];
+    u8 outer_ip_ecn[0x2];
+    u8 outer_ip_ttl[0x8];
+    u8 outer_ip_protocol[0x8];
+
+    u8 outer_ip_identification[0x10];
+    u8 outer_ip_flags[0x3];
+    u8 outer_ip_fragment_offset[0xd];
+
+    u8 outer_ip_total_length[0x10];
+    u8 outer_ip_checksum[0x10];
+
+    u8 reserved_180[0xc];
+    u8 outer_ip_flow_label[0x14];
+
+    u8 outer_eth_packet_length[0x10];
+    u8 outer_ip_payload_length[0x10];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 outer_data_offset[0x4];
+    u8 reserved_1e4[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 outer_ip_frag_first[0x1];
+    u8 reserved_1f0[0x7];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+};
+
+struct mlx5_ifc_ste_def6_v1_bits {
+    u8 dst_ipv6_127_96[0x20];
+
+    u8 dst_ipv6_95_64[0x20];
+
+    u8 dst_ipv6_63_32[0x20];
+
+    u8 dst_ipv6_31_0[0x20];
+
+    u8 reserved_at_80[0x40];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_e0[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 ip_frag[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 reserved_f0[0x10];
+};
+
+struct mlx5_ifc_ste_def16_v1_bits {
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 tunnel_header_2[0x20];
+
+    u8 tunnel_header_3[0x20];
+
+    u8 random_number[0x10];
+    u8 reserved_90[0x10];
+
+    u8 metadata_reg_a[0x20];
+
+    u8 reserved_c0[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 reserved_ce[0x1];
+    u8 functional_lb[0x1];
+    u8 source_gvmi[0x10];
+
+    u8 force_lb[0x1];
+    u8 outer_ip_frag[0x1];
+    u8 source_is_requester[0x1];
+    u8 reserved_e3[0x5];
+    u8 source_sqn[0x18];
+};
+
+struct mlx5_ifc_ste_def22_v1_bits {
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_at_40[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 outer_ip_frag[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 metadata_reg_c_0[0x20];
+
+    u8 outer_dmac_47_16[0x20];
+
+    u8 outer_smac_47_16[0x20];
+
+    u8 outer_smac_15_0[0x10];
+    u8 outer_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_ste_def24_v1_bits {
+    u8 metadata_reg_c_2[0x20];
+
+    u8 metadata_reg_c_3[0x20];
+
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_c_1[0x20];
+
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 inner_ip_protocol[0x8];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 inner_ip_frag[0x1];
+    u8 functional_lb[0x1];
+    u8 outer_ip_protocol[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 functional_lb_dup[0x1];
+};
+
+struct mlx5_ifc_ste_def25_v1_bits {
+    u8 inner_ip_src_addr[0x20];
+
+    u8 inner_ip_dst_addr[0x20];
+
+    u8 inner_l4_sport[0x10];
+    u8 inner_l4_dport[0x10];
+
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    u8 port_number_dup[0x2];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 port_number[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_ste_def26_v1_bits {
+    u8 src_ipv6_127_96[0x20];
+
+    u8 src_ipv6_95_64[0x20];
+
+    u8 src_ipv6_63_32[0x20];
+
+    u8 src_ipv6_31_0[0x20];
+
+    u8 reserved_at_80[0x3];
+    u8 ip_frag[0x1];
+    u8 reserved_at_84[0x6];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_type[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 reserved_at_a0[0xb];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_type[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 ip_porotcol[0x8];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+};
+
+struct mlx5_ifc_ste_def28_v1_bits {
+    u8 inner_l4_sport[0x10];
+    u8 inner_l4_dport[0x10];
+
+    u8 flex_gtpu_teid[0x20];
+
+    u8 inner_ip_src_addr[0x20];
+
+    u8 inner_ip_dst_addr[0x20];
+
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 inner_ip_protocol[0x8];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 inner_ip_frag[0x1];
+    u8 functional_lb[0x1];
+    u8 outer_ip_protocol[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 functional_lb_dup[0x1];
+};
+
+struct mlx5_ifc_set_action_in_bits {
+    u8 action_type[0x4];
+    u8 field[0xc];
+    u8 reserved_at_10[0x3];
+    u8 offset[0x5];
+    u8 reserved_at_18[0x3];
+    u8 length[0x5];
+
+    u8 data[0x20];
+};
+
+struct mlx5_ifc_add_action_in_bits {
+    u8 action_type[0x4];
+    u8 field[0xc];
+    u8 reserved_at_10[0x10];
+
+    u8 data[0x20];
+};
+
+struct mlx5_ifc_copy_action_in_bits {
+    u8 action_type[0x4];
+    u8 src_field[0xc];
+    u8 reserved_at_10[0x3];
+    u8 src_offset[0x5];
+    u8 reserved_at_18[0x3];
+    u8 length[0x5];
+
+    u8 reserved_at_20[0x4];
+    u8 dst_field[0xc];
+    u8 reserved_at_30[0x3];
+    u8 dst_offset[0x5];
+    u8 reserved_at_38[0x8];
+};
+
+enum {
+    MLX5_ACTION_TYPE_SET = 0x1,
+    MLX5_ACTION_TYPE_ADD = 0x2,
+    MLX5_ACTION_TYPE_COPY = 0x3,
+};
+
+enum {
+    MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16 = 0x1,
+    MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0 = 0x2,
+    MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE = 0x3,
+    MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16 = 0x4,
+    MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0 = 0x5,
+    MLX5_ACTION_IN_FIELD_OUT_IP_DSCP = 0x6,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS = 0x7,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT = 0x8,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT = 0x9,
+    MLX5_ACTION_IN_FIELD_OUT_IP_TTL = 0xa,
+    MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT = 0xb,
+    MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT = 0xc,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96 = 0xd,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64 = 0xe,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32 = 0xf,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0 = 0x10,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96 = 0x11,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64 = 0x12,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32 = 0x13,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16,
+    MLX5_ACTION_IN_FIELD_OUT_FIRST_VID = 0x17,
+    MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA = 0x49,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB = 0x50,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0 = 0x51,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1 = 0x52,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2 = 0x53,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3 = 0x54,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4 = 0x55,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5 = 0x56,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM = 0x59,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM = 0x5B,
+    MLX5_ACTION_IN_FIELD_OUT_GTPU_TEID = 0x6E,
+};
+
+struct mlx5_ifc_dctc_bits {
+    u8 reserved_at_0[0x1d];
+    u8 data_in_order[0x1];
+    u8 reserved_at_1e[0x362];
+};
+
+struct mlx5_ifc_packet_reformat_context_in_bits {
+    u8 reserved_at_0[0x5];
+    u8 reformat_type[0x3];
+    u8 reserved_at_8[0xe];
+    u8 reformat_data_size[0xa];
+
+    u8 reserved_at_20[0x10];
+    u8 reformat_data[2][0x8];
+
+    u8 more_reformat_data[0][0x8];
+};
+
+struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0xa0];
+
+    struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context;
+};
+
+struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_packet_reformat_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_packet_reformat_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum reformat_type {
+    MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0,
+    MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1,
+    MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
+    MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
+    MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
+};
+
+struct mlx5_ifc_alloc_flow_counter_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_flow_counter_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum {
+    MLX5_OBJ_TYPE_FLOW_METER = 0x000a,
+    MLX5_OBJ_TYPE_DEK = 0x000C,
+    MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018,
+    MLX5_OBJ_TYPE_CRYPTO_LOGIN = 0x001F,
+    MLX5_OBJ_TYPE_FLOW_SAMPLER = 0x0020,
+    MLX5_OBJ_TYPE_ASO_FLOW_METER = 0x0024,
+    MLX5_OBJ_TYPE_ASO_FIRST_HIT = 0x0025,
+    MLX5_OBJ_TYPE_SCHEDULING_ELEMENT = 0x0026,
+    MLX5_OBJ_TYPE_RESERVED_QPN = 0x002C,
+    MLX5_OBJ_TYPE_ASO_CT = 0x0031,
+    MLX5_OBJ_TYPE_AV_QP_MAPPING = 0x003A,
+};
+
+struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 obj_type[0x10];
+
+    u8 obj_id[0x20];
+
+    u8 reserved_at_60[0x3];
+    u8 log_obj_range[0x5];
+    u8 reserved_at_68[0x18];
+};
+
+struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 obj_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_flow_meter_bits {
+    u8 modify_field_select[0x40];
+
+    u8 active[0x1];
+    u8 reserved_at_41[0x3];
+    u8 return_reg_id[0x4];
+    u8 table_type[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x8];
+    u8 destination_table_id[0x18];
+
+    u8 reserved_at_80[0x80];
+
+    u8 flow_meter_params[0x100];
+
+    u8 reserved_at_180[0x180];
+
+    u8 sw_steering_icm_address_rx[0x40];
+    u8 sw_steering_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_create_flow_meter_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_meter_bits meter;
+};
+
+struct mlx5_ifc_query_flow_meter_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_meter_bits obj;
+};
+
+struct mlx5_ifc_flow_sampler_bits {
+    u8 modify_field_select[0x40];
+
+    u8 table_type[0x8];
+    u8 level[0x8];
+    u8 reserved_at_50[0xf];
+    u8 ignore_flow_level[0x1];
+
+    u8 sample_ratio[0x20];
+
+    u8 reserved_at_80[0x8];
+    u8 sample_table_id[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 default_table_id[0x18];
+
+    u8 sw_steering_icm_address_rx[0x40];
+    u8 sw_steering_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_create_flow_sampler_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_sampler_bits sampler;
+};
+
+struct mlx5_ifc_query_flow_sampler_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_sampler_bits obj;
+};
+
+struct mlx5_ifc_definer_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x10];
+    u8 format_id[0x10];
+
+    u8 reserved_at_60[0x160];
+
+    u8 ctrl[0xA0];
+    u8 match_mask_dw_11_8[0x60];
+    u8 match_mask_dw_7_0[0x100];
+};
+
+struct mlx5_ifc_create_definer_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_definer_bits definer;
+};
+
+struct mlx5_ifc_esw_vport_context_bits {
+    u8 reserved_at_0[0x3];
+    u8 vport_svlan_strip[0x1];
+    u8 vport_cvlan_strip[0x1];
+    u8 vport_svlan_insert[0x1];
+    u8 vport_cvlan_insert[0x2];
+    u8 reserved_at_8[0x18];
+
+    u8 reserved_at_20[0x20];
+
+    u8 svlan_cfi[0x1];
+    u8 svlan_pcp[0x3];
+    u8 svlan_id[0xc];
+    u8 cvlan_cfi[0x1];
+    u8 cvlan_pcp[0x3];
+    u8 cvlan_id[0xc];
+
+    u8 reserved_at_40[0x720];
+    u8 sw_steering_vport_icm_address_rx[0x40];
+    u8 sw_steering_vport_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_query_esw_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
+struct mlx5_ifc_query_esw_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+    u8 reserved_at_0[0x1f];
+    u8 roce_en[0x1];
+
+    u8 reserved_at_20[0x7e0];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum {
+    MLX5_QPC_ST_RC = 0x0,
+};
+
+enum {
+    MLX5_QPC_PM_STATE_MIGRATED = 0x3,
+};
+
+struct mlx5_ifc_ud_av_bits {
+    u8 reserved_at_0[0x60];
+
+    u8 reserved_at_60[0x4];
+    u8 sl_or_eth_prio[0x4];
+    u8 reserved_at_68[0x18];
+
+    u8 reserved_at_80[0x60];
+
+    u8 reserved_at_e0[0x4];
+    u8 src_addr_index[0x8];
+    u8 reserved_at_ec[0x14];
+
+    u8 rgid_or_rip[16][0x8];
+};
+
+struct mlx5_ifc_ads_bits {
+    u8 fl[0x1];
+    u8 free_ar[0x1];
+    u8 reserved_at_2[0xe];
+    u8 pkey_index[0x10];
+
+    u8 reserved_at_20[0x8];
+    u8 grh[0x1];
+    u8 mlid[0x7];
+    u8 rlid[0x10];
+
+    u8 ack_timeout[0x5];
+    u8 reserved_at_45[0x3];
+    u8 src_addr_index[0x8];
+    u8 reserved_at_50[0x4];
+    u8 stat_rate[0x4];
+    u8 hop_limit[0x8];
+
+    u8 reserved_at_60[0x4];
+    u8 tclass[0x8];
+    u8 flow_label[0x14];
+
+    u8 rgid_rip[16][0x8];
+
+    u8 reserved_at_100[0x4];
+    u8 f_dscp[0x1];
+    u8 f_ecn[0x1];
+    u8 reserved_at_106[0x1];
+    u8 f_eth_prio[0x1];
+    u8 ecn[0x2];
+    u8 dscp[0x6];
+    u8 udp_sport[0x10];
+
+    u8 dei_cfi[0x1];
+    u8 eth_prio[0x3];
+    u8 sl[0x4];
+    u8 vhca_port_num[0x8];
+    u8 rmac_47_32[0x10];
+
+    u8 rmac_31_0[0x20];
+};
+
+enum {
+    MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+    MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT = 0x1,
+    MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME = 0x2,
+};
+
+struct mlx5_ifc_qpc_bits {
+    u8 state[0x4];
+    u8 lag_tx_port_affinity[0x4];
+    u8 st[0x8];
+    u8 reserved_at_10[0x2];
+    u8 isolate_vl_tc[0x1];
+    u8 pm_state[0x2];
+    u8 reserved_at_15[0x1];
+    u8 req_e2e_credit_mode[0x2];
+    u8 offload_type[0x4];
+    u8 end_padding_mode[0x2];
+    u8 reserved_at_1e[0x2];
+
+    u8 wq_signature[0x1];
+    u8 block_lb_mc[0x1];
+    u8 atomic_like_write_en[0x1];
+    u8 latency_sensitive[0x1];
+    u8 reserved_at_24[0x1];
+    u8 drain_sigerr[0x1];
+    u8 reserved_at_26[0x2];
+    u8 pd[0x18];
+
+    u8 mtu[0x3];
+    u8 log_msg_max[0x5];
+    u8 reserved_at_48[0x1];
+    u8 log_rq_size[0x4];
+    u8 log_rq_stride[0x3];
+    u8 no_sq[0x1];
+    u8 log_sq_size[0x4];
+    u8 reserved_at_55[0x3];
+    u8 ts_format[0x2];
+    u8 data_in_order[0x1];
+    u8 rlky[0x1];
+    u8 ulp_stateless_offload_mode[0x4];
+
+    u8 counter_set_id[0x8];
+    u8 uar_page[0x18];
+
+    u8 reserved_at_80[0x8];
+    u8 user_index[0x18];
+
+    u8 reserved_at_a0[0x3];
+    u8 log_page_size[0x5];
+    u8 remote_qpn[0x18];
+
+    struct mlx5_ifc_ads_bits primary_address_path;
+
+    struct mlx5_ifc_ads_bits secondary_address_path;
+
+    u8 log_ack_req_freq[0x4];
+    u8 reserved_at_384[0x4];
+    u8 log_sra_max[0x3];
+    u8 reserved_at_38b[0x2];
+    u8 retry_count[0x3];
+    u8 rnr_retry[0x3];
+    u8 reserved_at_393[0x1];
+    u8 fre[0x1];
+    u8 cur_rnr_retry[0x3];
+    u8 cur_retry_count[0x3];
+    u8 reserved_at_39b[0x5];
+
+    u8 reserved_at_3a0[0x20];
+
+    u8 reserved_at_3c0[0x8];
+    u8 next_send_psn[0x18];
+
+    u8 reserved_at_3e0[0x8];
+    u8 cqn_snd[0x18];
+
+    u8 reserved_at_400[0x8];
+    u8 deth_sqpn[0x18];
+
+    u8 reserved_at_420[0x20];
+
+    u8 reserved_at_440[0x8];
+    u8 last_acked_psn[0x18];
+
+    u8 reserved_at_460[0x8];
+    u8 ssn[0x18];
+
+    u8 reserved_at_480[0x8];
+    u8 log_rra_max[0x3];
+    u8 reserved_at_48b[0x1];
+    u8 atomic_mode[0x4];
+    u8 rre[0x1];
+    u8 rwe[0x1];
+    u8 rae[0x1];
+    u8 reserved_at_493[0x1];
+    u8 page_offset[0x6];
+    u8 reserved_at_49a[0x3];
+    u8 cd_slave_receive[0x1];
+    u8 cd_slave_send[0x1];
+    u8 cd_master[0x1];
+
+    u8 reserved_at_4a0[0x3];
+    u8 min_rnr_nak[0x5];
+    u8 next_rcv_psn[0x18];
+
+    u8 reserved_at_4c0[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_4e0[0x8];
+    u8 cqn_rcv[0x18];
+
+    u8 dbr_addr[0x40];
+
+    u8 q_key[0x20];
+
+    u8 reserved_at_560[0x5];
+    u8 rq_type[0x3];
+    u8 srqn_rmpn_xrqn[0x18];
+
+    u8 reserved_at_580[0x8];
+    u8 rmsn[0x18];
+
+    u8 hw_sq_wqebb_counter[0x10];
+    u8 sw_sq_wqebb_counter[0x10];
+
+    u8 hw_rq_counter[0x20];
+
+    u8 sw_rq_counter[0x20];
+
+    u8 reserved_at_600[0x20];
+
+    u8 reserved_at_620[0xf];
+    u8 cgs[0x1];
+    u8 cs_req[0x8];
+    u8 cs_res[0x8];
+
+    u8 dc_access_key[0x40];
+
+    u8 reserved_at_680[0x3];
+    u8 dbr_umem_valid[0x1];
+
+    u8 reserved_at_684[0x9c];
+
+    u8 dbr_umem_id[0x20];
+};
+
+struct mlx5_ifc_qpc_ext_bits {
+    u8 reserved_at_0[0x2];
+    u8 mmo[0x1];
+    u8 reserved_at_3[0xd];
+    u8 dci_stream_channel_id[0x10];
+
+    u8 qos_queue_group_id_requester[0x20];
+
+    u8 qos_queue_group_id_responder[0x20];
+
+    u8 reserved_at_60[0x5a0];
+};
+
+struct mlx5_ifc_create_tir_out_bits {
+    u8 status[0x8];
+    u8 icm_address_63_40[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 icm_address_39_32[0x8];
+    u8 tirn[0x18];
+
+    u8 icm_address_31_0[0x20];
+};
+
+struct mlx5_ifc_destroy_tir_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 tirn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 wq_umem_id[0x20];
+
+    u8 wq_umem_valid[0x1];
+    u8 reserved_at_861[0x1f];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_destroy_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum mlx5_qpc_opt_mask_32 {
+    MLX5_QPC_OPT_MASK_32_DCI_STREAM_CHANNEL_ID = 1 << 0,
+    MLX5_QPC_OPT_MASK_32_QOS_QUEUE_GROUP_ID = 1 << 1,
+    MLX5_QPC_OPT_MASK_32_UDP_SPORT = 1 << 2,
+};
+
+enum mlx5_qpc_opt_mask {
+    MLX5_QPC_OPT_MASK_INIT2INIT_DRAIN_SIGERR = 1 << 11,
+    MLX5_QPC_OPT_MASK_RTS2RTS_LAG_TX_PORT_AFFINITY = 1 << 15,
+    MLX5_QPC_OPT_MASK_INIT2INIT_MMO = 1 << 25,
+};
+
+struct mlx5_ifc_init2init_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2init_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 qpc_ext[0x1];
+    u8 reserved_at_41[0x7];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 opt_param_mask_95_32[0x40];
+
+    struct mlx5_ifc_qpc_ext_bits qpc_data_ext;
+};
+
+struct mlx5_ifc_init2rtr_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2rtr_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rtr2rts_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rtr2rts_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rst2init_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rst2init_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rts2rts_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rts2rts_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 qpc_ext[0x1];
+    u8 reserved_at_41[0x7];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 opt_param_mask_95_32[0x40];
+
+    struct mlx5_ifc_qpc_ext_bits qpc_data_ext;
+};
+
+struct mlx5_ifc_qp_2rst_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t vhca_tunnel_id[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t qpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_qp_2rst_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qp_2err_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t vhca_tunnel_id[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t qpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_qp_2err_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qpc_extension_and_pas_list_in_bits {
+    uint8_t qpc_data_extension[48][0x20];
+
+    uint8_t pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_dct_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_dctc_bits dctc;
+};
+
+struct mlx5_ifc_query_dct_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_tisc_bits {
+    u8 strict_lag_tx_port_affinity[0x1];
+    u8 tls_en[0x1];
+    u8 reserved_at_2[0x2];
+    u8 lag_tx_port_affinity[0x04];
+
+    u8 reserved_at_8[0x4];
+    u8 prio[0x4];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x100];
+
+    u8 reserved_at_120[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_140[0x8];
+    u8 underlay_qpn[0x18];
+
+    u8 reserved_at_160[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_180[0x380];
+};
+
+struct mlx5_ifc_query_tis_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_tisc_bits tis_context;
+};
+
+struct mlx5_ifc_query_tis_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_lagc_bits {
+    u8 reserved_at_0[0x1d];
+    u8 lag_state[0x3];
+
+    u8 reserved_at_20[0x14];
+    u8 tx_remap_affinity_2[0x4];
+    u8 reserved_at_38[0x4];
+    u8 tx_remap_affinity_1[0x4];
+};
+
+struct mlx5_ifc_query_lag_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    struct mlx5_ifc_lagc_bits ctx;
+};
+
+struct mlx5_ifc_query_lag_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_av_qp_mapping_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x20];
+
+    u8 qpn[0x20];
+
+    struct mlx5_ifc_ud_av_bits remote_address_vector;
+};
+
+struct mlx5_ifc_create_av_qp_mapping_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_av_qp_mapping_bits mapping;
+};
+
+struct mlx5_ifc_query_av_qp_mapping_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_av_qp_mapping_bits obj;
+};
+
+struct mlx5_ifc_modify_tis_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_tis_bitmask_bits {
+    u8 reserved_at_0[0x20];
+
+    u8 reserved_at_20[0x1d];
+    u8 lag_tx_port_affinity[0x1];
+    u8 strict_lag_tx_port_affinity[0x1];
+    u8 prio[0x1];
+};
+
+struct mlx5_ifc_modify_tis_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_tisc_bits ctx;
+};
+
+enum roce_version {
+    MLX5_ROCE_VERSION_1 = 0,
+    MLX5_ROCE_VERSION_2 = 2,
+};
+
+struct mlx5_ifc_roce_addr_layout_bits {
+    u8 source_l3_address[4][0x20];
+
+    u8 reserved_at_80[0x2];
+    u8 rx_allow_untagged[0x1];
+    u8 vlan_valid[0x1];
+    u8 vlan_id[0xc];
+    u8 source_mac_47_32[0x10];
+
+    u8 source_mac_31_0[0x20];
+
+    u8 reserved_at_c0[0x14];
+    u8 roce_l3_type[0x4];
+    u8 roce_version[0x8];
+
+    u8 reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_query_roce_address_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x20];
+
+    uint8_t roce_address_num[0x10];
+    uint8_t reserved_at_70[0x10];
+
+    struct mlx5_ifc_roce_addr_layout_bits roce_address[0];
+};
+
+struct mlx5_ifc_query_roce_address_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 roce_address_index[0x10];
+    u8 reserved_at_50[0xc];
+    u8 vhca_port_num[0x4];
+
+    u8 reserved_at_60[0x20];
+};
+
+/* Both HW set and HW add share the same HW format with different opcodes */
+struct mlx5_ifc_dr_action_hw_set_bits {
+    u8 opcode[0x8];
+    u8 destination_field_code[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x3];
+    u8 destination_length[0x5];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_dr_action_hw_copy_bits {
+    u8 opcode[0x8];
+    u8 destination_field_code[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 reserved_at_20[0x8];
+    u8 source_field_code[0x8];
+    u8 reserved_at_30[0x2];
+    u8 source_left_shifter[0x6];
+    u8 reserved_at_38[0x8];
+};
+
+struct mlx5_ifc_host_params_context_bits {
+    u8 host_number[0x8];
+    u8 reserved_at_8[0x6];
+    u8 host_pf_vhca_id_valid[0x1];
+    u8 host_pf_disabled[0x1];
+    u8 host_num_of_vfs[0x10];
+
+    u8 host_total_vfs[0x10];
+    u8 host_pci_bus[0x10];
+
+    u8 host_pf_vhca_id[0x10];
+    u8 host_pci_device[0x10];
+
+    u8 reserved_at_60[0x10];
+    u8 host_pci_function[0x10];
+
+    u8 reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_query_esw_functions_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_esw_functions_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_host_params_context_bits host_params_context;
+
+    u8 reserved_at_280[0x180];
+    u8 host_sf_enable[0][0x40];
+};
+
+struct mlx5_ifc_create_flow_group_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x1f40];
+};
+
+struct mlx5_ifc_create_flow_group_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 group_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_group_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 group_id[0x20];
+
+    u8 reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_dest_format_bits {
+    u8 destination_type[0x8];
+    u8 destination_id[0x18];
+
+    u8 reserved_at_20[0x1];
+    u8 packet_reformat[0x1];
+    u8 reserved_at_22[0x1e];
+};
+
+struct mlx5_ifc_extended_dest_format_bits {
+    struct mlx5_ifc_dest_format_bits destination_entry;
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_flow_counter_list_bits {
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_20[0x20];
+};
+
+union mlx5_ifc_dest_format_flow_counter_list_auto_bits {
+    struct mlx5_ifc_dest_format_bits dest_format;
+    struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
+    u8 reserved_at_0[0x40];
+};
+
+struct mlx5_ifc_flow_context_bits {
+    u8 reserved_at_00[0x20];
+
+    u8 group_id[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 flow_tag[0x18];
+
+    u8 reserved_at_60[0x10];
+    u8 action[0x10];
+
+    u8 extended_destination[0x1];
+    u8 reserved_at_81[0x7];
+    u8 destination_list_size[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 flow_counter_list_size[0x18];
+
+    u8 reserved_at_c0[0x1740];
+
+    union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[0];
+};
+
+struct mlx5_ifc_set_fte_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x40];
+    u8 flow_index[0x20];
+
+    u8 reserved_at_120[0xe0];
+    struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_set_fte_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum dr_devx_flow_dest_type {
+    MLX5_FLOW_DEST_TYPE_VPORT = 0x0,
+    MLX5_FLOW_DEST_TYPE_FT = 0x1,
+    MLX5_FLOW_DEST_TYPE_TIR = 0x2,
+
+    MLX5_FLOW_DEST_TYPE_COUNTER = 0x100,
+};
+
+enum {
+    MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4,
+    MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8,
+};
+
+enum {
+    MLX5_QPC_PAGE_OFFSET_QUANTA = 64,
+};
+
+enum {
+    MLX5_ASO_FIRST_HIT_NUM_PER_OBJ = 512,
+    MLX5_ASO_FLOW_METER_NUM_PER_OBJ = 2,
+    MLX5_ASO_CT_NUM_PER_OBJ = 1,
+};
+
+enum mlx5_sched_hierarchy_type {
+    MLX5_SCHED_HIERARCHY_NIC = 3,
+};
+
+enum mlx5_sched_elem_type {
+    MLX5_SCHED_ELEM_TYPE_TSAR = 0x0,
+    MLX5_SCHED_ELEM_TYPE_VPORT = 0x1,
+    MLX5_SCHED_ELEM_TYPE_VPORT_TC = 0x2,
+    MLX5_SCHED_ELEM_TYPE_PARA_VPORT_TC = 0x3,
+    MLX5_SCHED_ELEM_TYPE_QUEUE_GROUP = 0x4,
+};
+
+enum mlx5_sched_tsar_type {
+    MLX5_SCHED_TSAR_TYPE_DWRR = 0x0,
+    MLX5_SCHED_TSAR_TYPE_ROUND_ROBIN = 0x1,
+    MLX5_SCHED_TSAR_TYPE_ETS = 0x2,
+};
+
+struct mlx5_ifc_sched_elem_attr_tsar_bits {
+    u8 reserved_at_0[0x8];
+    u8 tsar_type[0x8];
+    u8 reserved_at_10[0x10];
+};
+
+union mlx5_ifc_sched_elem_attr_bits {
+    struct mlx5_ifc_sched_elem_attr_tsar_bits tsar;
+};
+
+struct mlx5_ifc_sched_context_bits {
+    u8 element_type[0x8];
+    u8 reserved_at_8[0x18];
+
+    union mlx5_ifc_sched_elem_attr_bits sched_elem_attr;
+
+    u8 parent_element_id[0x20];
+
+    u8 reserved_at_60[0x40];
+
+    u8 bw_share[0x20];
+
+    u8 max_average_bw[0x20];
+
+    u8 reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_sched_elem_bits {
+    u8 modify_field_select[0x40];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 reserved_at_60[0xa0];
+
+    struct mlx5_ifc_sched_context_bits sched_context;
+
+    u8 reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_create_sched_elem_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_sched_elem_bits sched_elem;
+};
+
+struct mlx5_ifc_create_modify_elem_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_sched_elem_bits sched_elem;
+};
+
+enum {
+    MLX5_SQC_STATE_RDY = 0x1,
+};
+
+struct mlx5_ifc_sqc_bits {
+    u8 reserved_at_0[0x8];
+    u8 state[0x4];
+    u8 reserved_at_c[0x14];
+
+    u8 reserved_at_20[0xe0];
+
+    u8 reserved_at_100[0x10];
+    u8 qos_queue_group_id[0x10];
+
+    u8 reserved_at_120[0x660];
+};
+
+enum {
+    MLX5_MODIFY_SQ_BITMASK_QOS_QUEUE_GROUP_ID = 1 << 2,
+};
+
+struct mlx5_ifc_modify_sq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_sq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 sq_state[0x4];
+    u8 reserved_at_44[0x4];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 modify_bitmask[0x40];
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_sqc_bits sq_context;
+};
+
+struct mlx5_ifc_reserved_qpn_bits {
+    u8 reserved_at_0[0x80];
+};
+
+struct mlx5_ifc_create_reserved_qpn_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_reserved_qpn_bits rqpns;
+};
+
+struct mlx5_ifc_create_psv_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x8];
+    u8 psv0_index[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 psv1_index[0x18];
+
+    u8 reserved_at_c0[0x8];
+    u8 psv2_index[0x18];
+
+    u8 reserved_at_e0[0x8];
+    u8 psv3_index[0x18];
+};
+
+struct mlx5_ifc_create_psv_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 num_psv[0x4];
+    u8 reserved_at_44[0x4];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_psv_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 psvn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_mbox_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_enable_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_enable_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x20];
+};
+
+struct mlx5_ifc_query_issi_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x10];
+    u8 current_issi[0x10];
+
+    u8 reserved_at_60[0xa0];
+
+    u8 reserved_at_100[76][0x8];
+    u8 supported_issi_dw0[0x20];
+};
+
+struct mlx5_ifc_query_issi_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_issi_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_issi_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 current_issi[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_pages_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 embedded_cpu_function[0x01];
+    u8 reserved_bits[0x0f];
+    u8 function_id[0x10];
+
+    u8 num_pages[0x20];
+};
+
+struct mlx5_ifc_query_pages_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_manage_pages_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 output_num_entries[0x20];
+
+    u8 reserved_at_60[0x20];
+
+    u8 pas[][0x40];
+};
+
+struct mlx5_ifc_manage_pages_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 embedded_cpu_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 input_num_entries[0x20];
+
+    u8 pas[][0x40];
+};
+
+enum {
+    MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1,
+};
+
+struct mlx5_ifc_teardown_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x3f];
+
+    u8 state[0x1];
+};
+
+enum {
+    MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE = 0x0,
+    MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN = 0x2,
+};
+
+struct mlx5_ifc_teardown_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 profile[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_init_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_access_register_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 register_data[][0x20];
+};
+
+struct mlx5_ifc_access_register_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 register_id[0x10];
+
+    u8 argument[0x20];
+
+    u8 register_data[][0x20];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_nic_vport_field_select_bits {
+    u8 reserved_at_0[0x12];
+    u8 affiliation[0x1];
+    u8 reserved_at_13[0x1];
+    u8 disable_uc_local_lb[0x1];
+    u8 disable_mc_local_lb[0x1];
+    u8 node_guid[0x1];
+    u8 port_guid[0x1];
+    u8 min_inline[0x1];
+    u8 mtu[0x1];
+    u8 change_event[0x1];
+    u8 promisc[0x1];
+    u8 permanent_address[0x1];
+    u8 addresses_list[0x1];
+    u8 roce_en[0x1];
+    u8 reserved_at_1f[0x1];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    struct mlx5_ifc_modify_nic_vport_field_select_bits field_select;
+
+    u8 reserved_at_80[0x780];
+
+    struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_set_hca_cap_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_hca_cap_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_alloc_uar_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 uar[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_uar_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 uar[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_eqc_bits {
+    u8 status[0x4];
+    u8 reserved_at_4[0x9];
+    u8 ec[0x1];
+    u8 oi[0x1];
+    u8 reserved_at_f[0x5];
+    u8 st[0x4];
+    u8 reserved_at_18[0x8];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x14];
+    u8 page_offset[0x6];
+    u8 reserved_at_5a[0x6];
+
+    u8 reserved_at_60[0x3];
+    u8 log_eq_size[0x5];
+    u8 uar_page[0x18];
+
+    u8 reserved_at_80[0x20];
+
+    u8 reserved_at_a0[0x18];
+    u8 intr[0x8];
+
+    u8 reserved_at_c0[0x3];
+    u8 log_page_size[0x5];
+    u8 reserved_at_c8[0x18];
+
+    u8 reserved_at_e0[0x60];
+
+    u8 reserved_at_140[0x8];
+    u8 consumer_counter[0x18];
+
+    u8 reserved_at_160[0x8];
+    u8 producer_counter[0x18];
+
+    u8 reserved_at_180[0x80];
+};
+
+struct mlx5_ifc_create_eq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x18];
+    u8 eq_number[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_eq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_eqc_bits eq_context_entry;
+
+    u8 reserved_at_280[0x40];
+
+    u8 event_bitmask[4][0x40];
+
+    u8 reserved_at_3c0[0x4c0];
+
+    u8 pas[][0x40];
+};
+
+struct mlx5_ifc_destroy_eq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_eq_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x18];
+    u8 eq_number[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_mtt_bits {
+    u8 ptag_63_32[0x20];
+
+    u8 ptag_31_8[0x18];
+    u8 reserved_at_38[0x6];
+    u8 wr_en[0x1];
+    u8 rd_en[0x1];
+};
+
+struct mlx5_ifc_umem_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 reserved_at_80[0x1b];
+    u8 log_page_size[0x5];
+
+    u8 page_offset[0x20];
+
+    u8 num_of_mtt[0x40];
+
+    struct mlx5_ifc_mtt_bits mtt[];
+};
+
+struct mlx5_ifc_create_umem_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_umem_bits umem;
+};
+
+struct mlx5_ifc_create_umem_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 umem_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 umem_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_delete_fte_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x40];
+
+    u8 flow_index[0x20];
+
+    u8 reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_create_cq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 cqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_cq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 cqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_transport_domain_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_wq_bits {
+    uint8_t wq_type[0x4];
+    uint8_t wq_signature[0x1];
+    uint8_t end_padding_mode[0x2];
+    uint8_t cd_slave[0x1];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t reserved_at_20[0x1];
+    uint8_t reserved_at_21[0x3];
+    uint8_t reserved_at_24[0x7];
+    uint8_t page_offset[0x5];
+    uint8_t reserved_at_30[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t pd[0x18];
+
+    uint8_t reserved_at_60[0x8];
+    uint8_t uar_page[0x18];
+
+    uint8_t dbr_addr[0x40];
+
+    uint8_t reserved_at_c0[0x20];
+
+    uint8_t reserved_at_e0[0x20];
+
+    uint8_t reserved_at_100[0xc];
+    uint8_t log_wq_stride[0x4];
+    uint8_t reserved_at_110[0x3];
+    uint8_t log_wq_pg_sz[0x5];
+    uint8_t reserved_at_118[0x3];
+    uint8_t log_wq_sz[0x5];
+
+    uint8_t dbr_umem_valid[0x1];
+    uint8_t wq_umem_valid[0x1];
+    uint8_t reserved_at_122[0x1];
+    uint8_t reserved_at_123[0x5];
+    uint8_t reserved_at_128[0x3];
+    uint8_t reserved_at_12b[0x5];
+    uint8_t reserved_at_130[0x4];
+    uint8_t reserved_at_134[0x4];
+    uint8_t reserved_at_138[0x1];
+    uint8_t reserved_at_139[0x4];
+    uint8_t reserved_at_13d[0x3];
+
+    uint8_t dbr_umem_id[0x20];
+
+    uint8_t wq_umem_id[0x20];
+
+    uint8_t wq_umem_offset[0x40];
+
+    uint8_t reserved_at_1bc[0x20];
+
+    uint8_t reserved_at_1dd[0x1];
+    uint8_t reserved_at_1e1[0x1];
+    uint8_t reserved_at_1e2[0x2];
+    uint8_t reserved_at_1e4[0x1];
+    uint8_t reserved_at_1e5[0x3];
+    uint8_t reserved_at_1e8[0x5];
+    uint8_t reserved_at_1ed[0x3];
+    uint8_t reserved_at_1f0[0x6];
+    uint8_t reserved_at_1f6[0x2];
+    uint8_t reserved_at_1fa[0x4];
+    uint8_t reserved_at_1fc[0x4];
+
+    uint8_t reserved_at_200[0xb];
+    uint8_t reserved_at_20b[0x5];
+    uint8_t reserved_at_210[0x10];
+
+    uint8_t reserved_at_220[0x3e0];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_rmpc_bits {
+    uint8_t reserved_at_0[0x8];
+    uint8_t state[0x4];
+    uint8_t reserved_at_c[0x14];
+
+    uint8_t basic_cyclic_rcv_wqe[0x1];
+    uint8_t reserved_at_21[0x1f];
+
+    uint8_t reserved_at_40[0x140];
+
+    struct mlx5_ifc_wq_bits wq;
+};
+
+struct mlx5_ifc_create_rmp_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0xc0];
+
+    struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_create_rmp_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t rmpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_sq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_sq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 rqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_rq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 rqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rqt_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 rqtn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_rqt_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 rqtn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_tis_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_tis_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_q_counter_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x18];
+    u8 counter_set_id[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_q_counter_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x18];
+    u8 counter_set_id[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_modify_header_context_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 modify_header_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 modify_header_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_scheduling_element_out_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 scheduling_element_id[0x20];
+
+    u8 reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_create_scheduling_element_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 reserved_at_60[0x3a0];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 scheduling_element_id[0x20];
+
+    u8 reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
+    u8 reserved_at_0[0x60];
+
+    u8 reserved_at_60[0x10];
+    u8 vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x40];
+
+    u8 reserved_at_60[0x10];
+    u8 vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_set_l2_table_entry_in_bits {
+    u8 reserved_at_0[0xa0];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_index[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x80];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_index[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_create_srq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_srq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrc_srq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrc_srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrc_srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_dct_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 ece[0x20];
+};
+
+struct mlx5_ifc_destroy_dct_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_xrq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_attach_to_mcg_in_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_detach_from_mcg_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_alloc_xrcd_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_xrcd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum {
+    MLX5_CRYPTO_LOGIN_OBJ_STATE_VALID = 0x0,
+    MLX5_CRYPTO_LOGIN_OBJ_STATE_INVALID = 0x1,
+};
+
+struct mlx5_ifc_crypto_login_obj_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x4];
+    u8 state[0x4];
+    u8 credential_pointer[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 session_import_kek_ptr[0x18];
+
+    u8 reserved_at_c0[0x140];
+
+    u8 credential[12][0x20];
+
+    u8 reserved_at_380[0x480];
+};
+
+struct mlx5_ifc_create_crypto_login_obj_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_crypto_login_obj_bits login_obj;
+};
+
+struct mlx5_ifc_query_crypto_login_obj_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_crypto_login_obj_bits obj;
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_STATE_READY = 0x0,
+    MLX5_ENCRYPTION_KEY_OBJ_STATE_ERROR = 0x1,
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_128 = 0x0,
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_256 = 0x1,
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_PURPOSE_AES_XTS = 0x3,
+};
+
+struct mlx5_ifc_encryption_key_obj_bits {
+    u8 modify_field_select[0x40];
+
+    u8 state[0x8];
+    u8 reserved_at_48[0xc];
+    u8 key_size[0x4];
+    u8 has_keytag[0x1];
+    u8 reserved_at_59[0x3];
+    u8 key_purpose[0x4];
+
+    u8 reserved_at_60[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_80[0x100];
+
+    u8 opaque[0x40];
+
+    u8 reserved_at_1c0[0x40];
+
+    u8 key[32][0x20];
+
+    u8 reserved_at_600[0x200];
+};
+
+struct mlx5_ifc_create_encryption_key_obj_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_encryption_key_obj_bits key_obj;
+};
+
+struct mlx5_ifc_query_encryption_key_obj_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_encryption_key_obj_bits obj;
+};
+
+enum {
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_WIRE_SIGNATURE = 0x0,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_MEMORY_SIGNATURE = 0x1,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_WIRE = 0x2,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_MEMORY = 0x3,
+};
+
+enum {
+    MLX5_ENCRYPTION_STANDARD_AES_XTS = 0x0,
+};
+
+struct mlx5_ifc_nop_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_nop_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+#endif /* MLX5_IFC_H */
diff --git a/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h b/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h
new file mode 100644
index 000000000..3c5a182a1
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h
@@ -0,0 +1,170 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+#include <unistd.h>
+#include <linux/types.h>
+
+#define MLX5_ADAPTER_PAGE_SHIFT 12
+
+enum {
+    MLX5_CQE_SIZE_64B = 0x0,
+    MLX5_CQE_SIZE_128B = 0x1,
+};
+
+enum {
+    MLX5_QPC_RQ_TYPE_REGULAR = 0x0,
+    MLX5_QPC_RQ_TYPE_SRQ_RMP_XRC_SRQ_XRQ = 0x1,
+    MLX5_QPC_RQ_TYPE_ZERO_SIZE_RQ = 0x3,
+};
+
+enum {
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_1_0 = 0x0,
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_1_5 = 0x1,
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0 = 0x2,
+};
+
+enum {
+    MLX5_QPC_MTU_256_BYTES = 0x1,
+    MLX5_QPC_MTU_512_BYTES = 0x2,
+    MLX5_QPC_MTU_1K_BYTES = 0x3,
+    MLX5_QPC_MTU_2K_BYTES = 0x4,
+    MLX5_QPC_MTU_4K_BYTES = 0x5,
+    MLX5_QPC_MTU_8K_BYTES = 0x6,
+    MLX5_QPC_MTU_RAW_ETHERNET_QP = 0x7,
+};
+
+enum {
+    MLX5_QPC_STATE_RST = 0x0,
+    MLX5_QPC_STATE_INIT = 0x1,
+    MLX5_QPC_STATE_RTR = 0x2,
+    MLX5_QPC_STATE_RTS = 0x3,
+    MLX5_QPC_STATE_SQER = 0x4,
+    MLX5_QPC_STATE_SQDRAINED = 0x5,
+    MLX5_QPC_STATE_ERR = 0x6,
+};
+
+enum {
+    MLX5_CQC_CQE_SZ_BYTES_64 = 0x0,
+};
+
+enum {
+    MLX5_CQ_SET_CI = 0,
+    MLX5_CQ_ARM_DB = 1,
+};
+
+struct mlx5_ifc_cqc_bits {
+    uint8_t status[0x4];
+    uint8_t as_notify[0x1];
+    uint8_t initiator_src_dct[0x1];
+    uint8_t dbr_umem_valid[0x1];
+    uint8_t reserved_at_7[0x1];
+    uint8_t cqe_sz[0x3];
+    uint8_t cc[0x1];
+    uint8_t reserved_at_c[0x1];
+    uint8_t scqe_break_moderation_en[0x1];
+    uint8_t oi[0x1];
+    uint8_t cq_period_mode[0x2];
+    uint8_t cqe_comp_en[0x1];
+    uint8_t mini_cqe_res_format[0x2];
+    uint8_t st[0x4];
+    uint8_t reserved_at_18[0x1];
+    uint8_t cqe_comp_layout[0x7];
+    uint8_t dbr_umem_id[0x20];
+    uint8_t reserved_at_40[0x14];
+    uint8_t page_offset[0x6];
+    uint8_t reserved_at_5a[0x2];
+    uint8_t mini_cqe_res_format_ext[0x2];
+    uint8_t cq_timestamp_format[0x2];
+    uint8_t reserved_at_60[0x3];
+    uint8_t log_cq_size[0x5];
+    uint8_t uar_page[0x18];
+    uint8_t reserved_at_80[0x4];
+    uint8_t cq_period[0xc];
+    uint8_t cq_max_count[0x10];
+    uint8_t reserved_at_a0[0x18];
+    uint8_t c_eqn[0x8];
+    uint8_t reserved_at_c0[0x3];
+    uint8_t log_page_size[0x5];
+    uint8_t reserved_at_c8[0x18];
+    uint8_t reserved_at_e0[0x20];
+    uint8_t reserved_at_100[0x8];
+    uint8_t last_notified_index[0x18];
+    uint8_t reserved_at_120[0x8];
+    uint8_t last_solicit_index[0x18];
+    uint8_t reserved_at_140[0x8];
+    uint8_t consumer_counter[0x18];
+    uint8_t reserved_at_160[0x8];
+    uint8_t producer_counter[0x18];
+    uint8_t local_partition_id[0xc];
+    uint8_t process_id[0x14];
+    uint8_t reserved_at_1A0[0x20];
+    uint8_t dbr_addr[0x40];
+};
+
+struct mlx5_ifc_create_cq_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+    uint8_t reserved_at_40[0x40];
+    struct mlx5_ifc_cqc_bits cq_context;
+    uint8_t cq_umem_offset[0x40];
+    uint8_t cq_umem_id[0x20];
+    uint8_t cq_umem_valid[0x1];
+    uint8_t reserved_at_2e1[0x1f];
+    uint8_t reserved_at_300[0x580];
+    uint8_t pas[];
+};
+
+struct mlx5_err_cqe_ex {
+    uint8_t rsvd0[32];
+    __be32 srqn;
+    uint8_t rsvd1[16];
+    uint8_t hw_err_synd;
+    uint8_t hw_synd_type;
+    uint8_t vendor_err_synd;
+    uint8_t syndrome;
+    __be32 s_wqe_opcode_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+/* If not present, it will compile but it will not work.
+ * Fallback UAR mechanism is in place.
+ */
+#ifndef MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED
+#define MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED (1U << 31)
+#endif
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
new file mode 100644
index 000000000..930a19b71
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
@@ -0,0 +1,942 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include <cuda_runtime.h>
+#include <string.h>
+
+#include <atomic>
+#include <set>
+#include <unordered_map>
+#include <mutex>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "host/doca_gpunetio.h"
+#include "doca_gpunetio_gdrcopy.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_qp.hpp"
+#include "doca_verbs_cuda_wrapper.h"
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR 10
+
+struct doca_gpu_mtable {
+    uintptr_t base_addr;
+    size_t size_orig;
+    uintptr_t align_addr_gpu;
+    uintptr_t align_addr_cpu;
+    size_t size;
+    enum doca_gpu_mem_type mtype;
+    void *gdr_mh;
+};
+
+struct doca_gpu {
+    CUdevice cuda_dev; /* CUDA device handler */
+    std::unordered_map<uintptr_t, struct doca_gpu_mtable *>
+        *mtable;                       /* Table of GPU/CPU memory allocated addresses */
+    bool support_gdrcopy;              ///< Boolean value that indicates if gdrcopy is
+                                       ///< supported
+    bool support_dmabuf;               ///< Boolean value that indicates if dmabuf is
+                                       ///< supported by the gpu
+    bool support_wq_gpumem;            ///< Boolean value that indicates if gpumem is
+                                       ///< available and nic-gpu mapping is supported
+    bool support_cq_gpumem;            ///< Boolean value that indicates if gpumem is
+                                       ///< available and nic-gpu mapping is supported
+    bool support_uar_gpumem;           ///< Boolean value that indicates if gpumem is
+                                       ///< available and gpu-nic mapping is supported
+    bool support_async_store_release;  ///< Boolean value that indicates if
+                                       ///< async store release is supported
+    bool support_bf_uar;               ///< Boolean value that indicates if BlueFlame
+                                       ///< is supported
+};
+
+struct doca_gpu_verbs_service {
+    pthread_t service_thread;
+    pthread_rwlock_t service_lock;
+    bool running;
+    std::set<struct doca_gpu_verbs_qp *> *qps;
+};
+
+static inline bool priv_query_async_store_release_support(void) {
+    int current_device;
+    int compute_cap_major;
+    cudaError_t status = cudaSuccess;
+
+    status = cudaGetDevice(&current_device);
+    if (status != cudaSuccess) return false;
+
+    status = cudaDeviceGetAttribute(&compute_cap_major, cudaDevAttrComputeCapabilityMajor,
+                                    current_device);
+    if (status != cudaSuccess) return false;
+
+    return (compute_cap_major >= GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR);
+}
+
+bool priv_is_power_of_two(uint64_t x) { return x && (x & (x - 1)) == 0; }
+
+static size_t priv_get_page_size() {
+    auto ret = sysconf(_SC_PAGESIZE);
+    if (ret == -1) return 4096;  // 4KB, default Linux page size
+
+    return (size_t)ret;
+}
+
+doca_error_t doca_gpu_create(const char *gpu_bus_id, struct doca_gpu **gpu_dev) {
+    struct doca_gpu *gpu_dev_;
+    int dmabuf_supported;
+    CUresult res_drv = CUDA_SUCCESS;
+    cudaError_t res_cuda = cudaSuccess;
+
+    if (gpu_bus_id == nullptr || gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input parameters.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    gpu_dev_ = (struct doca_gpu *)calloc(1, sizeof(struct doca_gpu));
+    if (gpu_dev_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "error in %s: failed to allocate memory for doca_gpu", __func__);
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    res_cuda = cudaDeviceGetByPCIBusId(&gpu_dev_->cuda_dev, gpu_bus_id);
+    if (res_cuda != cudaSuccess) {
+        DOCA_LOG(LOG_ERR, "Invalid GPU bus id provided (ret %d).", res_drv);
+        goto exit_error;
+    }
+
+    res_drv = doca_verbs_wrapper_cuDeviceGetAttribute(
+        &(dmabuf_supported), CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, gpu_dev_->cuda_dev);
+    if (res_drv != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED returned %d.",
+                 res_drv);
+        goto exit_error;
+    }
+
+    (dmabuf_supported == 1 ? (gpu_dev_->support_dmabuf = true)
+                           : (gpu_dev_->support_dmabuf = false));
+
+    // status = gdaki_map_uar(guar);
+    // device_attr->support_uar_gpumem = (status == 0);
+    // did_map_uar = (status == 0);
+
+    // TBD
+    gpu_dev_->support_wq_gpumem = true;
+    gpu_dev_->support_cq_gpumem = true;
+    gpu_dev_->support_uar_gpumem = true;
+    gpu_dev_->support_bf_uar = true;
+    gpu_dev_->support_async_store_release = priv_query_async_store_release_support();
+    gpu_dev_->support_gdrcopy = doca_gpu_gdrcopy_is_supported();
+
+    try {
+        gpu_dev_->mtable = new std::unordered_map<uintptr_t, struct doca_gpu_mtable *>();
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "mtable map allocation failed");
+        goto exit_error;
+    }
+
+    (*gpu_dev) = gpu_dev_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    free(gpu_dev_);
+
+    return DOCA_ERROR_INITIALIZATION;
+}
+
+doca_error_t doca_gpu_destroy(struct doca_gpu *gpu_dev) {
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input parameters.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (gpu_dev->mtable != nullptr) {
+        if (gpu_dev->mtable->size() > 0) {
+            DOCA_LOG(LOG_ERR, "mtable map is not empty.");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        delete gpu_dev->mtable;
+    }
+
+    free(gpu_dev);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_mem_alloc(struct doca_gpu *gpu_dev, size_t size, size_t alignment,
+                                enum doca_gpu_mem_type mtype, void **memptr_gpu,
+                                void **memptr_cpu) {
+    cudaError_t res;
+    CUresult res_drv;
+    int ret;
+    void *cudev_memptr_gpu_orig_ = 0;
+    void *cudev_memptr_gpu_ = 0;
+    struct doca_gpu_mtable *mentry;
+    unsigned int flag = 1;
+    const char *err_string;
+    void *memptr_cpu_ = nullptr;
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (memptr_gpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_gpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (mtype != DOCA_GPU_MEM_TYPE_GPU && memptr_cpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_cpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (size == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid size provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (alignment == 0) alignment = priv_get_page_size();
+
+    if (priv_is_power_of_two(alignment) == false) {
+        DOCA_LOG(LOG_ERR, "alignment %zd has to be power of 2.", alignment);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    mentry = (struct doca_gpu_mtable *)calloc(1, sizeof(struct doca_gpu_mtable));
+    mentry->mtype = mtype;
+    mentry->size = size;
+
+    if (mtype == DOCA_GPU_MEM_TYPE_GPU_CPU && alignment != GPU_PAGE_SIZE) alignment = GPU_PAGE_SIZE;
+
+    if (mtype == DOCA_GPU_MEM_TYPE_GPU) {
+        mentry->size_orig = mentry->size + alignment;
+
+        res = cudaMalloc(&(cudev_memptr_gpu_orig_), mentry->size_orig);
+        if (res != cudaSuccess) {
+            err_string = cudaGetErrorString(res);
+            DOCA_LOG(LOG_ERR, "cudaMalloc current failed with %s size %zd", err_string,
+                     mentry->size_orig);
+            goto error;
+        }
+
+        /* Align memory address */
+        cudev_memptr_gpu_ = cudev_memptr_gpu_orig_;
+        if (alignment && ((uintptr_t)cudev_memptr_gpu_) % alignment)
+            cudev_memptr_gpu_ =
+                (void *)((uintptr_t)cudev_memptr_gpu_ +
+                         (alignment - (((uintptr_t)cudev_memptr_gpu_) % alignment)));
+
+        /* GPUDirect RDMA attribute required */
+        res_drv = doca_verbs_wrapper_cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                                           (CUdeviceptr)cudev_memptr_gpu_);
+        if (res_drv != CUDA_SUCCESS) {
+            cudaFree(cudev_memptr_gpu_orig_);
+            DOCA_LOG(LOG_ERR, "Could not set SYNC MEMOP attribute for GPU memory at %lx, err %d",
+                     (uintptr_t)cudev_memptr_gpu_, res);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->base_addr = (uintptr_t)cudev_memptr_gpu_orig_;
+        mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+        mentry->align_addr_cpu = 0;
+    } else if (mtype == DOCA_GPU_MEM_TYPE_GPU_CPU) {
+        if (gpu_dev->support_gdrcopy == true) {
+            mentry->size_orig = mentry->size + alignment;
+
+            res = cudaMalloc(&(cudev_memptr_gpu_orig_), mentry->size_orig);
+            if (res != cudaSuccess) {
+                err_string = cudaGetErrorString(res);
+                DOCA_LOG(LOG_ERR, "cudaMalloc current failed with %s", err_string);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            /* Align memory address */
+            cudev_memptr_gpu_ = cudev_memptr_gpu_orig_;
+            if (alignment && ((uintptr_t)cudev_memptr_gpu_) % alignment)
+                cudev_memptr_gpu_ =
+                    (void *)((uintptr_t)cudev_memptr_gpu_ +
+                             (alignment - (((uintptr_t)cudev_memptr_gpu_) % alignment)));
+
+            /* GPUDirect RDMA attribute required */
+            res_drv = doca_verbs_wrapper_cuPointerSetAttribute(
+                &flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)cudev_memptr_gpu_);
+            if (res_drv != CUDA_SUCCESS) {
+                cudaFree(cudev_memptr_gpu_orig_);
+                DOCA_LOG(LOG_ERR,
+                         "Could not set SYNC MEMOP attribute for GPU memory at %lx, err %d",
+                         (uintptr_t)cudev_memptr_gpu_, res);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->base_addr = (uintptr_t)cudev_memptr_gpu_orig_;
+            mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+            mentry->align_addr_cpu = 0;
+
+            ret =
+                doca_gpu_gdrcopy_create_mapping((void *)mentry->align_addr_gpu, mentry->size,
+                                                &mentry->gdr_mh, (void **)&mentry->align_addr_cpu);
+            if (ret) {
+                DOCA_LOG(LOG_ERR, "Error mapping GPU memory at %lx to CPU", mentry->align_addr_gpu);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+        } else {
+            DOCA_LOG(LOG_WARNING,
+                     "GDRCopy not enabled, can't allocate memory type DOCA_GPU_MEM_TYPE_GPU_CPU. "
+                     "Using DOCA_GPU_MEM_TYPE_CPU_GPU mode instead");
+
+            mentry->size_orig = mentry->size;
+
+            memptr_cpu_ = (uint8_t *)calloc(alignment, mentry->size_orig);
+            if (memptr_cpu_ == nullptr) {
+                DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory.");
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            res = cudaHostRegister(memptr_cpu_, mentry->size_orig,
+                                   cudaHostRegisterPortable | cudaHostRegisterMapped);
+            if (res != cudaSuccess) {
+                DOCA_LOG(LOG_ERR, "Could register CPU memory to CUDA %lx, err %d",
+                         (uintptr_t)memptr_cpu_, res);
+                free(memptr_cpu_);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->base_addr = (uintptr_t)memptr_cpu_;
+
+            res = cudaHostGetDevicePointer(&cudev_memptr_gpu_, memptr_cpu_, 0);
+            if (res != cudaSuccess) {
+                DOCA_LOG(LOG_ERR, "Could get GPU device ptr for CPU memory %lx, err %d",
+                         (uintptr_t)memptr_cpu_, res);
+                free(memptr_cpu_);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+            mentry->align_addr_cpu = (uintptr_t)memptr_cpu_;
+        }
+
+    } else if (mtype == DOCA_GPU_MEM_TYPE_CPU_GPU) {
+        mentry->size_orig = mentry->size;
+
+        memptr_cpu_ = (uint8_t *)calloc(alignment, mentry->size_orig);
+        if (memptr_cpu_ == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory.");
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        res = cudaHostRegister(memptr_cpu_, mentry->size_orig,
+                               cudaHostRegisterPortable | cudaHostRegisterMapped);
+        if (res != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Could register CPU memory to CUDA %lx, err %d",
+                     (uintptr_t)memptr_cpu_, res);
+            free(memptr_cpu_);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->base_addr = (uintptr_t)memptr_cpu_;
+
+        res = cudaHostGetDevicePointer(&cudev_memptr_gpu_, memptr_cpu_, 0);
+        if (res != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Could get GPU device ptr for CPU memory %lx, err %d",
+                     (uintptr_t)memptr_cpu_, res);
+            free(memptr_cpu_);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+        mentry->align_addr_cpu = (uintptr_t)memptr_cpu_;
+    }
+
+    *memptr_gpu = (void *)mentry->align_addr_gpu;
+    if (memptr_cpu) *memptr_cpu = (void *)mentry->align_addr_cpu;
+
+    // DOCA_LOG(LOG_DEBUG, "New memory: Orig %lx GPU %lx CPU %lx type %d size %zd\n",
+    // 	      mentry->base_addr,
+    // 	      mentry->align_addr_gpu,
+    // 	      mentry->align_addr_cpu,
+    // 	      mentry->mtype,
+    // 	      mentry->size);
+
+    try {
+        gpu_dev->mtable->insert({mentry->align_addr_gpu, mentry});
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "mtable map insert failed");
+        status = DOCA_ERROR_DRIVER;
+        goto error;
+    }
+
+    return DOCA_SUCCESS;
+
+error:
+    free(mentry);
+    return status;
+}
+
+doca_error_t doca_gpu_mem_free(struct doca_gpu *gpu_dev, void *memptr_gpu) {
+    struct doca_gpu_mtable *mentry;
+    cudaError_t res_cuda;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (memptr_gpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_gpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    std::unordered_map<uint64_t, struct doca_gpu_mtable *>::const_iterator it =
+        gpu_dev->mtable->find((uintptr_t)memptr_gpu);
+    if (it == gpu_dev->mtable->end()) {
+        DOCA_LOG(LOG_ERR, "memptr_gpu = %p was not allocated by DOCA GPUNetIO.", memptr_gpu);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    mentry = it->second;
+
+    if (mentry->mtype == DOCA_GPU_MEM_TYPE_GPU)
+        cudaFree((void *)mentry->base_addr);
+    else if (mentry->mtype == DOCA_GPU_MEM_TYPE_GPU_CPU) {
+        if (gpu_dev->support_gdrcopy)
+            doca_gpu_gdrcopy_destroy_mapping(mentry->gdr_mh, (void *)mentry->align_addr_cpu,
+                                             mentry->size);
+        cudaFree((void *)mentry->base_addr);
+    } else {
+        res_cuda = cudaHostUnregister((void *)mentry->base_addr);
+        if (res_cuda != cudaSuccess)
+            DOCA_LOG(LOG_ERR, "Error unregistering GPU memory at %p", (void *)mentry->base_addr);
+        free((void *)mentry->base_addr);
+    }
+
+    gpu_dev->mtable->erase(it);
+    free(mentry);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_dmabuf_fd(struct doca_gpu *gpu_dev, void *memptr_gpu, size_t size,
+                                int *dmabuf_fd) {
+#if DOCA_GPUNETIO_HAVE_CUDA_DMABUF == 1
+    CUresult res_drv = CUDA_SUCCESS;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (gpu_dev->support_dmabuf == false) {
+        DOCA_LOG(LOG_ERR, "DMABuf not supported on this system by this CUDA installation.");
+        return DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    if (dmabuf_fd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DMABuf fd pointer provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    res_drv = doca_verbs_wrapper_cuMemGetHandleForAddressRange(
+        dmabuf_fd, (CUdeviceptr)memptr_gpu, size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
+    if (res_drv != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "cuMemGetHandleForAddressRange returned %d.", res_drv);
+        return DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    return DOCA_SUCCESS;
+#else
+    return DOCA_ERROR_NOT_SUPPORTED;
+#endif
+}
+
+doca_error_t doca_gpu_verbs_can_gpu_register_uar(void *db, bool *out_can_register) {
+    cudaError_t cuda_status = cudaSuccess;
+
+    if (db == nullptr || out_can_register == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    cuda_status = cudaHostRegister(
+        db, DOCA_VERBS_DB_UAR_SIZE,
+        cudaHostRegisterPortable | cudaHostRegisterMapped | cudaHostRegisterIoMemory);
+
+    *out_can_register =
+        (cuda_status == cudaSuccess || cuda_status == cudaErrorHostMemoryAlreadyRegistered);
+
+    if (cuda_status == cudaSuccess) cudaHostUnregister(db);
+
+    return DOCA_SUCCESS;
+}
+
+struct pair_ptr_cucontext_hash {
+    std::size_t operator()(const std::pair<void *, CUcontext> &p) const noexcept {
+        // Hash the pointer and the CUcontext (which is also a pointer type)
+        std::size_t h1 = std::hash<void *>{}(p.first);
+        std::size_t h2 = std::hash<CUcontext>{}(p.second);
+        // Combine the two hashes
+        return h1 ^ (h2 << 1);
+    }
+};
+static std::unordered_map<std::pair<void *, CUcontext>, unsigned int, pair_ptr_cucontext_hash>
+    registered_uar_refcount;
+static std::mutex registered_uar_mutex;
+
+doca_error_t doca_gpu_verbs_export_uar(uint64_t *sq_db, uint64_t **uar_addr_gpu) {
+    std::lock_guard<std::mutex> lock(registered_uar_mutex);
+
+    void *ptr = nullptr;
+    cudaError_t cuda_status = cudaSuccess;
+    CUresult cuda_drv_status = CUDA_SUCCESS;
+    bool registered = false;
+    CUcontext current_ctx = nullptr;
+    std::pair<void *, CUcontext> uar_key;
+
+    if (sq_db == nullptr || uar_addr_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    // Get current CUDA context
+    cuda_drv_status = doca_verbs_wrapper_cuCtxGetCurrent(&current_ctx);
+    if (cuda_drv_status != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get current CUDA context (err %d)", cuda_drv_status);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    cuda_status = cudaHostRegister(
+        sq_db, DOCA_VERBS_DB_UAR_SIZE,
+        cudaHostRegisterPortable | cudaHostRegisterMapped | cudaHostRegisterIoMemory);
+    if (cuda_status == cudaSuccess)
+        registered = true;
+    else if (cuda_status != cudaErrorHostMemoryAlreadyRegistered) {
+        DOCA_LOG(LOG_ERR,
+                 "Function cudaHostRegister (err %d) "
+                 "failed on addr %p size %d",
+                 cuda_status, (void *)sq_db, DOCA_VERBS_DB_UAR_SIZE);
+        goto out;
+    }
+
+    cuda_status = cudaHostGetDevicePointer(&ptr, sq_db, 0);
+    if (cuda_status != cudaSuccess) {
+        DOCA_LOG(LOG_ERR,
+                 "Function cudaHostGetDevicePointer (err %d) "
+                 "failed on addr %p size %d",
+                 cuda_status, (void *)sq_db, DOCA_VERBS_DB_UAR_SIZE);
+        goto out;
+    }
+
+    uar_key = std::make_pair((void *)sq_db, current_ctx);
+    if (registered_uar_refcount.find(uar_key) == registered_uar_refcount.end()) {
+        registered_uar_refcount[uar_key] = 0;
+    }
+    registered_uar_refcount[uar_key]++;
+
+    *uar_addr_gpu = (uint64_t *)ptr;
+
+out:
+    if (cuda_status != cudaSuccess) {
+        if (registered) cudaHostUnregister(sq_db);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_unexport_uar(uint64_t *uar_addr_gpu) {
+    std::lock_guard<std::mutex> lock(registered_uar_mutex);
+
+    CUcontext current_ctx = nullptr;
+    CUresult cuda_drv_status = CUDA_SUCCESS;
+    cudaError_t cuda_status = cudaSuccess;
+    std::pair<void *, CUcontext> uar_key;
+
+    if (uar_addr_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    // Get current CUDA context
+    cuda_drv_status = doca_verbs_wrapper_cuCtxGetCurrent(&current_ctx);
+    if (cuda_drv_status != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get current CUDA context (err %d)", cuda_drv_status);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    uar_key = std::make_pair((void *)uar_addr_gpu, current_ctx);
+    if (registered_uar_refcount.find(uar_key) == registered_uar_refcount.end()) {
+        DOCA_LOG(LOG_ERR, "UAR address %p with context %p not found in registered_uar_refcount",
+                 uar_addr_gpu, current_ctx);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    registered_uar_refcount[uar_key]--;
+    assert(registered_uar_refcount[uar_key] >= 0);
+    if (registered_uar_refcount[uar_key] == 0) {
+        registered_uar_refcount.erase(uar_key);
+        cuda_status = cudaHostUnregister(uar_addr_gpu);
+        if (cuda_status != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Failed to unregister UAR address %p", uar_addr_gpu);
+            return DOCA_ERROR_DRIVER;
+        }
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_export_qp(struct doca_gpu *gpu_dev, struct doca_verbs_qp *qp,
+                                      enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                                      void *gpu_qp_umem_dev_ptr, struct doca_verbs_cq *cq_sq,
+                                      struct doca_gpu_verbs_qp **qp_out) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    struct doca_gpu_dev_verbs_qp *qp_cpu_ = nullptr;
+    void *rq_wqe_daddr;
+    uint32_t rq_wqe_num;
+    uint32_t rcv_wqe_size;
+    uint64_t *sq_db;
+    uint32_t sq_wqe_num;
+    uint64_t *uar_db_reg = NULL;
+    uint32_t *arm_dbr = NULL;
+    uint32_t *cq_dbrec;
+
+    if (gpu_dev == nullptr || qp == nullptr || qp == nullptr || cq_sq == nullptr)
+        return DOCA_ERROR_INVALID_VALUE;
+
+    *qp_out = (struct doca_gpu_verbs_qp *)calloc(1, sizeof(struct doca_gpu_verbs_qp));
+    if (*qp_out == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    (*qp_out)->qp_cpu =
+        (struct doca_gpu_dev_verbs_qp *)calloc(1, sizeof(struct doca_gpu_dev_verbs_qp));
+    if ((*qp_out)->qp_cpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory");
+        free(*qp_out);
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    qp_cpu_ = (*qp_out)->qp_cpu;
+
+    // Should this be propagated to GPU?
+    if (qp->get_uar_mtype() == DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME)
+        gpu_dev->support_bf_uar = true;
+
+    // Check QP and CQ same size!!!!
+
+    doca_verbs_qp_get_wq(qp,
+                         (void **)&(qp_cpu_->sq_wqe_daddr),  // broken for external umem
+                         &sq_wqe_num,
+                         (void **)&(rq_wqe_daddr),  // broken for external umem
+                         &rq_wqe_num, &rcv_wqe_size);
+
+    uint32_t *dbrec = reinterpret_cast<uint32_t *>(doca_verbs_qp_get_dbr_addr(qp));
+
+    qp_cpu_->sq_wqe_num = (uint16_t)sq_wqe_num;
+    qp_cpu_->sq_wqe_mask = qp_cpu_->sq_wqe_num - 1;
+    qp_cpu_->sq_num = doca_verbs_qp_get_qpn(qp);
+    qp_cpu_->sq_num_shift8 = qp_cpu_->sq_num << 8;
+    qp_cpu_->sq_num_shift8_be = htobe32(qp_cpu_->sq_num_shift8);
+    qp_cpu_->sq_num_shift8_be_1ds = htobe32(qp_cpu_->sq_num_shift8 | 1);
+    qp_cpu_->sq_num_shift8_be_2ds = htobe32(qp_cpu_->sq_num_shift8 | 2);
+    qp_cpu_->sq_num_shift8_be_3ds = htobe32(qp_cpu_->sq_num_shift8 | 3);
+    qp_cpu_->sq_num_shift8_be_4ds = htobe32(qp_cpu_->sq_num_shift8 | 4);
+    qp_cpu_->sq_wqe_pi = 0;
+    qp_cpu_->sq_rsvd_index = 0;
+    qp_cpu_->sq_ready_index = 0;
+    qp_cpu_->sq_lock = 0;
+    qp_cpu_->sq_dbrec = (__be32 *)(dbrec + DOCA_GPUNETIO_IB_MLX5_SND_DBR);
+    qp_cpu_->mem_type = DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU;
+    (*qp_out)->cpu_db = nullptr;
+    (*qp_out)->sq_db = nullptr;
+    (*qp_out)->sq_wqe_pi_last = 0;
+    (*qp_out)->cpu_proxy = false;
+    (*qp_out)->qp_gpu = nullptr;
+    (*qp_out)->qp = qp;
+
+    sq_db = reinterpret_cast<uint64_t *>(doca_verbs_qp_get_uar_addr(qp));
+
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        status = doca_gpu_verbs_export_uar(sq_db, (uint64_t **)&(qp_cpu_->sq_db));
+        if (status != DOCA_SUCCESS && nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+            DOCA_LOG(LOG_ERR, "Can't export UAR to GPU.");
+            goto destroy_uar;
+        }
+    }
+
+    if ((status != DOCA_SUCCESS && nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) ||
+        nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        DOCA_LOG(LOG_WARNING, "Enabling CPU proxy mode");
+
+        status = doca_gpu_mem_alloc(gpu_dev, sizeof(uint64_t), priv_get_page_size(),
+                                    DOCA_GPU_MEM_TYPE_CPU_GPU, (void **)&((*qp_out)->cpu_db),
+                                    (void **)&((*qp_out)->cpu_db));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc GPU memory for CPU proxy DB");
+            goto destroy_uar;
+        }
+
+        *((*qp_out)->cpu_db) = 0;
+        qp_cpu_->sq_db = (*qp_out)->cpu_db;
+        (*qp_out)->sq_dbrec = qp_cpu_->sq_dbrec;
+        (*qp_out)->sq_db = reinterpret_cast<uint64_t *>(doca_verbs_qp_get_uar_addr(qp));
+        (*qp_out)->cpu_proxy = true;
+        (*qp_out)->sq_num_shift8_be = qp_cpu_->sq_num_shift8_be;
+        qp_cpu_->nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY;
+    } else {
+        qp_cpu_->nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB;
+    }
+
+    doca_verbs_cq_get_wq(cq_sq, (void **)&(qp_cpu_->cq_sq.cqe_daddr), &(qp_cpu_->cq_sq.cqe_num),
+                         &(qp_cpu_->cq_sq.cqe_size));
+
+    doca_verbs_cq_get_dbr_addr(cq_sq, &uar_db_reg, (uint32_t **)&(cq_dbrec), &arm_dbr);
+
+    qp_cpu_->cq_sq.dbrec = (__be32 *)cq_dbrec;
+    qp_cpu_->cq_sq.cq_num = doca_verbs_cq_get_cqn(cq_sq);
+    qp_cpu_->cq_sq.cqe_mask = (qp_cpu_->cq_sq.cqe_num - 1);
+    qp_cpu_->cq_sq.cqe_ci = 0;
+    qp_cpu_->cq_sq.cqe_rsvd = 0;
+    qp_cpu_->cq_sq.mem_type = DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU;
+
+    (*qp_out)->gpu_dev = gpu_dev;
+
+    return DOCA_SUCCESS;
+
+destroy_uar:
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        tmp_status = doca_gpu_verbs_unexport_uar((*qp_out)->qp_cpu->sq_db);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy GPU descriptor memory");
+    }
+
+    free((*qp_out)->qp_cpu);
+    free(*qp_out);
+
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_get_qp_dev(struct doca_gpu_verbs_qp *qp,
+                                       struct doca_gpu_dev_verbs_qp **qp_gpu) {
+    doca_error_t status = DOCA_SUCCESS;
+    int custatus = 0;
+
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp->qp_gpu == nullptr) {
+        status = doca_gpu_mem_alloc(qp->gpu_dev, sizeof(struct doca_gpu_dev_verbs_qp),
+                                    priv_get_page_size(), DOCA_GPU_MEM_TYPE_GPU,
+                                    (void **)&qp->qp_gpu, nullptr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for qp_gpu");
+            return status;
+        }
+
+        custatus = cudaMemcpy(qp->qp_gpu, qp->qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp),
+                              cudaMemcpyHostToDevice);
+        if (custatus != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "cuMemcpyHtoD failed");
+            doca_gpu_mem_free(qp->gpu_dev, qp->qp_gpu);
+            qp->qp_gpu = nullptr;
+            return DOCA_ERROR_DRIVER;
+        }
+    }
+
+    *qp_gpu = qp->qp_gpu;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_unexport_qp(struct doca_gpu *gpu_dev,
+                                        struct doca_gpu_verbs_qp *qp_gverbs) {
+    if (gpu_dev == nullptr || qp_gverbs == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp_gverbs->cpu_db) doca_gpu_mem_free(gpu_dev, qp_gverbs->cpu_db);
+
+    if (qp_gverbs->qp_cpu) {
+        if (qp_gverbs->qp_cpu->nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY)
+            doca_gpu_verbs_unexport_uar(qp_gverbs->qp_cpu->sq_db);
+        free(qp_gverbs->qp_cpu);
+    }
+
+    if (qp_gverbs->qp_gpu) {
+        doca_gpu_mem_free(gpu_dev, qp_gverbs->qp_gpu);
+        qp_gverbs->qp_gpu = nullptr;
+    }
+
+    free(qp_gverbs);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_cpu_proxy_progress(struct doca_gpu_verbs_qp *qp_cpu) {
+    uint32_t tmp_db = 0;
+    __be32 dbr_val;
+
+    if (qp_cpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp_cpu->cpu_proxy != true) return DOCA_ERROR_NOT_SUPPORTED;
+
+    tmp_db = (uint32_t) * ((volatile uint64_t *)qp_cpu->cpu_db);
+    if (tmp_db != qp_cpu->sq_wqe_pi_last) {
+        struct doca_gpu_dev_verbs_wqe_ctrl_seg ctrl_seg = {.opmod_idx_opcode = htobe32(tmp_db << 8),
+                                                           .qpn_ds = qp_cpu->sq_num_shift8_be};
+
+        dbr_val = htobe32(tmp_db & 0xffff);
+
+        // Ring the DB ASAP.
+        // The second DB ringing happens after the fence. This is used when the NIC enters a
+        // recovery state and it needs to read DBR.
+        *((volatile uint32_t *)qp_cpu->sq_dbrec) = dbr_val;
+        std::atomic_thread_fence(std::memory_order_release);
+        *((volatile uint64_t *)qp_cpu->sq_db) = *((volatile uint64_t *)&ctrl_seg);
+
+        // DOCA_LOG(LOG_DEBUG, "CPU proxy ring wqe %d\n", tmp_db);
+        qp_cpu->sq_wqe_pi_last = tmp_db;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+static void *priv_service_mainloop(void *args) {
+    struct doca_gpu_verbs_service *service = (struct doca_gpu_verbs_service *)args;
+    const unsigned int num_loops = 1000;
+
+    while (service->running) {
+        pthread_rwlock_rdlock(&service->service_lock);
+        for (unsigned int i = 0; i < num_loops; i++) {
+            for (auto qp : *service->qps) {
+                doca_gpu_verbs_cpu_proxy_progress(qp);
+            }
+        }
+        pthread_rwlock_unlock(&service->service_lock);
+        sched_yield();
+    }
+
+    return nullptr;
+}
+
+doca_error_t doca_gpu_verbs_create_service(doca_gpu_verbs_service_t *out_service) {
+    int status = 0;
+    doca_error_t doca_status = DOCA_SUCCESS;
+    struct doca_gpu_verbs_service *service = nullptr;
+
+    if (out_service == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    service = (struct doca_gpu_verbs_service *)calloc(1, sizeof(struct doca_gpu_verbs_service));
+    if (service == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate memory for service");
+        doca_status = DOCA_ERROR_NO_MEMORY;
+        goto out;
+    }
+
+    status = pthread_rwlock_init(&service->service_lock, nullptr);
+    if (status != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to initialize service lock");
+        doca_status = DOCA_ERROR_DRIVER;
+        goto out;
+    }
+
+    service->running = true;
+    service->qps = new std::set<struct doca_gpu_verbs_qp *>();
+    status = pthread_create(&service->service_thread, nullptr, priv_service_mainloop, service);
+    if (status != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create service thread");
+        doca_status = DOCA_ERROR_DRIVER;
+        goto out;
+    }
+
+    *out_service = service;
+
+out:
+    if (status) {
+        if (service->qps) delete service->qps;
+        if (service) free(service);
+    }
+    return doca_status;
+}
+
+doca_error_t doca_gpu_verbs_service_monitor_qp(doca_gpu_verbs_service_t service,
+                                               struct doca_gpu_verbs_qp *qp) {
+    struct doca_gpu_verbs_service *service_ = (struct doca_gpu_verbs_service *)service;
+    if (service == nullptr || qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    pthread_rwlock_wrlock(&service_->service_lock);
+    service_->qps->insert(qp);
+    pthread_rwlock_unlock(&service_->service_lock);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_destroy_service(doca_gpu_verbs_service_t service) {
+    struct doca_gpu_verbs_service *service_ = (struct doca_gpu_verbs_service *)service;
+    if (service == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    service_->running = false;
+    pthread_join(service_->service_thread, nullptr);
+    pthread_rwlock_destroy(&service_->service_lock);
+    delete service_->qps;
+    free(service_);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_query_last_error(struct doca_gpu_verbs_qp *qp,
+                                             struct doca_gpu_verbs_qp_error_info *error_info) {
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (qp == nullptr || qp->qp == nullptr || error_info == nullptr)
+        return DOCA_ERROR_INVALID_VALUE;
+
+    memset(error_info, 0, sizeof(struct doca_gpu_verbs_qp_error_info));
+
+    struct doca_verbs_qp_attr qp_attr;
+    struct doca_verbs_qp_init_attr qp_init_attr;
+    status = doca_verbs_qp_query(qp->qp, &qp_attr, &qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query QP");
+        return status;
+    }
+
+    error_info->has_error = (qp_attr.current_state == DOCA_VERBS_QP_STATE_ERR);
+
+    return DOCA_SUCCESS;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp
new file mode 100644
index 000000000..ecf1f13c6
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp
@@ -0,0 +1,261 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpu_gdrcopy.h
+ * @brief Implementation of the GDRCopy APIs used in doca_gpunetio
+ */
+
+#include <dlfcn.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "doca_gpunetio_gdrcopy.h"
+#include "doca_gpunetio_log.hpp"
+
+struct gdr;
+typedef struct gdr *gdr_t;
+typedef struct gdr_mh_s {
+    unsigned long h;
+} gdr_mh_t;
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1)
+#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
+
+#ifdef __GNUC__
+#define TYPEOF(x) __typeof__(x)
+#else
+#define TYPEOF(x) decltype(x)
+#endif
+
+#define DOCA_GPUNETIO_GDRCOPY_LIB_NAME "libgdrapi.so.2"
+#define DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, symbol, funcptr, on_error_status, on_error_out) \
+    do {                                                                                       \
+        funcptr = (TYPEOF(funcptr))dlsym(handle, symbol);                                      \
+        if (!funcptr) {                                                                        \
+            DOCA_LOG(LOG_ERR, "Failed to load symbol %s", symbol);                             \
+            on_error_status = ENOENT;                                                          \
+            goto on_error_out;                                                                 \
+        }                                                                                      \
+    } while (0)
+
+struct doca_gpu_gdrcopy_function_table {
+    void *handle;
+    gdr_t (*open)();
+    int (*close)(gdr_t g);
+    int (*pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token,
+                      uint32_t va_space, gdr_mh_t *handle);
+    int (*unpin_buffer)(gdr_t g, gdr_mh_t handle);
+    int (*map)(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+    int (*unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+    int (*copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+    int (*copy_to_mapping)(gdr_mh_t handle, const void *map_d_ptr, void *h_ptr, size_t size);
+    void (*runtime_get_version)(int *major, int *minor);
+    int (*driver_get_version)(gdr_t g, int *major, int *minor);
+};
+
+static struct doca_gpu_gdrcopy_function_table *doca_gpu_gdrcopy_ftable = NULL;
+static gdr_t doca_gpu_gdr = NULL;
+
+static int doca_gpu_gdrcopy_ftable_init(struct doca_gpu_gdrcopy_function_table **ftable) {
+    int status = 0;
+    void *handle = NULL;
+    struct doca_gpu_gdrcopy_function_table *table = NULL;
+
+    handle = dlopen(DOCA_GPUNETIO_GDRCOPY_LIB_NAME, RTLD_LAZY);
+    if (!handle) {
+        DOCA_LOG(LOG_ERR, "Failed to open libgdrapi.so.2");
+        status = ENOENT;
+        goto out;
+    }
+
+    table = (struct doca_gpu_gdrcopy_function_table *)malloc(
+        sizeof(struct doca_gpu_gdrcopy_function_table));
+    if (!table) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate memory for gdrcopy function table");
+        status = ENOMEM;
+        goto out;
+    }
+
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_open", table->open, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_close", table->close, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_pin_buffer", table->pin_buffer, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_unpin_buffer", table->unpin_buffer, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_map", table->map, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_unmap", table->unmap, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_copy_from_mapping", table->copy_from_mapping,
+                                   status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_copy_to_mapping", table->copy_to_mapping, status,
+                                   out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_runtime_get_version", table->runtime_get_version,
+                                   status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_driver_get_version", table->driver_get_version,
+                                   status, out);
+
+    table->handle = handle;
+    *ftable = table;
+
+out:
+    if (status != 0) {
+        if (handle) {
+            dlclose(handle);
+        }
+        if (table) {
+            free(table);
+        }
+    }
+    return status;
+}
+
+static int doca_gpu_init_gdrcopy() {
+    int status = 0;
+    if (!doca_gpu_gdr) {
+        if (!doca_gpu_gdrcopy_ftable) {
+            status = doca_gpu_gdrcopy_ftable_init(&doca_gpu_gdrcopy_ftable);
+            if (status) {
+                DOCA_LOG(LOG_ERR, "Error in doca_gpu_gdrcopy_ftable_init");
+                goto out;
+            }
+        }
+
+        doca_gpu_gdr = doca_gpu_gdrcopy_ftable->open();
+        if (!doca_gpu_gdr) {
+            DOCA_LOG(LOG_ERR, "Error in gdr_open");
+            status = EIO;
+            goto out;
+        }
+    }
+
+out:
+    return status;
+}
+
+static bool doca_gpu_enable_gdrcopy() {
+    const char *env = getenv("DOCA_GPUNETIO_DISABLE_GDRCOPY");
+    if (env && atoi(env) != 0) {
+        DOCA_LOG(LOG_INFO, "DOCA_GPUNETIO_DISABLE_GDRCOPY is set, disabling GDRCopy");
+        return false;
+    }
+    return true;
+}
+
+bool doca_gpu_gdrcopy_is_supported() {
+    static bool is_tried_init = false;
+    static bool is_supported = false;
+    if (!is_tried_init) {
+        bool enabled = doca_gpu_enable_gdrcopy();
+        is_supported = (enabled && (doca_gpu_init_gdrcopy() == 0));
+        DOCA_LOG(LOG_INFO, "GDRCopy usage is %s", is_supported ? "enabled" : "disabled");
+        is_tried_init = true;
+    }
+    return is_supported;
+}
+
+static int priv_doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size,
+                                                gdr_mh_t *out_mh, void **out_host_ptr) {
+    int status = 0;
+    gdr_mh_t mh;
+    void *host_ptr;
+    bool did_gdr_pin_buffer = false;
+
+    status = doca_gpu_init_gdrcopy();
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in doca_gpu_init_gdrcopy");
+        goto out;
+    }
+
+    assert(((uintptr_t)dev_aligned_ptr & (GPU_PAGE_SIZE - 1ULL)) == 0);
+
+    status = doca_gpu_gdrcopy_ftable->pin_buffer(doca_gpu_gdr, (unsigned long)dev_aligned_ptr, size,
+                                                 0, 0, &mh);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in gdr_pin_buffer");
+        goto out;
+    }
+    did_gdr_pin_buffer = true;
+
+    status = doca_gpu_gdrcopy_ftable->map(doca_gpu_gdr, mh, &host_ptr, size);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in gdr_map");
+        goto out;
+    }
+
+    *out_mh = mh;
+    *out_host_ptr = host_ptr;
+
+out:
+    if (status) {
+        if (did_gdr_pin_buffer) doca_gpu_gdrcopy_ftable->unpin_buffer(doca_gpu_gdr, mh);
+    }
+    return status;
+}
+
+int doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size, void **out_mh,
+                                    void **out_host_ptr) {
+    int status = 0;
+    gdr_mh_t *mh = NULL;
+    mh = (gdr_mh_t *)malloc(sizeof(gdr_mh_t));
+    if (!mh) {
+        DOCA_LOG(LOG_ERR, "Error in malloc for mh");
+        status = ENOMEM;
+        goto out;
+    }
+
+    status = priv_doca_gpu_gdrcopy_create_mapping(dev_aligned_ptr, size, mh, out_host_ptr);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in priv_doca_gpu_gdrcopy_create_mapping");
+        goto out;
+    }
+
+    *out_mh = mh;
+
+out:
+    if (status) {
+        if (mh) {
+            free(mh);
+        }
+    }
+    return status;
+}
+
+static void _doca_gpu_gdrcopy_destroy_mapping(gdr_mh_t *mh, void *host_ptr, size_t size) {
+    assert(doca_gpu_gdr);
+    doca_gpu_gdrcopy_ftable->unmap(doca_gpu_gdr, *mh, host_ptr, size);
+    doca_gpu_gdrcopy_ftable->unpin_buffer(doca_gpu_gdr, *mh);
+}
+
+void doca_gpu_gdrcopy_destroy_mapping(void *mh, void *host_ptr, size_t size) {
+    if (mh) {
+        _doca_gpu_gdrcopy_destroy_mapping((gdr_mh_t *)mh, host_ptr, size);
+        free(mh);
+    }
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h
new file mode 100644
index 000000000..dcc8a1eb9
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file gdaki_gdrcopy.h
+ * @brief A header file for the GDRCopy APIs used in GDAKI
+ */
+
+#ifndef DOCA_GPUNETIO_GDRCOPY_H
+#define DOCA_GPUNETIO_GDRCOPY_H
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool doca_gpu_gdrcopy_is_supported();
+int doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size, void **out_mh,
+                                    void **out_host_ptr);
+void doca_gpu_gdrcopy_destroy_mapping(void *mh, void *host_ptr, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // DOCA_GPUNETIO_GDRCOPY_H
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp
new file mode 100644
index 000000000..b97ff5f4a
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp
@@ -0,0 +1,903 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mutex>
+#include <atomic>
+#include <time.h>
+#include <unordered_map>
+#include <cuda_runtime.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "host/doca_gpunetio_high_level.h"
+#include "doca_gpunetio_gdrcopy.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_qp.hpp"
+#include "common/doca_gpunetio_verbs_dev.h"
+
+#define DBR_SIZE (8)
+#define MAX_SEND_SEGS (1)
+#define MAX_RECEIVE_SEGS (1)
+
+static size_t priv_get_page_size() {
+    auto ret = sysconf(_SC_PAGESIZE);
+    if (ret == -1) return 4096;  // 4KB, default Linux page size
+
+    return (size_t)ret;
+}
+
+static uint32_t align_up_uint32(uint32_t value, uint32_t alignment) {
+    uint64_t remainder = (value % alignment);
+
+    if (remainder == 0) return value;
+
+    return (uint32_t)(value + (alignment - remainder));
+}
+
+static doca_error_t create_uar(struct ibv_context *ibctx,
+                               enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                               struct doca_verbs_uar **external_uar, bool bf_supported) {
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF) {
+        status = doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED,
+                                       external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC DEDICATED");
+            status =
+                doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE, external_uar);
+            if (status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC");
+            } else {
+                DOCA_LOG(LOG_INFO, "UAR created with DOCA_UAR_ALLOCATION_TYPE_NONCACHE");
+            }
+            return DOCA_SUCCESS;
+        } else
+            return DOCA_SUCCESS;
+    }
+
+    if (bf_supported &&
+        (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF ||
+         (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO && status != DOCA_SUCCESS))) {
+        status =
+            doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME, external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC");
+            return status;
+        }
+    } else
+        return DOCA_ERROR_DRIVER;
+
+    return status;
+}
+
+static doca_error_t create_gpu_umem(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                                    enum doca_gpu_verbs_mem_reg_type mreg_type, uint32_t umem_sz,
+                                    void *umem_ptr, struct doca_verbs_umem **umem) {
+    doca_error_t status;
+    int dmabuf_fd;
+    struct ibv_context *ibctx = ibpd->context;
+
+    if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT) {
+        status = doca_gpu_dmabuf_fd(gpu_dev, umem_ptr, umem_sz, &dmabuf_fd);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING,
+                     "GPU doesn't support dmabuf, fallback to legacy nvidia-peermem mode");
+            dmabuf_fd = DOCA_VERBS_DMABUF_INVALID_FD;
+        }
+
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE, dmabuf_fd,
+                                        0, umem);
+        if (status != DOCA_SUCCESS) {
+            if (dmabuf_fd > 0) {
+                DOCA_LOG(LOG_WARNING,
+                         "Failed to create gpu umem with dmabuf. Fallback to legacy nvidia-peermem "
+                         "mode");
+                status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE,
+                                                DOCA_VERBS_DMABUF_INVALID_FD, 0, umem);
+                if (status != DOCA_SUCCESS) {
+                    DOCA_LOG(LOG_ERR, "Failed to create gpu umem with nvidia-peermem mode");
+                    goto destroy_resources;
+                }
+            } else {
+                DOCA_LOG(LOG_ERR, "Failed to create gpu umem");
+                goto destroy_resources;
+            }
+        }
+    } else if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_DMABUF) {
+        status = doca_gpu_dmabuf_fd(gpu_dev, umem_ptr, umem_sz, &dmabuf_fd);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING, "GPU doesn't support dmabuf.");
+            goto destroy_resources;
+        }
+
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE, dmabuf_fd,
+                                        0, umem);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING, "GPU doesn't support dmabuf.");
+            goto destroy_resources;
+        }
+    } else if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_PEERMEM) {
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE,
+                                        DOCA_VERBS_DMABUF_INVALID_FD, 0, umem);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create gpu umem with nvidia-peermem mode");
+            goto destroy_resources;
+        }
+    }
+
+    // Immediately close dmabuf_fd after registration.
+    if (dmabuf_fd > 0 && dmabuf_fd != (int)DOCA_VERBS_DMABUF_INVALID_FD) close(dmabuf_fd);
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (*umem) doca_verbs_umem_destroy(*umem);
+
+    return status;
+}
+
+static uint32_t calc_cq_external_umem_size(uint32_t queue_size) {
+    uint32_t cqe_buf_size = 0;
+
+    if (queue_size != 0)
+        cqe_buf_size = (uint32_t)(queue_size * sizeof(struct doca_gpunetio_ib_mlx5_cqe64));
+
+    return align_up_uint32(cqe_buf_size, priv_get_page_size());
+}
+
+static void mlx5_init_cqes(struct doca_gpunetio_ib_mlx5_cqe64 *cqes, uint32_t nb_cqes) {
+    for (uint32_t cqe_idx = 0; cqe_idx < nb_cqes; cqe_idx++)
+        cqes[cqe_idx].op_own =
+            (DOCA_GPUNETIO_IB_MLX5_CQE_INVALID << DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT) |
+            DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK;
+}
+
+static doca_error_t create_cq(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                              enum doca_gpu_verbs_mem_reg_type mreg_type, uint32_t ncqes,
+                              void **gpu_umem_dev_ptr, struct doca_verbs_umem **gpu_umem,
+                              void **gpu_umem_dbr_dev_ptr, struct doca_verbs_umem **gpu_umem_dbr,
+                              struct doca_verbs_uar *external_uar,
+                              struct doca_verbs_cq **verbs_cq) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    cudaError_t status_cuda = cudaSuccess;
+    struct doca_verbs_cq_attr *verbs_cq_attr = NULL;
+    struct doca_verbs_cq *new_cq = NULL;
+    struct doca_gpunetio_ib_mlx5_cqe64 *cq_ring_haddr = NULL;
+    uint32_t external_umem_size = 0;
+    size_t dbr_umem_align_sz;
+    struct ibv_context *ibctx = ibpd->context;
+
+    status = doca_verbs_cq_attr_create(&verbs_cq_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq attributes");
+        return status;
+    }
+
+    external_umem_size = calc_cq_external_umem_size(ncqes);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to calc external umem size");
+        goto destroy_resources;
+    }
+
+    status = doca_gpu_mem_alloc(gpu_dev, external_umem_size, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, (void **)gpu_umem_dev_ptr, NULL);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem cq");
+        goto destroy_resources;
+    }
+
+    cq_ring_haddr =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)(calloc(external_umem_size, sizeof(uint8_t)));
+    if (cq_ring_haddr == NULL) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate cq host ring buffer memory for initialization");
+        status = DOCA_ERROR_NO_MEMORY;
+        goto destroy_resources;
+    }
+
+    mlx5_init_cqes(cq_ring_haddr, ncqes);
+
+    DOCA_LOG(LOG_DEBUG, "Create CQ memcpy cq_ring_haddr %p into gpu_umem_dev_ptr %p size %d\n",
+             (void *)(cq_ring_haddr), (*gpu_umem_dev_ptr), external_umem_size);
+
+    status_cuda = cudaMemcpy((*gpu_umem_dev_ptr), (void *)(cq_ring_haddr), external_umem_size,
+                             cudaMemcpyDefault);
+    if (status_cuda != cudaSuccess) {
+        DOCA_LOG(LOG_ERR, "Failed to cudaMempy gpu cq cq ring buffer ret %d", status_cuda);
+        goto destroy_resources;
+    }
+
+    free(cq_ring_haddr);
+    cq_ring_haddr = nullptr;
+
+    status =
+        create_gpu_umem(gpu_dev, ibpd, mreg_type, external_umem_size, *gpu_umem_dev_ptr, gpu_umem);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_external_umem(verbs_cq_attr, *gpu_umem, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external umem");
+        goto destroy_resources;
+    }
+
+    dbr_umem_align_sz = CUDA_ROUND_UP(DBR_SIZE, priv_get_page_size());
+    status = doca_gpu_mem_alloc(gpu_dev, dbr_umem_align_sz, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, (void **)gpu_umem_dbr_dev_ptr, nullptr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+        goto destroy_resources;
+    }
+
+    status = create_gpu_umem(gpu_dev, ibpd, mreg_type, dbr_umem_align_sz, *gpu_umem_dbr_dev_ptr,
+                             gpu_umem_dbr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_external_dbr_umem(verbs_cq_attr, *gpu_umem_dbr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external dbr umem");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_cq_size(verbs_cq_attr, ncqes);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_cq_overrun(verbs_cq_attr, DOCA_VERBS_CQ_ENABLE_OVERRUN);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq size");
+        goto destroy_resources;
+    }
+
+    if (external_uar != NULL) {
+        status = doca_verbs_cq_attr_set_external_uar(verbs_cq_attr, external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external uar");
+            goto destroy_resources;
+        }
+    }
+
+    status = doca_verbs_cq_create(ibctx, verbs_cq_attr, &new_cq);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_destroy(verbs_cq_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq attributes");
+        goto destroy_resources;
+    }
+
+    *verbs_cq = new_cq;
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (new_cq != NULL) {
+        tmp_status = doca_verbs_cq_destroy(new_cq);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq");
+    }
+
+    if (verbs_cq_attr != NULL) {
+        tmp_status = doca_verbs_cq_attr_destroy(verbs_cq_attr);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq attributes");
+    }
+
+    if (*gpu_umem != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu ring buffer umem");
+    }
+
+    if (*gpu_umem_dbr != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem_dbr);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu ring buffer umem");
+    }
+
+    if (cq_ring_haddr) {
+        free(cq_ring_haddr);
+    }
+
+    if ((*gpu_umem_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of cq umem buffer");
+    }
+
+    if ((*gpu_umem_dbr_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dbr_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of cq umem dbr buffer");
+    }
+
+    return status;
+}
+
+static uint32_t calc_qp_external_umem_size(uint32_t sq_nwqes) {
+    uint32_t sq_ring_size = 0;
+
+    if (sq_nwqes != 0) sq_ring_size = (uint32_t)(sq_nwqes * sizeof(struct doca_gpu_dev_verbs_wqe));
+
+    return align_up_uint32(sq_ring_size, priv_get_page_size());
+}
+
+static doca_error_t create_qp(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                              enum doca_gpu_verbs_mem_reg_type mreg_type,
+                              struct doca_verbs_cq *cq_sq, uint32_t sq_nwqe,
+                              void **gpu_umem_dev_ptr, struct doca_verbs_umem **gpu_umem,
+                              void **gpu_umem_dbr_dev_ptr, struct doca_verbs_umem **gpu_umem_dbr,
+                              struct doca_verbs_uar *external_uar,
+                              enum doca_gpu_dev_verbs_nic_handler req_nic_handler,
+                              bool set_core_direct, struct doca_verbs_qp **verbs_qp,
+                              enum doca_gpu_dev_verbs_nic_handler *out_nic_handler) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr = NULL;
+    struct doca_verbs_qp *new_qp = NULL;
+    uint32_t external_umem_size = 0;
+    size_t dbr_umem_align_sz = align_up_uint32(DBR_SIZE, priv_get_page_size());
+    struct ibv_context *ibctx = ibpd->context;
+    enum doca_gpu_dev_verbs_nic_handler nic_handler = req_nic_handler;
+
+    status = doca_verbs_qp_init_attr_create(&verbs_qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp attributes");
+        return status;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_uar(verbs_qp_init_attr, external_uar);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges");
+        goto destroy_resources;
+    }
+
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        bool can_register = false;
+        status = doca_gpu_verbs_can_gpu_register_uar(external_uar->get_reg_addr(), &can_register);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to check if UAR can be registered on GPU");
+            goto destroy_resources;
+        }
+
+        nic_handler = can_register ? DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB
+                                   : DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY;
+    }
+
+    external_umem_size = calc_qp_external_umem_size(sq_nwqe);
+
+    status = doca_gpu_mem_alloc(gpu_dev, external_umem_size, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, gpu_umem_dev_ptr, NULL);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+        goto destroy_resources;
+    }
+
+    status =
+        create_gpu_umem(gpu_dev, ibpd, mreg_type, external_umem_size, *gpu_umem_dev_ptr, gpu_umem);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_umem(verbs_qp_init_attr, *gpu_umem, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs qp external umem");
+        goto destroy_resources;
+    }
+
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        *gpu_umem_dbr_dev_ptr = calloc(dbr_umem_align_sz, sizeof(uint8_t));
+        if (*gpu_umem_dbr_dev_ptr == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+            goto destroy_resources;
+        }
+    } else {
+        status = doca_gpu_mem_alloc(gpu_dev, dbr_umem_align_sz, priv_get_page_size(),
+                                    DOCA_GPU_MEM_TYPE_GPU, gpu_umem_dbr_dev_ptr, NULL);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+            goto destroy_resources;
+        }
+    }
+
+    status = create_gpu_umem(gpu_dev, ibpd, mreg_type, dbr_umem_align_sz, *gpu_umem_dbr_dev_ptr,
+                             gpu_umem_dbr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_dbr_umem(verbs_qp_init_attr, *gpu_umem_dbr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs qp external dbr umem");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_pd(verbs_qp_init_attr, ibpd);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs PD");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_sq_wr(verbs_qp_init_attr, sq_nwqe);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set SQ size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_rq_wr(verbs_qp_init_attr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set RQ size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_qp_type(verbs_qp_init_attr, DOCA_VERBS_QP_TYPE_RC);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set QP type");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_send_cq(verbs_qp_init_attr, cq_sq);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs CQ");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_send_max_sges(verbs_qp_init_attr, MAX_SEND_SEGS);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_max_sges");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_receive_max_sges(verbs_qp_init_attr, MAX_RECEIVE_SEGS);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges");
+        goto destroy_resources;
+    }
+
+    if (set_core_direct) {
+        status = doca_verbs_qp_init_attr_set_core_direct_master(verbs_qp_init_attr, 1);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to set core_direct");
+            goto destroy_resources;
+        }
+    }
+
+    status = doca_verbs_qp_create(ibctx, verbs_qp_init_attr, &new_qp);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs QP");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_destroy(verbs_qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP attributes");
+        goto destroy_resources;
+    }
+
+    *verbs_qp = new_qp;
+    *out_nic_handler = nic_handler;
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (new_qp != NULL) {
+        tmp_status = doca_verbs_qp_destroy(new_qp);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP");
+    }
+
+    if (verbs_qp_init_attr != NULL) {
+        tmp_status = doca_verbs_qp_init_attr_destroy(verbs_qp_init_attr);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP attributes");
+    }
+
+    if (*gpu_umem != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu umem");
+    }
+
+    if ((*gpu_umem_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of umem");
+    }
+
+    if (*gpu_umem_dbr != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem_dbr);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu umem");
+    }
+
+    if ((*gpu_umem_dbr_dev_ptr) != 0) {
+        if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+            free(*gpu_umem_dbr_dev_ptr);
+        } else {
+            tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dbr_dev_ptr));
+            if (tmp_status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of umem");
+        }
+    }
+
+    // Immediately close dmabuf_fd after registration.
+    // if (dmabuf_fd > 0) close(dmabuf_fd);
+
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_create_qp_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                         struct doca_gpu_verbs_qp_hl **qp) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+
+    if (qp_init_attr == nullptr || qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: qp_init_attr %p qp %p", (void *)qp_init_attr,
+                 (void *)*qp);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->gpu_dev == nullptr || qp_init_attr->ibpd == nullptr ||
+        qp_init_attr->sq_nwqe == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: gpu_dev %p ibpd %p sq_nwqe %d",
+                 (void *)qp_init_attr->gpu_dev, (void *)qp_init_attr->ibpd, qp_init_attr->sq_nwqe);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    struct doca_gpu_verbs_qp_hl *qp_ =
+        (struct doca_gpu_verbs_qp_hl *)calloc(1, sizeof(struct doca_gpu_verbs_qp_hl));
+    if (qp_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed alloc memory for high-level qp");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    qp_->gpu_dev = qp_init_attr->gpu_dev;
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status = create_cq(qp_->gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                           qp_init_attr->sq_nwqe, &qp_->cq_sq_umem_gpu_ptr, &qp_->cq_sq_umem,
+                           &qp_->cq_sq_umem_dbr_gpu_ptr, &qp_->cq_sq_umem_dbr, NULL, &qp_->cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    qp_->nic_handler = qp_init_attr->nic_handler;
+
+    status = create_uar(qp_init_attr->ibpd->context, qp_->nic_handler, &qp_->external_uar, true);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs uar");
+        goto exit_error;
+    }
+
+    status = create_qp(qp_->gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type, qp_->cq_sq,
+                       qp_init_attr->sq_nwqe, &qp_->qp_umem_gpu_ptr, &qp_->qp_umem,
+                       &qp_->qp_umem_dbr_gpu_ptr, &qp_->qp_umem_dbr, qp_->external_uar,
+                       qp_init_attr->nic_handler, false, &qp_->qp, &qp_->nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status = doca_gpu_verbs_export_qp(qp_->gpu_dev, qp_->qp, qp_->nic_handler, qp_->qp_umem_gpu_ptr,
+                                      qp_->cq_sq, &qp_->qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    *qp = qp_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    if (qp_->external_uar != NULL) {
+        tmp_status = doca_verbs_uar_destroy(qp_->external_uar);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    free(qp_);
+    return status;
+}
+
+static doca_error_t doca_gpu_verbs_destroy_qp_hl_internal(struct doca_gpu_verbs_qp_hl *qp) {
+    doca_error_t status;
+
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    status = doca_gpu_verbs_unexport_qp(qp->gpu_dev, qp->qp_gverbs);
+    if (status != DOCA_SUCCESS)
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca gpu thread argument cq memory");
+
+    status = doca_verbs_qp_destroy(qp->qp);
+    if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP");
+
+    if (qp->qp_umem != NULL) {
+        status = doca_verbs_umem_destroy(qp->qp_umem);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu qp umem");
+    }
+
+    if (qp->qp_umem_gpu_ptr != 0) {
+        status = doca_gpu_mem_free(qp->gpu_dev, qp->qp_umem_gpu_ptr);
+        if (status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of qp ring buffer");
+    }
+
+    if (qp->qp_umem_dbr != NULL) {
+        status = doca_verbs_umem_destroy(qp->qp_umem_dbr);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu qp umem dbr");
+    }
+
+    if (qp->qp_umem_dbr_gpu_ptr != NULL) {
+        if (qp->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+            free(qp->qp_umem_dbr_gpu_ptr);
+        } else {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->qp_umem_dbr_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of qp ring buffer dbr");
+        }
+    }
+
+    if (qp->external_uar != NULL) {
+        status = doca_verbs_uar_destroy(qp->external_uar);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    if (qp->cq_sq) {
+        status = doca_verbs_cq_destroy(qp->cq_sq);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs CQ");
+
+        if (qp->cq_sq_umem != NULL) {
+            status = doca_verbs_umem_destroy(qp->cq_sq_umem);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu sq cq ring buffer umem");
+        }
+
+        if (qp->cq_sq_umem_gpu_ptr != 0) {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->cq_sq_umem_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of sq cq ring buffer");
+        }
+
+        if (qp->cq_sq_umem_dbr != NULL) {
+            status = doca_verbs_umem_destroy(qp->cq_sq_umem_dbr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu sq cq ring buffer umem");
+        }
+
+        if (qp->cq_sq_umem_dbr_gpu_ptr != 0) {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->cq_sq_umem_dbr_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of sq cq umem dbr buffer");
+        }
+    }
+
+    memset(qp, 0, sizeof(*qp));
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_destroy_qp_hl(struct doca_gpu_verbs_qp_hl *qp) {
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    doca_gpu_verbs_destroy_qp_hl_internal(qp);
+    free(qp);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_create_qp_group_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                               struct doca_gpu_verbs_qp_group_hl **qpg) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+
+    if (qp_init_attr == nullptr || qpg == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: qp_init_attr %p qp %p", (void *)qp_init_attr,
+                 (void *)*qpg);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->gpu_dev == nullptr || qp_init_attr->ibpd == nullptr ||
+        qp_init_attr->sq_nwqe == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: gpu_dev %p ibpd %p sq_nwqe %d",
+                 (void *)qp_init_attr->gpu_dev, (void *)qp_init_attr->ibpd, qp_init_attr->sq_nwqe);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF) {
+        DOCA_LOG(LOG_ERR, "BlueFlame not supported with QP group");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    struct doca_gpu_verbs_qp_group_hl *qpg_ =
+        (struct doca_gpu_verbs_qp_group_hl *)calloc(1, sizeof(struct doca_gpu_verbs_qp_group_hl));
+    if (qpg_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed alloc memory for high-level qp");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    /********** Create main QP **********/
+
+    qpg_->qp_main.gpu_dev = qp_init_attr->gpu_dev;
+
+    status = create_uar(qp_init_attr->ibpd->context, qpg_->qp_main.nic_handler,
+                        &qpg_->qp_main.external_uar, true);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs uar");
+        goto exit_error;
+    }
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status = create_cq(qpg_->qp_main.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                           qp_init_attr->sq_nwqe, &qpg_->qp_main.cq_sq_umem_gpu_ptr,
+                           &qpg_->qp_main.cq_sq_umem, &qpg_->qp_main.cq_sq_umem_dbr_gpu_ptr,
+                           &qpg_->qp_main.cq_sq_umem_dbr, qpg_->qp_main.external_uar,
+                           &qpg_->qp_main.cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    status = create_qp(
+        qpg_->qp_main.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type, qpg_->qp_main.cq_sq,
+        qp_init_attr->sq_nwqe, &qpg_->qp_main.qp_umem_gpu_ptr, &qpg_->qp_main.qp_umem,
+        &qpg_->qp_main.qp_umem_dbr_gpu_ptr, &qpg_->qp_main.qp_umem_dbr, qpg_->qp_main.external_uar,
+        qp_init_attr->nic_handler, false, &qpg_->qp_main.qp, &qpg_->qp_main.nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status = doca_gpu_verbs_export_qp(qpg_->qp_main.gpu_dev, qpg_->qp_main.qp,
+                                      qpg_->qp_main.nic_handler, qpg_->qp_main.qp_umem_gpu_ptr,
+                                      qpg_->qp_main.cq_sq, &qpg_->qp_main.qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    /********** Create companion QP **********/
+
+    qpg_->qp_companion.gpu_dev = qp_init_attr->gpu_dev;
+    qpg_->qp_companion.external_uar = qpg_->qp_main.external_uar;
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status =
+            create_cq(qpg_->qp_companion.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                      qp_init_attr->sq_nwqe, &qpg_->qp_companion.cq_sq_umem_gpu_ptr,
+                      &qpg_->qp_companion.cq_sq_umem, &qpg_->qp_companion.cq_sq_umem_dbr_gpu_ptr,
+                      &qpg_->qp_companion.cq_sq_umem_dbr, qpg_->qp_companion.external_uar,
+                      &qpg_->qp_companion.cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    status = create_qp(qpg_->qp_companion.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                       qpg_->qp_companion.cq_sq, qp_init_attr->sq_nwqe,
+                       &qpg_->qp_companion.qp_umem_gpu_ptr, &qpg_->qp_companion.qp_umem,
+                       &qpg_->qp_companion.qp_umem_dbr_gpu_ptr, &qpg_->qp_companion.qp_umem_dbr,
+                       qpg_->qp_companion.external_uar, qp_init_attr->nic_handler, true,
+                       &qpg_->qp_companion.qp, &qpg_->qp_companion.nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status =
+        doca_gpu_verbs_export_qp(qpg_->qp_companion.gpu_dev, qpg_->qp_companion.qp,
+                                 qpg_->qp_companion.nic_handler, qpg_->qp_companion.qp_umem_gpu_ptr,
+                                 qpg_->qp_companion.cq_sq, &qpg_->qp_companion.qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    *qpg = qpg_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    if (qpg_->qp_main.external_uar != NULL) {
+        tmp_status = doca_verbs_uar_destroy(qpg_->qp_main.external_uar);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    free(qpg_);
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_destroy_qp_group_hl(struct doca_gpu_verbs_qp_group_hl *qpg) {
+    if (qpg == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    doca_gpu_verbs_destroy_qp_hl_internal(&qpg->qp_main);
+    qpg->qp_companion.external_uar = nullptr;
+    doca_gpu_verbs_destroy_qp_hl_internal(&qpg->qp_companion);
+
+    memset(qpg, 0, sizeof(*qpg));
+
+    free(qpg);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_qp_flat_list_create_hl(struct doca_gpu_verbs_qp_hl **qp_list,
+                                                   uint32_t num_elems,
+                                                   struct doca_gpu_dev_verbs_qp **qp_gpu) {
+    doca_error_t status = DOCA_SUCCESS;
+    cudaError_t error;
+    struct doca_gpu_dev_verbs_qp *qp_gpu_;
+
+    if (num_elems == 0 || qp_list == nullptr || qp_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    error = cudaMalloc((void **)&qp_gpu_, sizeof(struct doca_gpu_dev_verbs_qp) * num_elems);
+    if (error != cudaSuccess) return DOCA_ERROR_NO_MEMORY;
+
+    for (uint32_t i = 0; i < num_elems; i++) {
+        error = cudaMemcpy(qp_gpu_ + i, qp_list[i]->qp_gverbs->qp_cpu,
+                           sizeof(struct doca_gpu_dev_verbs_qp), cudaMemcpyDefault);
+        if (error != cudaSuccess) goto exit_error;
+    }
+
+    *qp_gpu = qp_gpu_;
+
+    return status;
+
+exit_error:
+    cudaFree(qp_gpu);
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_qp_flat_list_destroy_hl(struct doca_gpu_dev_verbs_qp *qp_gpu) {
+    if (qp_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    cudaFree(qp_gpu);
+    return DOCA_SUCCESS;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp
new file mode 100644
index 000000000..05c71997f
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp
@@ -0,0 +1,77 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <time.h>
+#include <stdarg.h>
+
+#include "doca_gpunetio_log.hpp"
+
+static const char *doca_gpu_log_level_strings[] = {"EMERG",   "ALERT",  "CRIT", "ERR",
+                                                   "WARNING", "NOTICE", "INFO", "DEBUG"};
+
+void doca_gpu_log_print(int log_level, const char *file, int line, const char *func,
+                        const char *fmt, ...) {
+    static int cur_log_level = -1;
+    if (cur_log_level < 0) {
+        const char *debug_env = getenv("DOCA_GPUNETIO_LOG");
+        if (debug_env != NULL) {
+            int env_log_level = atoi(debug_env);
+            if (env_log_level >= 0 &&
+                env_log_level <= (int)(sizeof(doca_gpu_log_level_strings) /
+                                       sizeof(doca_gpu_log_level_strings[0]))) {
+                cur_log_level = env_log_level;
+            }
+        }
+        if (cur_log_level < 0) {
+            cur_log_level = 0;
+        }
+    }
+
+    if (log_level <= cur_log_level) {
+        time_t now = time(NULL);
+        char *timestamp = ctime(&now);
+        timestamp[strlen(timestamp) - 1] = '\0';
+        va_list args;
+        va_start(args, fmt);
+        fprintf(stderr, "%s [%s] [%s]: %d: %s(): ", timestamp,
+                doca_gpu_log_level_strings[log_level], file, line, func);
+        vfprintf(stderr, fmt, args);
+        fprintf(stderr, "\n");
+        va_end(args);
+    }
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp
new file mode 100644
index 000000000..84e3a7ec7
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <syslog.h>
+
+void doca_gpu_log_print(int log_level, const char *file, int line, const char *func,
+                        const char *fmt, ...);
+
+#define DOCA_LOG(LOG_LEVEL, fmt, ...)                                                    \
+    do {                                                                                 \
+        doca_gpu_log_print(LOG_LEVEL, __FILE__, __LINE__, __func__, fmt, ##__VA_ARGS__); \
+    } while (0)
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp
new file mode 100644
index 000000000..e4058b237
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string>
+#include <cmath>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <linux/types.h>
+
+#include "host/doca_error.h"
+#include "doca_gpunetio_config.h"
+#include "doca_gpunetio_log.hpp"
+
+#ifndef CUDA_ROUND_UP
+#define CUDA_ROUND_UP(unaligned_mapping_size, align_val) \
+    ((unaligned_mapping_size) + (align_val) - 1) & (~((align_val) - 1))
+#endif
+
+#ifndef CUDA_ROUND_DOWN
+#define CUDA_ROUND_DOWN(unaligned_mapping_size, align_val) \
+    ((unaligned_mapping_size) & ~((align_val) - 1))
+#endif
+
+#define DOCA_VERBS_PAGE_SIZE 4096
+#define DOCA_VERBS_CACHELINE_SIZE (64)
+
+#define DOCA_VERBS_DB_UAR_SIZE 8
+
+/**
+ * @brief This method checks if a number is a power of 2
+ *
+ * @param [in] x
+ * The number to check
+ * @return true if x is a power of 2, false if not.
+ */
+inline bool doca_internal_utils_is_power_of_two(uint64_t x) { return x && (x & (x - 1)) == 0; }
+
+inline uint64_t doca_internal_utils_next_power_of_two(uint64_t x) {
+    x--;
+
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x |= x >> 32;
+
+    return x + 1;
+}
+
+struct doca_internal_mlx5_wqe_data_seg {
+    __be32 byte_count;
+    __be32 lkey;
+    __be64 addr;
+};
+
+struct doca_internal_mlx5_wqe_mprq_next_seg {
+    uint8_t rsvd0[2];
+    __be16 next_wqe_index;
+    uint8_t signature;
+    uint8_t rsvd1[11];
+};
+
+template <typename T>
+T doca_internal_utils_log2(T x) {
+    if (x == 0) /* log(0) is undefined */
+        return 0;
+
+    return static_cast<T>(std::log2(x));
+}
+
+inline uint64_t doca_internal_utils_align_up_uint64(uint64_t value, uint64_t alignment) {
+    uint64_t remainder = (value % alignment);
+
+    if (remainder == 0) return value;
+
+    return value + (alignment - remainder);
+}
+
+inline uint32_t doca_internal_utils_align_up_uint32(uint32_t value, uint32_t alignment) {
+    return (uint32_t)doca_internal_utils_align_up_uint64(value, alignment);
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp
new file mode 100644
index 000000000..4009d2eea
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp
@@ -0,0 +1,472 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_cq.hpp"
+#include "doca_verbs_net_wrapper.h"
+
+#define DOCA_VERBS_CQE_SIZE 64
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+static constexpr uint32_t sc_cq_doorbell_size = 64;
+
+using create_cq_in = uint32_t[MLX5_ST_SZ_DW(create_cq_in)];
+using create_cq_out = uint32_t[MLX5_ST_SZ_DW(create_cq_out)];
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_cq Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_cq::doca_verbs_cq(struct ibv_context *ibv_ctx, struct doca_verbs_cq_attr &cq_attr)
+    : m_ibv_ctx(ibv_ctx), m_cq_attr(cq_attr) {
+    try {
+        create(cq_attr);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create CQ");
+        throw;
+    }
+}
+
+doca_verbs_cq::~doca_verbs_cq() { static_cast<void>(destroy()); }
+
+doca_error_t doca_verbs_cq::create_cq_obj(uint32_t uar_id, uint32_t log_nb_cqes,
+                                          uint64_t db_umem_offset, uint32_t db_umem_id,
+                                          uint32_t wq_umem_id, bool cq_overrun) noexcept {
+    create_cq_in create_in{0};
+    create_cq_out create_out{0};
+
+    DEVX_SET(create_cq_in, create_in, opcode, MLX5_CMD_OP_CREATE_CQ);
+    DEVX_SET(create_cq_in, create_in, cq_context.cqe_sz, MLX5_CQC_CQE_SZ_BYTES_64);
+    DEVX_SET(create_cq_in, create_in, cq_context.cc, 0x0);  // Disable collapsed CQ
+    DEVX_SET(create_cq_in, create_in, cq_context.oi,
+             static_cast<uint8_t>(cq_overrun));                              // Enable overrun
+    DEVX_SET(create_cq_in, create_in, cq_context.log_cq_size, log_nb_cqes);  //<--
+    DEVX_SET(create_cq_in, create_in, cq_context.uar_page, uar_id);
+    DEVX_SET(create_cq_in, create_in, cq_umem_id, wq_umem_id);
+    DEVX_SET(create_cq_in, create_in, cq_umem_valid, 1);
+    DEVX_SET64(create_cq_in, create_in, cq_umem_offset, 0x0);
+    DEVX_SET(create_cq_in, create_in, cq_context.dbr_umem_id, db_umem_id);
+    DEVX_SET(create_cq_in, create_in, cq_context.dbr_umem_valid, 1);
+    DEVX_SET64(create_cq_in, create_in, cq_context.dbr_addr, db_umem_offset);
+
+    uint32_t element_id;
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_query_eqn(m_ibv_ctx, 0, &element_id);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query devx eqn");
+        return DOCA_ERROR_OPERATING_SYSTEM;
+    }
+
+    DEVX_SET(create_cq_in, create_in, cq_context.c_eqn, element_id);
+
+    /* Since cq_umem_valid == 1, FW deduces page size from umem and this field is reserved */
+    DEVX_SET(create_cq_in, create_in, cq_context.log_page_size,
+             0);  // GPU_PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ibv_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_cq_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create CQ. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_cqn = DEVX_GET(create_cq_out, create_out, cqn);
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_cq::create(struct doca_verbs_cq_attr &cq_attr) {
+    auto status{DOCA_SUCCESS};
+
+    if ((cq_attr.external_umem != nullptr && cq_attr.external_umem_dbr == nullptr) ||
+        (cq_attr.external_umem == nullptr && cq_attr.external_umem_dbr != nullptr)) {
+        DOCA_LOG(LOG_ERR, "Both UMEM should be either external or internal");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    /* Query device attr */
+    status = doca_verbs_query_device(m_ibv_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (doca_internal_utils_is_power_of_two(cq_attr.cq_size) == false) {
+        DOCA_LOG(LOG_ERR, "Number of CQE is not a power of 2");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    m_num_cqes = static_cast<uint32_t>(cq_attr.cq_size);
+    uint32_t log_nb_cqes = doca_internal_utils_log2(m_num_cqes);
+
+    if (m_num_cqes > m_verbs_device_attr->m_max_cqe) {
+        DOCA_LOG(LOG_ERR, "CQ cq_size is invalid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    uint32_t umem_id{0};
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+
+    dbr_umem_offset = m_num_cqes * DOCA_VERBS_CQE_SIZE;
+    dbr_umem_offset =
+        doca_internal_utils_align_up_uint32(dbr_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+    if (cq_attr.external_umem == nullptr) {
+        /* Case of internal umem */
+        uint32_t total_umem_size = doca_internal_utils_align_up_uint32(
+            dbr_umem_offset + sc_cq_doorbell_size, DOCA_VERBS_PAGE_SIZE);
+
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+        memset(m_umem_buf, 0, total_umem_size);
+
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ibv_ctx, m_umem_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create CQ UMEM");
+            throw umem_status;
+        }
+
+        m_cq_buf = m_umem_buf;
+        umem_id = m_umem_obj->umem_id;
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_cq_buf + dbr_umem_offset);
+    } else {
+        uint8_t *tmp_db_buffer;
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(cq_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_cq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(cq_attr.external_umem, &umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(cq_attr.external_umem_dbr,
+                                             reinterpret_cast<void **>(&tmp_db_buffer));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(cq_attr.external_umem_dbr, &dbr_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        dbr_umem_offset = cq_attr.external_umem_dbr_offset;
+        m_db_buffer = reinterpret_cast<uint32_t *>(tmp_db_buffer + dbr_umem_offset);
+    }
+
+    m_ci_dbr = &m_db_buffer[MLX5_CQ_SET_CI];
+    m_arm_dbr = &m_db_buffer[MLX5_CQ_ARM_DB];
+
+    uint32_t uar_id{};
+    if (cq_attr.external_uar == nullptr) {
+        auto uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+            m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_NC, &m_uar_obj);
+        if (uar_status != DOCA_SUCCESS) {
+            uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+                m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_BF, &m_uar_obj);
+            if (uar_status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to create UAR");
+                throw uar_status;
+            }
+        }
+
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(m_uar_obj->reg_addr);
+        uar_id = m_uar_obj->page_id;
+    } else {
+        /* Case of external UAR */
+        status = doca_verbs_uar_id_get(cq_attr.external_uar, &uar_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR ID");
+            throw status;
+        }
+
+        void *reg_addr{};
+        status = doca_verbs_uar_reg_addr_get(cq_attr.external_uar, &reg_addr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR reg_addr");
+            throw status;
+        }
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(reg_addr);
+    }
+
+    /* Create CQ object */
+    status = create_cq_obj(uar_id, log_nb_cqes, dbr_umem_offset, dbr_umem_id, umem_id,
+                           cq_attr.cq_overrun);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create CQ object");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs CQ %p: has been successfully created", this);
+}
+
+doca_error_t doca_verbs_cq::destroy() noexcept {
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_cq_obj) {
+        auto destroy_status = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_cq_obj);
+        if (destroy_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy CQ object");
+            return destroy_status;
+        }
+        m_cq_obj = nullptr;
+    }
+
+    if (m_uar_obj) {
+        auto free_uar_status = doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        if (free_uar_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free UAR");
+            return free_uar_status;
+        }
+        m_uar_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_cq_attr_create(struct doca_verbs_cq_attr **verbs_cq_attr) {
+    if (verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create cq_attr: parameter verbs_cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_cq_attr = (struct doca_verbs_cq_attr *)calloc(1, sizeof(struct doca_verbs_cq_attr));
+    if (*verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create cq_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_destroy(struct doca_verbs_cq_attr *cq_attr) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca_verbs_cq_attr. parameter cq_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(cq_attr);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_size(struct doca_verbs_cq_attr *cq_attr, uint32_t cq_size) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set cq_size: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_size = cq_size;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_context(struct doca_verbs_cq_attr *cq_attr,
+                                               void *cq_context) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set cq_context: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_context = cq_context;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                  struct doca_verbs_umem *external_umem,
+                                                  uint64_t external_umem_offset) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter cq_attr is NULL.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_umem = external_umem;
+    cq_attr->external_umem_offset = external_umem_offset;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_dbr_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                      struct doca_verbs_umem *external_umem,
+                                                      uint64_t external_umem_offset) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_umem_dbr = external_umem;
+    cq_attr->external_umem_dbr_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_uar(struct doca_verbs_cq_attr *cq_attr,
+                                                 struct doca_verbs_uar *external_uar) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_uar = external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_overrun(struct doca_verbs_cq_attr *cq_attr,
+                                               enum doca_verbs_cq_overrun overrun) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_overrun = overrun;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_create(struct ibv_context *context,
+                                  struct doca_verbs_cq_attr *verbs_cq_attr,
+                                  struct doca_verbs_cq **verbs_cq) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca_verbs_cq. param context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca_verbs_cq. param verbs_cq_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_cq = new doca_verbs_cq(context, *verbs_cq_attr);
+        DOCA_LOG(LOG_INFO, "IB Verbs Context %p: verbs_cq=%p was created", context, *verbs_cq);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_cq_destroy(struct doca_verbs_cq *verbs_cq) {
+    if (verbs_cq == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_cq: parameter verbs_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_cq->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_cq.");
+        return status;
+    }
+
+    delete (verbs_cq);
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_cq_get_wq(struct doca_verbs_cq *verbs_cq, void **cq_buf, uint32_t *cq_num_entries,
+                          uint8_t *cq_entry_size) {
+    *cq_buf = verbs_cq->get_cq_buf();
+    *cq_num_entries = verbs_cq->get_cq_num_entries();
+    *cq_entry_size = DOCA_VERBS_CQE_SIZE;
+}
+
+void doca_verbs_cq_get_dbr_addr(struct doca_verbs_cq *verbs_cq, uint64_t **uar_db_reg,
+                                uint32_t **ci_dbr, uint32_t **arm_dbr) {
+    *uar_db_reg = verbs_cq->get_cq_uar_db_reg();
+    *ci_dbr = verbs_cq->get_cq_ci_dbr();
+    *arm_dbr = verbs_cq->get_cq_arm_dbr();
+}
+
+uint32_t doca_verbs_cq_get_cqn(const struct doca_verbs_cq *verbs_cq) { return verbs_cq->get_cqn(); }
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp
new file mode 100644
index 000000000..de8634517
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp
@@ -0,0 +1,151 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+struct doca_verbs_cq_attr {
+    uint32_t cq_size{};
+    void *cq_context{};
+    struct doca_verbs_umem *external_umem{};
+    struct doca_verbs_umem *external_umem_dbr{};
+    uint32_t external_umem_offset{};
+    uint64_t external_umem_dbr_offset{};
+    struct doca_verbs_uar *external_uar{};
+    enum doca_verbs_cq_overrun cq_overrun;
+};
+
+/**
+ *  @brief This struct implements the doca verbs cq
+ */
+struct doca_verbs_cq {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] verbs_ctx
+     * ibv_context
+     * @param [in] cq_attr
+     * The DOCA IB Verbs CQ attributes
+     *
+     */
+    doca_verbs_cq(struct ibv_context *ibv_ctx, struct doca_verbs_cq_attr &cq_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_cq();
+
+    /**
+     * @brief destroy the cq
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the cq.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the cq
+     *
+     */
+    void create(struct doca_verbs_cq_attr &cq_attr);
+
+    doca_error_t create_cq_obj(uint32_t uar_id, uint32_t log_nb_cqes, uint64_t db_umem_offset,
+                               uint32_t db_umem_id, uint32_t wq_umem_id, bool cq_overrun) noexcept;
+
+    /**
+     * @brief Get CQ number
+     *
+     * @return CQ number
+     */
+    uint32_t get_cqn() const noexcept { return m_cqn; }
+
+    /**
+     * @brief Get CQ buff
+     *
+     * @return CQ buff
+     */
+    void *get_cq_buf() const noexcept { return m_cq_buf; }
+
+    /**
+     * @brief Get CQ num entries
+     *
+     * @return CQ num entries
+     */
+    uint32_t get_cq_num_entries() const noexcept { return m_num_cqes; }
+
+    /**
+     * @brief Get CQ UAR reg
+     *
+     * @return CQ UAR reg
+     */
+    uint64_t *get_cq_uar_db_reg() const noexcept { return m_uar_db_reg; }
+
+    /**
+     * @brief Get CQ ci dbr
+     *
+     * @return CQ ci dbr
+     */
+    uint32_t *get_cq_ci_dbr() const noexcept { return m_ci_dbr; }
+
+    /**
+     * @brief Get CQ arm dbr
+     *
+     * @return CQ arm dbr
+     */
+    uint32_t *get_cq_arm_dbr() const noexcept { return m_arm_dbr; }
+
+   private:
+    struct mlx5dv_devx_obj *m_cq_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_cq_buf{};
+    uint32_t *m_db_buffer;
+    uint64_t *m_uar_db_reg{};
+    uint32_t m_num_cqes{};
+    uint32_t m_cqn{};
+    uint32_t *m_ci_dbr{};
+    uint32_t *m_arm_dbr{};
+    struct doca_verbs_cq_attr m_cq_attr {};
+    struct doca_verbs_device_attr *m_verbs_device_attr{};
+
+    doca_verbs_cq(doca_verbs_cq const &) = delete;
+    doca_verbs_cq &operator=(doca_verbs_cq const &) = delete;
+};
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp
new file mode 100644
index 000000000..6b6259a13
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp
@@ -0,0 +1,129 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syslog.h>
+#include <mutex>
+
+#include "doca_verbs_cuda_wrapper.h"
+#include "doca_gpunetio_log.hpp"
+
+/* Function pointer types for CUDA device APIs */
+typedef CUresult (*cuDeviceGetAttribute_t)(int *pi, CUdevice_attribute attrib, CUdevice dev);
+typedef CUresult (*cuPointerSetAttribute_t)(const void *value, CUpointer_attribute attribute,
+                                            CUdeviceptr ptr);
+typedef CUresult (*cuMemGetHandleForAddressRange_t)(int *pHandle, CUdeviceptr dptr, size_t size,
+                                                    CUmemRangeHandleType handleType,
+                                                    unsigned long long flags);
+typedef CUresult (*cuCtxGetCurrent_t)(CUcontext *pctx);
+
+/* Global function pointers */
+cuDeviceGetAttribute_t p_cuDeviceGetAttribute = nullptr;
+cuPointerSetAttribute_t p_cuPointerSetAttribute = nullptr;
+cuMemGetHandleForAddressRange_t p_cuMemGetHandleForAddressRange = nullptr;
+cuCtxGetCurrent_t p_cuCtxGetCurrent = nullptr;
+
+static void *cuda_handle = nullptr;
+
+/* Helper function to get function pointer from libcuda */
+static void *get_cuda_symbol(const char *symbol_name) {
+    void *symbol = dlsym(cuda_handle, symbol_name);
+    if (!symbol) {
+        DOCA_LOG(LOG_ERR, "Failed to get symbol %s: %s\n", symbol_name, dlerror());
+        return nullptr;
+    }
+    return symbol;
+}
+
+static void doca_verbs_wrapper_init_once(int *ret) {
+    /* Open libcuda.so */
+    cuda_handle = dlopen("libcuda.so", RTLD_LAZY);
+    if (!cuda_handle) {
+        cuda_handle = dlopen("libcuda.so.1", RTLD_LAZY);
+        if (!cuda_handle) {
+            DOCA_LOG(LOG_ERR, "Failed to open libcuda: %s\n", dlerror());
+            *ret = -1;
+            return;
+        }
+    }
+
+    /* Get function pointers */
+    p_cuDeviceGetAttribute = (cuDeviceGetAttribute_t)get_cuda_symbol("cuDeviceGetAttribute");
+    p_cuPointerSetAttribute = (cuPointerSetAttribute_t)get_cuda_symbol("cuPointerSetAttribute");
+    p_cuMemGetHandleForAddressRange =
+        (cuMemGetHandleForAddressRange_t)get_cuda_symbol("cuMemGetHandleForAddressRange");
+    p_cuCtxGetCurrent = (cuCtxGetCurrent_t)get_cuda_symbol("cuCtxGetCurrent");
+
+    /* Check if all symbols were found */
+    if (!p_cuDeviceGetAttribute || !p_cuPointerSetAttribute || !p_cuMemGetHandleForAddressRange ||
+        !p_cuCtxGetCurrent) {
+        DOCA_LOG(LOG_ERR, "Failed to get all required CUDA symbols\n");
+        dlclose(cuda_handle);
+        cuda_handle = nullptr;
+        *ret = -1;
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int init_cuda_wrapper(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* Wrapper function implementations */
+CUresult doca_verbs_wrapper_cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuDeviceGetAttribute(pi, attrib, dev);
+}
+
+CUresult doca_verbs_wrapper_cuPointerSetAttribute(const void *value, CUpointer_attribute attribute,
+                                                  CUdeviceptr ptr) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuPointerSetAttribute(value, attribute, ptr);
+}
+
+CUresult doca_verbs_wrapper_cuMemGetHandleForAddressRange(int *pHandle, CUdeviceptr dptr,
+                                                          size_t size,
+                                                          CUmemRangeHandleType handleType,
+                                                          unsigned long long flags) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuMemGetHandleForAddressRange(pHandle, dptr, size, handleType, flags);
+}
+
+CUresult doca_verbs_wrapper_cuCtxGetCurrent(CUcontext *pctx) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuCtxGetCurrent(pctx);
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h
new file mode 100644
index 000000000..d748e1b77
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DOCA_VERBS_CUDA_WRAPPER_H
+#define DOCA_VERBS_CUDA_WRAPPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef DOCA_VERBS_USE_CUDA_WRAPPER
+
+/* CUDA type declarations for builds without cuda.h */
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0,
+    CUDA_ERROR_NOT_INITIALIZED = 3,
+} CUresult;
+typedef int CUdevice;
+typedef unsigned long long CUdeviceptr;
+typedef enum CUmemRangeHandleType_enum {
+    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
+    CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF
+} CUmemRangeHandleType;
+
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS =
+        6, /**< Synchronize every synchronous memory operation initiated on this region */
+} CUpointer_attribute;
+
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED =
+        124, /**< Device supports buffer sharing with dma_buf mechanism. */
+} CUdevice_attribute;
+
+typedef void *CUcontext;
+
+/* Wrapper function declarations */
+CUresult doca_verbs_wrapper_cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+CUresult doca_verbs_wrapper_cuPointerSetAttribute(const void *value, CUpointer_attribute attribute,
+                                                  CUdeviceptr ptr);
+CUresult doca_verbs_wrapper_cuMemGetHandleForAddressRange(int *pHandle, CUdeviceptr dptr,
+                                                          size_t size,
+                                                          CUmemRangeHandleType handleType,
+                                                          unsigned long long flags);
+CUresult doca_verbs_wrapper_cuCtxGetCurrent(CUcontext *pctx);
+
+/* Initialization function */
+int doca_cuda_wrapper_init(void);
+
+#else
+
+#include <cuda.h>
+
+/* Direct API calls when wrapper is not enabled */
+#define doca_verbs_wrapper_cuDeviceGetAttribute cuDeviceGetAttribute
+#define doca_verbs_wrapper_cuPointerSetAttribute cuPointerSetAttribute
+#define doca_verbs_wrapper_cuMemGetHandleForAddressRange cuMemGetHandleForAddressRange
+#define doca_verbs_wrapper_cuCtxGetCurrent cuCtxGetCurrent
+
+/* No initialization needed when wrapper is not enabled */
+#define doca_cuda_wrapper_init() 0
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_CUDA_WRAPPER_H */
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp
new file mode 100644
index 000000000..8ed34e91f
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp
@@ -0,0 +1,266 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_device_attr.hpp"
+
+#define PRIV_DOCA_MLX5_GID_TABLE_8_ENTRIES 0x0
+#define PRIV_DOCA_MLX5_GID_TABLE_16_ENTRIES 0x1
+#define PRIV_DOCA_MLX5_GID_TABLE_32_ENTRIES 0x2
+#define PRIV_DOCA_MLX5_GID_TABLE_64_ENTRIES 0x3
+#define PRIV_DOCA_MLX5_GID_TABLE_128_ENTRIES 0x4
+
+#define PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_MAX 0
+#define PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR 1
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+uint16_t translate_gid_table_size(uint16_t gid_table_size_prm) {
+    switch (gid_table_size_prm) {
+        case PRIV_DOCA_MLX5_GID_TABLE_8_ENTRIES:
+            return 8;
+        case PRIV_DOCA_MLX5_GID_TABLE_16_ENTRIES:
+            return 16;
+        case PRIV_DOCA_MLX5_GID_TABLE_32_ENTRIES:
+            return 32;
+        case PRIV_DOCA_MLX5_GID_TABLE_64_ENTRIES:
+            return 64;
+        case PRIV_DOCA_MLX5_GID_TABLE_128_ENTRIES:
+            return 128;
+        default:
+            // Shouldn't reach this
+            return 0;
+    }
+
+    // Shouldn't reach this
+    return 0;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_device_attr Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_device_attr::doca_verbs_device_attr(struct ibv_context *ibv_ctx) {
+    try {
+        query_caps(ibv_ctx);
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "Failed to create device_attr");
+        throw;
+    }
+}
+
+void doca_verbs_device_attr::query_caps(struct ibv_context *ibv_ctx) {
+    struct ibv_device_attr device_attr {};
+    auto ret = doca_verbs_wrapper_ibv_query_device(ibv_ctx, &device_attr);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw ret;
+    }
+
+    m_max_qp = device_attr.max_qp;
+    m_max_qp_wr = device_attr.max_qp_wr;
+    m_max_sge = device_attr.max_sge;
+    m_max_cq = device_attr.max_cq;
+    m_max_cqe = device_attr.max_cqe;
+    m_max_mr = device_attr.max_mr;
+    m_max_pd = device_attr.max_pd;
+    m_max_ah = device_attr.max_ah;
+    m_max_srq = device_attr.max_srq;
+    m_max_srq_wr = device_attr.max_srq_wr;
+    m_max_srq_sge = device_attr.max_srq_sge;
+    m_max_pkeys = device_attr.max_pkeys;
+    m_phys_port_cnt = device_attr.phys_port_cnt;
+
+    uint32_t in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {0};
+    uint32_t out[MLX5_ST_SZ_DW(query_hca_cap_out)] = {0};
+
+    DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+    DEVX_SET(query_hca_cap_in, in, op_mod,
+             PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR | MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
+
+    ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ibv_ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device capabilities");
+        throw ret;
+    }
+
+    m_port_type = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.port_type);
+    if (m_port_type == MLX5_CAP_PORT_TYPE_IB)
+        m_gid_table_size = translate_gid_table_size(
+            DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.gid_table_size));
+    m_is_qp_rc_supported = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.rc);
+    m_is_rts2rts_qp_dscp_supported =
+        DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.rts2rts_qp_dscp);
+    m_max_sq_desc_size = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.max_wqe_sz_sq);
+    m_max_rq_desc_size = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.max_wqe_sz_rq);
+    m_max_send_wqebb = 1 << DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.log_max_qp_sz);
+
+    memset(in, 0, sizeof(in));
+    memset(out, 0, sizeof(out));
+
+    DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+    DEVX_SET(query_hca_cap_in, in, op_mod,
+             PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR | MLX5_SET_HCA_CAP_OP_MOD_ROCE);
+
+    ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ibv_ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query ROCE capabilities");
+        throw ret;
+    }
+
+    if (m_port_type == MLX5_CAP_PORT_TYPE_ETH)
+        m_gid_table_size =
+            DEVX_GET(query_hca_cap_out, out, capability.roce_caps.roce_address_table_size);
+    m_min_udp_sport =
+        DEVX_GET(query_hca_cap_out, out, capability.roce_caps.r_roce_min_src_udp_port);
+    m_max_udp_sport =
+        DEVX_GET(query_hca_cap_out, out, capability.roce_caps.r_roce_max_src_udp_port);
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_query_device(struct ibv_context *context,
+                                     struct doca_verbs_device_attr **verbs_device_attr) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query doca_verbs_device_attr. param context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (verbs_device_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query doca_verbs_device_attr. param verbs_device_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_device_attr = new doca_verbs_device_attr(context);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_device_attr_free(struct doca_verbs_device_attr *verbs_device_attr) {
+    if (verbs_device_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to free doca_verbs_device_attr. param verbs_device_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    delete (verbs_device_attr);
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_device_attr_get_max_qp(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_qp;
+}
+
+uint32_t doca_verbs_device_attr_get_max_qp_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_qp_wr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_sge;
+}
+
+uint32_t doca_verbs_device_attr_get_max_cq(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_cq;
+}
+
+uint32_t doca_verbs_device_attr_get_max_cqe(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_cqe;
+}
+
+uint32_t doca_verbs_device_attr_get_max_mr(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_mr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_pd(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_pd;
+}
+
+uint32_t doca_verbs_device_attr_get_max_ah(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_ah;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq_wr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq_sge;
+}
+
+uint16_t doca_verbs_device_attr_get_max_pkeys(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_pkeys;
+}
+
+doca_error_t doca_verbs_device_attr_get_is_qp_type_supported(
+    const struct doca_verbs_device_attr *verbs_device_attr, uint32_t qp_type) {
+    switch (qp_type) {
+        case DOCA_VERBS_QP_TYPE_RC:
+            return verbs_device_attr->m_is_qp_rc_supported ? DOCA_SUCCESS
+                                                           : DOCA_ERROR_NOT_SUPPORTED;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Failed to check if QP type is supported. param QP type is invalid");
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    // Shouldn't reach this
+    return DOCA_ERROR_UNEXPECTED;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp
new file mode 100644
index 000000000..21e52fe3b
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+
+/**
+ *  @brief This struct implements the doca rdma_verbs device attributes
+ */
+struct doca_verbs_device_attr {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * IBV context to query device attributes from
+     *
+     */
+    doca_verbs_device_attr(struct ibv_context *ibv_ctx);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_device_attr() = default;
+
+    /**
+     * @brief Query device capabilities
+     *
+     * @param [in] ibv_ctx
+     * IBV context to query device attributes from
+     *
+     */
+    void query_caps(struct ibv_context *ibv_ctx);
+
+    uint32_t m_max_qp{};
+    uint32_t m_max_qp_wr{};
+    uint32_t m_max_sge{};
+    uint32_t m_max_cq{};
+    uint32_t m_max_cqe{};
+    uint32_t m_max_mr{};
+    uint32_t m_max_pd{};
+    uint32_t m_max_ah{};
+    uint32_t m_max_srq{};
+    uint32_t m_max_srq_wr{};
+    uint32_t m_max_srq_sge{};
+    uint32_t m_max_pkeys{};
+    uint32_t m_max_sq_desc_size{};
+    uint32_t m_max_rq_desc_size{};
+    uint32_t m_max_send_wqebb{};
+    uint16_t m_min_udp_sport{};
+    uint16_t m_max_udp_sport{};
+    uint16_t m_gid_table_size{};
+    uint8_t m_is_qp_rc_supported{};
+    uint8_t m_port_type{};
+    uint8_t m_is_rts2rts_qp_dscp_supported{};
+    uint8_t m_phys_port_cnt{};
+
+   private:
+    doca_verbs_device_attr &operator=(doca_verbs_device_attr const &) = delete;
+};
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp
new file mode 100644
index 000000000..864fbfdae
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp
@@ -0,0 +1,374 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_ibv_wrapper.cpp
+ * @brief Implementation of IB Verbs API wrapper using dlopen
+ *
+ * This file implements the IB Verbs API wrapper using dynamic loading.
+ * It is only compiled when DOCA_VERBS_USE_IBV_WRAPPER is defined.
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mutex>
+
+#include "doca_verbs_net_wrapper.h"
+#include "host/doca_error.h"
+
+/* *********** Function Pointer Types *********** */
+
+typedef struct ibv_device **(*ibv_get_device_list_func_t)(int *num_devices);
+typedef void (*ibv_free_device_list_func_t)(struct ibv_device **list);
+typedef const char *(*ibv_get_device_name_func_t)(struct ibv_device *device);
+typedef struct ibv_context *(*ibv_open_device_func_t)(struct ibv_device *device);
+typedef int (*ibv_close_device_func_t)(struct ibv_context *context);
+typedef struct ibv_pd *(*ibv_alloc_pd_func_t)(struct ibv_context *context);
+typedef int (*ibv_dealloc_pd_func_t)(struct ibv_pd *pd);
+typedef struct ibv_mr *(*ibv_reg_mr_func_t)(struct ibv_pd *pd, void *addr, size_t length,
+                                            int access);
+typedef int (*ibv_dereg_mr_func_t)(struct ibv_mr *mr);
+typedef int (*ibv_query_device_func_t)(struct ibv_context *context,
+                                       struct ibv_device_attr *device_attr);
+typedef int (*ibv_query_port_func_t)(struct ibv_context *context, uint8_t port_num,
+                                     struct ibv_port_attr *port_attr);
+typedef int (*ibv_query_gid_func_t)(struct ibv_context *context, uint8_t port_num, int index,
+                                    union ibv_gid *gid);
+typedef struct ibv_ah *(*ibv_create_ah_func_t)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+typedef int (*ibv_destroy_ah_func_t)(struct ibv_ah *ah);
+typedef struct ibv_cq *(*ibv_create_cq_func_t)(struct ibv_context *context, int cqe,
+                                               void *cq_context, struct ibv_comp_channel *channel,
+                                               int comp_vector);
+typedef int (*ibv_destroy_cq_func_t)(struct ibv_cq *cq);
+typedef struct ibv_srq *(*ibv_create_srq_func_t)(struct ibv_pd *pd,
+                                                 struct ibv_srq_init_attr *srq_init_attr);
+typedef int (*ibv_destroy_srq_func_t)(struct ibv_srq *srq);
+typedef struct ibv_qp *(*ibv_create_qp_func_t)(struct ibv_pd *pd,
+                                               struct ibv_qp_init_attr *qp_init_attr);
+typedef int (*ibv_destroy_qp_func_t)(struct ibv_qp *qp);
+typedef int (*ibv_modify_qp_func_t)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+typedef int (*ibv_query_qp_func_t)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
+                                   struct ibv_qp_init_attr *init_attr);
+
+/* *********** Global Function Pointers *********** */
+
+static ibv_get_device_list_func_t real_ibv_get_device_list = NULL;
+static ibv_free_device_list_func_t real_ibv_free_device_list = NULL;
+static ibv_get_device_name_func_t real_ibv_get_device_name = NULL;
+static ibv_open_device_func_t real_ibv_open_device = NULL;
+static ibv_close_device_func_t real_ibv_close_device = NULL;
+static ibv_alloc_pd_func_t real_ibv_alloc_pd = NULL;
+static ibv_dealloc_pd_func_t real_ibv_dealloc_pd = NULL;
+static ibv_reg_mr_func_t real_ibv_reg_mr = NULL;
+static ibv_dereg_mr_func_t real_ibv_dereg_mr = NULL;
+static ibv_query_device_func_t real_ibv_query_device = NULL;
+static ibv_query_port_func_t real_ibv_query_port = NULL;
+static ibv_query_gid_func_t real_ibv_query_gid = NULL;
+static ibv_create_ah_func_t real_ibv_create_ah = NULL;
+static ibv_destroy_ah_func_t real_ibv_destroy_ah = NULL;
+static ibv_create_cq_func_t real_ibv_create_cq = NULL;
+static ibv_destroy_cq_func_t real_ibv_destroy_cq = NULL;
+static ibv_create_srq_func_t real_ibv_create_srq = NULL;
+static ibv_destroy_srq_func_t real_ibv_destroy_srq = NULL;
+static ibv_create_qp_func_t real_ibv_create_qp = NULL;
+static ibv_destroy_qp_func_t real_ibv_destroy_qp = NULL;
+static ibv_modify_qp_func_t real_ibv_modify_qp = NULL;
+static ibv_query_qp_func_t real_ibv_query_qp = NULL;
+
+/* *********** Library Handle *********** */
+
+static void *ibverbs_handle = NULL;
+
+/* *********** Helper Functions *********** */
+
+/**
+ * @brief Initialize the IB Verbs library using dlopen
+ *
+ * @return 0 on success, -1 on failure
+ */
+static void doca_verbs_wrapper_init_once(int *ret) {
+    /* Try to open the IB Verbs library */
+    ibverbs_handle = dlopen("libibverbs.so.1", RTLD_LAZY);
+    if (!ibverbs_handle) {
+        ibverbs_handle = dlopen("libibverbs.so", RTLD_LAZY);
+    }
+    if (!ibverbs_handle) {
+        fprintf(stderr, "Failed to load libibverbs: %s\n", dlerror());
+        *ret = -1;
+        return;
+    }
+
+    /* Load all function pointers */
+    real_ibv_get_device_list =
+        (ibv_get_device_list_func_t)dlsym(ibverbs_handle, "ibv_get_device_list");
+    real_ibv_free_device_list =
+        (ibv_free_device_list_func_t)dlsym(ibverbs_handle, "ibv_free_device_list");
+    real_ibv_get_device_name =
+        (ibv_get_device_name_func_t)dlsym(ibverbs_handle, "ibv_get_device_name");
+    real_ibv_open_device = (ibv_open_device_func_t)dlsym(ibverbs_handle, "ibv_open_device");
+    real_ibv_close_device = (ibv_close_device_func_t)dlsym(ibverbs_handle, "ibv_close_device");
+    real_ibv_alloc_pd = (ibv_alloc_pd_func_t)dlsym(ibverbs_handle, "ibv_alloc_pd");
+    real_ibv_dealloc_pd = (ibv_dealloc_pd_func_t)dlsym(ibverbs_handle, "ibv_dealloc_pd");
+    real_ibv_reg_mr = (ibv_reg_mr_func_t)dlsym(ibverbs_handle, "ibv_reg_mr");
+    real_ibv_dereg_mr = (ibv_dereg_mr_func_t)dlsym(ibverbs_handle, "ibv_dereg_mr");
+    real_ibv_query_device = (ibv_query_device_func_t)dlsym(ibverbs_handle, "ibv_query_device");
+    real_ibv_query_port = (ibv_query_port_func_t)dlsym(ibverbs_handle, "ibv_query_port");
+    real_ibv_query_gid = (ibv_query_gid_func_t)dlsym(ibverbs_handle, "ibv_query_gid");
+    real_ibv_create_ah = (ibv_create_ah_func_t)dlsym(ibverbs_handle, "ibv_create_ah");
+    real_ibv_destroy_ah = (ibv_destroy_ah_func_t)dlsym(ibverbs_handle, "ibv_destroy_ah");
+    real_ibv_create_cq = (ibv_create_cq_func_t)dlsym(ibverbs_handle, "ibv_create_cq");
+    real_ibv_destroy_cq = (ibv_destroy_cq_func_t)dlsym(ibverbs_handle, "ibv_destroy_cq");
+    real_ibv_create_srq = (ibv_create_srq_func_t)dlsym(ibverbs_handle, "ibv_create_srq");
+    real_ibv_destroy_srq = (ibv_destroy_srq_func_t)dlsym(ibverbs_handle, "ibv_destroy_srq");
+    real_ibv_create_qp = (ibv_create_qp_func_t)dlsym(ibverbs_handle, "ibv_create_qp");
+    real_ibv_destroy_qp = (ibv_destroy_qp_func_t)dlsym(ibverbs_handle, "ibv_destroy_qp");
+    real_ibv_modify_qp = (ibv_modify_qp_func_t)dlsym(ibverbs_handle, "ibv_modify_qp");
+    real_ibv_query_qp = (ibv_query_qp_func_t)dlsym(ibverbs_handle, "ibv_query_qp");
+
+    /* Check if all functions were loaded successfully */
+    if (!real_ibv_get_device_list || !real_ibv_free_device_list || !real_ibv_get_device_name ||
+        !real_ibv_open_device || !real_ibv_close_device || !real_ibv_alloc_pd ||
+        !real_ibv_dealloc_pd || !real_ibv_reg_mr || !real_ibv_dereg_mr || !real_ibv_query_device ||
+        !real_ibv_query_port || !real_ibv_query_gid || !real_ibv_create_ah ||
+        !real_ibv_destroy_ah || !real_ibv_create_cq || !real_ibv_destroy_cq ||
+        !real_ibv_create_srq || !real_ibv_destroy_srq || !real_ibv_create_qp ||
+        !real_ibv_destroy_qp || !real_ibv_modify_qp || !real_ibv_query_qp) {
+        fprintf(stderr, "Failed to load IB Verbs functions: %s\n", dlerror());
+        dlclose(ibverbs_handle);
+        ibverbs_handle = NULL;
+        *ret = -1;
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int init_ibverbs_library(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* *********** Wrapper Implementations *********** */
+
+doca_error_t doca_verbs_wrapper_ibv_get_device_list(int *num_devices,
+                                                    struct ibv_device ***device_list) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *device_list = real_ibv_get_device_list(num_devices);
+    return (*device_list != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list) {
+    if (real_ibv_free_device_list) {
+        real_ibv_free_device_list(list);
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                    const char **device_name) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *device_name = real_ibv_get_device_name(device);
+    return (*device_name != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                struct ibv_context **context) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *context = real_ibv_open_device(device);
+    return (*context != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context) {
+    if (real_ibv_close_device) {
+        int ret = real_ibv_close_device(context);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context, struct ibv_pd **pd) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *pd = real_ibv_alloc_pd(context);
+    return (*pd != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd) {
+    if (real_ibv_dealloc_pd) {
+        int ret = real_ibv_dealloc_pd(pd);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access,
+                                           struct ibv_mr **mr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *mr = real_ibv_reg_mr(pd, addr, length, access);
+    return (*mr != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr) {
+    if (real_ibv_dereg_mr) {
+        int ret = real_ibv_dereg_mr(mr);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_device(struct ibv_context *context,
+                                                 struct ibv_device_attr *device_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_device(context, device_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context, uint8_t port_num,
+                                               struct ibv_port_attr *port_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_port(context, port_num, port_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+                                              int index, union ibv_gid *gid) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_gid(context, port_num, index, gid);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr,
+                                              struct ibv_ah **ah) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *ah = real_ibv_create_ah(pd, attr);
+    return (*ah != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah) {
+    if (real_ibv_destroy_ah) {
+        int ret = real_ibv_destroy_ah(ah);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                              void *cq_context, struct ibv_comp_channel *channel,
+                                              int comp_vector, struct ibv_cq **cq) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *cq = real_ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
+    return (*cq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq) {
+    if (real_ibv_destroy_cq) {
+        int ret = real_ibv_destroy_cq(cq);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_srq(struct ibv_pd *pd,
+                                               struct ibv_srq_init_attr *srq_init_attr,
+                                               struct ibv_srq **srq) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *srq = real_ibv_create_srq(pd, srq_init_attr);
+    return (*srq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq) {
+    if (real_ibv_destroy_srq) {
+        int ret = real_ibv_destroy_srq(srq);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                              struct ibv_qp_init_attr *qp_init_attr,
+                                              struct ibv_qp **qp) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *qp = real_ibv_create_qp(pd, qp_init_attr);
+    return (*qp != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp) {
+    if (real_ibv_destroy_qp) {
+        int ret = real_ibv_destroy_qp(qp);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                              int attr_mask) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_modify_qp(qp, attr, attr_mask);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                             int attr_mask, struct ibv_qp_init_attr *init_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_qp(qp, attr, attr_mask, init_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h
new file mode 100644
index 000000000..a825aec82
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h
@@ -0,0 +1,452 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_ibv_wrapper.h
+ * @brief Wrapper for IB Verbs API calls and structs
+ *
+ * This wrapper provides an abstraction layer over IB Verbs APIs.
+ * It can be enabled by defining DOCA_VERBS_USE_IBV_WRAPPER.
+ *
+ * When DOCA_VERBS_USE_IBV_WRAPPER is defined:
+ * - All IB Verbs API calls are wrapped using dlopen
+ * - All IB Verbs structs are wrapped
+ * - The wrapper provides a clean abstraction layer
+ *
+ * When DOCA_VERBS_USE_IBV_WRAPPER is not defined:
+ * - Direct IB Verbs APIs are used
+ * - No overhead is introduced
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_IBV_WRAPPER_H
+#define DOCA_VERBS_IBV_WRAPPER_H
+
+#ifdef DOCA_VERBS_USE_IBV_WRAPPER
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+
+union ibv_gid {
+    uint8_t raw[16];
+    struct {
+        __be64 subnet_prefix;
+        __be64 interface_id;
+    } global;
+};
+
+struct ibv_global_route {
+    union ibv_gid dgid;
+    uint32_t flow_label;
+    uint8_t sgid_index;
+    uint8_t hop_limit;
+    uint8_t traffic_class;
+};
+
+struct ibv_ah_attr {
+    struct ibv_global_route grh;
+    uint16_t dlid;
+    uint8_t sl;
+    uint8_t src_path_bits;
+    uint8_t static_rate;
+    uint8_t is_global;
+    uint8_t port_num;
+};
+
+enum ibv_atomic_cap { IBV_ATOMIC_NONE, IBV_ATOMIC_HCA, IBV_ATOMIC_GLOB };
+
+struct ibv_device_attr {
+    char fw_ver[64];
+    __be64 node_guid;
+    __be64 sys_image_guid;
+    uint64_t max_mr_size;
+    uint64_t page_size_cap;
+    uint32_t vendor_id;
+    uint32_t vendor_part_id;
+    uint32_t hw_ver;
+    int max_qp;
+    int max_qp_wr;
+    unsigned int device_cap_flags;
+    int max_sge;
+    int max_sge_rd;
+    int max_cq;
+    int max_cqe;
+    int max_mr;
+    int max_pd;
+    int max_qp_rd_atom;
+    int max_ee_rd_atom;
+    int max_res_rd_atom;
+    int max_qp_init_rd_atom;
+    int max_ee_init_rd_atom;
+    enum ibv_atomic_cap atomic_cap;
+    int max_ee;
+    int max_rdd;
+    int max_mw;
+    int max_raw_ipv6_qp;
+    int max_raw_ethy_qp;
+    int max_mcast_grp;
+    int max_mcast_qp_attach;
+    int max_total_mcast_qp_attach;
+    int max_ah;
+    int max_fmr;
+    int max_map_per_fmr;
+    int max_srq;
+    int max_srq_wr;
+    int max_srq_sge;
+    uint16_t max_pkeys;
+    uint8_t local_ca_ack_delay;
+    uint8_t phys_port_cnt;
+};
+
+struct ibv_pd {
+    struct ibv_context *context;
+    uint32_t handle;
+};
+
+enum ibv_access_flags {
+    IBV_ACCESS_LOCAL_WRITE = 1,
+    IBV_ACCESS_REMOTE_WRITE = (1 << 1),
+    IBV_ACCESS_REMOTE_READ = (1 << 2),
+    IBV_ACCESS_REMOTE_ATOMIC = (1 << 3),
+    IBV_ACCESS_MW_BIND = (1 << 4),
+    IBV_ACCESS_ZERO_BASED = (1 << 5),
+    IBV_ACCESS_ON_DEMAND = (1 << 6),
+    IBV_ACCESS_HUGETLB = (1 << 7),
+    IBV_ACCESS_FLUSH_GLOBAL = (1 << 8),
+    IBV_ACCESS_FLUSH_PERSISTENT = (1 << 9),
+    IBV_ACCESS_RELAXED_ORDERING = (1 << 20),
+};
+
+struct ibv_device;
+struct ibv_context;
+struct ibv_mr;
+struct ibv_ah;
+struct ibv_cq;
+struct ibv_comp_channel;
+struct ibv_srq;
+struct ibv_srq_init_attr;
+struct ibv_qp;
+struct ibv_qp_init_attr;
+struct ibv_qp_attr;
+struct ibv_port_attr;
+
+/* *********** IB Verbs API Wrappers *********** */
+
+/**
+ * @brief Wrapper for ibv_get_device_list
+ */
+doca_error_t doca_verbs_wrapper_ibv_get_device_list(int *num_devices,
+                                                    struct ibv_device ***device_list);
+
+/**
+ * @brief Wrapper for ibv_free_device_list
+ */
+doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list);
+
+/**
+ * @brief Wrapper for ibv_get_device_name
+ */
+doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                    const char **device_name);
+
+/**
+ * @brief Wrapper for ibv_open_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                struct ibv_context **context);
+
+/**
+ * @brief Wrapper for ibv_close_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context);
+
+/**
+ * @brief Wrapper for ibv_alloc_pd
+ */
+doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context, struct ibv_pd **pd);
+
+/**
+ * @brief Wrapper for ibv_dealloc_pd
+ */
+doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd);
+
+/**
+ * @brief Wrapper for ibv_reg_mr
+ */
+doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access,
+                                           struct ibv_mr **mr);
+
+/**
+ * @brief Wrapper for ibv_dereg_mr
+ */
+doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr);
+
+/**
+ * @brief Wrapper for ibv_query_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_device(struct ibv_context *context,
+                                                 struct ibv_device_attr *device_attr);
+
+/**
+ * @brief Wrapper for ibv_query_port
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context, uint8_t port_num,
+                                               struct ibv_port_attr *port_attr);
+
+/**
+ * @brief Wrapper for ibv_query_gid
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+                                              int index, union ibv_gid *gid);
+
+/**
+ * @brief Wrapper for ibv_create_ah
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr,
+                                              struct ibv_ah **ah);
+
+/**
+ * @brief Wrapper for ibv_destroy_ah
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah);
+
+/**
+ * @brief Wrapper for ibv_create_cq
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                              void *cq_context, struct ibv_comp_channel *channel,
+                                              int comp_vector, struct ibv_cq **cq);
+
+/**
+ * @brief Wrapper for ibv_destroy_cq
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq);
+
+/**
+ * @brief Wrapper for ibv_create_srq
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_srq(struct ibv_pd *pd,
+                                               struct ibv_srq_init_attr *srq_init_attr,
+                                               struct ibv_srq **srq);
+
+/**
+ * @brief Wrapper for ibv_destroy_srq
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq);
+
+/**
+ * @brief Wrapper for ibv_create_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                              struct ibv_qp_init_attr *qp_init_attr,
+                                              struct ibv_qp **qp);
+
+/**
+ * @brief Wrapper for ibv_destroy_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp);
+
+/**
+ * @brief Wrapper for ibv_modify_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                              int attr_mask);
+
+/**
+ * @brief Wrapper for ibv_query_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                             int attr_mask, struct ibv_qp_init_attr *init_attr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* !DOCA_VERBS_USE_IBV_WRAPPER */
+
+#include <infiniband/verbs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+/* *********** Direct Implementation (when wrapper not used) *********** */
+
+static inline doca_error_t doca_verbs_wrapper_ibv_get_device_list(
+    int *num_devices, struct ibv_device ***device_list) {
+    *device_list = ibv_get_device_list(num_devices);
+    return (*device_list != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list) {
+    ibv_free_device_list(list);
+    return DOCA_SUCCESS;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                                  const char **device_name) {
+    *device_name = ibv_get_device_name(device);
+    return (*device_name != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                              struct ibv_context **context) {
+    *context = ibv_open_device(device);
+    return (*context != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context) {
+    int ret = ibv_close_device(context);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context,
+                                                           struct ibv_pd **pd) {
+    *pd = ibv_alloc_pd(context);
+    return (*pd != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd) {
+    int ret = ibv_dealloc_pd(pd);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr,
+                                                         size_t length, int access,
+                                                         struct ibv_mr **mr) {
+    *mr = ibv_reg_mr(pd, addr, length, access);
+    return (*mr != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr) {
+    int ret = ibv_dereg_mr(mr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_device(
+    struct ibv_context *context, struct ibv_device_attr *device_attr) {
+    int ret = ibv_query_device(context, device_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context,
+                                                             uint8_t port_num,
+                                                             struct ibv_port_attr *port_attr) {
+    int ret = ibv_query_port(context, port_num, port_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context,
+                                                            uint8_t port_num, int index,
+                                                            union ibv_gid *gid) {
+    int ret = ibv_query_gid(context, port_num, index, gid);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd,
+                                                            struct ibv_ah_attr *attr,
+                                                            struct ibv_ah **ah) {
+    *ah = ibv_create_ah(pd, attr);
+    return (*ah != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah) {
+    int ret = ibv_destroy_ah(ah);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                                            void *cq_context,
+                                                            struct ibv_comp_channel *channel,
+                                                            int comp_vector, struct ibv_cq **cq) {
+    *cq = ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
+    return (*cq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq) {
+    int ret = ibv_destroy_cq(cq);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_srq(
+    struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr, struct ibv_srq **srq) {
+    *srq = ibv_create_srq(pd, srq_init_attr);
+    return (*srq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq) {
+    int ret = ibv_destroy_srq(srq);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                                            struct ibv_qp_init_attr *qp_init_attr,
+                                                            struct ibv_qp **qp) {
+    *qp = ibv_create_qp(pd, qp_init_attr);
+    return (*qp != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp) {
+    int ret = ibv_destroy_qp(qp);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp,
+                                                            struct ibv_qp_attr *attr,
+                                                            int attr_mask) {
+    int ret = ibv_modify_qp(qp, attr, attr_mask);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp,
+                                                           struct ibv_qp_attr *attr, int attr_mask,
+                                                           struct ibv_qp_init_attr *init_attr) {
+    int ret = ibv_query_qp(qp, attr, attr_mask, init_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_USE_IBV_WRAPPER */
+
+/** @} */
+
+#endif /* DOCA_VERBS_IBV_WRAPPER_H */
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp
new file mode 100644
index 000000000..7d690d466
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp
@@ -0,0 +1,287 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_mlx5dv_wrapper.cpp
+ * @brief Implementation of mlx5dv API wrapper using dlopen
+ *
+ * This file contains the implementation of the mlx5dv API wrapper
+ * using dynamic loading with dlopen when DOCA_VERBS_USE_MLX5DV_WRAPPER is defined.
+ */
+
+#include "doca_verbs_net_wrapper.h"
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <endian.h>
+#include <mutex>
+
+#include "host/doca_error.h"
+
+/* *********** dlopen Function Pointers *********** */
+
+static void *mlx5dv_handle = NULL;
+
+/* Function pointer types */
+typedef int (*mlx5dv_init_obj_func_t)(struct mlx5dv_obj *obj, enum mlx5dv_obj_type obj_type);
+typedef struct mlx5dv_devx_obj *(*mlx5dv_devx_obj_create_func_t)(struct ibv_context *context,
+                                                                 const void *in, size_t inlen,
+                                                                 void *out, size_t outlen);
+typedef int (*mlx5dv_devx_obj_destroy_func_t)(struct mlx5dv_devx_obj *obj);
+typedef int (*mlx5dv_devx_obj_query_func_t)(struct mlx5dv_devx_obj *obj, const void *in,
+                                            size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_obj_modify_func_t)(struct mlx5dv_devx_obj *obj, const void *in,
+                                             size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_general_cmd_func_t)(struct ibv_context *context, const void *in,
+                                              size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_query_eqn_func_t)(struct ibv_context *context, uint32_t cpus,
+                                            uint32_t *eqn);
+typedef struct mlx5dv_devx_umem *(*mlx5dv_devx_umem_reg_func_t)(struct ibv_context *context,
+                                                                void *addr, size_t size,
+                                                                uint32_t access);
+typedef struct mlx5dv_devx_umem *(*mlx5dv_devx_umem_reg_ex_func_t)(
+    struct ibv_context *context, struct mlx5dv_devx_umem_in *umem_in);
+typedef int (*mlx5dv_devx_umem_dereg_func_t)(struct mlx5dv_devx_umem *umem);
+typedef struct mlx5dv_devx_uar *(*mlx5dv_devx_alloc_uar_func_t)(struct ibv_context *context,
+                                                                uint32_t uar_type);
+typedef void (*mlx5dv_devx_free_uar_func_t)(struct mlx5dv_devx_uar *uar);
+typedef int (*mlx5dv_query_device_func_t)(struct ibv_context *context,
+                                          struct mlx5dv_context *attrs_out);
+
+/* Function pointers */
+static mlx5dv_init_obj_func_t mlx5dv_init_obj_func = NULL;
+static mlx5dv_devx_obj_create_func_t mlx5dv_devx_obj_create_func = NULL;
+static mlx5dv_devx_obj_destroy_func_t mlx5dv_devx_obj_destroy_func = NULL;
+static mlx5dv_devx_obj_query_func_t mlx5dv_devx_obj_query_func = NULL;
+static mlx5dv_devx_obj_modify_func_t mlx5dv_devx_obj_modify_func = NULL;
+static mlx5dv_devx_general_cmd_func_t mlx5dv_devx_general_cmd_func = NULL;
+static mlx5dv_devx_query_eqn_func_t mlx5dv_devx_query_eqn_func = NULL;
+static mlx5dv_devx_umem_reg_func_t mlx5dv_devx_umem_reg_func = NULL;
+static mlx5dv_devx_umem_reg_ex_func_t mlx5dv_devx_umem_reg_ex_func = NULL;
+static mlx5dv_devx_umem_dereg_func_t mlx5dv_devx_umem_dereg_func = NULL;
+static mlx5dv_devx_alloc_uar_func_t mlx5dv_devx_alloc_uar_func = NULL;
+static mlx5dv_devx_free_uar_func_t mlx5dv_devx_free_uar_func = NULL;
+static mlx5dv_query_device_func_t mlx5dv_query_device_func = NULL;
+
+/* *********** dlopen Initialization *********** */
+
+static void doca_verbs_wrapper_init_once(int *ret) {
+    mlx5dv_handle = dlopen("libmlx5.so", RTLD_LAZY);
+    if (!mlx5dv_handle) {
+        *ret = -1; /* Failed to load library */
+        return;
+    }
+
+    /* Load function pointers */
+    mlx5dv_init_obj_func = (mlx5dv_init_obj_func_t)dlsym(mlx5dv_handle, "mlx5dv_init_obj");
+    mlx5dv_devx_obj_create_func =
+        (mlx5dv_devx_obj_create_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_create");
+    mlx5dv_devx_obj_destroy_func =
+        (mlx5dv_devx_obj_destroy_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_destroy");
+    mlx5dv_devx_obj_query_func =
+        (mlx5dv_devx_obj_query_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_query");
+    mlx5dv_devx_obj_modify_func =
+        (mlx5dv_devx_obj_modify_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_modify");
+    mlx5dv_devx_general_cmd_func =
+        (mlx5dv_devx_general_cmd_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_general_cmd");
+    mlx5dv_devx_query_eqn_func =
+        (mlx5dv_devx_query_eqn_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_query_eqn");
+    mlx5dv_devx_umem_reg_func =
+        (mlx5dv_devx_umem_reg_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_reg");
+    mlx5dv_devx_umem_reg_ex_func =
+        (mlx5dv_devx_umem_reg_ex_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_reg_ex");
+    mlx5dv_devx_umem_dereg_func =
+        (mlx5dv_devx_umem_dereg_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_dereg");
+    mlx5dv_devx_alloc_uar_func =
+        (mlx5dv_devx_alloc_uar_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_alloc_uar");
+    mlx5dv_devx_free_uar_func =
+        (mlx5dv_devx_free_uar_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_free_uar");
+    mlx5dv_query_device_func =
+        (mlx5dv_query_device_func_t)dlsym(mlx5dv_handle, "mlx5dv_query_device");
+
+    /* Check if all functions were loaded successfully */
+    if (!mlx5dv_init_obj_func || !mlx5dv_devx_obj_create_func || !mlx5dv_devx_obj_destroy_func ||
+        !mlx5dv_devx_obj_query_func || !mlx5dv_devx_obj_modify_func ||
+        !mlx5dv_devx_general_cmd_func || !mlx5dv_devx_query_eqn_func ||
+        !mlx5dv_devx_umem_reg_func || !mlx5dv_devx_umem_reg_ex_func ||
+        !mlx5dv_devx_umem_dereg_func || !mlx5dv_devx_alloc_uar_func || !mlx5dv_devx_free_uar_func ||
+        !mlx5dv_query_device_func) {
+        dlclose(mlx5dv_handle);
+        mlx5dv_handle = NULL;
+        *ret = -1; /* Failed to load some functions */
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int doca_verbs_wrapper_init_dlopen(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* *********** Wrapper Implementation with dlopen *********** */
+
+doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                enum mlx5dv_obj_type obj_type) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_init_obj_func(obj, obj_type);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(struct ibv_context *context, const void *in,
+                                                       size_t inlen, void *out, size_t outlen,
+                                                       struct mlx5dv_devx_obj **obj_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_obj *obj = mlx5dv_devx_obj_create_func(context, in, inlen, out, outlen);
+    if (obj) {
+        *obj_out = obj;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_destroy_func(obj);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in,
+                                                      size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_query_func(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in,
+                                                       size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_modify_func(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in,
+                                                        size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_general_cmd_func(context, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t cpus,
+                                                      uint32_t *eqn) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_query_eqn_func(context, cpus, eqn);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr,
+                                                     size_t size, uint32_t access,
+                                                     struct mlx5dv_devx_umem **umem_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_func(context, addr, size, access);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(struct ibv_context *context,
+                                                        struct mlx5dv_devx_umem_in *umem_in,
+                                                        struct mlx5dv_devx_umem **umem_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_ex_func(context, umem_in);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *umem) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_umem_dereg_func(umem);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(struct ibv_context *context,
+                                                      uint32_t uar_type,
+                                                      struct mlx5dv_devx_uar **uar_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar_func(context, uar_type);
+    if (uar) {
+        *uar_out = uar;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    mlx5dv_devx_free_uar_func(uar);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_query_device(struct ibv_context *context,
+                                                    struct mlx5dv_context *attrs_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_query_device_func(context, attrs_out);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h
new file mode 100644
index 000000000..2707a0efd
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h
@@ -0,0 +1,431 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_mlx5dv_wrapper.h
+ * @brief Wrapper for mlx5dv API calls and structs
+ *
+ * This wrapper provides an abstraction layer over mlx5dv APIs.
+ * It can be enabled by defining DOCA_VERBS_USE_MLX5DV_WRAPPER.
+ *
+ * When DOCA_VERBS_USE_MLX5DV_WRAPPER is defined:
+ * - All mlx5dv API calls are wrapped using dlopen
+ * - All mlx5dv structs are wrapped
+ * - The wrapper provides a clean abstraction layer with dynamic loading
+ *
+ * When DOCA_VERBS_USE_MLX5DV_WRAPPER is not defined:
+ * - Direct mlx5dv APIs are used
+ * - No overhead is introduced
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_MLX5DV_WRAPPER_H
+#define DOCA_VERBS_MLX5DV_WRAPPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+#ifdef DOCA_VERBS_USE_MLX5DV_WRAPPER
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <sys/types.h>
+#include <endian.h>
+
+#include "doca_verbs_ibv_wrapper.h"
+
+#define ETHERNET_LL_SIZE 6
+
+enum mlx5_ib_uapi_uar_alloc_type {
+    MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF = 0x0,
+    MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1,
+};
+
+#define MLX5DV_UAR_ALLOC_TYPE_BF MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF
+#define MLX5DV_UAR_ALLOC_TYPE_NC MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC
+
+enum mlx5dv_devx_umem_in_mask {
+    MLX5DV_UMEM_MASK_DMABUF = 1 << 0,
+};
+
+struct mlx5dv_devx_umem_in {
+    void *addr;
+    size_t size;
+    uint32_t access;
+    uint64_t pgsz_bitmap;
+    uint64_t comp_mask;
+    int dmabuf_fd;
+};
+
+enum mlx5dv_obj_type {
+    MLX5DV_OBJ_QP = 1 << 0,
+    MLX5DV_OBJ_CQ = 1 << 1,
+    MLX5DV_OBJ_SRQ = 1 << 2,
+    MLX5DV_OBJ_RWQ = 1 << 3,
+    MLX5DV_OBJ_DM = 1 << 4,
+    MLX5DV_OBJ_AH = 1 << 5,
+    MLX5DV_OBJ_PD = 1 << 6,
+    MLX5DV_OBJ_DEVX = 1 << 7,
+};
+
+struct mlx5dv_devx_umem {
+    uint32_t umem_id;
+};
+
+struct mlx5dv_devx_obj {
+    /* Opaque structure - implementation details hidden */
+    void *obj;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_av {
+    union {
+        struct {
+            __be32 qkey;
+            __be32 reserved;
+        } qkey;
+        __be64 dc_key;
+    } key;
+    __be32 dqp_dct;
+    uint8_t stat_rate_sl;
+    uint8_t fl_mlid;
+    __be16 rlid;
+    uint8_t reserved0[4];
+    uint8_t rmac[ETHERNET_LL_SIZE];
+    uint8_t tclass;
+    uint8_t hop_limit;
+    __be32 grh_gid_fl;
+    uint8_t rgid[16];
+};
+
+struct mlx5dv_ah {
+    struct doca_gpunetio_ib_mlx5_wqe_av *av;
+    uint64_t comp_mask;
+};
+
+struct mlx5dv_pd {
+    uint32_t pdn;
+    uint64_t comp_mask;
+};
+
+struct mlx5dv_obj {
+    struct {
+        struct ibv_qp *in;
+        struct mlx5dv_qp *out;
+    } qp;
+    struct {
+        struct ibv_cq *in;
+        struct mlx5dv_cq *out;
+    } cq;
+    struct {
+        struct ibv_srq *in;
+        struct mlx5dv_srq *out;
+    } srq;
+    struct {
+        struct ibv_wq *in;
+        struct mlx5dv_rwq *out;
+    } rwq;
+    struct {
+        struct ibv_dm *in;
+        struct mlx5dv_dm *out;
+    } dm;
+    struct {
+        struct ibv_ah *in;
+        struct mlx5dv_ah *out;
+    } ah;
+    struct {
+        struct ibv_pd *in;
+        struct mlx5dv_pd *out;
+    } pd;
+    struct {
+        struct mlx5dv_devx_obj *in;
+        struct mlx5dv_devx *out;
+    } devx;
+};
+
+struct mlx5dv_devx_uar {
+    void *reg_addr;
+    void *base_addr;
+    uint32_t page_id;
+    off_t mmap_off;
+    uint64_t comp_mask;
+};
+
+#define __devx_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)NULL)
+#define __devx_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+#define __devx_bit_sz(typ, fld) sizeof(__devx_nullp(typ)->fld)
+#define __devx_bit_off(typ, fld) offsetof(struct mlx5_ifc_##typ##_bits, fld)
+#define __devx_dw_off(bit_off) ((bit_off) / 32)
+#define __devx_64_off(bit_off) ((bit_off) / 64)
+#define __devx_dw_bit_off(bit_sz, bit_off) (32 - (bit_sz) - ((bit_off) & 0x1f))
+#define __devx_mask(bit_sz) ((uint32_t)((1ull << (bit_sz)) - 1))
+#define __devx_dw_mask(bit_sz, bit_off) (__devx_mask(bit_sz) << __devx_dw_bit_off(bit_sz, bit_off))
+
+#define DEVX_FLD_SZ_BYTES(typ, fld) (__devx_bit_sz(typ, fld) / 8)
+#define DEVX_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define DEVX_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define DEVX_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
+#define DEVX_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define DEVX_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
+#define DEVX_BYTE_OFF(typ, fld) (__devx_bit_off(typ, fld) / 8)
+#define DEVX_ADDR_OF(typ, p, fld) ((unsigned char *)(p) + DEVX_BYTE_OFF(typ, fld))
+
+static inline void _devx_set(void *p, uint32_t value, size_t bit_off, size_t bit_sz) {
+    __be32 *fld = (__be32 *)(p) + __devx_dw_off(bit_off);
+    uint32_t dw_mask = __devx_dw_mask(bit_sz, bit_off);
+    uint32_t mask = __devx_mask(bit_sz);
+
+    *fld = htobe32((be32toh(*fld) & (~dw_mask)) |
+                   ((value & mask) << __devx_dw_bit_off(bit_sz, bit_off)));
+}
+
+#define DEVX_SET(typ, p, fld, v) _devx_set(p, v, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld))
+
+static inline uint32_t _devx_get(const void *p, size_t bit_off, size_t bit_sz) {
+    return ((be32toh(*((const __be32 *)(p) + __devx_dw_off(bit_off))) >>
+             __devx_dw_bit_off(bit_sz, bit_off)) &
+            __devx_mask(bit_sz));
+}
+
+#define DEVX_GET(typ, p, fld) _devx_get(p, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld))
+
+static inline void _devx_set64(void *p, uint64_t v, size_t bit_off) {
+    *((__be64 *)(p) + __devx_64_off(bit_off)) = htobe64(v);
+}
+
+#define DEVX_SET64(typ, p, fld, v) _devx_set64(p, v, __devx_bit_off(typ, fld))
+
+static inline uint64_t _devx_get64(const void *p, size_t bit_off) {
+    return be64toh(*((const __be64 *)(p) + __devx_64_off(bit_off)));
+}
+
+#define DEVX_GET64(typ, p, fld) _devx_get64(p, __devx_bit_off(typ, fld))
+
+struct mlx5dv_context;
+struct mlx5dv_port;
+
+/* *********** mlx5dv API Wrappers *********** */
+
+/**
+ * @brief Wrapper for mlx5dv_init_obj
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                enum mlx5dv_obj_type obj_type);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_create
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(struct ibv_context *context, const void *in,
+                                                       size_t inlen, void *out, size_t outlen,
+                                                       struct mlx5dv_devx_obj **obj_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_destroy
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_query
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in,
+                                                      size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_modify
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in,
+                                                       size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_general_cmd
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in,
+                                                        size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_query_eqn
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t cpus,
+                                                      uint32_t *eqn);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_reg
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr,
+                                                     size_t size, uint32_t access,
+                                                     struct mlx5dv_devx_umem **umem_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_reg_ex
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(struct ibv_context *context,
+                                                        struct mlx5dv_devx_umem_in *umem_in,
+                                                        struct mlx5dv_devx_umem **umem_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_dereg
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *umem);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_alloc_uar
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(struct ibv_context *context,
+                                                      uint32_t uar_type,
+                                                      struct mlx5dv_devx_uar **uar_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_free_uar
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar);
+
+/**
+ * @brief Wrapper for mlx5dv_query_device
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_query_device(struct ibv_context *context,
+                                                    struct mlx5dv_context *attrs_out);
+
+#else /* !DOCA_VERBS_USE_MLX5DV_WRAPPER */
+
+#include <infiniband/mlx5dv.h>
+
+/* *********** Direct API Implementation (inline) *********** */
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                              enum mlx5dv_obj_type obj_type) {
+    int ret = mlx5dv_init_obj(obj, obj_type);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(
+    struct ibv_context *context, const void *in, size_t inlen, void *out, size_t outlen,
+    struct mlx5dv_devx_obj **obj_out) {
+    struct mlx5dv_devx_obj *obj = mlx5dv_devx_obj_create(context, in, inlen, out, outlen);
+    if (obj) {
+        *obj_out = obj;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj) {
+    int ret = mlx5dv_devx_obj_destroy(obj);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj,
+                                                                    const void *in, size_t inlen,
+                                                                    void *out, size_t outlen) {
+    int ret = mlx5dv_devx_obj_query(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj,
+                                                                     const void *in, size_t inlen,
+                                                                     void *out, size_t outlen) {
+    int ret = mlx5dv_devx_obj_modify(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context,
+                                                                      const void *in, size_t inlen,
+                                                                      void *out, size_t outlen) {
+    int ret = mlx5dv_devx_general_cmd(context, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context,
+                                                                    uint32_t cpus, uint32_t *eqn) {
+    int ret = mlx5dv_devx_query_eqn(context, cpus, eqn);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(
+    struct ibv_context *context, void *addr, size_t size, uint32_t access,
+    struct mlx5dv_devx_umem **umem_out) {
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg(context, addr, size, access);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(
+    struct ibv_context *context, struct mlx5dv_devx_umem_in *umem_in,
+    struct mlx5dv_devx_umem **umem_out) {
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_ex(context, umem_in);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(
+    struct mlx5dv_devx_umem *umem) {
+    int ret = mlx5dv_devx_umem_dereg(umem);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+    struct ibv_context *context, uint32_t uar_type, struct mlx5dv_devx_uar **uar_out) {
+    struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar(context, uar_type);
+    if (uar) {
+        *uar_out = uar;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar) {
+    mlx5dv_devx_free_uar(uar);
+    return DOCA_SUCCESS;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_query_device(
+    struct ibv_context *context, struct mlx5dv_context *attrs_out) {
+    int ret = mlx5dv_query_device(context, attrs_out);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+#endif /* !DOCA_VERBS_USE_MLX5DV_WRAPPER */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_MLX5DV_WRAPPER_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h
new file mode 100644
index 000000000..2512b74bd
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_net_wrapper.h
+ * @brief Main wrapper header for IB Verbs and mlx5dv API calls and structs
+ *
+ * This header includes the separate IB Verbs and mlx5dv wrappers.
+ * It provides backward compatibility with the original unified wrapper.
+ *
+ * For IB Verbs wrapper, define DOCA_VERBS_USE_IBV_WRAPPER
+ * For mlx5dv wrapper, define DOCA_VERBS_USE_MLX5DV_WRAPPER
+ * For backward compatibility, define DOCA_VERBS_USE_WRAPPER (enables both)
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_NET_WRAPPER_H
+#define DOCA_VERBS_NET_WRAPPER_H
+
+#ifdef DOCA_VERBS_USE_NET_WRAPPER
+#ifndef DOCA_VERBS_USE_IBV_WRAPPER
+#define DOCA_VERBS_USE_IBV_WRAPPER
+#endif
+#ifndef DOCA_VERBS_USE_MLX5DV_WRAPPER
+#define DOCA_VERBS_USE_MLX5DV_WRAPPER
+#endif
+#endif
+
+/* Include the separate wrappers */
+#include "doca_verbs_ibv_wrapper.h"
+#include "doca_verbs_mlx5dv_wrapper.h"
+
+#endif /* DOCA_VERBS_NET_WRAPPER_H */
+
+/** @} */
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp
new file mode 100644
index 000000000..5970d75e3
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp
@@ -0,0 +1,2743 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_srq.hpp"
+#include "doca_verbs_cq.hpp"
+#include "doca_verbs_qp.hpp"
+#include "doca_verbs_net_wrapper.h"
+#include "common/doca_gpunetio_verbs_def.h"
+
+#define USER_INDEX_MSB_8BITS_MASK 0xFF000000
+#define DOCA_VERBS_LOG_OCTOWORD_SIZE 4
+#define DOCA_VERBS_OCTOWORD_SIZE (1U << DOCA_VERBS_LOG_OCTOWORD_SIZE)
+#define DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_data_seg)
+#define DOCA_VERBS_LOG_WQEBB_SIZE 6
+#define DOCA_VERBS_WQEBB_SIZE (1U << DOCA_VERBS_LOG_WQEBB_SIZE)
+#define MAX(a, b) std::max(a, b)
+#define QP_ATTR(_mask) (DOCA_VERBS_QP_ATTR_##_mask)
+#define PRIV_DOCA_MAC_BYTE_LENGTH 6
+#define PRIV_DOCA_VERBS_PORT_NUM 1
+#define PRIV_DOCA_GID_BYTE_LENGTH 16
+
+enum {
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_RRE = (1 << 1),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_RWE = (1 << 3),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_PKEY_INDEX = (1 << 4),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_MIN_RNR_NAK = (1 << 6),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_PORT_NUM = (1 << 16),
+    PRIV_DOCA_MLX5_QP_OPT_DSCP = (1 << 17),
+    PRIV_DOCA_MLX5_QP_OPT_SGID_INDEX = (1 << 23),
+};
+
+enum doca_verbs_qp_state_mod {
+    DOCA_VERBS_QP_RST2INIT,
+    DOCA_VERBS_QP_INIT2INIT,
+    DOCA_VERBS_QP_INIT2RTR,
+    DOCA_VERBS_QP_RTR2RTS,
+    DOCA_VERBS_QP_RTS2RTS,
+};
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+constexpr uint32_t sc_verbs_qp_doorbell_size = 64;
+constexpr uint8_t sc_verbs_qp_log_rq_stride_shift = 4;
+constexpr uint32_t sc_verbs_mac_addr_len = 6;
+constexpr uint32_t sc_verbs_mac_addr_2msbytes_len = 2;
+constexpr uint32_t sc_verbs_log_msg_max = 30;
+
+using create_qp_in = uint32_t[MLX5_ST_SZ_DW(create_qp_in)];
+using create_qp_out = uint32_t[MLX5_ST_SZ_DW(create_qp_out)];
+
+using rst2init_qp_in = uint32_t[MLX5_ST_SZ_DW(rst2init_qp_in)];
+using rst2init_qp_out = uint32_t[MLX5_ST_SZ_DW(rst2init_qp_out)];
+
+using init2init_qp_in = uint32_t[MLX5_ST_SZ_DW(init2init_qp_in)];
+using init2init_qp_out = uint32_t[MLX5_ST_SZ_DW(init2init_qp_out)];
+
+using init2rtr_qp_in = uint32_t[MLX5_ST_SZ_DW(init2rtr_qp_in)];
+using init2rtr_qp_out = uint32_t[MLX5_ST_SZ_DW(init2rtr_qp_out)];
+
+using rtr2rts_qp_in = uint32_t[MLX5_ST_SZ_DW(rtr2rts_qp_in)];
+using rtr2rts_qp_out = uint32_t[MLX5_ST_SZ_DW(rtr2rts_qp_out)];
+
+using rts2rts_qp_in = uint32_t[MLX5_ST_SZ_DW(rts2rts_qp_in)];
+using rts2rts_qp_out = uint32_t[MLX5_ST_SZ_DW(rts2rts_qp_out)];
+
+using qp_2err_in = uint32_t[MLX5_ST_SZ_DW(qp_2err_in)];
+using qp_2err_out = uint32_t[MLX5_ST_SZ_DW(qp_2err_out)];
+
+using qp_2rst_in = uint32_t[MLX5_ST_SZ_DW(qp_2rst_in)];
+using qp_2rst_out = uint32_t[MLX5_ST_SZ_DW(qp_2rst_out)];
+
+using query_qp_in = uint32_t[MLX5_ST_SZ_DW(query_qp_in)];
+using query_qp_out = uint32_t[MLX5_ST_SZ_DW(query_qp_out)];
+
+int rst2init_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(PKEY_INDEX) | QP_ATTR(PORT_NUM) | QP_ATTR(ALLOW_REMOTE_WRITE) |
+        QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int init2rtr_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(RQ_PSN) | QP_ATTR(DEST_QP_NUM) | QP_ATTR(PATH_MTU) | QP_ATTR(AH_ATTR) |
+        QP_ATTR(MIN_RNR_TIMER),
+};
+
+int rtr2rts_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(SQ_PSN) | QP_ATTR(ACK_TIMEOUT) | QP_ATTR(RETRY_CNT) | QP_ATTR(RNR_RETRY),
+};
+
+int init2init_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(PKEY_INDEX) | QP_ATTR(PORT_NUM) |
+        QP_ATTR(ALLOW_REMOTE_WRITE) | QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int init2rtr_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(PKEY_INDEX) |
+        QP_ATTR(ALLOW_REMOTE_WRITE) | QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int rtr2rts_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(MIN_RNR_TIMER) |
+        QP_ATTR(ALLOW_REMOTE_WRITE),
+};
+
+int rts2rts_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(ALLOW_REMOTE_WRITE) |
+        QP_ATTR(ALLOW_REMOTE_READ) | QP_ATTR(MIN_RNR_TIMER) | QP_ATTR(AH_ATTR),
+};
+
+const char *qp_attr_to_string(int attr) {
+    switch (attr) {
+        case DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE:
+            return "ALLOW_REMOTE_WRITE";
+        case DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ:
+            return "ALLOW_REMOTE_READ";
+        case DOCA_VERBS_QP_ATTR_PKEY_INDEX:
+            return "PKEY_INDEX";
+        case DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER:
+            return "MIN_RNR_TIMER";
+        case DOCA_VERBS_QP_ATTR_PORT_NUM:
+            return "PORT_NUM";
+        case DOCA_VERBS_QP_ATTR_NEXT_STATE:
+            return "NEXT_STATE";
+        case DOCA_VERBS_QP_ATTR_CURRENT_STATE:
+            return "CURRENT_STATE";
+        case DOCA_VERBS_QP_ATTR_PATH_MTU:
+            return "PATH_MTU";
+        case DOCA_VERBS_QP_ATTR_RQ_PSN:
+            return "RQ_PSN";
+        case DOCA_VERBS_QP_ATTR_SQ_PSN:
+            return "SQ_PSN";
+        case DOCA_VERBS_QP_ATTR_DEST_QP_NUM:
+            return "DEST_QP_NUM";
+        case DOCA_VERBS_QP_ATTR_ACK_TIMEOUT:
+            return "ACK_TIMEOUT";
+        case DOCA_VERBS_QP_ATTR_RETRY_CNT:
+            return "RETRY_CNT";
+        case DOCA_VERBS_QP_ATTR_RNR_RETRY:
+            return "RNR_RETRY";
+        case DOCA_VERBS_QP_ATTR_AH_ATTR:
+            return "AH_ATTR";
+        default:
+            break;
+    }
+
+    return "UNKNOWN";
+}
+
+void print_if_missing_attr(int required_attr_mask, int attr_mask, int attr_to_check) {
+    if ((required_attr_mask & attr_to_check) != 0 && (attr_mask & attr_to_check) == 0)
+        DOCA_LOG(LOG_ERR, "%s is required but diabled in attr_mask (%d)",
+                 qp_attr_to_string(attr_to_check), attr_mask);
+}
+
+void print_missing_attrs(int required_attr_mask, int attr_mask) {
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PKEY_INDEX);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PORT_NUM);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_NEXT_STATE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_CURRENT_STATE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PATH_MTU);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RQ_PSN);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_SQ_PSN);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_DEST_QP_NUM);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ACK_TIMEOUT);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RETRY_CNT);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RNR_RETRY);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_AH_ATTR);
+}
+
+bool is_X2rst_attrs_valid(int attr_mask) {
+    int valid_attr = (DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE);
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_X2err_attrs_valid(int attr_mask) {
+    int valid_attr = (DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE);
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rst2init_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = rst2init_requested_attr[qp_type];
+    int valid_attr =
+        required_attr | DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE;
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_init2init_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int valid_attr = init2init_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_init2rtr_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = init2rtr_requested_attr[qp_type];
+    int valid_attr = required_attr | init2rtr_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rtr2rts_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = rtr2rts_requested_attr[qp_type];
+    int valid_attr = required_attr | rtr2rts_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rts2rts_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int valid_attr = rts2rts_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+void convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(
+    int attr_mask, int &mlx5_opt_mask, doca_verbs_qp_state_mod state_mod) {
+    mlx5_opt_mask = 0;
+
+    static const int valid_opt_mask[] = {
+        // RST2INIT
+        0,
+        // INIT2INIT
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_PKEY_INDEX | DOCA_VERBS_QP_ATTR_PORT_NUM,
+        // INIT2RTR
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_PKEY_INDEX,
+        // RTR2RTS
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER,
+        // RTS2RTS
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_AH_ATTR | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER,
+    };
+
+    attr_mask &= valid_opt_mask[state_mod];
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_PKEY_INDEX)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_PKEY_INDEX;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_MIN_RNR_NAK;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_PORT_NUM)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_PORT_NUM;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR)
+        mlx5_opt_mask |= (PRIV_DOCA_MLX5_QP_OPT_SGID_INDEX | PRIV_DOCA_MLX5_QP_OPT_DSCP);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE) {
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_RWE;
+    }
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) {
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_RRE;
+    }
+}
+
+doca_error_t query_roce_version(struct ibv_context *ctx, uint8_t sgid_index,
+                                uint8_t &roce_version) noexcept {
+    uint32_t in[MLX5_ST_SZ_DW(query_roce_address_in)] = {0};
+    constexpr auto out_size =
+        MLX5_ST_SZ_DW(query_roce_address_out) + MLX5_ST_SZ_DW(roce_addr_layout);
+    uint32_t out[out_size] = {0};
+
+    DEVX_SET(query_roce_address_in, &in, opcode, MLX5_CMD_OP_QUERY_ROCE_ADDRESS);
+    DEVX_SET(query_roce_address_in, &in, roce_address_index, sgid_index);
+
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query roce version");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    roce_version = DEVX_GET(query_roce_address_out, out, roce_address[0].roce_version);
+
+    DOCA_LOG(LOG_INFO, "roce_version = %d", roce_version);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_doca_mtu_size_to_prm_mtu_size(doca_verbs_mtu_size mtu_size,
+                                                   uint32_t &prm_mtu_size) noexcept {
+    switch (mtu_size) {
+        case DOCA_VERBS_MTU_SIZE_256_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_256_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_512_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_512_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_1K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_1K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_2K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_2K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_4K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_4K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_RAW_ETHERNET:
+            prm_mtu_size = MLX5_QPC_MTU_RAW_ETHERNET_QP;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid DOCA mtu size=%d", mtu_size);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_prm_mtu_size_to_doca_verbs_mtu_size(uint32_t prm_mtu_size,
+                                                         doca_verbs_mtu_size &mtu_size) noexcept {
+    switch (prm_mtu_size) {
+        case MLX5_QPC_MTU_256_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_256_BYTES;
+            break;
+        case MLX5_QPC_MTU_512_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_512_BYTES;
+            break;
+        case MLX5_QPC_MTU_1K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_1K_BYTES;
+            break;
+        case MLX5_QPC_MTU_2K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_2K_BYTES;
+            break;
+        case MLX5_QPC_MTU_4K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_4K_BYTES;
+            break;
+        case MLX5_QPC_MTU_RAW_ETHERNET_QP:
+            mtu_size = DOCA_VERBS_MTU_SIZE_RAW_ETHERNET;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid prm mtu size=%d", mtu_size);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+int random_in_range(int min, int max) { return min + rand() % (max - min + 1); }
+
+doca_error_t resolve_remote_mac(ibv_pd *pd_handle, uint8_t local_port_num, uint32_t local_gid_index,
+                                uint8_t remote_gid[PRIV_DOCA_GID_BYTE_LENGTH], uint8_t hop_limit,
+                                uint8_t is_global,
+                                uint8_t mac[PRIV_DOCA_MAC_BYTE_LENGTH]) noexcept {
+    struct ibv_ah_attr attr = {};
+
+    attr.port_num = local_port_num;
+    attr.grh.sgid_index = local_gid_index;
+    memcpy(attr.grh.dgid.raw, remote_gid, PRIV_DOCA_GID_BYTE_LENGTH);
+    attr.grh.hop_limit = hop_limit;
+    attr.is_global = is_global;
+
+    struct ibv_ah *ah;
+    auto ah_ret = doca_verbs_wrapper_ibv_create_ah(pd_handle, &attr, &ah);
+    if (ah_ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create ibv_ah. ret=%d", ah_ret);
+        return ah_ret;
+    }
+
+    struct mlx5dv_obj dv_obj {};
+    struct mlx5dv_ah dv_ah {};
+
+    dv_obj.ah.in = ah;
+    dv_obj.ah.out = &dv_ah;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_AH);
+    if (ret != DOCA_SUCCESS) {
+        auto destroy_ret = doca_verbs_wrapper_ibv_destroy_ah(ah);
+        if (destroy_ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ret);
+        }
+        DOCA_LOG(LOG_ERR, "Failed to initialize mlx5dv_ah from ibv_ah. ret=%d", ret);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    // Check needed for coverity
+    if (dv_ah.av == nullptr) {
+        auto destroy_ret = doca_verbs_wrapper_ibv_destroy_ah(ah);
+        if (destroy_ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ret);
+        }
+        DOCA_LOG(LOG_ERR, "Failed to initialize mlx5dv_ah from ibv_ah mlx5dv_ah::av is NULL");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    memcpy(mac, dv_ah.av->rmac, PRIV_DOCA_MAC_BYTE_LENGTH);
+
+    auto destroy_ah_status = doca_verbs_wrapper_ibv_destroy_ah(ah);
+    if (destroy_ah_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ah_status);
+        return destroy_ah_status;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_prm_qp_state_to_doca_verbs_qp_state(uint32_t qp_state,
+                                                         doca_verbs_qp_state &state) {
+    switch (qp_state) {
+        case MLX5_QPC_STATE_RST:
+            state = DOCA_VERBS_QP_STATE_RST;
+            break;
+        case MLX5_QPC_STATE_INIT:
+            state = DOCA_VERBS_QP_STATE_INIT;
+            break;
+        case MLX5_QPC_STATE_RTR:
+            state = DOCA_VERBS_QP_STATE_RTR;
+            break;
+        case MLX5_QPC_STATE_RTS:
+            state = DOCA_VERBS_QP_STATE_RTS;
+            break;
+        case MLX5_QPC_STATE_ERR:
+            state = DOCA_VERBS_QP_STATE_ERR;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid prm qp state=%d", qp_state);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs Member Functions
+ *********************************************************************************************************************/
+
+bool doca_verbs_qp::is_qp_attr_state_valid(enum doca_verbs_qp_state state) noexcept {
+    switch (state) {
+        case DOCA_VERBS_QP_STATE_RST:
+        case DOCA_VERBS_QP_STATE_INIT:
+        case DOCA_VERBS_QP_STATE_RTR:
+        case DOCA_VERBS_QP_STATE_RTS:
+        case DOCA_VERBS_QP_STATE_ERR:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "state is invalid (value is %u)", state);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_path_mtu_valid(enum doca_verbs_mtu_size path_mtu) noexcept {
+    switch (path_mtu) {
+        case DOCA_VERBS_MTU_SIZE_256_BYTES:
+        case DOCA_VERBS_MTU_SIZE_512_BYTES:
+        case DOCA_VERBS_MTU_SIZE_1K_BYTES:
+        case DOCA_VERBS_MTU_SIZE_2K_BYTES:
+        case DOCA_VERBS_MTU_SIZE_4K_BYTES:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "path_mtu is invalid (value is %u)", path_mtu);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+// No value of PSN causes a value (we print a warning and mask it in case of overflow)
+uint32_t doca_verbs_qp::is_qp_attr_queue_psn_valid(uint32_t psn) noexcept {
+    if (psn & ~0xffffff) {
+        DOCA_LOG(LOG_ERR, "PSN value overflow (max is %x). Masking to 24 bits", 0xffffff);
+        psn &= 0xffffff;
+    }
+
+    return psn;
+}
+
+bool doca_verbs_qp::is_qp_attr_ah_add_type_valid(enum doca_verbs_addr_type addr_type) noexcept {
+    switch (addr_type) {
+        case DOCA_VERBS_ADDR_TYPE_IPv4:
+        case DOCA_VERBS_ADDR_TYPE_IPv6:
+        case DOCA_VERBS_ADDR_TYPE_IB_GRH:
+        case DOCA_VERBS_ADDR_TYPE_IB_NO_GRH:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "addr_type is invalid (value is %u)", addr_type);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_ah_sgid_index_valid(uint8_t sgid_index) noexcept {
+    if (sgid_index >= m_verbs_device_attr->m_gid_table_size) {
+        DOCA_LOG(LOG_ERR, "sgid_index should be less than %u (value is %u)",
+                 m_verbs_device_attr->m_gid_table_size - 1, sgid_index);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_pkey_index_valid(uint16_t pkey_index) noexcept {
+    if (pkey_index > m_verbs_device_attr->m_max_pkeys) {
+        DOCA_LOG(LOG_ERR, "pkey_index should be less than %u (value is %u)",
+                 m_verbs_device_attr->m_max_pkeys, pkey_index);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_port_num_valid(uint16_t port_num) noexcept {
+    if (port_num > m_verbs_device_attr->m_phys_port_cnt || port_num < 1) {
+        DOCA_LOG(LOG_ERR, "port_num should be from %u to %u (value is %u)", 1,
+                 m_verbs_device_attr->m_phys_port_cnt, port_num);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_valid(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_CURRENT_STATE) &&
+        !is_qp_attr_state_valid(verbs_qp_attr->current_state))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_NEXT_STATE) &&
+        !is_qp_attr_state_valid(verbs_qp_attr->next_state))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PATH_MTU) &&
+        !is_qp_attr_path_mtu_valid(verbs_qp_attr->path_mtu))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_RQ_PSN))
+        verbs_qp_attr->rq_psn = is_qp_attr_queue_psn_valid(verbs_qp_attr->rq_psn);
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_SQ_PSN))
+        verbs_qp_attr->sq_psn = is_qp_attr_queue_psn_valid(verbs_qp_attr->sq_psn);
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) &&
+        !is_qp_attr_ah_add_type_valid(verbs_qp_attr->ah_attr->addr_type))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) &&
+        !is_qp_attr_ah_sgid_index_valid(verbs_qp_attr->ah_attr->sgid_index))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PKEY_INDEX) &&
+        !is_qp_attr_pkey_index_valid(verbs_qp_attr->pkey_index))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PORT_NUM) &&
+        !is_qp_attr_port_num_valid(verbs_qp_attr->port_num))
+        return false;
+
+    return true;
+}
+
+doca_verbs_qp_state doca_verbs_qp::get_current_state() const noexcept { return m_current_state; }
+
+doca_error_t doca_verbs_qp::create_qp_obj(
+    uint32_t uar_id, uint32_t log_rq_size, uint32_t log_sq_size_wqebb, uint32_t log_stride,
+    uint64_t dbr_umem_offset, uint32_t dbr_umem_id, uint32_t wq_umem_id,
+    struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept {
+    create_qp_in create_in{0};
+    create_qp_out create_out{0};
+
+    void *qpc = MLX5_ADDR_OF(create_qp_in, create_in, qpc);
+
+    DEVX_SET(create_qp_in, create_in, opcode, MLX5_CMD_OP_CREATE_QP);
+    DEVX_SET(qpc, qpc, st, MLX5_QPC_ST_RC);
+
+    struct mlx5dv_pd dvpd;
+    struct mlx5dv_obj dv_obj;
+    // Query pdn
+    memset(&dv_obj, 0, sizeof(dv_obj));
+    dv_obj.pd.in = m_pd;
+    dv_obj.pd.out = &dvpd;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_PD);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Error in mlx5dv PD initialization");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    DEVX_SET(qpc, qpc, pd, dvpd.pdn);
+
+    DEVX_SET(qpc, qpc, user_index, verbs_qp_init_attr.user_index);
+    DEVX_SET(qpc, qpc, uar_page, uar_id);
+
+    if (m_sq_size_wqebb > 0) {
+        if (verbs_qp_init_attr.send_cq == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP. Send CQ is null");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        DEVX_SET(qpc, qpc, cqn_snd, verbs_qp_init_attr.send_cq->get_cqn());
+        DEVX_SET(qpc, qpc, log_sq_size, log_sq_size_wqebb);
+    } else {
+        DEVX_SET(qpc, qpc, no_sq, 1);
+    }
+
+    if ((m_rq_size > 0) || (verbs_qp_init_attr.srq != nullptr)) {
+        if (verbs_qp_init_attr.receive_cq == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP. Receive CQ is null");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+
+        DEVX_SET(qpc, qpc, cqn_rcv, verbs_qp_init_attr.receive_cq->get_cqn());
+
+        if (verbs_qp_init_attr.srq != nullptr) {
+            /* Case of SRQ */
+            DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, verbs_qp_init_attr.srq->get_srqn());
+            DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_SRQ_RMP_XRC_SRQ_XRQ);
+            m_srq = verbs_qp_init_attr.srq;
+        } else if (m_rq_size > 0) {
+            /* Case of regular RQ */
+            DEVX_SET(qpc, qpc, log_rq_stride, log_stride);
+            DEVX_SET(qpc, qpc, log_rq_size, log_rq_size);
+            DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_REGULAR);
+        }
+    } else {
+        /* Case of no RQ */
+        DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_ZERO_SIZE_RQ);
+    }
+
+    // DEVX_SET(qpc, qpc, cs_req, 0);            // Disable CS Request
+    // DEVX_SET(qpc, qpc, cs_res, 0);            // Disable CS Response
+
+    DEVX_SET(qpc, qpc, dbr_umem_valid, 1);
+    DEVX_SET(qpc, qpc, dbr_umem_id, dbr_umem_id);
+    DEVX_SET64(qpc, qpc, dbr_addr, dbr_umem_offset);
+    DEVX_SET64(qpc, qpc, cd_master, verbs_qp_init_attr.core_direct_master);
+    DEVX_SET(create_qp_in, create_in, wq_umem_id, wq_umem_id);
+    DEVX_SET(create_qp_in, create_in, wq_umem_valid, 1);
+
+    /* Since wq_umem_valid == 1, FW deduces page size from umem and this field is reserved */
+    DEVX_SET(qpc, qpc, log_page_size, 0);
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ibv_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_qp_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create QP. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_qp_num = DEVX_GET(create_qp_out, create_out, qpn);
+    m_current_state = DOCA_VERBS_QP_STATE_RST;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rst2init(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    rst2init_qp_in in{0};
+    rst2init_qp_out out{0};
+
+    if (!is_rst2init_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rst2init attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(rst2init_qp_in, &in, qpc);
+    DEVX_SET(rst2init_qp_in, &in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+    DEVX_SET(rst2init_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, verbs_qp_attr.port_num);
+    DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+    // DEVX_SET(qpc, qpc, counter_set_id, 0x0);  // Not connected to a counter set
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rst2init");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_INIT;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Init state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::init2init(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                      int attr_mask) noexcept {
+    init2init_qp_in in{0};
+    init2init_qp_out out{0};
+
+    if (!is_init2init_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "init2init attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(init2init_qp_in, &in, qpc);
+    DEVX_SET(init2init_qp_in, &in, opcode, MLX5_CMD_OP_INIT2INIT_QP);
+    DEVX_SET(init2init_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, verbs_qp_attr.port_num);
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_INIT2INIT);
+    DEVX_SET(init2init_qp_in, &in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP init2init");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_INIT;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Init state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::init2rtr(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    if (!is_init2rtr_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "init2rtr attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) && !verbs_qp_attr.ah_attr) {
+        DOCA_LOG(LOG_ERR, "AH_ATTR mask is enabled but ah_attr=nullptr");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    init2rtr_qp_in in{0};
+    init2rtr_qp_out out{0};
+
+    void *qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
+    DEVX_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+    DEVX_SET(init2rtr_qp_in, in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, next_rcv_psn, verbs_qp_attr.rq_psn);
+    DEVX_SET(qpc, qpc, remote_qpn, verbs_qp_attr.dest_qp_num);
+    DEVX_SET(qpc, qpc, log_msg_max, sc_verbs_log_msg_max);
+
+    uint32_t prm_mtu{};
+    auto status = convert_doca_mtu_size_to_prm_mtu_size(verbs_qp_attr.path_mtu, prm_mtu);
+    if (status != DOCA_SUCCESS) return status;
+    DEVX_SET(qpc, qpc, mtu, prm_mtu);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IB_GRH)
+        DEVX_SET(qpc, qpc, primary_address_path.tclass, verbs_qp_attr.ah_attr->traffic_class);
+    DEVX_SET(qpc, qpc, primary_address_path.stat_rate, verbs_qp_attr.ah_attr->static_rate);
+
+    if (verbs_qp_attr.ah_attr->addr_type != DOCA_VERBS_ADDR_TYPE_IB_NO_GRH) {
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+               verbs_qp_attr.ah_attr->gid.raw, sizeof(struct doca_verbs_gid));
+        DEVX_SET(qpc, qpc, primary_address_path.hop_limit, verbs_qp_attr.ah_attr->hop_limit);
+        DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, verbs_qp_attr.ah_attr->sgid_index);
+    }
+
+    DEVX_SET(qpc, qpc, primary_address_path.rlid, verbs_qp_attr.ah_attr->dlid);
+    DEVX_SET(qpc, qpc, primary_address_path.sl, verbs_qp_attr.ah_attr->sl);
+
+    if ((verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IPv4) ||
+        (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IPv6)) { /* ROCE */
+        uint8_t dest_mac[PRIV_DOCA_MAC_BYTE_LENGTH];
+        status =
+            resolve_remote_mac(m_pd, PRIV_DOCA_VERBS_PORT_NUM, verbs_qp_attr.ah_attr->sgid_index,
+                               verbs_qp_attr.ah_attr->gid.raw, verbs_qp_attr.ah_attr->hop_limit,
+                               verbs_qp_attr.ah_attr->is_global, dest_mac);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get remote MAC");
+            return status;
+        }
+
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), dest_mac,
+               sc_verbs_mac_addr_2msbytes_len);
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_31_0),
+               dest_mac + sc_verbs_mac_addr_2msbytes_len,
+               sc_verbs_mac_addr_len - sc_verbs_mac_addr_2msbytes_len);
+    }
+
+    if (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IB_GRH) {
+        DEVX_SET(qpc, qpc, primary_address_path.grh, 1);
+    }
+
+    if (m_verbs_device_attr->m_port_type == MLX5_CAP_PORT_TYPE_ETH) {
+        uint8_t roce_version{};
+        status = query_roce_version(m_ibv_ctx, verbs_qp_attr.ah_attr->sgid_index, roce_version);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to query roce version");
+            return status;
+        }
+
+        if (roce_version >= MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0) {
+            // generate a random udp_sport
+            srand(time(NULL));
+            uint16_t udp_sport = (uint16_t)random_in_range(m_verbs_device_attr->m_min_udp_sport,
+                                                           m_verbs_device_attr->m_max_udp_sport);
+            DOCA_LOG(LOG_INFO, "Generated udp_sport = %d", udp_sport);
+
+            DEVX_SET(qpc, qpc, primary_address_path.udp_sport, udp_sport);
+            DEVX_SET(qpc, qpc, primary_address_path.dscp,
+                     verbs_qp_attr.ah_attr->traffic_class >> 2);
+        }
+    }
+
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_INIT2RTR);
+    DEVX_SET(init2rtr_qp_in, in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP init2rtr, syndrome=0x%x",
+                 DEVX_GET(nop_out, out, syndrome));
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTR;
+    m_addr_type = verbs_qp_attr.ah_attr->addr_type;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to RTR state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rtr2rts(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                    int attr_mask) noexcept {
+    rtr2rts_qp_in in{0};
+    rtr2rts_qp_out out{0};
+
+    if (!is_rtr2rts_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rtr2rts attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(rtr2rts_qp_in, &in, qpc);
+    DEVX_SET(rtr2rts_qp_in, &in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+    DEVX_SET(rtr2rts_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, next_send_psn, verbs_qp_attr.sq_psn);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ACK_TIMEOUT)
+        DEVX_SET(qpc, qpc, primary_address_path.ack_timeout, verbs_qp_attr.ack_timeout);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_RETRY_CNT)
+        DEVX_SET(qpc, qpc, retry_count, verbs_qp_attr.retry_cnt);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_RNR_RETRY)
+        DEVX_SET(qpc, qpc, rnr_retry, verbs_qp_attr.rnr_retry);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    DEVX_SET(qpc, qpc, log_ack_req_freq, 0x0);  // 8
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_RTR2RTS);
+
+    DEVX_SET(rtr2rts_qp_in, &in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rtr2rts");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTS;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to RTS state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rts2rts(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                    int attr_mask) noexcept {
+    if (!is_rts2rts_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rts2rts attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) && !verbs_qp_attr.ah_attr) {
+        DOCA_LOG(LOG_ERR, "AH_ATTR mask is enabled but ah_attr=nullptr");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    rts2rts_qp_in in{0};
+    rts2rts_qp_out out{0};
+
+    void *qpc = MLX5_ADDR_OF(rts2rts_qp_in, in, qpc);
+    DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
+    DEVX_SET(rts2rts_qp_in, in, qpn, m_qp_num);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) {
+        DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, verbs_qp_attr.ah_attr->sgid_index);
+
+        if (m_verbs_device_attr->m_is_rts2rts_qp_dscp_supported &&
+            m_verbs_device_attr->m_port_type == MLX5_CAP_PORT_TYPE_ETH) {
+            uint8_t roce_version{};
+            auto status =
+                query_roce_version(m_ibv_ctx, verbs_qp_attr.ah_attr->sgid_index, roce_version);
+            if (status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to query roce version");
+                return status;
+            }
+
+            if (roce_version >= MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0)
+                DEVX_SET(qpc, qpc, primary_address_path.dscp,
+                         verbs_qp_attr.ah_attr->traffic_class >> 2);
+        }
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_RTS2RTS);
+
+    DEVX_SET(rts2rts_qp_in, in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rts2rts");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTS;
+
+    DOCA_LOG(LOG_INFO, "IB Verbs QP %p: has been successfully moved to RTS state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::qp2err(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                   int attr_mask) noexcept {
+    qp_2err_in in{0};
+    qp_2err_out out{0};
+
+    if (!is_X2err_attrs_valid(attr_mask)) {
+        DOCA_LOG(LOG_ERR, "X2err attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    DEVX_SET(qp_2err_in, in, opcode, MLX5_CMD_OP_QP_2ERR);
+    DEVX_SET(qp_2err_in, in, qpn, m_qp_num);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP 2err");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_ERR;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Error state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::qp2rst(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                   int attr_mask) noexcept {
+    qp_2rst_in in{0};
+    qp_2rst_out out{0};
+
+    if (!is_X2rst_attrs_valid(attr_mask)) {
+        DOCA_LOG(LOG_ERR, "X2rst attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    DEVX_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_QP_2RST);
+    DEVX_SET(qp_2rst_in, in, qpn, m_qp_num);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP 2rst");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RST;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Reset state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::query_qp(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept {
+    query_qp_in in{0};
+    query_qp_out out{0};
+
+    DEVX_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
+    DEVX_SET(query_qp_in, in, qpn, m_qp_num);
+
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_obj_query(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query QP");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    /* Set verbs_qp_attr with the QP information */
+    const void *qpc = MLX5_ADDR_OF(query_qp_out, out, qpc);
+    auto prm_qp_state = DEVX_GET(qpc, qpc, state);
+
+    auto status =
+        convert_prm_qp_state_to_doca_verbs_qp_state(prm_qp_state, verbs_qp_attr.current_state);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get state, invalid qp state");
+        return DOCA_ERROR_UNEXPECTED;
+    }
+
+    verbs_qp_attr.next_state = verbs_qp_attr.current_state;
+
+    auto prm_mtu_size = DEVX_GET(qpc, qpc, mtu);
+    status = convert_prm_mtu_size_to_doca_verbs_mtu_size(prm_mtu_size, verbs_qp_attr.path_mtu);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get state, invalid MTU size");
+        return DOCA_ERROR_UNEXPECTED;
+    }
+
+    verbs_qp_attr.rq_psn = DEVX_GET(qpc, qpc, next_rcv_psn);
+    verbs_qp_attr.sq_psn = DEVX_GET(qpc, qpc, next_send_psn);
+    verbs_qp_attr.dest_qp_num = DEVX_GET(qpc, qpc, remote_qpn);
+    verbs_qp_attr.pkey_index = DEVX_GET(qpc, qpc, primary_address_path.pkey_index);
+    verbs_qp_attr.port_num = DEVX_GET(qpc, qpc, primary_address_path.vhca_port_num);
+    verbs_qp_attr.ack_timeout = DEVX_GET(qpc, qpc, primary_address_path.ack_timeout);
+    verbs_qp_attr.retry_cnt = DEVX_GET(qpc, qpc, retry_count);
+    verbs_qp_attr.rnr_retry = DEVX_GET(qpc, qpc, rnr_retry);
+    verbs_qp_attr.min_rnr_timer = DEVX_GET(qpc, qpc, min_rnr_nak);
+    verbs_qp_attr.allow_remote_write = DEVX_GET(qpc, qpc, rwe);
+    verbs_qp_attr.allow_remote_read = DEVX_GET(qpc, qpc, rre);
+    // verbs_qp_attr.allow_remote_atomic = DEVX_GET(qpc, qpc, rae);
+
+    if (verbs_qp_attr.ah_attr != nullptr) {
+        verbs_qp_attr.ah_attr->addr_type = m_addr_type;
+        verbs_qp_attr.ah_attr->dlid = DEVX_GET(qpc, qpc, primary_address_path.rlid);
+        verbs_qp_attr.ah_attr->sl = DEVX_GET(qpc, qpc, primary_address_path.sl);
+        verbs_qp_attr.ah_attr->sgid_index = DEVX_GET(qpc, qpc, primary_address_path.src_addr_index);
+        verbs_qp_attr.ah_attr->static_rate = DEVX_GET(qpc, qpc, primary_address_path.stat_rate);
+        verbs_qp_attr.ah_attr->hop_limit = DEVX_GET(qpc, qpc, primary_address_path.hop_limit);
+        verbs_qp_attr.ah_attr->traffic_class = DEVX_GET(qpc, qpc, primary_address_path.tclass);
+
+        memcpy(verbs_qp_attr.ah_attr->gid.raw,
+               MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+               sizeof(struct doca_verbs_gid));
+    }
+
+    /* Set verbs_qp_init_attr with the QP information */
+    verbs_qp_init_attr.send_cq = m_init_attr.send_cq;
+    verbs_qp_init_attr.receive_cq = m_init_attr.receive_cq;
+    verbs_qp_init_attr.sq_sig_all = m_init_attr.sq_sig_all;
+    verbs_qp_init_attr.qp_context = m_init_attr.qp_context;
+    verbs_qp_init_attr.pd = m_pd;
+    verbs_qp_init_attr.sq_wr = m_sq_size_wr;
+    verbs_qp_init_attr.rq_wr = m_rq_size;
+    verbs_qp_init_attr.receive_max_sges = m_rcv_max_sges;
+    verbs_qp_init_attr.user_index = DEVX_GET(qpc, qpc, user_index);
+    verbs_qp_init_attr.qp_type = m_qp_type;
+    verbs_qp_init_attr.send_max_sges = m_send_max_sges;
+    verbs_qp_init_attr.max_inline_data = m_init_attr.max_inline_data;
+    verbs_qp_init_attr.external_umem = m_init_attr.external_umem;
+    verbs_qp_init_attr.external_umem_offset = m_init_attr.external_umem_offset;
+    verbs_qp_init_attr.external_uar = m_init_attr.external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_qp::create(struct ibv_context *ibv_ctx) {
+    auto status{DOCA_SUCCESS};
+    m_ibv_ctx = ibv_ctx;
+    m_pd = m_init_attr.pd;
+
+    if ((m_init_attr.external_umem != nullptr && m_init_attr.external_umem_dbr == nullptr) ||
+        (m_init_attr.external_umem == nullptr && m_init_attr.external_umem_dbr != nullptr)) {
+        DOCA_LOG(LOG_ERR, "Both UMEM should be either external or internal");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    /* Query device attr */
+    status = doca_verbs_query_device(ibv_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (m_init_attr.qp_type != DOCA_VERBS_QP_TYPE_RC) {
+        DOCA_LOG(LOG_ERR, "QP type is not valid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    uint32_t log_rq_size{0};
+    uint32_t log_stride{0};
+    uint32_t log_sq_size_wqebb{0};
+
+    /* Calculate Work Queue sizes */
+    if (m_init_attr.rq_wr > 0 && m_init_attr.srq == nullptr) {
+        if (m_init_attr.rq_wr > m_verbs_device_attr->m_max_qp_wr) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: rq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        if (m_init_attr.receive_max_sges == 0) {
+            DOCA_LOG(
+                LOG_ERR,
+                "Failed to create IB Verbs QP: rq_wr is greater than 0 but receive_max_sges is 0");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_rcv_max_sges = doca_internal_utils_next_power_of_two(m_init_attr.receive_max_sges);
+        /* Calculate receive_wqe size */
+        m_rcv_wqe_size = m_rcv_max_sges * DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+        if (m_rcv_wqe_size > m_verbs_device_attr->m_max_rq_desc_size) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: rcv_max_sges is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_log_rcv_wqe_size = static_cast<uint8_t>(doca_internal_utils_log2(m_rcv_wqe_size));
+        log_stride = m_log_rcv_wqe_size - sc_verbs_qp_log_rq_stride_shift;
+
+        /* Calculate RQ size in bytes */
+        auto rq_size_bytes = static_cast<uint32_t>(
+            doca_internal_utils_next_power_of_two(m_init_attr.rq_wr * m_rcv_wqe_size));
+        /* Minimum size of RQ is 64 bytes */
+        rq_size_bytes = MAX(rq_size_bytes, DOCA_VERBS_WQEBB_SIZE);
+        /* Calculate RQ size in receive_wqe units */
+        m_rq_size = rq_size_bytes / m_rcv_wqe_size;
+        log_rq_size = doca_internal_utils_log2(m_rq_size);
+    }
+
+    if (m_init_attr.sq_wr > 0) {
+        // This check is done in rdma-core
+        if (m_init_attr.sq_wr > (0x7fffffff / m_verbs_device_attr->m_max_sq_desc_size)) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: sq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        if (m_init_attr.send_max_sges == 0) {
+            DOCA_LOG(
+                LOG_ERR,
+                "Failed to create IB Verbs QP: sq_wr is greater than 0 but send_max_sges is 0");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_send_max_sges = m_init_attr.send_max_sges;
+        /* Calculate Send WQE size, which is the size of one control segment, size of one rdma
+         * segment and a single data segment multiplied by the maximum number of send SGEs */
+        uint32_t send_wqe_size =
+            sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+            sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+            (m_init_attr.send_max_sges * sizeof(struct doca_gpunetio_ib_mlx5_wqe_data_seg));
+        if (send_wqe_size > m_verbs_device_attr->m_max_sq_desc_size) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: send_max_sges is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+
+        uint32_t send_wqe_inline_size{};
+        if (m_init_attr.max_inline_data > 0) {
+            /* Calculate inline data segment size, which is composed of:
+             * - size of mlx5_wqe_inl_data_seg (4 Bytes for byte_count and is_inline
+             * attributes)
+             * - max_inline_data size */
+            uint32_t inline_data_seg_size =
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg) + m_init_attr.max_inline_data;
+            /* Align the size to OCTOWORD_SIZE (16 bytes) */
+            inline_data_seg_size =
+                doca_internal_utils_align_up_uint64(inline_data_seg_size, DOCA_VERBS_OCTOWORD_SIZE);
+            /* Calculate Send WQE with inline data size, which is the size of one control
+             * segment, size of one rdma segment and the total inline data segment size */
+            send_wqe_inline_size = sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+                                   inline_data_seg_size;
+            if (send_wqe_inline_size > m_verbs_device_attr->m_max_sq_desc_size) {
+                DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: max_inline_data is too big");
+                throw DOCA_ERROR_INVALID_VALUE;
+            }
+        }
+
+        /* Set m_send_wqe_size to the maximum value between the sizes of send_wqe_size and
+         * send_wqe_inline_size
+         */
+        m_send_wqe_size = MAX(send_wqe_size, send_wqe_inline_size);
+
+        /* Align size of send_wqe_size to WQEBB size */
+        m_send_wqe_size =
+            doca_internal_utils_align_up_uint32(m_send_wqe_size, DOCA_VERBS_WQEBB_SIZE);
+        /* Calculate sq_size in bytes */
+        auto sq_size_bytes = static_cast<uint32_t>(
+            doca_internal_utils_next_power_of_two(m_send_wqe_size * m_init_attr.sq_wr));
+        /* Calculate sq_size in wqebb units */
+        m_sq_size_wqebb = sq_size_bytes / DOCA_VERBS_WQEBB_SIZE;
+        if (m_sq_size_wqebb > m_verbs_device_attr->m_max_send_wqebb) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: sq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        log_sq_size_wqebb = doca_internal_utils_log2(m_sq_size_wqebb);
+        /* Calculate sq_size in Work Request units */
+        m_sq_size_wr = sq_size_bytes / m_send_wqe_size;
+
+        /* Due to alignments we may have more space for inline data */
+        if (m_init_attr.max_inline_data > 0) {
+            m_init_attr.max_inline_data =
+                m_send_wqe_size - (sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg));
+            m_max_inline_data_length = m_init_attr.max_inline_data;
+        }
+    }
+
+    uint32_t uar_id{};
+    if (m_init_attr.external_uar == nullptr) {
+        /* Case of internal UAR */
+        auto uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+            m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_BF, &m_uar_obj);
+        if (uar_status != DOCA_SUCCESS) {
+            uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+                m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_NC, &m_uar_obj);
+            if (uar_status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to create UAR");
+                throw DOCA_ERROR_DRIVER;
+            }
+        }
+
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(m_uar_obj->reg_addr);
+        uar_id = m_uar_obj->page_id;
+    } else {
+        /* Case of external UAR */
+        status = doca_verbs_uar_id_get(m_init_attr.external_uar, &uar_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR ID");
+            throw status;
+        }
+
+        void *reg_addr{};
+        status = doca_verbs_uar_reg_addr_get(m_init_attr.external_uar, &reg_addr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR reg_addr");
+            throw status;
+        }
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(reg_addr);
+    }
+
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+    uint32_t wq_umem_id{0};
+
+    if (m_init_attr.external_umem == nullptr) {
+        auto db_umem_offset =
+            (m_rq_size * m_rcv_wqe_size) + (m_sq_size_wqebb * DOCA_VERBS_WQEBB_SIZE);
+        /* Align the Work Queue size to cacheline size for better performance */
+        db_umem_offset =
+            doca_internal_utils_align_up_uint32(db_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+        /* Case of internal umem */
+        auto total_umem_size = doca_internal_utils_align_up_uint32(
+            db_umem_offset + sc_verbs_qp_doorbell_size, DOCA_VERBS_PAGE_SIZE);
+
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+
+        memset(m_umem_buf, 0, total_umem_size);
+
+        m_wq_buf = m_umem_buf;
+        m_rq_buf = m_wq_buf;
+        m_sq_buf = m_wq_buf + ((uintptr_t)m_rq_size << m_log_rcv_wqe_size);
+
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ibv_ctx, m_wq_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP UMEM");
+            throw DOCA_ERROR_DRIVER;
+        }
+
+        wq_umem_id = m_umem_obj->umem_id;
+        dbr_umem_offset = db_umem_offset;
+        dbr_umem_id = wq_umem_id;
+
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_wq_buf + db_umem_offset);
+    } else {
+        uint8_t *tmp_db_buffer;
+
+        /* Case of external umem for wq and dbr */
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_wq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        m_wq_buf += m_init_attr.external_umem_offset;
+        m_rq_buf = m_wq_buf;
+        m_sq_buf = m_wq_buf + ((uintptr_t)m_rq_size << m_log_rcv_wqe_size);
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem, &wq_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem_dbr,
+                                             reinterpret_cast<void **>(&tmp_db_buffer));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem_dbr, &dbr_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        dbr_umem_offset = m_init_attr.external_umem_dbr_offset;
+        m_db_buffer = reinterpret_cast<uint32_t *>(tmp_db_buffer + dbr_umem_offset);
+    }
+
+    /* Create QP object */
+    status = create_qp_obj(uar_id, log_rq_size, log_sq_size_wqebb, log_stride, dbr_umem_offset,
+                           dbr_umem_id, wq_umem_id, m_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create QP object");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully created", this);
+}
+
+doca_error_t doca_verbs_qp::destroy() noexcept {
+    doca_error_t ret = DOCA_SUCCESS;
+
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_qp_obj) {
+        ret = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_qp_obj);
+        if (ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy QP object");
+            return DOCA_ERROR_DRIVER;
+        }
+        m_qp_obj = nullptr;
+    }
+
+    if (m_uar_obj) {
+        doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        m_uar_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        ret = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return DOCA_ERROR_DRIVER;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_verbs_qp::doca_verbs_qp(struct ibv_context *ibv_ctx,
+                             struct doca_verbs_qp_init_attr &verbs_qp_init_attr)
+    : m_ibv_ctx(ibv_ctx), m_init_attr(verbs_qp_init_attr) {
+    try {
+        create(ibv_ctx);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create QP");
+        throw;
+    }
+}
+
+doca_verbs_qp::~doca_verbs_qp() { static_cast<void>(destroy()); }
+
+uint32_t doca_verbs_qp::get_qpn() const noexcept { return m_qp_num; }
+
+void *doca_verbs_qp::get_dbr_addr() const noexcept { return (void *)m_db_buffer; }
+
+void *doca_verbs_qp::get_uar_addr() const noexcept { return (void *)m_uar_db_reg; }
+
+enum doca_verbs_uar_allocation_type doca_verbs_qp::get_uar_mtype() const noexcept {
+    return m_init_attr.external_uar->get_uar_mtype();
+}
+
+void *doca_verbs_qp::get_sq_buf() const noexcept { return m_sq_buf; }
+
+void *doca_verbs_qp::get_rq_buf() const noexcept { return (void *)m_wq_buf; }
+
+uint32_t doca_verbs_qp::get_sq_size_wqebb() const noexcept { return m_sq_size_wqebb; }
+
+uint32_t doca_verbs_qp::get_rq_size() const noexcept { return m_rq_size; }
+
+uint32_t doca_verbs_qp::get_rcv_wqe_size() const noexcept { return m_rcv_wqe_size; }
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_qp_init_attr_create(struct doca_verbs_qp_init_attr **verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_init_attr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_qp_init_attr =
+        (struct doca_verbs_qp_init_attr *)calloc(1, sizeof(struct doca_verbs_qp_init_attr));
+    if (*verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_init_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_destroy(struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy qp_init_attr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_qp_init_attr);
+    verbs_qp_init_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_pd(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                            struct ibv_pd *pd) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter pd is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->pd = pd;
+
+    return DOCA_SUCCESS;
+}
+
+struct ibv_pd *doca_verbs_qp_init_attr_get_pd(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pd: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->pd;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_send_cq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 struct doca_verbs_cq *send_cq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (send_cq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_cq: parameter send_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->send_cq = send_cq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_send_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get send_cq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->send_cq;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_receive_cq(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_cq *receive_cq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (receive_cq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter receive_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->receive_cq = receive_cq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_receive_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_cq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->receive_cq;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_sq_sig_all(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, int sq_sig_all) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_sig_all: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->sq_sig_all = sq_sig_all;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_init_attr_get_sq_sig_all(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_sig_all: parameter verbs_qp_init_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_init_attr->sq_sig_all;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_sq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t sq_wr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_wr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->sq_wr = sq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_sq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_wr: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->sq_wr;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_rq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t rq_wr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->rq_wr = rq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_rq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rq_wr: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->rq_wr;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_send_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t send_max_sges) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_max_sges: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->send_max_sges = send_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_send_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get send_max_sges: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->send_max_sges;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_receive_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t receive_max_sges) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->receive_max_sges = receive_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_receive_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_max_sges: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->receive_max_sges;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_max_inline_data(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t max_inline_data) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set max_inline_data: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->max_inline_data = max_inline_data;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_max_inline_data(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get max_inline_data: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->max_inline_data;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_user_index(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t user_index) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set user_index: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if ((user_index & USER_INDEX_MSB_8BITS_MASK) != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set user_index: input parameter user_index=%u exceeds 24 bits",
+                 user_index);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->user_index = user_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_user_index(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get user_index: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->user_index;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_qp_type(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 uint32_t qp_type) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_type: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->qp_type = qp_type;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_qp_type(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get qp_type: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->qp_type;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_umem = external_umem;
+    verbs_qp_init_attr->external_umem_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_dbr_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_umem_dbr = external_umem;
+    verbs_qp_init_attr->external_umem_dbr_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_get_external_umem(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem_offset == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem_offset is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_umem = verbs_qp_init_attr->external_umem;
+    *external_umem_offset = verbs_qp_init_attr->external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_uar(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar *external_uar) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_uar = external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_get_external_uar(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_uar **external_uar) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_uar: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_uar = verbs_qp_init_attr->external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_qp_context(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, void *qp_context) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_context: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (qp_context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_context: parameter qp_context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->qp_context = qp_context;
+
+    return DOCA_SUCCESS;
+}
+
+void *doca_verbs_qp_init_attr_get_qp_context(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get qp_context: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->qp_context;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_core_direct_master(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint8_t core_direct_master) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set core_direct_master: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (core_direct_master != 0x0 && core_direct_master != 0x1) {
+        DOCA_LOG(LOG_ERR, "Failed to set core_direct_master: invalid input value %d",
+                 core_direct_master);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->core_direct_master = core_direct_master;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_qp_init_attr_get_core_direct_master(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get core_direct_master: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->core_direct_master;
+}
+
+doca_error_t doca_verbs_qp_attr_create(struct doca_verbs_qp_attr **verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_qp_attr = (struct doca_verbs_qp_attr *)calloc(1, sizeof(struct doca_verbs_qp_attr));
+    if (*verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_attr_destroy(struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy qp_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_qp_attr);
+    verbs_qp_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_attr_set_next_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               enum doca_verbs_qp_state next_state) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set next_state: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->next_state = next_state;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_next_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get next_state: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_qp_state>(0);
+    }
+
+    return verbs_qp_attr->next_state;
+}
+
+doca_error_t doca_verbs_qp_attr_set_current_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  enum doca_verbs_qp_state current_state) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set current_state: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->current_state = current_state;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_current_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get current_state: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_qp_state>(0);
+    }
+
+    return verbs_qp_attr->current_state;
+}
+
+doca_error_t doca_verbs_qp_attr_set_path_mtu(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             enum doca_verbs_mtu_size path_mtu) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set path_mtu: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->path_mtu = path_mtu;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_mtu_size doca_verbs_qp_attr_get_path_mtu(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get path_mtu: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_mtu_size>(0);
+    }
+
+    return verbs_qp_attr->path_mtu;
+}
+
+doca_error_t doca_verbs_qp_attr_set_rq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t rq_psn) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set rq_psn: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->rq_psn = rq_psn;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_rq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rq_psn: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->rq_psn;
+}
+
+doca_error_t doca_verbs_qp_attr_set_sq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t sq_psn) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_psn: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->sq_psn = sq_psn;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_sq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_psn: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->sq_psn;
+}
+
+doca_error_t doca_verbs_qp_attr_set_dest_qp_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint32_t dest_qp_num) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set dest_qp_num: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->dest_qp_num = dest_qp_num;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_dest_qp_num(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get dest_qp_num: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->dest_qp_num;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_write(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                       int allow_remote_write) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_write: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_write = allow_remote_write;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_attr_get_allow_remote_write(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_write: parameter verbs_qp_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_attr->allow_remote_write;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_read(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                      int allow_remote_read) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_read: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_read = allow_remote_read;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_attr_get_allow_remote_read(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_read: parameter verbs_qp_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_attr->allow_remote_read;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_atomic(
+    struct doca_verbs_qp_attr *verbs_qp_attr, enum doca_verbs_qp_atomic_type atomic_type) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_atomic: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_atomic = atomic_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_atomic_type doca_verbs_qp_attr_get_allow_remote_atomic(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_atomic: parameter verbs_qp_attr is NULL");
+        return DOCA_VERBS_QP_ATOMIC_MODE_NONE;
+    }
+
+    return verbs_qp_attr->allow_remote_atomic;
+}
+
+doca_error_t doca_verbs_qp_attr_set_ah_attr(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                            doca_verbs_ah_attr *ah_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ah_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (ah_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ah_attr: parameter ah_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->ah_attr = ah_attr;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_ah_attr *doca_verbs_qp_attr_get_ah_attr(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ah_attr: parameter verbs_qp_attr is NULL");
+        return nullptr;
+    }
+    if (verbs_qp_attr->ah_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ah_attr: ah_attr object was not set previously");
+        return nullptr;
+    }
+
+    return verbs_qp_attr->ah_attr;
+}
+
+doca_error_t doca_verbs_qp_attr_set_pkey_index(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               uint16_t pkey_index) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pkey_index: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->pkey_index = pkey_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_pkey_index(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pkey_index: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->pkey_index;
+}
+
+doca_error_t doca_verbs_qp_attr_set_port_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             uint16_t port_num) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set port_num: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->port_num = port_num;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_port_num(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get port_num: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->port_num;
+}
+
+doca_error_t doca_verbs_qp_attr_set_ack_timeout(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint16_t ack_timeout) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ack_timeout: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->ack_timeout = ack_timeout;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_ack_timeout(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ack_timeout: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->ack_timeout;
+}
+
+doca_error_t doca_verbs_qp_attr_set_retry_cnt(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t retry_cnt) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set retry_cnt: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->retry_cnt = retry_cnt;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_retry_cnt(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get retry_cnt: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->retry_cnt;
+}
+
+doca_error_t doca_verbs_qp_attr_set_rnr_retry(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t rnr_retry) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set rnr_retry: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->rnr_retry = rnr_retry;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_rnr_retry(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rnr_retry: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->rnr_retry;
+}
+
+doca_error_t doca_verbs_qp_attr_set_min_rnr_timer(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  uint16_t min_rnr_timer) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set min_rnr_timer: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->min_rnr_timer = min_rnr_timer;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_min_rnr_timer(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get min_rnr_timer: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->min_rnr_timer;
+}
+
+doca_error_t doca_verbs_ah_attr_create(struct ibv_context *context,
+                                       struct doca_verbs_ah_attr **verbs_ah) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_ah = (struct doca_verbs_ah_attr *)calloc(1, sizeof(struct doca_verbs_ah_attr));
+    if (*verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    (*verbs_ah)->is_global = 1;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_ah_attr_destroy(struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_ah: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_ah);
+    verbs_ah = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_ah_attr_set_gid(struct doca_verbs_ah_attr *verbs_ah,
+                                        struct doca_verbs_gid gid) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set gid: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->gid = gid;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_gid doca_verbs_ah_get_gid(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get gid: parameter verbs_ah is NULL");
+        struct doca_verbs_gid zero_gid {};
+        memset(&zero_gid, 0, sizeof(zero_gid));
+        return zero_gid;
+    }
+
+    return verbs_ah->gid;
+}
+
+doca_error_t doca_verbs_ah_attr_set_addr_type(struct doca_verbs_ah_attr *verbs_ah,
+                                              enum doca_verbs_addr_type addr_type) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set addr_type: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->addr_type = addr_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_addr_type doca_verbs_ah_get_addr_type(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get addr_type: parameter verbs_ah is NULL");
+        return static_cast<enum doca_verbs_addr_type>(0);
+    }
+
+    return verbs_ah->addr_type;
+}
+
+doca_error_t doca_verbs_ah_attr_set_dlid(struct doca_verbs_ah_attr *verbs_ah, uint32_t dlid) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set dlid: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->dlid = dlid;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_ah_get_dlid(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get dlid: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->dlid;
+}
+
+doca_error_t doca_verbs_ah_attr_set_sl(struct doca_verbs_ah_attr *verbs_ah, uint8_t sl) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sl: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->sl = sl;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_sl(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sl: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->sl;
+}
+
+doca_error_t doca_verbs_ah_attr_set_sgid_index(struct doca_verbs_ah_attr *verbs_ah,
+                                               uint8_t sgid_index) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sgid_index: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->sgid_index = sgid_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_sgid_index(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sgid_index: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->sgid_index;
+}
+
+doca_error_t doca_verbs_ah_attr_set_static_rate(struct doca_verbs_ah_attr *verbs_ah,
+                                                uint8_t static_rate) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set static_rate: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->static_rate = static_rate;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_static_rate(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get static_rate: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->static_rate;
+}
+
+doca_error_t doca_verbs_ah_attr_set_hop_limit(struct doca_verbs_ah_attr *verbs_ah,
+                                              uint8_t hop_limit) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set hop_limit: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->hop_limit = hop_limit;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_hop_limit(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get hop_limit: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->hop_limit;
+}
+
+doca_error_t doca_verbs_ah_attr_set_traffic_class(struct doca_verbs_ah_attr *verbs_ah,
+                                                  uint8_t traffic_class) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set traffic_class: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->traffic_class = traffic_class;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_traffic_class(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get traffic_class: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->traffic_class;
+}
+
+doca_error_t doca_verbs_qp_create(struct ibv_context *context,
+                                  struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                  struct doca_verbs_qp **verbs_qp) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_qp = new doca_verbs_qp(context, *verbs_qp_init_attr);
+        DOCA_LOG(LOG_INFO, "IB Verbs Context %p: verbs_qp=%p was created", context, *verbs_qp);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_destroy(struct doca_verbs_qp *verbs_qp) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_qp->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_qp.");
+        return status;
+    }
+
+    delete (verbs_qp);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_modify(struct doca_verbs_qp *verbs_qp,
+                                  struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (!verbs_qp->is_qp_attr_valid(verbs_qp_attr, attr_mask)) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: some QP attributes values are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    doca_verbs_qp_state current_state;
+    doca_verbs_qp_state next_state;
+    if (!(attr_mask & DOCA_VERBS_QP_ATTR_CURRENT_STATE))
+        current_state = verbs_qp->get_current_state();
+    else
+        current_state = verbs_qp_attr->current_state;
+    if (!(attr_mask & DOCA_VERBS_QP_ATTR_NEXT_STATE))
+        next_state = current_state;
+    else
+        next_state = verbs_qp_attr->next_state;
+
+    switch (next_state) {
+        case DOCA_VERBS_QP_STATE_RST:
+            return verbs_qp->qp2rst(*verbs_qp_attr, attr_mask);
+        case DOCA_VERBS_QP_STATE_INIT:
+            if (current_state == DOCA_VERBS_QP_STATE_RST)
+                return verbs_qp->rst2init(*verbs_qp_attr, attr_mask);
+            else if (current_state == DOCA_VERBS_QP_STATE_INIT)
+                return verbs_qp->init2init(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_RTR:
+            if (current_state == DOCA_VERBS_QP_STATE_INIT)
+                return verbs_qp->init2rtr(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_RTS:
+            if (current_state == DOCA_VERBS_QP_STATE_RTR)
+                return verbs_qp->rtr2rts(*verbs_qp_attr, attr_mask);
+            else if (current_state == DOCA_VERBS_QP_STATE_RTS)
+                return verbs_qp->rts2rts(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_ERR:
+            return verbs_qp->qp2err(*verbs_qp_attr, attr_mask);
+        default:
+            DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: invalid next_state");
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+invalid_input:
+    DOCA_LOG(LOG_ERR,
+             "Failed to modify verbs_qp: invalid combination of current_state and next_state");
+    return DOCA_ERROR_INVALID_VALUE;
+}
+
+doca_error_t doca_verbs_qp_query(struct doca_verbs_qp *verbs_qp,
+                                 struct doca_verbs_qp_attr *verbs_qp_attr,
+                                 struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return verbs_qp->query_qp(*verbs_qp_attr, *verbs_qp_init_attr);
+}
+
+uint32_t doca_verbs_qp_get_qpn(const struct doca_verbs_qp *verbs_qp) { return verbs_qp->get_qpn(); }
+
+void *doca_verbs_qp_get_dbr_addr(const struct doca_verbs_qp *verbs_qp) {
+    return verbs_qp->get_dbr_addr();
+}
+
+void *doca_verbs_qp_get_uar_addr(const struct doca_verbs_qp *verbs_qp) {
+    return verbs_qp->get_uar_addr();
+}
+
+void doca_verbs_qp_get_wq(const struct doca_verbs_qp *verbs_qp, void **sq_buf,
+                          uint32_t *sq_num_entries, void **rq_buf, uint32_t *rq_num_entries,
+                          uint32_t *rwqe_size_bytes) {
+    *sq_buf = verbs_qp->get_sq_buf();
+    *rq_buf = verbs_qp->get_rq_buf();
+    *sq_num_entries = verbs_qp->get_sq_size_wqebb();
+    *rq_num_entries = verbs_qp->get_rq_size();
+    *rwqe_size_bytes = verbs_qp->get_rcv_wqe_size();
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_srq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                             struct doca_verbs_srq *srq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq: parameter srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->srq = srq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_srq *doca_verbs_qp_init_attr_get_srq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->srq;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp
new file mode 100644
index 000000000..a4ae748ca
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp
@@ -0,0 +1,211 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+#include "doca_verbs_uar.hpp"
+
+struct doca_verbs_ah_attr {
+    struct doca_verbs_gid gid {};
+    enum doca_verbs_addr_type addr_type { DOCA_VERBS_ADDR_TYPE_IPv4 };
+    uint32_t dlid{};
+    uint8_t sl{};
+    uint8_t sgid_index{};
+    uint8_t static_rate{};
+    uint8_t hop_limit{};
+    uint8_t traffic_class{};
+    uint8_t is_global{};
+};
+
+struct doca_verbs_qp_init_attr {
+    struct ibv_pd *pd{};
+    struct doca_verbs_cq *send_cq{};
+    struct doca_verbs_cq *receive_cq{};
+    struct doca_verbs_srq *srq{};
+    struct doca_verbs_umem *external_umem{};
+    struct doca_verbs_umem *external_umem_dbr{};
+    struct doca_verbs_uar *external_uar{};
+    uint64_t external_umem_offset{};
+    uint64_t external_umem_dbr_offset{};
+    int sq_sig_all{};
+    uint32_t sq_wr{};
+    uint32_t rq_wr{};
+    uint32_t send_max_sges{};
+    uint32_t receive_max_sges{};
+    uint32_t max_inline_data{};
+    uint32_t user_index{};
+    uint32_t qp_type{};
+    void *qp_context{};
+    uint32_t send_cqn{};
+    uint32_t receive_cqn{};
+    uint8_t core_direct_master{};
+};
+
+struct doca_verbs_qp_attr {
+    enum doca_verbs_qp_state next_state { DOCA_VERBS_QP_STATE_RST };
+    enum doca_verbs_qp_state current_state { DOCA_VERBS_QP_STATE_RST };
+    enum doca_verbs_mtu_size path_mtu { DOCA_VERBS_MTU_SIZE_256_BYTES };
+    uint32_t rq_psn{};
+    uint32_t sq_psn{};
+    uint32_t dest_qp_num{};
+    int allow_remote_write{};
+    int allow_remote_read{};
+    enum doca_verbs_qp_atomic_type allow_remote_atomic {};
+    doca_verbs_ah_attr *ah_attr{};
+    uint16_t pkey_index{};
+    uint16_t port_num{};
+    uint8_t ack_timeout{};
+    uint8_t retry_cnt{};
+    uint8_t rnr_retry{};
+    uint8_t min_rnr_timer{};
+    uint8_t core_direct_master{};
+};
+
+/**
+ *  @brief This struct implements the doca_verbs_qp
+ */
+struct doca_verbs_qp {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * The ibv context
+     * @param [in] verbs_qp_init_attr
+     * The DOCA IB Verbs QP attributes
+     *
+     */
+    doca_verbs_qp(struct ibv_context *ibv_ctx, struct doca_verbs_qp_init_attr &verbs_qp_init_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_qp();
+
+    void create(struct ibv_context *ibv_ctx);
+
+    doca_error_t destroy() noexcept;
+
+    uint32_t get_qpn() const noexcept;
+
+    void *get_sq_buf() const noexcept;
+
+    void *get_rq_buf() const noexcept;
+
+    uint32_t get_sq_size_wqebb() const noexcept;
+
+    uint32_t get_rq_size() const noexcept;
+
+    uint32_t get_rcv_wqe_size() const noexcept;
+
+    void *get_dbr_addr() const noexcept;
+
+    void *get_uar_addr() const noexcept;
+
+    enum doca_verbs_uar_allocation_type get_uar_mtype() const noexcept;
+
+    doca_error_t create_qp_obj(uint32_t uar_id, uint32_t log_rq_size, uint32_t log_sq_size,
+                               uint32_t log_stride, uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+                               uint32_t wq_umem_id,
+                               struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept;
+
+    doca_error_t rst2init(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t init2init(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t init2rtr(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t rtr2rts(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t rts2rts(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t qp2err(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t qp2rst(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t query_qp(struct doca_verbs_qp_attr &verbs_qp_attr,
+                          struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept;
+
+    bool is_qp_attr_state_valid(enum doca_verbs_qp_state state) noexcept;
+
+    bool is_qp_attr_path_mtu_valid(enum doca_verbs_mtu_size path_mtu) noexcept;
+
+    uint32_t is_qp_attr_queue_psn_valid(uint32_t psn) noexcept;
+
+    bool is_qp_attr_ah_add_type_valid(enum doca_verbs_addr_type addr_type) noexcept;
+
+    bool is_qp_attr_ah_sgid_index_valid(uint8_t sgid_index) noexcept;
+
+    bool is_qp_attr_pkey_index_valid(uint16_t pkey_index) noexcept;
+
+    bool is_qp_attr_port_num_valid(uint16_t port_num) noexcept;
+
+    bool is_qp_attr_valid(struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask) noexcept;
+
+    doca_verbs_qp_state get_current_state() const noexcept;
+
+   private:
+    struct mlx5dv_devx_obj *m_qp_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_wq_buf{};
+    struct ibv_context *m_ibv_ctx{};
+    struct ibv_pd *m_pd{};
+    uint32_t m_qp_type{DOCA_VERBS_QP_TYPE_RC};
+    doca_verbs_addr_type m_addr_type{DOCA_VERBS_ADDR_TYPE_IPv4};
+    doca_verbs_qp_state m_current_state{DOCA_VERBS_QP_STATE_RST};
+    uint32_t m_rcv_wqe_size{};
+    uint32_t m_send_wqe_size{};
+    doca_verbs_qp_init_attr m_init_attr{};
+    struct doca_verbs_device_attr *m_verbs_device_attr{};
+    uint32_t m_rcv_max_sges{};
+    uint8_t m_log_rcv_wqe_size{};
+    uint32_t m_rq_size{};
+    uint32_t m_send_max_sges{};
+    uint32_t m_sq_size_wqebb{};
+    uint32_t m_sq_size_wr{};
+    uint32_t m_max_inline_data_length{};
+    uint64_t *m_uar_db_reg{};
+    uint8_t *m_sq_buf{};
+    uint8_t *m_rq_buf{};
+    uint32_t m_qp_num{};
+    uint32_t *m_db_buffer{};
+    struct doca_verbs_srq *m_srq{};
+
+    doca_verbs_qp(doca_verbs_qp const &) = delete;
+    doca_verbs_qp &operator=(doca_verbs_qp const &) = delete;
+};
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp
new file mode 100644
index 000000000..408aabe9f
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp
@@ -0,0 +1,580 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_srq.hpp"
+#include "doca_verbs_net_wrapper.h"
+
+#define DOCA_VERBS_SRQ_DB_SIZE 64
+#define DOCA_VERBS_LOG_WQEBB_SIZE 6
+#define DOCA_VERBS_MIN_SRQ_SIZE 32
+#define DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_data_seg)
+#define DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_mprq_next_seg)
+#define MAX(a, b) std::max(a, b)
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+using create_rmp_in = uint32_t[MLX5_ST_SZ_DW(create_rmp_in)];
+using create_rmp_out = uint32_t[MLX5_ST_SZ_DW(create_rmp_out)];
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_srq Member Functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_srq::create_srq_obj(
+    uint32_t log_srq_size, uint32_t log_stride, uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+    uint64_t wq_umem_offset, uint32_t wq_umem_id,
+    struct doca_verbs_srq_init_attr &verbs_srq_init_attr) noexcept {
+    create_rmp_in create_in{0};
+    create_rmp_out create_out{0};
+
+    DEVX_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP);
+
+    void *rmp_context = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+    DEVX_SET(rmpc, rmp_context, state, MLX5_SQC_STATE_RDY);
+
+    void *wq_context = MLX5_ADDR_OF(rmpc, rmp_context, wq);
+
+    struct mlx5dv_pd dvpd;
+    struct mlx5dv_obj dv_obj;
+    // Query pdn
+    memset(&dv_obj, 0, sizeof(dv_obj));
+    dv_obj.pd.in = m_pd;
+    dv_obj.pd.out = &dvpd;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_PD);
+    if (ret) {
+        DOCA_LOG(LOG_ERR, "Error in mlx5dv PD initialization");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    DEVX_SET(wq, wq_context, pd, dvpd.pdn);
+
+    if (verbs_srq_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        DEVX_SET(wq, wq_context, wq_type, 0);
+        DEVX_SET(rmpc, rmp_context, basic_cyclic_rcv_wqe, 0);
+        m_srq_type = DOCA_VERBS_SRQ_TYPE_LINKED_LIST;
+    } else {
+        DEVX_SET(wq, wq_context, wq_type, 1);
+        DEVX_SET(rmpc, rmp_context, basic_cyclic_rcv_wqe, 1);
+        m_srq_type = DOCA_VERBS_SRQ_TYPE_CONTIGUOUS;
+    }
+
+    DEVX_SET(wq, wq_context, log_wq_sz, log_srq_size);
+    DEVX_SET(wq, wq_context, log_wq_stride, log_stride);
+    DEVX_SET(wq, wq_context, end_padding_mode, 1);
+
+    DEVX_SET(wq, wq_context, wq_umem_id, wq_umem_id);
+    DEVX_SET64(wq, wq_context, wq_umem_offset, wq_umem_offset);
+    DEVX_SET(wq, wq_context, wq_umem_valid, 1);
+
+    DEVX_SET(wq, wq_context, dbr_umem_id, dbr_umem_id);
+    DEVX_SET64(wq, wq_context, dbr_addr, dbr_umem_offset);
+    DEVX_SET(wq, wq_context, dbr_umem_valid, 1);
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_srq_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_srq_num = DEVX_GET(create_rmp_out, create_out, rmpn);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq::destroy() noexcept {
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_srq_obj) {
+        auto destroy_status = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_srq_obj);
+        if (destroy_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy SRQ object");
+            return destroy_status;
+        }
+        m_srq_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_srq::create(struct ibv_context *ctx) {
+    auto status{DOCA_SUCCESS};
+    m_pd = m_init_attr.pd;
+    m_ctx = ctx;
+
+    /* Query device attr */
+    status = doca_verbs_query_device(m_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if ((m_init_attr.srq_type != DOCA_VERBS_SRQ_TYPE_LINKED_LIST) &&
+        (m_init_attr.srq_type != DOCA_VERBS_SRQ_TYPE_CONTIGUOUS)) {
+        DOCA_LOG(LOG_ERR, "SRQ type is invalid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+    if (m_init_attr.pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: pd is NUL");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    m_pd = m_init_attr.pd;
+
+    if (m_init_attr.srq_wr == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: srq_wr is 0");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+    if (m_init_attr.srq_wr > m_verbs_device_attr->m_max_srq_wr) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create DOCA IB Verbs SRQ: The requested srq_wr is larger than the "
+                 "maximum supported srq_wr by the device");
+        throw DOCA_ERROR_NOT_SUPPORTED;
+    }
+    if (m_init_attr.receive_max_sges == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: receive_max_sges is 0");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (m_init_attr.receive_max_sges > m_verbs_device_attr->m_max_srq_sge) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create DOCA IB Verbs SRQ: The requested sge size is larger than the "
+                 "maximum supported sge size by the device");
+        throw DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    m_rcv_max_sges = m_init_attr.receive_max_sges;
+
+    /* Calculate receive_wqe size */
+    m_rcv_wqe_size = m_rcv_max_sges * DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    if (m_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        m_rcv_wqe_size += DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES;
+        /* For LL SRQ: Minimum receive WQE size for SRQ is 32 bytes */
+        m_rcv_wqe_size = MAX(m_rcv_wqe_size, static_cast<uint32_t>(DOCA_VERBS_MIN_SRQ_SIZE));
+    }
+
+    m_rcv_wqe_size = doca_internal_utils_next_power_of_two(m_rcv_wqe_size);
+
+    /* Calculate the actual max_sges size according to the actual wqe size */
+    if (m_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        m_rcv_max_sges = (m_rcv_wqe_size - DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES) /
+                         DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    } else {  // m_init_attr.srq_type = DOCA_VERBS_SRQ_TYPE_CONTIGUOUS
+        m_rcv_max_sges = m_rcv_wqe_size / DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    }
+
+    m_log_rcv_wqe_size = static_cast<uint8_t>(doca_internal_utils_log2(m_rcv_wqe_size));
+
+    /* Calculate SRQ size in bytes */
+    auto srq_size_bytes = static_cast<uint32_t>(
+        doca_internal_utils_next_power_of_two(m_init_attr.srq_wr * m_rcv_wqe_size));
+
+    /* Calculate SRQ size in receive_wqe units */
+    m_srq_size = srq_size_bytes / m_rcv_wqe_size;
+    auto log_srq_size = doca_internal_utils_log2(m_srq_size);
+
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+    uint32_t wq_umem_id{0};
+    uint64_t wq_umem_offset{0};
+
+    /* Calculate DBR offset */
+    auto db_umem_offset = m_srq_size * m_rcv_wqe_size;
+
+    /* Align the Work Queue size to cacheline size for better performance */
+    db_umem_offset = doca_internal_utils_align_up_uint32(db_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+    if (m_init_attr.external_umem == nullptr) {
+        /* Case of internal umem */
+
+        auto total_umem_size = doca_internal_utils_align_up_uint32(
+            db_umem_offset + DOCA_VERBS_SRQ_DB_SIZE, DOCA_VERBS_PAGE_SIZE);
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+        memset(m_umem_buf, 0, total_umem_size);
+
+        m_srq_buf = m_umem_buf;
+
+        /* Create UMEM object */
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ctx, m_srq_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create SRQ UMEM");
+            throw umem_status;
+        }
+
+        dbr_umem_id = m_umem_obj->umem_id;
+        dbr_umem_offset = db_umem_offset;
+
+        wq_umem_id = m_umem_obj->umem_id;
+        wq_umem_offset = 0;
+    } else {
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_srq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        m_srq_buf += m_init_attr.external_umem_offset;
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem, &wq_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get umem id");
+            throw status;
+        }
+
+        wq_umem_offset = m_init_attr.external_umem_offset;
+
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_srq_buf + db_umem_offset);
+        m_db_buffer = reinterpret_cast<uint32_t *>((reinterpret_cast<uint8_t *>(m_db_buffer)));
+
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem base offset");
+            throw status;
+        }
+
+        dbr_umem_offset = m_init_attr.external_umem_offset + db_umem_offset;
+        dbr_umem_id = wq_umem_id;
+    }
+
+    m_db_buffer = reinterpret_cast<uint32_t *>(m_srq_buf + db_umem_offset);
+
+    /* Create SRQ object */
+    status = create_srq_obj(log_srq_size, m_log_rcv_wqe_size, dbr_umem_offset, dbr_umem_id,
+                            wq_umem_offset, wq_umem_id, m_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ object");
+        throw status;
+    }
+}
+
+doca_verbs_srq::doca_verbs_srq(struct ibv_context *ctx,
+                               struct doca_verbs_srq_init_attr &verbs_srq_init_attr)
+    : m_ctx(ctx), m_init_attr(verbs_srq_init_attr) {
+    try {
+        create(ctx);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ");
+        throw;
+    }
+}
+
+doca_verbs_srq::~doca_verbs_srq() { static_cast<void>(destroy()); }
+
+void *doca_verbs_srq::get_srq_buf() const noexcept { return (void *)m_srq_buf; }
+
+uint32_t doca_verbs_srq::get_srq_size() const noexcept { return m_srq_size; }
+
+uint32_t doca_verbs_srq::get_rcv_wqe_size() const noexcept { return m_rcv_wqe_size; }
+
+void *doca_verbs_srq::get_dbr_addr() const noexcept { return (void *)m_db_buffer; }
+
+uint32_t doca_verbs_srq::get_srqn() const noexcept { return m_srq_num; }
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_srq_init_attr_create(
+    struct doca_verbs_srq_init_attr **verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create srq_init_attr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_srq_init_attr =
+        (struct doca_verbs_srq_init_attr *)calloc(1, sizeof(struct doca_verbs_srq_init_attr));
+    if (*verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create srq_init_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_destroy(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy srq_init_attr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_srq_init_attr);
+    verbs_srq_init_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_create(struct ibv_context *context,
+                                   struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                   struct doca_verbs_srq **verbs_srq) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter verbs_srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_srq = new doca_verbs_srq(context, *verbs_srq_init_attr);
+        DOCA_LOG(LOG_INFO, "doca_verbs_srq=%p was created", verbs_srq);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_destroy(struct doca_verbs_srq *verbs_srq) {
+    if (verbs_srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_srq: parameter verbs_srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_srq->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_srq");
+        return status;
+    }
+
+    delete (verbs_srq);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_srq_wr(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t srq_wr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_wr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq_wr == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_wr: parameter srq_wr is 0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->srq_wr = srq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_srq_init_attr_get_srq_wr(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq_wr: parameter verbs_srq_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_srq_init_attr->srq_wr;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_receive_max_sges(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t receive_max_sges) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (receive_max_sges == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter receive_max_sges is 0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->receive_max_sges = receive_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_srq_init_attr_get_receive_max_sges(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_max_sges: parameter verbs_srq_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_srq_init_attr->receive_max_sges;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_type(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                               enum doca_verbs_srq_type srq_type) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_type: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq_type != DOCA_VERBS_SRQ_TYPE_LINKED_LIST && srq_type != DOCA_VERBS_SRQ_TYPE_CONTIGUOUS) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_type: parameter srq_type is invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->srq_type = srq_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_srq_type doca_verbs_srq_init_attr_get_type(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq_type: parameter verbs_srq_init_attr is NULL");
+        return DOCA_VERBS_SRQ_TYPE_LINKED_LIST;
+    }
+
+    return verbs_srq_init_attr->srq_type;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_pd(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                             struct ibv_pd *pd) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter pd is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->pd = pd;
+
+    return DOCA_SUCCESS;
+}
+
+struct ibv_pd *doca_verbs_srq_init_attr_get_pd(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pd: parameter verbs_srq_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_srq_init_attr->pd;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_external_umem(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->external_umem = external_umem;
+    verbs_srq_init_attr->external_umem_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_get_external_umem(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem_offset == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem_offset is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_umem = verbs_srq_init_attr->external_umem;
+    *external_umem_offset = verbs_srq_init_attr->external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_srq_get_wq(const struct doca_verbs_srq *verbs_srq, void **srq_buf,
+                           uint32_t *srq_num_entries, uint32_t *rwqe_size_bytes) {
+    *srq_buf = verbs_srq->get_srq_buf();
+    *srq_num_entries = verbs_srq->get_srq_size();
+    *rwqe_size_bytes = verbs_srq->get_rcv_wqe_size();
+}
+
+void *doca_verbs_srq_get_dbr_addr(const struct doca_verbs_srq *verbs_srq) {
+    return verbs_srq->get_dbr_addr();
+}
+
+uint32_t doca_verbs_srq_get_srqn(const struct doca_verbs_srq *verbs_srq) {
+    return verbs_srq->get_srqn();
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp
new file mode 100644
index 000000000..2e897b8f4
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp
@@ -0,0 +1,109 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+#include "doca_verbs_uar.hpp"
+
+struct doca_verbs_srq_init_attr {
+    struct ibv_pd *pd{};
+    enum doca_verbs_srq_type srq_type { DOCA_VERBS_SRQ_TYPE_LINKED_LIST };
+    uint32_t srq_wr{};
+    uint32_t receive_max_sges{};
+    struct doca_verbs_umem *external_umem{};
+    uint64_t external_umem_offset{};
+};
+
+/**
+ *  @brief This struct implements the doca verbs srq
+ */
+struct doca_verbs_srq {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] verbs_ctx
+     * The DOCA IB Verbs Context
+     * @param [in] verbs_srq_init_attr
+     * The DOCA IB Verbs SRQ attributes
+     *
+     */
+    doca_verbs_srq(struct ibv_context *ctx, struct doca_verbs_srq_init_attr &verbs_srq_init_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_srq();
+
+    doca_error_t create_srq_obj(uint32_t log_srq_size, uint32_t log_stride,
+                                uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+                                uint64_t wq_umem_offset, uint32_t wq_umem_id,
+                                struct doca_verbs_srq_init_attr &verbs_srq_init_attr) noexcept;
+
+    void create(struct ibv_context *ctx);
+
+    doca_error_t destroy() noexcept;
+
+    void *get_srq_buf() const noexcept;
+
+    uint32_t get_srq_size() const noexcept;
+
+    uint32_t get_rcv_wqe_size() const noexcept;
+
+    void *get_dbr_addr() const noexcept;
+
+    uint32_t get_srqn() const noexcept;
+
+   private:
+    struct mlx5dv_devx_obj *m_srq_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_srq_buf{};
+    struct ibv_context *m_ctx{};
+    struct ibv_pd *m_pd{};
+    uint32_t m_srq_num{};
+    uint32_t *m_db_buffer{};
+    uint32_t m_rcv_wqe_size{};
+    uint32_t m_rcv_max_sges{};
+    uint32_t m_srq_size{};
+    uint8_t m_log_rcv_wqe_size{};
+    enum doca_verbs_srq_type m_srq_type { DOCA_VERBS_SRQ_TYPE_LINKED_LIST };
+    doca_verbs_srq_init_attr m_init_attr{};
+    doca_verbs_device_attr *m_verbs_device_attr{};
+
+    doca_verbs_srq(doca_verbs_srq const &) = delete;
+    doca_verbs_srq &operator=(doca_verbs_srq const &) = delete;
+};
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp
new file mode 100644
index 000000000..c09867353
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp
@@ -0,0 +1,197 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_gpunetio_config.h"
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_uar.hpp"
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+doca_error_t convert_doca_verbs_uar_type_to_mlx5_uar_type(doca_verbs_uar_allocation_type uar_type,
+                                                          uint32_t &mlx5_uar_type) noexcept {
+    switch (uar_type) {
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME:
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_BF;
+            break;
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE:
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_NC;
+            break;
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED:
+#if DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR == 1
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED;
+            break;
+#else
+            DOCA_LOG(LOG_ERR, "DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED is not supported");
+            return DOCA_ERROR_NOT_SUPPORTED;
+#endif
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid UAR type=%d", mlx5_uar_type);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_uar Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_uar::doca_verbs_uar(struct ibv_context *context,
+                               enum doca_verbs_uar_allocation_type allocation_type)
+    : m_ibv_ctx(context), m_allocation_type(allocation_type) {
+    try {
+        create();
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create UAR");
+        throw;
+    }
+}
+
+doca_verbs_uar::~doca_verbs_uar() { static_cast<void>(destroy()); }
+
+void doca_verbs_uar::create() {
+    uint32_t mlx5_uar_type{};
+    auto status = convert_doca_verbs_uar_type_to_mlx5_uar_type(m_allocation_type, mlx5_uar_type);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to convert UAR");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    auto uar_status =
+        doca_verbs_wrapper_mlx5dv_devx_alloc_uar(m_ibv_ctx, mlx5_uar_type, &m_uar_obj);
+    if (uar_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc UAR");
+        throw uar_status;
+    }
+
+    m_uar_id = m_uar_obj->page_id;
+    m_reg_addr = m_uar_obj->reg_addr;
+}
+
+doca_error_t doca_verbs_uar::destroy() noexcept {
+    if (m_uar_obj) {
+        auto free_uar_status = doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        if (free_uar_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free UAR");
+            return free_uar_status;
+        }
+        m_uar_obj = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_uar_create(struct ibv_context *context,
+                                   enum doca_verbs_uar_allocation_type allocation_type,
+                                   struct doca_verbs_uar **uar_obj) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create uar: parameter context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create uar: parameter uar_obj=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *uar_obj = new doca_verbs_uar(context, allocation_type);
+        DOCA_LOG(LOG_INFO, "doca_verbs_uar=%p was created", *uar_obj);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_uar_destroy(struct doca_verbs_uar *uar_obj) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy uar: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = uar_obj->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy uar.");
+        return status;
+    }
+
+    delete (uar_obj);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_uar_id_get(const struct doca_verbs_uar *uar_obj, uint32_t *uar_id) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar id: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (uar_id == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar id: parameter uar_id is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *uar_id = uar_obj->get_uar_id();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_uar_reg_addr_get(const struct doca_verbs_uar *uar_obj, void **reg_addr) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar reg_addr: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (reg_addr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar reg_addr: parameter reg_addr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *reg_addr = uar_obj->get_reg_addr();
+
+    return DOCA_SUCCESS;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp
new file mode 100644
index 000000000..5aa168b99
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp
@@ -0,0 +1,109 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+/**
+ *  @brief This struct implements the doca verbs uar
+ */
+struct doca_verbs_uar {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] context
+     * ibv_context
+     * @param [in] allocation_type
+     * The uar allocation type.
+     */
+    doca_verbs_uar(struct ibv_context *context,
+                   enum doca_verbs_uar_allocation_type allocation_type);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_uar();
+
+    /**
+     * @brief destroy the uar
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the uar.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the uar
+     *
+     */
+    void create();
+
+    /**
+     * @brief Get uar ID
+     *
+     * @return uar ID
+     */
+    uint32_t get_uar_id() const noexcept { return m_uar_id; }
+
+    /**
+     * @brief Get UAR reg address
+     *
+     * @return UAR reg address
+     */
+    void *get_reg_addr() const noexcept { return m_reg_addr; }
+
+    /**
+     * @brief Get UAR memory allocation type
+     *
+     * @return UAR memory allocation type
+     */
+    enum doca_verbs_uar_allocation_type get_uar_mtype() const noexcept { return m_allocation_type; }
+
+   private:
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    enum doca_verbs_uar_allocation_type m_allocation_type {
+        DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME
+    };
+    uint32_t m_uar_id{};
+    void *m_reg_addr{};
+
+    doca_verbs_uar(doca_verbs_uar const &) = delete;
+    doca_verbs_uar &operator=(doca_verbs_uar const &) = delete;
+};
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp
new file mode 100644
index 000000000..a8a444db2
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp
@@ -0,0 +1,212 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_gpunetio_config.h"
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_umem.hpp"
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_umem Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_umem::doca_verbs_umem(struct ibv_context *ibv_ctx, void *address, size_t size,
+                                 uint32_t access_flags, int dmabuf_fd, size_t dmabuf_offset)
+    : m_ibv_ctx(ibv_ctx),
+      m_address(address),
+      m_size(size),
+      m_access_flags(access_flags),
+      m_dmabuf_fd(dmabuf_fd),
+      m_dmabuf_offset(dmabuf_offset) {
+    try {
+        create();
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create UMEM");
+        throw;
+    }
+}
+
+doca_verbs_umem::~doca_verbs_umem() { static_cast<void>(destroy()); }
+
+void doca_verbs_umem::create() {
+    struct mlx5dv_devx_umem_in umem_in {};
+
+    umem_in.addr = m_address;
+    umem_in.size = m_size;
+    umem_in.access = m_access_flags;
+    umem_in.pgsz_bitmap = sysconf(_SC_PAGESIZE);
+    umem_in.comp_mask = 0;
+
+#if DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF == 1
+    /* check if dmabuf file descriptor was set to determine mask */
+    if (m_dmabuf_fd != (int)DOCA_VERBS_DMABUF_INVALID_FD) {
+        umem_in.comp_mask = MLX5DV_UMEM_MASK_DMABUF;
+        umem_in.dmabuf_fd = m_dmabuf_fd;
+        /* umem_in.addr is interpreted as the starting offset of the dmabuf */
+        umem_in.addr = reinterpret_cast<void *>(m_dmabuf_offset);
+    }
+#endif
+
+    auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(m_ibv_ctx, &umem_in, &m_umem_obj);
+    if (umem_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create UMEM, m_address %p m_size %zd m_access_flags %x m_dmabuf_fd %d "
+                 "m_dmabuf_offset %zd err %d",
+                 m_address, m_size, m_access_flags, m_dmabuf_fd, m_dmabuf_offset, errno);
+        throw umem_status;
+    }
+
+    m_umem_id = m_umem_obj->umem_id;
+}
+
+doca_error_t doca_verbs_umem::destroy() noexcept {
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_umem_create(struct ibv_context *context, void *address, size_t size,
+                                    uint32_t access_flags, int dmabuf_id, size_t dmabuf_offset,
+                                    struct doca_verbs_umem **umem_obj) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (address == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter address=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (size == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter size=0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter umem_obj=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *umem_obj =
+            new doca_verbs_umem(context, address, size, access_flags, dmabuf_id, dmabuf_offset);
+        DOCA_LOG(LOG_INFO, "doca_verbs_umem=%p was created", *umem_obj);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_umem_destroy(struct doca_verbs_umem *umem_obj) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy umem: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = umem_obj->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy umem.");
+        return status;
+    }
+
+    delete (umem_obj);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_id(const struct doca_verbs_umem *umem_obj, uint32_t *umem_id) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem id: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_id == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem id: parameter umem_id is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_id = umem_obj->get_umem_id();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_size(const struct doca_verbs_umem *umem_obj, size_t *umem_size) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem size: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_size == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem size: parameter umem_size is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_size = umem_obj->get_umem_size();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_address(const struct doca_verbs_umem *umem_obj,
+                                         void **umem_address) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem address: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_address == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem address: parameter umem_address is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_address = umem_obj->get_umem_address();
+
+    return DOCA_SUCCESS;
+}
diff --git a/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp
new file mode 100644
index 000000000..8f4e09cb1
--- /dev/null
+++ b/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+/**
+ *  @brief This struct implements the doca verbs umem
+ */
+struct doca_verbs_umem {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * ibv_context
+     * @param [in] address
+     * The umem address.
+     * @param [in] size
+     * The umem size.
+     * @param [in] access_flags
+     * The umem access flags.
+     * @param [in] dmabuf_fd
+     * The umem dmabuf file descriptor id.
+     * @param [in] dmabuf_offset
+     * The umem dmabuf offset.
+     */
+    doca_verbs_umem(struct ibv_context *ibv_ctx, void *address, size_t size, uint32_t access_flags,
+                    int dmabuf_fd, size_t dmabuf_offset);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_umem();
+
+    /**
+     * @brief destroy the umem
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the umem.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the umem
+     *
+     */
+    void create();
+
+    /**
+     * @brief Get umem ID
+     *
+     * @return umem ID
+     */
+    uint32_t get_umem_id() const noexcept { return m_umem_id; }
+
+    /**
+     * @brief Get umem size
+     *
+     * @return umem size
+     */
+    size_t get_umem_size() const noexcept { return m_size; }
+
+    /**
+     * @brief Get umem address
+     *
+     * @return umem address
+     */
+    void *get_umem_address() const noexcept { return m_address; }
+
+   private:
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    void *m_address{};
+    size_t m_size{};
+    uint32_t m_access_flags{};
+    uint32_t m_umem_id{};
+    int m_dmabuf_fd;
+    size_t m_dmabuf_offset;
+
+    doca_verbs_umem(doca_verbs_umem const &) = delete;
+    doca_verbs_umem &operator=(doca_verbs_umem const &) = delete;
+};
diff --git a/src/transport/gdaki/gin_host_gdaki.cc b/src/transport/gdaki/gin_host_gdaki.cc
new file mode 100644
index 000000000..a7dd67796
--- /dev/null
+++ b/src/transport/gdaki/gin_host_gdaki.cc
@@ -0,0 +1,1065 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <alloc.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <mutex>
+
+#include "ibvwrap.h"
+#include "mlx5/mlx5dvwrap.h"
+#include "gin/gin_host.h"
+#include "gin_host_gdaki.h"
+#include "plugin/nccl_net.h"
+#include "param.h"
+
+#include "doca_gpunetio_host.h"
+#include "nccl_device/gin/gdaki/gin_gdaki_device_host_common.h"
+#include "../net_ib_gin.h"
+
+#define DOCACHECK(call)                                       \
+  do {                                                        \
+    doca_error_t RES = call;                                  \
+    if (RES != DOCA_SUCCESS) {                           \
+      /* Print the back trace*/                               \
+      INFO(NCCL_NET, "%s:%d -> %d", __FILE__, __LINE__, RES); \
+      return ncclSystemError;                                 \
+    }                                                         \
+  } while (0)
+
+#define DOCACHECKGOTO(call, DOCA_RES, NCCL_RES, label)             \
+  do {                                                             \
+    DOCA_RES = call;                                               \
+    if (DOCA_RES != DOCA_SUCCESS) {                           \
+      /* Print the back trace*/                                    \
+      INFO(NCCL_NET, "%s:%d -> %d", __FILE__, __LINE__, DOCA_RES); \
+      NCCL_RES = ncclSystemError;                                  \
+      goto label;                                                  \
+    }                                                              \
+  } while (0)
+
+#define VERBS_TEST_DBR_SIZE (8)
+#define MAX_PCI_ADDRESS_LEN 32U
+
+NCCL_PARAM(GinGdakiNicHandler, "GIN_GDAKI_NIC_HANDLER", 0);
+NCCL_PARAM(GinGdakiQpDepth, "GIN_GDAKI_QP_DEPTH", 128);
+NCCL_PARAM(GinErrorQuerySec, "GIN_ERROR_QUERY_SEC", 10);
+extern int64_t ncclParamIbTimeout();
+extern int64_t ncclParamIbRetryCnt();
+extern int64_t ncclParamIbPkey();
+extern int64_t ncclParamIbSl();
+extern int64_t ncclParamIbTc();
+extern int64_t ncclParamIbPciRelaxedOrdering();
+extern int64_t ncclParamIbDataDirect();
+extern int64_t ncclParamDmaBufEnable();
+
+static const int NCCL_IB_SL_DEFAULT = 0;
+static const int NCCL_IB_TC_DEFAULT = 0;
+
+static inline bool gdakiRelaxedOrderingEnabled() {
+  static bool hasCheckedRelaxedOrdering = false;
+  static bool relaxedOrderingEnabled = false;
+
+  static std::mutex lockMutex;
+  std::lock_guard<std::mutex> lock(lockMutex);
+
+  if (!hasCheckedRelaxedOrdering) {
+    int roMode = ncclParamIbPciRelaxedOrdering();
+    ncclResult_t r = ncclInternalError;
+    if (roMode == 1 || roMode == 2) {
+      // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+      r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0);
+    }
+
+    relaxedOrderingEnabled = (r != ncclInternalError);
+    hasCheckedRelaxedOrdering = true;
+  }
+  return relaxedOrderingEnabled;
+}
+
+static ncclResult_t gdakiRegMrDmaBuf(struct ibv_mr **mr, struct ibv_pd *pd, void *addr,
+                                     size_t length, int access) {
+  int status = 0;
+  int dmabuf_fd = -1;
+
+  if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage;
+
+#if CUDA_VERSION >= 11070
+  static size_t host_page_size = sysconf(_SC_PAGESIZE);
+  size_t aligned_size = length;
+  ALIGN_SIZE(aligned_size, host_page_size);
+
+#if CUDA_VERSION >= 12080
+  if (ncclParamIbDataDirect()) {
+    status = pfn_cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)addr, aligned_size,
+                                               CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                               CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (status) {
+      INFO(NCCL_NET,
+           "Failed to get DMA-BUF handle for address range with type PCIE, error=%d. Trying a "
+           "different method.",
+           status);
+      goto try_legacy;
+    }
+    status = wrap_mlx5dv_reg_dmabuf_mr(mr, pd, 0, aligned_size, 0, dmabuf_fd, access,
+                                       MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+    if (status) {
+      INFO(NCCL_NET,
+           "Failed to register memory with DMA-BUF and data direct, error=%d. Trying a different "
+           "method.",
+           status);
+      close(dmabuf_fd);
+      dmabuf_fd = -1;
+    } else
+      goto out;
+  }
+try_legacy:
+
+#endif
+
+  CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)addr, aligned_size,
+                                        CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+  status = wrap_ibv_reg_dmabuf_mr(mr, pd, 0, aligned_size, 0, dmabuf_fd, access);
+  if (status)
+    INFO(NCCL_NET, "Failed to register memory with DMA-BUF, error=%d. Trying a different method.",
+         status);
+#else
+  status = ncclInvalidUsage;
+#endif
+
+#if CUDA_VERSION >= 12080
+out:
+#endif
+  if (dmabuf_fd >= 0) {
+    close(dmabuf_fd);
+  }
+  return (ncclResult_t)status;
+}
+
+static ncclResult_t gdakiRegMr(struct ibv_mr **mr, struct ibv_pd *pd, void *addr, size_t length,
+                               int access, bool force_strict_ordering = false) {
+  int status = 0;
+
+  if (!force_strict_ordering && gdakiRelaxedOrderingEnabled())
+    access |= IBV_ACCESS_RELAXED_ORDERING;
+
+  NOWARN(status = gdakiRegMrDmaBuf(mr, pd, addr, length, access), NCCL_NET);
+  if (status == ncclSuccess) return ncclSuccess;
+
+  NCCLCHECK(wrap_ibv_reg_mr_iova2(mr, pd, addr, length, 0, access));
+  return ncclSuccess;
+}
+
+template <typename T>
+class GdakiHostGPUMemHandle {
+ private:
+  CUmemGenericAllocationHandle cumemhandle;
+  unsigned int num_elements;
+
+ public:
+  T *host_buf;
+  T *gpu_buf;
+
+  ncclResult_t allocate(unsigned int num_elements) {
+    this->host_buf = (T *)calloc(num_elements, sizeof(T));
+    EQCHECK(this->host_buf, nullptr);
+
+    NCCLCHECK(ncclCuMemAlloc((void **)&this->gpu_buf, &this->cumemhandle, CU_MEM_HANDLE_TYPE_NONE,
+                             num_elements * sizeof(T)));
+
+    this->num_elements = num_elements;
+
+    return ncclSuccess;
+  }
+
+  void deallocate() {
+    if (this->host_buf != nullptr) {
+      free(this->host_buf);
+    }
+    if (this->gpu_buf != nullptr) {
+      ncclCuMemFree(this->gpu_buf);
+    }
+  }
+
+  ncclResult_t copy_h_to_d() {
+    NCCLCHECK(ncclCudaMemcpy<T>(this->gpu_buf, this->host_buf, this->num_elements));
+    return ncclSuccess;
+  }
+
+  ncclResult_t copy_d_to_h() {
+    NCCLCHECK(ncclCudaMemcpy<T>(this->host_buf, this->gpu_buf, this->num_elements));
+    return ncclSuccess;
+  }
+
+  GdakiHostGPUMemHandle() : cumemhandle(0), num_elements(0), host_buf(nullptr), gpu_buf(nullptr){};
+  GdakiHostGPUMemHandle(unsigned int num_elements) {
+    ncclResult_t status = this->allocate(num_elements);
+    if (status != ncclSuccess) {
+      throw status;
+    }
+  };
+
+  ~GdakiHostGPUMemHandle() { this->deallocate(); }
+};
+
+template <typename T>
+class GdakiGlobalGPUBufferTable {
+ private:
+  CUmemGenericAllocationHandle cumemhandle;
+  unsigned int num_elements;
+  unsigned int next_unused_idx;
+  unsigned int num_ranks;
+  GdakiHostGPUMemHandle<__be32> rkeys_hd_mhandle;
+
+ public:
+  T *gpu_ptr;
+  struct ibv_mr *mr;
+
+  ncclResult_t allocate(unsigned int num_elements, unsigned int num_ranks) {
+    NCCLCHECK(ncclCuMemAlloc((void **)&this->gpu_ptr, &this->cumemhandle, CU_MEM_HANDLE_TYPE_NONE,
+                             num_elements * sizeof(T)));
+    CUDACHECK(cudaMemset(this->gpu_ptr, 0, num_elements * sizeof(T)));
+    NCCLCHECK(this->rkeys_hd_mhandle.allocate(num_ranks));
+
+    this->num_elements = num_elements;
+    this->num_ranks = num_ranks;
+    this->next_unused_idx = 0;
+
+    return ncclSuccess;
+  }
+
+  void deallocate() {
+    if (this->gpu_ptr != nullptr) {
+      ncclCuMemFree(this->gpu_ptr);
+    }
+  }
+
+  ncclResult_t register_mr(struct ibv_pd *ib_pd, bool force_strict_ordering = false) {
+    NCCLCHECK(gdakiRegMr(&this->mr, ib_pd, this->gpu_ptr, this->num_elements * sizeof(T),
+                         IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                           IBV_ACCESS_REMOTE_ATOMIC,
+                         force_strict_ordering));
+    return ncclSuccess;
+  }
+
+  void deregister_mr() {
+    if (this->mr != nullptr) {
+      wrap_ibv_dereg_mr(this->mr);
+      this->mr = nullptr;
+    }
+  }
+
+  ncclResult_t exchange_info(struct ncclGinIbCollComm *cComm) {
+    __be32 rkey = htobe32(this->mr->rkey);
+    NCCLCHECK(cComm->allGather(cComm, &rkey, this->rkeys_hd_mhandle.host_buf, sizeof(__be32)));
+    NCCLCHECK(this->rkeys_hd_mhandle.copy_h_to_d());
+    return ncclSuccess;
+  }
+
+  ncclResult_t allocate_elements(unsigned int num_elements, unsigned int *out_start_idx) {
+    if (this->next_unused_idx + num_elements > this->num_elements) {
+      WARN("Not enough space to get elements");
+      return ncclInvalidUsage;
+    }
+
+    *out_start_idx = this->next_unused_idx;
+    this->next_unused_idx += num_elements;
+
+    return ncclSuccess;
+  }
+
+  void free_elements(unsigned int start_idx, unsigned int num_elements) {
+    // No op for now as we don't allow reusing elements.
+  }
+
+  uint32_t *get_rkeys_d() { return this->rkeys_hd_mhandle.gpu_buf; }
+
+  GdakiGlobalGPUBufferTable()
+    : gpu_ptr(nullptr), mr(nullptr), cumemhandle(nullptr), num_elements(0), next_unused_idx(0){};
+  GdakiGlobalGPUBufferTable(unsigned int num_elements, unsigned int num_ranks) {
+    this->allocate(num_elements, num_ranks);
+  };
+  ~GdakiGlobalGPUBufferTable() { this->deallocate(); }
+};
+
+struct gdaki_mem_handle {
+  int type;
+  struct ibv_mr *mr;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle> *gdaki_mhandle_hd_mhandle;
+  GdakiHostGPUMemHandle<uint32_t> *rkeys_hd_mhandle;
+};
+
+struct gdaki_exch_info {
+  int lid;
+  int qpn;
+  union ibv_gid gid;
+  struct doca_verbs_gid vgid;
+  int gid_index;
+};
+
+struct gdaki_context {
+  int cuda_id;
+  struct doca_gpu *gdev;
+  struct ibv_device *ib_dev;
+  struct ibv_context *ib_ctx;    /* DOCA Verbs Context */
+  struct ibv_pd *ib_pd;          /* local protection domain */
+  struct doca_verbs_ah_attr *ah; /* DOCA Verbs address handle */
+  struct doca_verbs_gid gid;
+
+  union ibv_gid rgid;
+  struct ibv_port_attr port_attr;
+  uint8_t port_num;
+  int gid_index;
+
+  uint32_t qp_rq_size;
+  uint32_t qp_sq_size;
+  struct doca_gpu_verbs_qp_group_hl **gqp_groups;
+  struct doca_gpu_verbs_qp_hl **gqps;
+  struct doca_gpu_verbs_qp_hl **companion_gqps;
+
+  GdakiGlobalGPUBufferTable<uint64_t> *counters_table;
+  GdakiGlobalGPUBufferTable<uint64_t> *signals_table;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext> *gin_gdaki_gpu_ctx_hd_mhandle;
+  struct {
+    void *addr;
+    struct ibv_mr *mr;
+    CUmemGenericAllocationHandle mhandle;
+  } sink_buffer;
+  struct timespec last_error_query_time;
+
+  struct ncclGinIbCollComm *collComm;
+  ncclNetDeviceHandle_v11_t *devHandle;
+};
+
+template <typename T>
+static inline T gdaki_round_up(T x, T y) {
+  return ((x + y - 1) / y) * y;
+}
+
+static ncclResult_t gdakiFindDevice(char *ibDevName, struct ibv_device **outIbDev) {
+  ncclResult_t status = ncclSuccess;
+  int numOfDevice;
+  struct ibv_device **devList = nullptr;
+  struct ibv_device *ibDev = nullptr;
+
+  assert(ibDevName != nullptr);
+
+  NCCLCHECK(wrap_ibv_get_device_list(&devList, &numOfDevice));
+
+  if (numOfDevice <= 0) {
+    WARN("No network devices that support GDAKI found");
+    status = ncclSystemError;
+    goto fail;
+  }
+
+  for (int i = 0; i < numOfDevice; ++i) {
+    struct ibv_device *ibDev_ = devList[i];
+    if (!strcmp(wrap_ibv_get_device_name(ibDev_), ibDevName)) {
+      ibDev = ibDev_;
+      break;
+    }
+  }
+  if (!ibDev) {
+    WARN("IB device %s not found", ibDevName);
+    status = ncclInvalidArgument;
+    goto fail;
+  }
+
+  *outIbDev = ibDev;
+
+exit:
+  return status;
+fail:
+  NCCLCHECK(wrap_ibv_free_device_list(devList));
+  goto exit;
+}
+
+static void gdakiFillExchInfo(struct gdaki_exch_info *exch_info, struct gdaki_context *gdaki_ctx,
+                              struct doca_gpu_verbs_qp_hl *gqp) {
+  exch_info->lid = gdaki_ctx->port_attr.lid;
+  exch_info->qpn = doca_verbs_qp_get_qpn(gqp->qp);
+  memcpy(exch_info->gid.raw, gdaki_ctx->rgid.raw, sizeof(union ibv_gid));
+  memcpy(exch_info->vgid.raw, gdaki_ctx->rgid.raw, sizeof(union ibv_gid));
+  exch_info->gid_index = gdaki_ctx->gid_index;
+}
+
+static ncclResult_t gdakiCreateVerbsAh(struct gdaki_context *ctx, int ib_sl, int ib_tc,
+                                       int ib_gid_index) {
+  ncclResult_t status = ncclSuccess;
+  doca_error_t docaStatus = DOCA_SUCCESS;
+
+  DOCACHECK(doca_verbs_ah_attr_create(ctx->ib_ctx, &ctx->ah));
+  DOCACHECK(doca_verbs_ah_attr_set_sl(ctx->ah, ib_sl));
+  DOCACHECK(doca_verbs_ah_attr_set_traffic_class(ctx->ah, ib_tc));
+
+  if (ctx->port_attr.link_layer == 1) {
+    DOCACHECKGOTO(doca_verbs_ah_attr_set_addr_type(ctx->ah, DOCA_VERBS_ADDR_TYPE_IB_NO_GRH),
+                  docaStatus, status, destroy_verbs_ah);
+  } else {
+    DOCACHECKGOTO(doca_verbs_ah_attr_set_addr_type(ctx->ah, DOCA_VERBS_ADDR_TYPE_IPv4),
+                  docaStatus, status, destroy_verbs_ah);
+  }
+
+  // set_port_num?
+  DOCACHECKGOTO(doca_verbs_ah_attr_set_sgid_index(ctx->ah, ib_gid_index), docaStatus, status,
+                destroy_verbs_ah);
+  DOCACHECKGOTO(doca_verbs_ah_attr_set_hop_limit(ctx->ah, 255), docaStatus, status, destroy_verbs_ah);
+
+  return ncclSuccess;
+
+destroy_verbs_ah:
+  DOCACHECK(doca_verbs_ah_attr_destroy(ctx->ah));
+  return status;
+}
+
+static ncclResult_t gdakiConnectQp(struct gdaki_context *ctx, struct doca_gpu_verbs_qp_hl *gqp,
+                                   struct gdaki_exch_info *exch_info) {
+  ncclResult_t status = ncclSuccess;
+  doca_error_t docaStatus = DOCA_SUCCESS;
+  struct doca_verbs_qp_attr *verbs_qp_attr = nullptr;
+
+  DOCACHECK(doca_verbs_ah_attr_set_gid(ctx->ah, exch_info->vgid));
+  DOCACHECK(doca_verbs_ah_attr_set_dlid(ctx->ah, exch_info->lid));
+  DOCACHECK(doca_verbs_qp_attr_create(&verbs_qp_attr));
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_path_mtu(verbs_qp_attr, DOCA_VERBS_MTU_SIZE_4K_BYTES),
+    docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_rq_psn(verbs_qp_attr, 0), docaStatus, status,
+                destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_sq_psn(verbs_qp_attr, 0), docaStatus, status,
+                destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_port_num(verbs_qp_attr, ctx->port_num), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_ack_timeout(verbs_qp_attr, ncclParamIbTimeout()), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_retry_cnt(verbs_qp_attr, ncclParamIbRetryCnt()), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_rnr_retry(verbs_qp_attr, 7), docaStatus, status,
+                destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_min_rnr_timer(verbs_qp_attr, 12), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_INIT),
+    docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_write(verbs_qp_attr, 1), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_read(verbs_qp_attr, 1), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_atomic(
+                  verbs_qp_attr, DOCA_VERBS_QP_ATOMIC_MODE_IB_SPEC),
+                docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_ah_attr(verbs_qp_attr, ctx->ah), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_dest_qp_num(verbs_qp_attr, exch_info->qpn),
+                docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_pkey_index(verbs_qp_attr, ncclParamIbPkey()), docaStatus,
+                status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE |
+                    DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ | DOCA_VERBS_QP_ATTR_PKEY_INDEX |
+                    DOCA_VERBS_QP_ATTR_PORT_NUM),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_RTR),
+    docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_RQ_PSN |
+                    DOCA_VERBS_QP_ATTR_DEST_QP_NUM | DOCA_VERBS_QP_ATTR_PATH_MTU |
+                    DOCA_VERBS_QP_ATTR_AH_ATTR | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_RTS),
+    docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_SQ_PSN |
+                    DOCA_VERBS_QP_ATTR_ACK_TIMEOUT | DOCA_VERBS_QP_ATTR_RETRY_CNT |
+                    DOCA_VERBS_QP_ATTR_RNR_RETRY),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECK(doca_verbs_qp_attr_destroy(verbs_qp_attr));
+
+  return ncclSuccess;
+
+destroy_verbs_qp_attr:
+  DOCACHECK(doca_verbs_qp_attr_destroy(verbs_qp_attr));
+  return status;
+}
+
+ncclResult_t ncclGinGdakiCreateContext(void *collComm, int nSignals, int nCounters,
+                                       void **outGinCtx, ncclNetDeviceHandle_v11_t **outDevHandle) {
+  int status = ncclSuccess;
+  doca_error_t docaStatus;
+
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+
+  char pciBusId[MAX_PCI_ADDRESS_LEN];
+
+  const int rank = cComm->rank;
+  const int nranks = cComm->nranks;
+  const int ncontexts = 1;
+  const int nqps_per_rank = ncontexts;
+  const int nqps_for_comm = nqps_per_rank * nranks;  // Number of QPs for communication
+  const int ncompanion_qps = nqps_for_comm * 2;      // Number of companion QPs for communication
+                                                     // Double because we connect to self.
+  const int nqps =
+    nqps_per_rank * (nranks + 1);  // +1 for the local rank.
+                                   // The last group is the responder of the local rank.
+
+  // TODO: Take these config parameters from the environment variables or users.
+  const int num_counters = nCounters;
+  const int num_signals = nSignals;
+  ncclNetProperties_t props;
+  ncclNetDeviceHandle_v11_t *devHandle = nullptr;
+  struct gdaki_context *gdaki_ctx = nullptr;
+  struct gdaki_exch_info *local_exch_info = nullptr;
+  struct gdaki_exch_info *remote_exch_info = nullptr;
+
+  struct doca_gpu_verbs_qp_init_attr_hl qp_init_attr;
+
+  uint64_t *sink_buffer = nullptr;
+  struct ibv_mr *sink_buffer_mr = nullptr;
+  CUmemGenericAllocationHandle sink_buffer_mhandle;
+
+  bool need_cpu_proxy = false;
+
+  GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext> *gin_gdaki_gpu_ctx_hd_mhandle =
+    new GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext>(ncontexts);
+
+  GdakiGlobalGPUBufferTable<uint64_t> *counters_table =
+    new GdakiGlobalGPUBufferTable<uint64_t>(num_counters, nranks);
+  GdakiGlobalGPUBufferTable<uint64_t> *signals_table =
+    new GdakiGlobalGPUBufferTable<uint64_t>(num_signals, nranks);
+
+  const int ib_sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : NCCL_IB_SL_DEFAULT;
+  const int ib_tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : NCCL_IB_TC_DEFAULT;
+  int ib_gid_index = 0;
+
+  NCCLCHECK(cComm->getProperties(cComm->dev, &props));
+
+  const size_t host_page_size = sysconf(_SC_PAGESIZE);
+  gdaki_ctx = (struct gdaki_context *)calloc(1, sizeof(*gdaki_ctx));
+  EQCHECKGOTO(gdaki_ctx, nullptr, status, out);
+
+  devHandle = (ncclNetDeviceHandle_v11_t *)calloc(1, sizeof(*devHandle));
+  EQCHECKGOTO(devHandle, nullptr, status, out);
+
+  gdaki_ctx->gqp_groups = (struct doca_gpu_verbs_qp_group_hl **)calloc(
+    nqps_for_comm, sizeof(*gdaki_ctx->gqp_groups));
+  EQCHECKGOTO(gdaki_ctx->gqp_groups, nullptr, status, out);
+
+  // Main QP
+  gdaki_ctx->gqps = (struct doca_gpu_verbs_qp_hl **)calloc(nqps, sizeof(*gdaki_ctx->gqps));
+  EQCHECKGOTO(gdaki_ctx->gqps, nullptr, status, out);
+
+  // Companion QP
+  gdaki_ctx->companion_gqps =
+    (struct doca_gpu_verbs_qp_hl **)calloc(ncompanion_qps, sizeof(*gdaki_ctx->companion_gqps));
+  EQCHECKGOTO(gdaki_ctx->companion_gqps, nullptr, status, out);
+
+  local_exch_info = (struct gdaki_exch_info *)calloc(nranks, sizeof(*local_exch_info));
+  EQCHECKGOTO(local_exch_info, nullptr, status, out);
+
+  remote_exch_info = (struct gdaki_exch_info *)calloc(nranks, sizeof(*remote_exch_info));
+  EQCHECKGOTO(remote_exch_info, nullptr, status, out);
+
+  CUDACHECK(cudaGetDevice(&gdaki_ctx->cuda_id));
+  CUDACHECK(cudaDeviceGetPCIBusId(pciBusId, MAX_PCI_ADDRESS_LEN, gdaki_ctx->cuda_id));
+
+  DOCACHECKGOTO(doca_gpu_create(pciBusId, &gdaki_ctx->gdev), docaStatus, status, out);
+
+  // Find the IB/RoCE device by name
+  NCCLCHECKGOTO(gdakiFindDevice(props.name, &gdaki_ctx->ib_dev), status, out);
+
+  // Open the IB context
+  NCCLCHECKGOTO(wrap_ibv_open_device(&gdaki_ctx->ib_ctx, gdaki_ctx->ib_dev), status, out);
+
+  // Allocate the protection domain
+  NCCLCHECKGOTO(wrap_ibv_alloc_pd(&gdaki_ctx->ib_pd, gdaki_ctx->ib_ctx), status, out);
+
+  // Exchange counters and signals with peers
+  NCCLCHECKGOTO(counters_table->register_mr(gdaki_ctx->ib_pd, true), status, out);
+  NCCLCHECKGOTO(signals_table->register_mr(gdaki_ctx->ib_pd, true), status, out);
+
+  NCCLCHECKGOTO(counters_table->exchange_info(cComm), status, out);
+  NCCLCHECKGOTO(signals_table->exchange_info(cComm), status, out);
+
+  gdaki_ctx->port_num = props.port;
+  NCCLCHECKGOTO(wrap_ibv_query_port(gdaki_ctx->ib_ctx, gdaki_ctx->port_num, &gdaki_ctx->port_attr),
+                status, out);
+
+  // Get the GID index
+  NCCLCHECKGOTO(cComm->getGidIndex(gdaki_ctx->ib_ctx, gdaki_ctx->port_num, &gdaki_ctx->port_attr, &ib_gid_index), status, out);
+  gdaki_ctx->gid_index = ib_gid_index;
+
+  NCCLCHECKGOTO(wrap_ibv_query_gid(gdaki_ctx->ib_ctx, 1, ib_gid_index, &gdaki_ctx->rgid), status,
+                out);
+
+  NCCLCHECKGOTO(gdakiCreateVerbsAh(gdaki_ctx, ib_sl, ib_tc, ib_gid_index), status, out);
+
+  gdaki_ctx->qp_rq_size = 0;
+  gdaki_ctx->qp_sq_size = ncclParamGinGdakiQpDepth();
+
+  memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+  qp_init_attr.gpu_dev = gdaki_ctx->gdev;
+  qp_init_attr.ibpd = gdaki_ctx->ib_pd;
+  qp_init_attr.sq_nwqe = gdaki_ctx->qp_sq_size;
+  qp_init_attr.nic_handler =
+    (enum doca_gpu_dev_verbs_nic_handler)ncclParamGinGdakiNicHandler();
+  qp_init_attr.mreg_type = DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT;
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    DOCACHECKGOTO(
+      doca_gpu_verbs_create_qp_group_hl(&qp_init_attr, &gdaki_ctx->gqp_groups[qp_idx]),
+      docaStatus, status, out);
+
+    gdaki_ctx->gqps[qp_idx] = &gdaki_ctx->gqp_groups[qp_idx]->qp_main;
+    gdaki_ctx->companion_gqps[qp_idx] = &gdaki_ctx->gqp_groups[qp_idx]->qp_companion;
+
+    INFO(NCCL_NET, "[%d] Created a QP group: qp_idx=%d, main_qpn=%#x, companion_qpn=%#x", rank,
+         qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp),
+         doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp));
+  }
+
+  for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+    DOCACHECKGOTO(doca_gpu_verbs_create_qp_hl(&qp_init_attr, &gdaki_ctx->gqps[qp_idx]),
+                  docaStatus, status, out);
+    INFO(NCCL_NET, "[%d] Created a self-loop peer QP: qp_idx=%d, qpn=%#x", rank, qp_idx,
+         doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp));
+  }
+
+  for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+    DOCACHECKGOTO(
+      doca_gpu_verbs_create_qp_hl(&qp_init_attr, &gdaki_ctx->companion_gqps[qp_idx]),
+      docaStatus, status, out);
+    INFO(NCCL_NET, "[%d] Created a self-loop peer companion QP: qp_idx=%d, qpn=%#x", rank, qp_idx,
+         doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp));
+  }
+
+  for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+    // Prepare information for exchange with peers
+    for (int rank_idx = 0; rank_idx < nranks; rank_idx++) {
+      int qp_idx = rank_idx + ctx_idx * nranks;
+      gdakiFillExchInfo(&local_exch_info[rank_idx], gdaki_ctx, gdaki_ctx->gqps[qp_idx]);
+    }
+
+    // Exchange information with peers
+    NCCLCHECKGOTO(
+      cComm->allToAll(cComm, local_exch_info, remote_exch_info, sizeof(struct gdaki_exch_info)),
+      status, out);
+
+    for (int rank_idx = 0; rank_idx < nranks; rank_idx++) {
+      int qp_idx = rank_idx + ctx_idx * nranks;
+      if (rank_idx == rank)
+        gdakiFillExchInfo(&remote_exch_info[rank_idx], gdaki_ctx,
+                          gdaki_ctx->gqps[nqps_for_comm + ctx_idx]);
+
+      NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->gqps[qp_idx], &remote_exch_info[rank_idx]),
+                    status, out);
+
+      INFO(NCCL_NET,
+           "[%d] Connected main QP: qp_idx=%d, main_qpn=%#x, remote_rank=%d, remote_qpn=%#x", rank,
+           qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp), rank_idx,
+           remote_exch_info[rank_idx].qpn);
+    }
+  }
+
+  for (int qp_idx = 0; qp_idx < nqps_per_rank; qp_idx++) {
+    int peer_qp_idx = nqps_for_comm + qp_idx;
+    struct gdaki_exch_info exch_info;
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->gqps[qp_idx * nqps_per_rank + rank]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->gqps[peer_qp_idx], &exch_info), status, out);
+    INFO(NCCL_NET, "[%d] Connected self-loop peer QP: qp_idx=%d, qpn=%#x, main_qpn=%#x", rank,
+         peer_qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[peer_qp_idx]->qp), exch_info.qpn);
+  }
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    int peer_qp_idx = nqps_for_comm + qp_idx;
+    struct gdaki_exch_info exch_info;
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->companion_gqps[peer_qp_idx]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->companion_gqps[qp_idx], &exch_info), status,
+                  out);
+    INFO(NCCL_NET,
+         "[%d] Connected companion QP: qp_idx=%d, companion_qpn=%#x, peer_companion_qpn=%#x", rank,
+         qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp), exch_info.qpn);
+
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->companion_gqps[qp_idx]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->companion_gqps[peer_qp_idx], &exch_info),
+                  status, out);
+    INFO(NCCL_NET,
+         "[%d] Connected self-loop peer companion QP: qp_idx=%d, peer_companion_qpn=%#x, "
+         "companion_qpn=%#x",
+         rank, peer_qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[peer_qp_idx]->qp),
+         exch_info.qpn);
+  }
+
+  NCCLCHECKGOTO(ncclCuMemAlloc((void **)&sink_buffer, &sink_buffer_mhandle, CU_MEM_HANDLE_TYPE_NONE,
+                               sizeof(uint64_t)),
+                status, out);
+
+  NCCLCHECKGOTO(gdakiRegMr(&sink_buffer_mr, gdaki_ctx->ib_pd, sink_buffer, sizeof(uint64_t),
+                           IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+                             IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC),
+                status, out);
+
+  for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+    struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+      &gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+
+    struct doca_gpu_dev_verbs_qp *tmp_qp;
+    struct doca_gpu_dev_verbs_qp *tmp_qp_companion;
+
+    tmp_qp = (struct doca_gpu_dev_verbs_qp *)calloc(nranks,
+                                                         sizeof(struct doca_gpu_dev_verbs_qp));
+    tmp_qp_companion = (struct doca_gpu_dev_verbs_qp *)calloc(
+      nranks, sizeof(struct doca_gpu_dev_verbs_qp));
+    for (int qp_idx = 0; qp_idx < nranks; qp_idx++) {
+      struct doca_gpu_dev_verbs_qp *qp_cpu =
+        gdaki_ctx->gqps[(ctx_idx * nranks) + qp_idx]->qp_gverbs->qp_cpu;
+      memcpy(&tmp_qp[qp_idx], qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp));
+      need_cpu_proxy |= (qp_cpu->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY);
+
+      qp_cpu = gdaki_ctx->companion_gqps[(ctx_idx * nranks) + qp_idx]->qp_gverbs->qp_cpu;
+      memcpy(&tmp_qp_companion[qp_idx], qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp));
+      need_cpu_proxy |= (qp_cpu->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY);
+    }
+
+    DOCACHECKGOTO(
+      doca_gpu_mem_alloc(gdaki_ctx->gdev, sizeof(struct doca_gpu_dev_verbs_qp) * nranks,
+                              host_page_size, DOCA_GPU_MEM_TYPE_GPU,
+                              (void **)&gin_gdaki_gpu_ctx->gdqp, nullptr);
+      , docaStatus, status, out);
+
+    NCCLCHECKGOTO(
+      ncclCudaMemcpy<struct doca_gpu_dev_verbs_qp>(gin_gdaki_gpu_ctx->gdqp, tmp_qp, nranks),
+      status, out);
+
+    DOCACHECKGOTO(
+      doca_gpu_mem_alloc(gdaki_ctx->gdev, sizeof(struct doca_gpu_dev_verbs_qp) * nranks,
+                              host_page_size, DOCA_GPU_MEM_TYPE_GPU,
+                              (void **)&gin_gdaki_gpu_ctx->companion_gdqp, nullptr);
+      , docaStatus, status, out);
+
+    NCCLCHECKGOTO(ncclCudaMemcpy<struct doca_gpu_dev_verbs_qp>(
+                    gin_gdaki_gpu_ctx->companion_gdqp, tmp_qp_companion, nranks),
+                  status, out);
+
+    gin_gdaki_gpu_ctx->counters_table.buffer = counters_table->gpu_ptr;
+    gin_gdaki_gpu_ctx->counters_table.rkeys = counters_table->get_rkeys_d();
+    gin_gdaki_gpu_ctx->counters_table.lkey = htobe32(counters_table->mr->lkey);
+    gin_gdaki_gpu_ctx->signals_table.buffer = signals_table->gpu_ptr;
+    gin_gdaki_gpu_ctx->signals_table.rkeys = signals_table->get_rkeys_d();
+    gin_gdaki_gpu_ctx->signals_table.lkey = htobe32(signals_table->mr->lkey);
+    gin_gdaki_gpu_ctx->sink_buffer_lkey = htobe32(sink_buffer_mr->lkey);
+
+    free(tmp_qp);
+    free(tmp_qp_companion);
+  }
+
+  NCCLCHECKGOTO(gin_gdaki_gpu_ctx_hd_mhandle->copy_h_to_d(), status, out);
+
+  devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_GDAKI;
+  devHandle->netDeviceVersion = NCCL_GIN_GDAKI_VERSION;
+  devHandle->handle = (void *)gin_gdaki_gpu_ctx_hd_mhandle->gpu_buf;
+  devHandle->size = 0;
+  devHandle->needsProxyProgress = need_cpu_proxy;
+
+  gdaki_ctx->ib_pd = gdaki_ctx->ib_pd;
+  gdaki_ctx->counters_table = counters_table;
+  gdaki_ctx->signals_table = signals_table;
+  gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle = gin_gdaki_gpu_ctx_hd_mhandle;
+  gdaki_ctx->sink_buffer.addr = sink_buffer;
+  gdaki_ctx->sink_buffer.mr = sink_buffer_mr;
+  gdaki_ctx->sink_buffer.mhandle = sink_buffer_mhandle;
+  gdaki_ctx->collComm = cComm;
+  gdaki_ctx->devHandle = devHandle;
+
+  cComm->ginCtx = gdaki_ctx;
+
+  *outDevHandle = devHandle;
+  *outGinCtx = gdaki_ctx;
+
+out:
+  if (status != ncclSuccess) {
+    if (gdaki_ctx) {
+      // Clean up any allocated GPU memory
+      if (gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle) {
+        for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+          struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+            &gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+          if (gin_gdaki_gpu_ctx->gdqp) {
+            doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->gdqp);
+            gin_gdaki_gpu_ctx->gdqp = nullptr;
+          }
+          if (gin_gdaki_gpu_ctx->companion_gdqp) {
+            doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->companion_gdqp);
+            gin_gdaki_gpu_ctx->companion_gdqp = nullptr;
+          }
+        }
+      }
+
+      for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_group_hl(gdaki_ctx->gqp_groups[qp_idx]);
+        gdaki_ctx->gqp_groups[qp_idx] = nullptr;
+      }
+      for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->gqps[qp_idx]);
+        gdaki_ctx->gqps[qp_idx] = nullptr;
+      }
+      for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->companion_gqps[qp_idx]);
+        gdaki_ctx->companion_gqps[qp_idx] = nullptr;
+      }
+
+      if (gdaki_ctx->gqp_groups) free(gdaki_ctx->gqp_groups);
+      if (gdaki_ctx->gqps) free(gdaki_ctx->gqps);
+      if (gdaki_ctx->companion_gqps) free(gdaki_ctx->companion_gqps);
+
+      if (gdaki_ctx->gdev) doca_gpu_destroy(gdaki_ctx->gdev);
+    }
+
+    if (devHandle) free(devHandle);
+
+    if (sink_buffer_mr) NCCLCHECK(wrap_ibv_dereg_mr(sink_buffer_mr));
+    if (sink_buffer) ncclCuMemFree(sink_buffer);
+
+    delete gin_gdaki_gpu_ctx_hd_mhandle;
+
+    if (counters_table) {
+      counters_table->deregister_mr();
+      delete counters_table;
+    }
+
+    if (signals_table) {
+      signals_table->deregister_mr();
+      delete signals_table;
+    }
+
+    if (gdaki_ctx) {
+      if (gdaki_ctx->ib_pd) NCCLCHECK(wrap_ibv_dealloc_pd(gdaki_ctx->ib_pd));
+      if (gdaki_ctx->ib_ctx) NCCLCHECK(wrap_ibv_close_device(gdaki_ctx->ib_ctx));
+
+      memset(gdaki_ctx, 0, sizeof(*gdaki_ctx));
+      free(gdaki_ctx);
+    }
+  }
+
+  if (local_exch_info) free(local_exch_info);
+
+  if (remote_exch_info) free(remote_exch_info);
+
+  return (ncclResult_t)status;
+}
+
+ncclResult_t ncclGinGdakiDestroyContext(void *ginCtx) {
+  if (!ginCtx) return ncclInvalidArgument;
+
+  struct gdaki_context *gdaki_ctx = (struct gdaki_context *)ginCtx;
+  struct ncclGinIbCollComm *cComm = gdaki_ctx->collComm;
+  const int nranks = cComm->nranks;
+  const int ncontexts = 1;
+  const int nqps_per_rank = ncontexts;
+  const int nqps_for_comm = nqps_per_rank * nranks;  // Number of QPs for communication
+  const int ncompanion_qps = nqps_for_comm * 2;      // Number of companion QPs for communication
+                                                     // Double because we connect to self.
+  const int nqps =
+    nqps_per_rank * (nranks + 1);  // +1 for the local rank.
+                                   // The last group is the responder of the local rank.
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_group_hl(gdaki_ctx->gqp_groups[qp_idx]);
+    gdaki_ctx->gqp_groups[qp_idx] = nullptr;
+  }
+  for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->gqps[qp_idx]);
+    gdaki_ctx->gqps[qp_idx] = nullptr;
+  }
+  for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->companion_gqps[qp_idx]);
+    gdaki_ctx->companion_gqps[qp_idx] = nullptr;
+  }
+
+  if (gdaki_ctx->gqp_groups) free(gdaki_ctx->gqp_groups);
+  if (gdaki_ctx->gqps) free(gdaki_ctx->gqps);
+  if (gdaki_ctx->companion_gqps) free(gdaki_ctx->companion_gqps);
+
+  if (gdaki_ctx->counters_table) {
+    gdaki_ctx->counters_table->deregister_mr();
+    delete gdaki_ctx->counters_table;
+  }
+  if (gdaki_ctx->signals_table) {
+    gdaki_ctx->signals_table->deregister_mr();
+    delete gdaki_ctx->signals_table;
+  }
+
+  if (gdaki_ctx->sink_buffer.mr) NCCLCHECK(wrap_ibv_dereg_mr(gdaki_ctx->sink_buffer.mr));
+  if (gdaki_ctx->sink_buffer.addr) NCCLCHECK(ncclCuMemFree(gdaki_ctx->sink_buffer.addr));
+
+  if (gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle) {
+    for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+      struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+        &gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+      if (gin_gdaki_gpu_ctx->gdqp) {
+        DOCACHECK(doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->gdqp));
+      }
+      if (gin_gdaki_gpu_ctx->companion_gdqp) {
+        DOCACHECK(doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->companion_gdqp));
+      }
+    }
+    delete gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle;
+  }
+
+  if (gdaki_ctx->ah) {
+    DOCACHECK(doca_verbs_ah_attr_destroy(gdaki_ctx->ah));
+  }
+
+  if (gdaki_ctx->gdev) {
+    DOCACHECK(doca_gpu_destroy(gdaki_ctx->gdev));
+  }
+  if (gdaki_ctx->ib_pd) NCCLCHECK(wrap_ibv_dealloc_pd(gdaki_ctx->ib_pd));
+  if (gdaki_ctx->ib_ctx) NCCLCHECK(wrap_ibv_close_device(gdaki_ctx->ib_ctx));
+
+  if (gdaki_ctx->devHandle) free(gdaki_ctx->devHandle);
+
+  memset(gdaki_ctx, 0, sizeof(*gdaki_ctx));
+  free(gdaki_ctx);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiRegMrSym(void *collComm, void *data, size_t size, int type, void **mhandle,
+                                  void **ginHandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+
+  struct gdaki_context *gdaki_ctx = (struct gdaki_context *)cComm->ginCtx;
+  struct ibv_mr *mr = nullptr;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle> *gdaki_mhandle_hd_mhandle =
+    new GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle>(1);
+  GdakiHostGPUMemHandle<__be32> *rkeys_hd_mhandle =
+    new GdakiHostGPUMemHandle<__be32>(cComm->nranks);
+  __be32 rkey;
+
+  struct gdaki_mem_handle *gdaki_mhandle = nullptr;
+  gdaki_mhandle = (struct gdaki_mem_handle *)calloc(1, sizeof(*gdaki_mhandle));
+  EQCHECK(gdaki_mhandle, nullptr);
+
+  NCCLCHECK(gdakiRegMr(&mr, gdaki_ctx->ib_pd, data, size,
+                       IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                         IBV_ACCESS_REMOTE_ATOMIC));
+
+  rkey = htobe32(mr->rkey);
+  NCCLCHECK(cComm->allGather(cComm, &rkey, rkeys_hd_mhandle->host_buf, sizeof(__be32)));
+  NCCLCHECK(rkeys_hd_mhandle->copy_h_to_d());
+
+  gdaki_mhandle_hd_mhandle->host_buf->rkeys = rkeys_hd_mhandle->gpu_buf;
+  gdaki_mhandle_hd_mhandle->host_buf->lkey = htobe32(mr->lkey);
+  NCCLCHECK(gdaki_mhandle_hd_mhandle->copy_h_to_d());
+
+  gdaki_mhandle->type = type;
+  gdaki_mhandle->mr = mr;
+  gdaki_mhandle->gdaki_mhandle_hd_mhandle = gdaki_mhandle_hd_mhandle;
+  gdaki_mhandle->rkeys_hd_mhandle = rkeys_hd_mhandle;
+
+  INFO(NCCL_NET, "[%d] Registered MR: data=%p, size=%zu, lkey(be32)=%#x, rkey(be32)=%#x",
+       cComm->rank, data, size, htobe32(mr->lkey), htobe32(mr->rkey));
+
+  *mhandle = (void *)gdaki_mhandle;
+  *ginHandle = (void *)gdaki_mhandle_hd_mhandle->gpu_buf;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiDeregMrSym(void *collComm, void *mhandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct gdaki_mem_handle *gdaki_mhandle = (struct gdaki_mem_handle *)mhandle;
+  struct ibv_mr *mr = gdaki_mhandle->mr;
+
+  INFO(NCCL_NET, "[%d] Unregistering MR: lkey(be32)=%#x, rkey(be32)=%#x", cComm->rank,
+       htobe32(mr->lkey), htobe32(mr->rkey));
+
+  NCCLCHECK(wrap_ibv_dereg_mr(mr));
+
+  delete gdaki_mhandle->gdaki_mhandle_hd_mhandle;
+  delete gdaki_mhandle->rkeys_hd_mhandle;
+
+  memset(gdaki_mhandle, 0, sizeof(*gdaki_mhandle));
+
+  free(gdaki_mhandle);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiProgress(void *collComm) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct gdaki_context *gdakiCtx = (struct gdaki_context *)cComm->ginCtx;
+  const int ncontexts = 1;
+  const int nranks = gdakiCtx->collComm->nranks;
+  const int nqpsPerRank = ncontexts;
+  const int nqpsForComm = nqpsPerRank * nranks;  // Number of QPs for communication
+
+  for (int qpIdx = 0; qpIdx < nqpsForComm; qpIdx++) {
+    struct doca_gpu_verbs_qp *qp = gdakiCtx->gqps[qpIdx]->qp_gverbs;
+    if (qp->cpu_proxy) {
+      DOCACHECK(doca_gpu_verbs_cpu_proxy_progress(qp));
+    }
+
+    qp = gdakiCtx->companion_gqps[qpIdx]->qp_gverbs;
+    if (qp->cpu_proxy) {
+      DOCACHECK(doca_gpu_verbs_cpu_proxy_progress(qp));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiQueryLastError(void *ginCtx, bool *hasError) {
+  struct gdaki_context *gdakiCtx = (struct gdaki_context *)ginCtx;
+  bool hasError_ = false;
+  const int ncontexts = 1;
+  const int nranks = gdakiCtx->collComm->nranks;
+  const int nqpsPerRank = ncontexts;
+  const int nqpsForComm = nqpsPerRank * nranks;  // Number of QPs for communication
+
+  // We throttle the frequency of these queries since they can easily take 250us.
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+    if (ts.tv_sec - gdakiCtx->last_error_query_time.tv_sec +
+          (ts.tv_nsec - gdakiCtx->last_error_query_time.tv_nsec) / 1e9 <
+        ncclParamGinErrorQuerySec()) {
+      goto exit;
+    }
+    gdakiCtx->last_error_query_time = ts;
+  }
+
+  for (int qpIdx = 0; qpIdx < nqpsForComm; qpIdx++) {
+    struct doca_gpu_verbs_qp *qp = gdakiCtx->gqps[qpIdx]->qp_gverbs;
+    struct doca_gpu_verbs_qp_error_info errorInfo;
+    DOCACHECK(doca_gpu_verbs_query_last_error(qp, &errorInfo));
+    hasError_ |= errorInfo.has_error;
+    if (hasError_) break;
+
+    qp = gdakiCtx->companion_gqps[qpIdx]->qp_gverbs;
+    DOCACHECK(doca_gpu_verbs_query_last_error(qp, &errorInfo));
+    hasError_ |= errorInfo.has_error;
+    if (hasError_) break;
+  }
+exit:
+  *hasError = hasError_;
+  return ncclSuccess;
+}
diff --git a/src/transport/gdaki/gin_host_gdaki.h b/src/transport/gdaki/gin_host_gdaki.h
new file mode 100644
index 000000000..fcd454891
--- /dev/null
+++ b/src/transport/gdaki/gin_host_gdaki.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _GIN_HOST_GDAKI_H_
+#define _GIN_HOST_GDAKI_H_
+
+#ifndef DOCA_VERBS_USE_CUDA_WRAPPER
+#define DOCA_VERBS_USE_CUDA_WRAPPER
+#endif
+
+#ifndef DOCA_VERBS_USE_NET_WRAPPER
+#define DOCA_VERBS_USE_NET_WRAPPER
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "nccl.h"
+#include "gin/gin_host.h"
+
+ncclResult_t ncclGinGdakiCreateContext(void *collComm, int nSignals, int nCounters,
+                                       void **outGinCtx, ncclNetDeviceHandle_v11_t **outDevHandle);
+ncclResult_t ncclGinGdakiDestroyContext(void *ginCtx);
+ncclResult_t ncclGinGdakiRegMrSym(void *collComm, void *data, size_t size, int type, void **mhandle,
+                                  void **ginHandle);
+ncclResult_t ncclGinGdakiDeregMrSym(void *collComm, void *mhandle);
+ncclResult_t ncclGinGdakiProgress(void *ginCtx);
+ncclResult_t ncclGinGdakiQueryLastError(void *ginCtx, bool *hasError);
+
+#endif
diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc
index 3614dec61..191dad03d 100644
--- a/src/transport/net_ib.cc
+++ b/src/transport/net_ib.cc
@@ -33,8 +33,6 @@
 static char ncclIbIfName[MAX_IF_NAME_SIZE+1];
 static union ncclSocketAddress ncclIbIfAddr;
 
-static ncclNetCommConfig_t ibContext;
-
 struct ncclIbMr {
   uintptr_t addr;
   size_t pages;
@@ -161,6 +159,49 @@ static void ncclIbDevFatalError(struct ncclIbDev* dev) {
   ncclIbStatsFatalError(&dev->stats);
 }
 
+// Helper function to convert IB work completion status to string
+static const char* ibvWcStatusStr(enum ibv_wc_status status) {
+  switch (status) {
+    case IBV_WC_SUCCESS:            return "IBV_WC_SUCCESS";
+    case IBV_WC_LOC_LEN_ERR:        return "IBV_WC_LOC_LEN_ERR";
+    case IBV_WC_LOC_QP_OP_ERR:      return "IBV_WC_LOC_QP_OP_ERR";
+    case IBV_WC_LOC_EEC_OP_ERR:     return "IBV_WC_LOC_EEC_OP_ERR";
+    case IBV_WC_LOC_PROT_ERR:       return "IBV_WC_LOC_PROT_ERR";
+    case IBV_WC_WR_FLUSH_ERR:       return "IBV_WC_WR_FLUSH_ERR";
+    case IBV_WC_MW_BIND_ERR:        return "IBV_WC_MW_BIND_ERR";
+    case IBV_WC_BAD_RESP_ERR:       return "IBV_WC_BAD_RESP_ERR";
+    case IBV_WC_LOC_ACCESS_ERR:     return "IBV_WC_LOC_ACCESS_ERR";
+    case IBV_WC_REM_INV_REQ_ERR:    return "IBV_WC_REM_INV_REQ_ERR";
+    case IBV_WC_REM_ACCESS_ERR:     return "IBV_WC_REM_ACCESS_ERR";
+    case IBV_WC_REM_OP_ERR:         return "IBV_WC_REM_OP_ERR";
+    case IBV_WC_RETRY_EXC_ERR:      return "IBV_WC_RETRY_EXC_ERR";
+    case IBV_WC_RNR_RETRY_EXC_ERR:  return "IBV_WC_RNR_RETRY_EXC_ERR";
+    case IBV_WC_LOC_RDD_VIOL_ERR:   return "IBV_WC_LOC_RDD_VIOL_ERR";
+    case IBV_WC_REM_INV_RD_REQ_ERR: return "IBV_WC_REM_INV_RD_REQ_ERR";
+    case IBV_WC_REM_ABORT_ERR:      return "IBV_WC_REM_ABORT_ERR";
+    case IBV_WC_INV_EECN_ERR:       return "IBV_WC_INV_EECN_ERR";
+    case IBV_WC_INV_EEC_STATE_ERR:  return "IBV_WC_INV_EEC_STATE_ERR";
+    case IBV_WC_FATAL_ERR:          return "IBV_WC_FATAL_ERR";
+    case IBV_WC_RESP_TIMEOUT_ERR:   return "IBV_WC_RESP_TIMEOUT_ERR";
+    case IBV_WC_GENERAL_ERR:        return "IBV_WC_GENERAL_ERR";
+    default:                        return "UNKNOWN_STATUS";
+  }
+}
+
+// Helper function to convert IB work completion opcode to string
+static const char* ibvWcOpcodeStr(enum ibv_wc_opcode opcode) {
+  switch (opcode) {
+    case IBV_WC_SEND:               return "IBV_WC_SEND";
+    case IBV_WC_RDMA_WRITE:         return "IBV_WC_RDMA_WRITE";
+    case IBV_WC_RDMA_READ:          return "IBV_WC_RDMA_READ";
+    case IBV_WC_COMP_SWAP:          return "IBV_WC_COMP_SWAP";
+    case IBV_WC_FETCH_ADD:          return "IBV_WC_FETCH_ADD";
+    case IBV_WC_BIND_MW:            return "IBV_WC_BIND_MW";
+    case IBV_WC_RECV:               return "IBV_WC_RECV";
+    case IBV_WC_RECV_RDMA_WITH_IMM: return "IBV_WC_RECV_RDMA_WITH_IMM";
+    default:                        return "UNKNOWN_OPCODE";
+  }
+}
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
   struct ncclIbDev* dev = (struct ncclIbDev*)args;
@@ -209,7 +250,7 @@ static void* ncclIbAsyncThreadMain(void* args) {
     case IBV_EVENT_CLIENT_REREGISTER:
     case IBV_EVENT_SRQ_LIMIT_REACHED:
       // the above are non-fatal
-      WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
+      WARN("NET/IB : %s:%d Got non-fatal async event: %s(%d)", dev->devName, dev->portNum, str, event.event_type);
       break;
     case IBV_EVENT_COMM_EST:
       break;
@@ -624,9 +665,14 @@ ncclResult_t ncclIbSetNetAttr(void *ctx, ncclNetAttr_t *netAttr) {
 
 static ncclProfilerCallback_t ncclProfilerFunction;
 
-ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
-  if (netRefCount++) return ncclSuccess;
+static ncclResult_t ncclIbFinalizeDevices(void) {
+  netRefCount--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclIbInitDevices(ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
   ncclResult_t ret = ncclSuccess;
+  if (netRefCount++) return ret;
   ncclProfilerFunction = profFunction;
   if (ncclParamIbDisable()) return ncclInternalError;
   static int shownIbHcaEnv = 0;
@@ -795,13 +841,21 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
 
   }
 exit:
-  ibContext.trafficClass = config->trafficClass;
-  *ctx = &ibContext;
   return ret;
 fail:
   goto exit;
 }
 
+ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  ncclResult_t ret = ncclSuccess;
+  ncclNetCommConfig_t* netCommConfig = nullptr;
+  NCCLCHECK(ncclIbInitDevices(logFunction, profFunction));
+  NCCLCHECK(ncclCalloc(&netCommConfig, 1));
+  netCommConfig->trafficClass = config->trafficClass;
+  *ctx = (void *)netCommConfig;
+  return ret;
+}
+
 ncclResult_t ncclIbDevices(int* ndev) {
   *ndev = ncclNMergedIbDevs;
   return ncclSuccess;
@@ -995,7 +1049,8 @@ struct ncclIbGidInfo {
 #define NCCL_NET_IB_REQ_SEND 1
 #define NCCL_NET_IB_REQ_RECV 2
 #define NCCL_NET_IB_REQ_FLUSH 3
-const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush" };
+#define NCCL_NET_IB_REQ_GIN_IPUT 4
+const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush", "IPut" };
 
 #define MAX_QPS_PER_REQ 8
 struct ncclProfilerInfo {
@@ -1026,6 +1081,9 @@ struct ncclIbRequest {
     struct {
       int* sizes;
     } recv;
+    struct {
+      int rank;
+    } iput;
   };
 };
 
@@ -1073,6 +1131,7 @@ struct ncclIbRemSizesFifo {
 struct alignas(8) ncclIbSendCommDev {
   struct ncclIbNetCommDevBase base;
   struct ibv_mr* fifoMr;
+  struct ibv_mr* putSignalScratchpadMr;
 };
 
 
@@ -1111,6 +1170,7 @@ struct ncclIbSendComm {
   struct ncclIbRemSizesFifo remSizesFifo;
   uint64_t fifoHead;
   int ar; // Use adaptive routing when all merged devices have it enabled
+  uint64_t putSignalScratchpad;
 };
 // The SendFifo needs to be 32-byte aligned and each element needs
 // to be a 32-byte multiple, so that an entry does not get split and
@@ -1403,6 +1463,9 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
     devInfo->mtu           = ibDev->portAttr.active_mtu;
     devInfo->lid           = ibDev->portAttr.lid;
 
+    // Prepare GIN Put Signal scratchpad (for RDMA Atomic result)
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->putSignalScratchpadMr, commDev->base.pd, &comm->putSignalScratchpad, sizeof(comm->putSignalScratchpad), IBV_ACCESS_LOCAL_WRITE), ret, fail);
+
     // Prepare my fifo
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     devInfo->fifoRkey = commDev->fifoMr->rkey;
@@ -1422,13 +1485,13 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d LID %d subnet-prefix %lu  FLID %d fifoRkey=0x%x fifoLkey=0x%x",
                comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev",
                dev, commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu, devInfo->lid,
-               devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
+               (uint64_t)devInfo->gid.global.subnet_prefix, ncclIbExtractFlid(&devInfo->gid), devInfo->fifoRkey, commDev->fifoMr->lkey);
         } else { // RoCE
           INFO(NCCL_NET,"NET/IB: %s %d IbDev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX) fifoRkey=0x%x fifoLkey=0x%x",
                comm->base.vProps.ndevs > 2 ? "NCCL MergedDev" : "NCCL Dev", dev,
                commDev->base.ibDevN, ibDev->portNum, meta.qpInfo[q].qpn, devInfo->mtu,
                (int64_t)commDev->base.gidInfo.localGidIndex,
-               devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
+               (uint64_t)devInfo->gid.global.subnet_prefix, devInfo->gid.global.interface_id, devInfo->fifoRkey, commDev->fifoMr->lkey);
         }
         // Log ECE info
         if (meta.qpInfo[q].ece_supported) {
@@ -1735,7 +1798,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     // Local ibDevN
     ibDevN = rComm->devs[devIndex].base.ibDevN;
     ibDev = ncclIbDevs + ibDevN;
-    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp), ret, fail);
     qp->devIndex = devIndex;
     devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
 
@@ -1872,7 +1935,7 @@ ncclResult_t ncclIbFreeRequest(struct ncclIbRequest* r) {
 
 ncclResult_t ncclIbTest(void* request, int* done, int* size);
 
-ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, ibv_mr** mhandle) {
+ncclResult_t ncclIbRegMrDmaBufInternal2(ncclIbNetCommDevBase* base, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mrFlags, ibv_mr** mhandle) {
   static __thread uintptr_t pageSize = 0;
   if (pageSize == 0) pageSize = sysconf(_SC_PAGESIZE);
   struct ncclIbMrCache* cache = &ncclIbDevs[base->ibDevN].mrCache;
@@ -1887,8 +1950,9 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
       }
       // Deregister / register
       struct ibv_mr* mr;
-      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
-      if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
+      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ|IBV_ACCESS_REMOTE_ATOMIC;
+      bool relaxedOrdering = ncclIbRelaxedOrderingEnabled && (mrFlags & NCCL_NET_MR_FLAG_FORCE_SO) == 0;
+      if (relaxedOrdering) flags |= IBV_ACCESS_RELAXED_ORDERING;
       if (fd != -1) {
         /* DMA-BUF support */
         if (!ncclIbDevs[base->ibDevN].capsProvider.mlx5.dataDirect) {
@@ -1897,7 +1961,7 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
           NCCLCHECK(wrap_mlx5dv_reg_dmabuf_mr(&mr, base->pd, offset, pages*pageSize, addr, fd, flags, MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT));
         }
       } else {
-        if (ncclIbRelaxedOrderingEnabled) {
+        if (relaxedOrdering) {
           // Use IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
           NCCLCHECK(wrap_ibv_reg_mr_iova2(&mr, base->pd, (void*)addr, pages*pageSize, addr, flags));
         }
@@ -1935,7 +1999,7 @@ struct ncclIbNetCommDevBase* ncclIbGetNetCommDevBase(ncclIbNetCommBase* base, in
 }
 
 /* DMA-BUF support */
-ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
+ncclResult_t ncclIbRegMrDmaBufInternal(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mrFlags, void** mhandle) {
   ncclResult_t ret = ncclSuccess;
   assert(size > 0);
   struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
@@ -1943,7 +2007,7 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
   for (int i = 0; i < base->vProps.ndevs; i++) {
     // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
     struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
-    NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal(devComm, data, size, type, offset, fd, mhandleWrapper->mrs + i), ret, fail);
+    NCCLCHECKGOTO(ncclIbRegMrDmaBufInternal2(devComm, data, size, type, offset, fd, mrFlags, mhandleWrapper->mrs + i), ret, fail);
   }
   *mhandle = (void*) mhandleWrapper;
 exit:
@@ -1953,8 +2017,11 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
   goto exit;
 }
 
+ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle) {
+  return ncclIbRegMrDmaBufInternal(comm, data, size, type, offset, fd, 0ULL, mhandle);
+}
 ncclResult_t ncclIbRegMr(void* comm, void* data, size_t size, int type, void** mhandle) {
-  return ncclIbRegMrDmaBuf(comm, data, size, type, 0ULL, -1, mhandle);
+  return ncclIbRegMrDmaBufInternal(comm, data, size, type, 0ULL, -1, 0, mhandle);
 }
 
 ncclResult_t ncclIbDeregMrInternal(ncclIbNetCommDevBase* base, ibv_mr* mhandle) {
@@ -2467,8 +2534,15 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
             char line[SOCKET_NAME_MAXLEN+1];
             char *hcaName = r->devBases[i]->pd->context->device->name;
-            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
-                ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
+            int reqSize = wc->byte_len;
+            struct ncclIbRequest* req = r->base->reqs+(wc->wr_id & 0xff);
+            if (req && req->type == NCCL_NET_IB_REQ_SEND) {
+              // For Send use the request size as WC byte_len is not reliable
+              reqSize = req->send.size;
+            }
+            WARN("NET/IB: Got completion from peer %s with status=%s(%d) opcode=%s(%d) reqSize=%d vendor_err=%u req_type=%s%s%s%s%s hca %s",
+                ncclSocketToString(&addr, line), ibvWcStatusStr(wc->status), wc->status,
+                ibvWcOpcodeStr(wc->opcode), wc->opcode, reqSize, wc->vendor_err, reqTypeStr[r->type],
                 localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
             return ncclRemoteError;
           }
@@ -2539,8 +2613,11 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
       struct ncclIbSendCommDev* commDev = comm->devs + i;
       if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
       if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i]));
+      if (commDev->putSignalScratchpadMr != NULL)
+        NCCLCHECK(wrap_ibv_dereg_mr(commDev->putSignalScratchpadMr));
       NCCLCHECK(ncclIbDestroyBase(&commDev->base));
     }
+
     free(comm);
   }
   TIME_PRINT("IB");
@@ -2580,8 +2657,8 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
 }
 
 ncclResult_t ncclIbFinalize(void* ctx) {
-  netRefCount--;
-  return ncclSuccess;
+  free(ctx);
+  return ncclIbFinalizeDevices();
 }
 
 ncclNet_t ncclNetIb = {
@@ -2609,7 +2686,542 @@ ncclNet_t ncclNetIb = {
   ncclIbSetNetAttr,
 };
 
-/*
-  ncclIbSetProperties,
-  ncclIbRefreshDevices
-*/
+
+
+/// GIN IB Plugin
+
+#include "gin/gin_host.h"
+#include "net_ib_gin.h"
+
+const int NCCL_GIN_IB_ALLGATHER_TAG = 0xa0;
+const int NCCL_GIN_IB_ALLTOALL_TAG = 0xa1;
+
+ncclResult_t ncclGinIbInit(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction) {
+  ncclNetCommConfig_t* netCommConfig = nullptr;
+  NCCLCHECK(ncclIbInitDevices(logFunction, nullptr));
+  NCCLCHECK(ncclCalloc(&netCommConfig, 1));
+  *ctx = netCommConfig;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbFinalize(void *ctx) {
+  if (ctx) free(ctx);
+  return ncclIbFinalizeDevices();
+}
+
+static ncclResult_t ncclGinIbAllGather(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len) {
+  ncclResult_t status = ncclSuccess;
+  void *rMhandle = NULL, *sMhandle = NULL;
+  void *srequest = NULL, *rrequest = NULL;
+  int speer;
+  int rpeer;
+  void *rbuf;
+  int tag;
+  int done;
+
+  NCCLCHECKGOTO(ncclNetIb.regMr(cComm->recvComm, recvBuf,
+                                cComm->nranks * len, NCCL_PTR_HOST,
+                                &rMhandle),
+                status, out);
+  NCCLCHECKGOTO(ncclNetIb.regMr(cComm->sendComm, recvBuf,
+                                cComm->nranks * len, NCCL_PTR_HOST,
+                                &sMhandle),
+                status, out);
+
+  speer = cComm->rank;
+  memcpy((void *)((uintptr_t)recvBuf + speer * len), srcBuf, len);
+  for (int i = 0; i < cComm->nranks - 1; i++) {
+    rpeer = (speer - 1 + cComm->nranks) % cComm->nranks;
+    while (srequest == NULL || rrequest == NULL) {
+      rbuf = (void *)((uintptr_t)recvBuf + rpeer * len);
+      tag = NCCL_GIN_IB_ALLGATHER_TAG;
+      if (srequest == NULL)
+        NCCLCHECKGOTO(ncclNetIb.isend(cComm->sendComm,
+                                      (void *)((uintptr_t)recvBuf + speer * len),
+                                      len, tag, sMhandle, NULL, &srequest),
+                      status, out);
+      if (rrequest == NULL)
+        NCCLCHECKGOTO(ncclNetIb.irecv(cComm->recvComm, 1, &rbuf, &len,
+                                      &tag, &rMhandle, NULL, &rrequest),
+                      status, out);
+    }
+    while (srequest || rrequest) {
+      if (rrequest)
+        NCCLCHECKGOTO(ncclNetIb.test(rrequest, &done, NULL),
+                      status, out);
+      if (done)
+        rrequest = NULL;
+      if (srequest)
+        NCCLCHECKGOTO(ncclNetIb.test(srequest, &done, NULL),
+                      status, out);
+      if (done)
+        srequest = NULL;
+    }
+    speer = rpeer;
+  }
+
+out:
+  if (rMhandle)
+    ncclNetIb.deregMr(cComm->recvComm, rMhandle);
+
+  if (sMhandle)
+    ncclNetIb.deregMr(cComm->sendComm, sMhandle);
+
+  return status;
+}
+
+static ncclResult_t ncclGinIbAllToAll(struct ncclGinIbCollComm *cComm, void *src_buf, void *recv_buf, size_t len) {
+  ncclResult_t status = ncclSuccess;
+
+  void *tmp_buf = nullptr;
+  NCCLCHECK(ncclIbMalloc((void **)&tmp_buf, cComm->nranks * cComm->nranks * len));
+  NCCLCHECKGOTO(cComm->allGather(cComm, src_buf, tmp_buf, cComm->nranks * len), status, out);
+
+  for (int i = 0; i < cComm->nranks; i++) {
+    memcpy((void *)((uintptr_t)recv_buf + i * len), (void *)((uintptr_t)tmp_buf + i * cComm->nranks * len + cComm->rank * len), len);
+  }
+
+out:
+  if (tmp_buf)
+    free(tmp_buf);
+
+  return status;
+}
+
+ncclResult_t ncclGinIbP2PBarrier(struct ncclGinIbCollComm *cComm) {
+  // TODO: move allocation to init or use zero-byte allgather
+  int *dummy;
+  NCCLCHECK(ncclIbMalloc((void **)&dummy, cComm->nranks * sizeof(int)));
+  NCCLCHECK(ncclGinIbAllGather(cComm, dummy + cComm->rank * sizeof(int),
+                               dummy, sizeof(int)));
+  free(dummy);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbConnect(void* ctx, void* handles[], int nranks, int rank, void* listenComm, void** collComm) {
+  struct ncclIbListenComm *lComm = (struct ncclIbListenComm *)listenComm;
+  struct ncclGinIbCollComm *cComm = nullptr;
+  int next;
+
+  NCCLCHECK(ncclIbMalloc((void **)&cComm, sizeof(*cComm)));
+  NCCLCHECK(ncclIbMalloc((void**)&cComm->fullSendComm, sizeof(void *) * nranks));
+  NCCLCHECK(ncclIbMalloc((void**)&cComm->fullRecvComm, sizeof(void *) * nranks));
+
+  cComm->nranks = nranks;
+  cComm->rank = rank;
+
+  next = (cComm->rank + 1) % nranks;
+  do
+  {
+    if (cComm->sendComm == NULL) {
+      NCCLCHECK(ncclNetIb.connect(ctx, lComm->dev, handles[next], &cComm->sendComm, NULL));
+    }
+    if (cComm->recvComm == NULL)
+      NCCLCHECK(ncclNetIb.accept(lComm, &cComm->recvComm, NULL));
+  } while (cComm->sendComm == NULL || cComm->recvComm == NULL);
+
+  cComm->getProperties = (ncclResult_t(*)(int dev, void *props))ncclIbGetProperties;
+  cComm->allGather = ncclGinIbAllGather;
+  cComm->allToAll = ncclGinIbAllToAll;
+  cComm->getGidIndex = ncclIbGetGidIndex;
+  cComm->dev = lComm->dev;
+
+  for (int i = 0; i < nranks; i++)
+  {
+    int connectPeer = (cComm->rank + i) % nranks;
+    int acceptPeer = (cComm->rank - i + nranks) % nranks;
+    do
+    {
+      if (cComm->fullSendComm[connectPeer] == NULL)
+        NCCLCHECK(ncclNetIb.connect(ctx, lComm->dev, handles[connectPeer], &cComm->fullSendComm[connectPeer], NULL));
+      if (cComm->fullRecvComm[acceptPeer] == NULL)
+        NCCLCHECK(ncclNetIb.accept(lComm, &cComm->fullRecvComm[acceptPeer], NULL));
+    } while ((cComm->fullSendComm[connectPeer] == NULL) || (cComm->fullRecvComm[acceptPeer] == NULL));
+    NCCLCHECK(ncclGinIbP2PBarrier(cComm));
+  }
+
+  *collComm = cComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbCloseColl(void* collComm) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+  if (!cComm) return ncclSuccess;
+
+  if (cComm->fullRecvComm) {
+    for (int i=0; i<cComm->nranks; i++) {
+      NCCLCHECK(ncclNetIb.closeRecv(cComm->fullRecvComm[i]));
+    }
+    free(cComm->fullRecvComm);
+    cComm->fullRecvComm = NULL;
+  }
+
+  if (cComm->fullSendComm) {
+    for (int i=0; i<cComm->nranks; i++) {
+      NCCLCHECK(ncclNetIb.closeSend(cComm->fullSendComm[i]));
+    }
+    free(cComm->fullSendComm);
+    cComm->fullSendComm = NULL;
+  }
+
+  if (cComm->recvComm) {
+    NCCLCHECK(ncclNetIb.closeRecv(cComm->recvComm));
+    cComm->recvComm = NULL;
+  }
+
+  if (cComm->sendComm) {
+    NCCLCHECK(ncclNetIb.closeSend(cComm->sendComm));
+    cComm->sendComm = NULL;
+  }
+
+  memset(cComm, 0, sizeof(*cComm));
+
+  free(cComm);
+  return ncclSuccess;
+}
+
+#include "gdaki/gin_host_gdaki.h"
+
+static std::mutex ncclGinIbGdakiLockMutex;
+static int ncclGinIbGdakiNDevs = -1;
+int ncclGinIbGdakiDevIndexes[MAX_IB_DEVS];
+
+ncclResult_t ncclGinIbGdakiInit(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction) {
+  NCCLCHECK(ncclGinIbInit(ctx, commId, logFunction));
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  if (ncclGinIbGdakiNDevs == -1) {
+    int ndevs = 0;
+    for (int i = 0; i < ncclNIbDevs; i++) {
+      if (ncclIbDevs[i].ibProvider == IB_PROVIDER_MLX5) {
+        ncclGinIbGdakiDevIndexes[ndevs] = i;
+        ++ndevs;
+      }
+    }
+    ncclGinIbGdakiNDevs = ndevs;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiDevices(int* ndev) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  *ndev = ncclGinIbGdakiNDevs;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiGetProperties(int dev, ncclNetProperties_t* props) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  NCCLCHECK(ncclNetIb.getProperties(ncclGinIbGdakiDevIndexes[dev], props));
+  props->netDeviceType = NCCL_NET_DEVICE_GIN_GDAKI;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  return ncclNetIb.listen(ctx, ncclGinIbGdakiDevIndexes[dev], opaqueHandle, listenComm);
+}
+
+ncclResult_t ncclGinIbGdakiCreateContext(void* collComm, int nSignals, int nCounters, void **ginCtx, ncclNetDeviceHandle_v11_t** devHandle) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  NCCLCHECK(ncclGinGdakiCreateContext(cComm, nSignals, nCounters, ginCtx, devHandle));
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiRegMrSym(void* collComm, void* data, size_t size, int type, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  return ncclGinGdakiRegMrSym((struct ncclGinIbCollComm *)collComm, data, size, type, mhandle, ginHandle);
+}
+
+ncclResult_t ncclGinIbGdakiDeregMrSym(void* collComm, void* mhandle) {
+  return ncclGinGdakiDeregMrSym((struct ncclGinIbCollComm *)collComm, mhandle);
+}
+
+ncclResult_t ncclGinIbGdakiDestroyContext(void* ginCtx) {
+  return ncclGinGdakiDestroyContext(ginCtx);
+}
+
+ncclResult_t ncclGinIbGdakiProgress(void *collComm)
+{
+  return ncclGinGdakiProgress(collComm);
+}
+
+ncclResult_t ncclGinIbGdakiQueryLastError(void *ginCtx, bool *hasError) {
+  return ncclGinGdakiQueryLastError(ginCtx, hasError);
+}
+
+ncclGin_t ncclGinIbGdaki = {
+  "GIN_IB_GDAKI",
+  ncclGinIbGdakiInit,
+  ncclGinIbGdakiDevices,
+  ncclGinIbGdakiGetProperties,
+  ncclGinIbGdakiListen,
+  ncclGinIbConnect,
+  ncclGinIbGdakiCreateContext,
+  ncclGinIbGdakiRegMrSym,
+  NULL, // regMrSymDmaBuf
+  ncclGinIbGdakiDeregMrSym,
+  ncclGinIbGdakiDestroyContext,
+  ncclGinIbCloseColl,
+  ncclIbCloseListen,
+  NULL,
+  NULL,
+  NULL,
+  ncclGinIbGdakiProgress,
+  ncclGinIbGdakiQueryLastError,
+  ncclGinIbFinalize
+};
+
+
+struct ncclIbGinProxyMrHandle {
+  struct ncclIbMrHandle *mrHandle;
+  uintptr_t *base_vas;
+  uint32_t *rkeys;
+};
+
+ncclResult_t ncclGinIbProxyGetProperties(int dev, ncclNetProperties_t* props) {
+  NCCLCHECK(ncclNetIb.getProperties(dev, props));
+  props->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyRegMrSymDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct ncclIbGinProxyMrHandle *ginMrHandle;
+  NCCLCHECK(ncclCalloc(&ginMrHandle, 1));
+
+  NCCLCHECKNOWARN(ncclIbRegMrDmaBufInternal(cComm->recvComm, data, size, type, offset, fd, mr_flags, (void **)&ginMrHandle->mrHandle), NCCL_NET);
+
+  NCCLCHECK(ncclCalloc(&ginMrHandle->base_vas, cComm->nranks));
+  NCCLCHECK(ncclCalloc(&ginMrHandle->rkeys, cComm->nranks));
+
+  NCCLCHECK(cComm->allGather(cComm, &data, ginMrHandle->base_vas, sizeof(uintptr_t)));
+  NCCLCHECK(cComm->allGather(cComm, &ginMrHandle->mrHandle->mrs[0]->rkey, ginMrHandle->rkeys, sizeof(uint32_t)));
+
+  *mhandle = ginMrHandle;
+  *ginHandle = ginMrHandle;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyRegMrSym(void* collComm, void* data, size_t size, int type, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  return ncclGinIbProxyRegMrSymDmaBuf(collComm, data, size, type, 0, -1, mr_flags, mhandle, ginHandle);
+}
+
+ncclResult_t ncclGinIbProxyDeregMrSym(void* collComm, void* mhandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct ncclIbGinProxyMrHandle *ginMrHandle = (struct ncclIbGinProxyMrHandle *)mhandle;
+
+  NCCLCHECK(ncclNetIb.deregMr(cComm->recvComm, ginMrHandle->mrHandle));
+  free(ginMrHandle->base_vas);
+  free(ginMrHandle->rkeys);
+  free(ginMrHandle);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyCloseColl(void* collComm) {
+  free(collComm);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyIPut(void *collComm, uint64_t srcOff, void *srcMhandle, size_t size,
+                                uint64_t dstOff, void *dstMhandle, uint32_t rank, void **request)
+{
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  struct ncclIbGinProxyMrHandle *srcMrHandle = (struct ncclIbGinProxyMrHandle *)srcMhandle;
+  struct ncclIbGinProxyMrHandle *dstMrHandle = (struct ncclIbGinProxyMrHandle *)dstMhandle;
+
+  void *srcPtr = (void *)(srcMrHandle->base_vas[cComm->rank] + srcOff);
+  void *dstPtr = (void *)(dstMrHandle->base_vas[rank] + dstOff);
+  uint32_t lkey = srcMrHandle->mrHandle->mrs[0]->lkey;
+  uint32_t rkey = dstMrHandle->rkeys[rank];
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  struct ncclIbQp *qp = &comm->base.qps[0];
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
+  req->type = NCCL_NET_IB_REQ_GIN_IPUT;
+  req->sock = &comm->base.sock;
+  req->iput.rank = rank;
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    req->devBases[i] = &comm->devs[i].base;
+  }
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  struct ibv_sge sge;
+  memset(&sge, 0, sizeof(sge));
+
+  wr.opcode                  = IBV_WR_RDMA_WRITE;
+  wr.send_flags              = IBV_SEND_SIGNALED;
+  wr.wr_id                   = req - comm->base.reqs;
+  wr.next                    = NULL;
+  wr.wr.rdma.remote_addr     = (uint64_t)dstPtr;
+  wr.wr.rdma.rkey            = rkey;
+  wr.sg_list = &sge;
+  wr.num_sge = 1;
+
+  sge.addr = (uintptr_t)srcPtr;  // Local buffer address
+  sge.length = size;  // Size of the transfer
+  sge.lkey = lkey;  // Local key
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(qp->qp, &wr, &bad_wr));
+  ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyIPutSignal(void *collComm, uint64_t srcOff, void *srcMhandle,
+                                      size_t size, uint64_t dstOff, void *dstMhandle,
+                                      uint32_t rank, uint64_t signalOff, void *signalMhandle,
+                                      uint64_t signalValue, uint32_t signalOp, void **request)
+{
+  if (signalOp != NCCL_NET_SIGNAL_OP_INC && signalOp != NCCL_NET_SIGNAL_OP_ADD) {
+    WARN("ncclGinIbProxyIPutSignal: Unsupported signalOp %u", signalOp);
+    return ncclInvalidArgument;
+  }
+
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  struct ncclIbGinProxyMrHandle *srcMrHandle = (struct ncclIbGinProxyMrHandle *)srcMhandle;
+  struct ncclIbGinProxyMrHandle *dstMrHandle = (struct ncclIbGinProxyMrHandle *)dstMhandle;
+  struct ncclIbGinProxyMrHandle *signalMrHandle = (struct ncclIbGinProxyMrHandle *)signalMhandle;
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  struct ncclIbQp *qp = &comm->base.qps[0];
+  int devIndex = qp->devIndex;
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
+  req->type = NCCL_NET_IB_REQ_GIN_IPUT;
+  req->sock = &comm->base.sock;
+  req->iput.rank = rank;
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    req->devBases[i] = &comm->devs[i].base;
+  }
+
+  struct ibv_send_wr wr[2];
+  memset(&wr, 0, sizeof(wr));
+  struct ibv_sge sge[2];
+  memset(&sge, 0, sizeof(sge));
+
+  // If size is 0, we only need to send the signal. srcMrHandle must be non-NULL
+  if (size > 0 && dstMrHandle) {
+    void *srcPtr = (void *)(srcMrHandle->base_vas[cComm->rank] + srcOff);
+    void *dstPtr = (void *)(dstMrHandle->base_vas[rank] + dstOff);
+    uint32_t lkey = srcMrHandle->mrHandle->mrs[0]->lkey;
+    uint32_t rkey = dstMrHandle->rkeys[rank];
+
+    // PUT
+    wr[0].opcode                  = IBV_WR_RDMA_WRITE;
+    wr[0].send_flags              = 0; // We only need the CQE from the signal
+    wr[0].wr_id                   = req - comm->base.reqs;
+    wr[0].next                    = &wr[1];
+    wr[0].wr.rdma.remote_addr     = (uint64_t)dstPtr;
+    wr[0].wr.rdma.rkey            = rkey;
+    wr[0].sg_list = &sge[0];
+    wr[0].num_sge = 1;
+
+    sge[0].addr = (uintptr_t)srcPtr;  // Local buffer address
+    sge[0].length = size;  // Size of the transfer
+    sge[0].lkey = lkey;  // Local key
+  }
+
+  void *signalPtr = (void *)(signalMrHandle->base_vas[rank] + signalOff);
+  uint32_t signalRkey = signalMrHandle->rkeys[rank];
+
+  // SIGNAL
+  wr[1].opcode                  = IBV_WR_ATOMIC_FETCH_AND_ADD;
+  wr[1].send_flags              = IBV_SEND_SIGNALED;
+  wr[1].wr_id                   = req - comm->base.reqs;  // used for matching completions with request
+  wr[1].next                    = NULL;
+  wr[1].wr.atomic.remote_addr   = (uint64_t)signalPtr;
+  wr[1].wr.atomic.compare_add   = signalOp == NCCL_NET_SIGNAL_OP_INC ? 1 : signalValue;
+  wr[1].wr.atomic.rkey          = signalRkey;
+  wr[1].sg_list = &sge[1];
+  wr[1].num_sge = 1;
+
+  sge[1].addr = (uintptr_t)&comm->putSignalScratchpad;
+  sge[1].length = sizeof(comm->putSignalScratchpad);
+  sge[1].lkey = comm->devs[devIndex].putSignalScratchpadMr->lkey;
+
+  // Send the put and the signal in one go
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(qp->qp, size > 0 ? &wr[0] : &wr[1], &bad_wr));
+  ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyTest(void *collComm, void *request, int *done) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+  struct ncclIbRequest* req = (struct ncclIbRequest*)request;
+  int rank = req->iput.rank;
+  *done = 0;
+
+  if (req->events[0] == 0) {
+    *done = 1;
+    NCCLCHECK(ncclIbFreeRequest(req));
+    return ncclSuccess;
+  }
+  int wrDone = 0;
+  struct ibv_wc wc[4];
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  NCCLCHECK(wrap_ibv_poll_cq(comm->devs[0].base.cq, 4, wc, &wrDone));
+  for (int i = 0; i < wrDone; i++) {
+    if (wc[i].status != IBV_WC_SUCCESS) {
+      union ncclSocketAddress addr;
+      ncclSocketGetAddr(req->sock, &addr);
+      char localGidString[INET6_ADDRSTRLEN] = "";
+      char remoteGidString[INET6_ADDRSTRLEN] = "";
+      const char* localGidStr = NULL, *remoteGidStr = NULL;
+      if (req->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
+        localGidStr = ibvGetGidStr(&req->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
+        remoteGidStr = ibvGetGidStr(&req->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
+      }
+
+      char line[SOCKET_NAME_MAXLEN+1];
+      char *hcaName = req->devBases[i]->pd->context->device->name;
+      WARN("NET/IB/GIN: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
+          ncclSocketToString(&addr, line), wc[i].status, wc[i].opcode, wc[i].byte_len, wc[i].vendor_err, reqTypeStr[req->type],
+          localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
+      return ncclRemoteError;
+    }
+
+    struct ncclIbRequest* wcReq = comm->base.reqs + wc[i].wr_id;
+
+    wcReq->events[0]--;
+    if (wcReq == req && wcReq->events[0] == 0) {
+      *done = 1;
+      NCCLCHECK(ncclIbFreeRequest(wcReq));
+    }
+  }
+  return ncclSuccess;
+}
+
+// No support for NCCL_IB_SPLIT_DATA_ON_QPS or NCCL_IB_MERGE_NICS
+ncclGin_t ncclGinIbProxy = {
+  "GIN_IB_PROXY",
+  ncclGinIbInit,
+  ncclIbDevices,
+  ncclGinIbProxyGetProperties,
+  ncclIbListen,
+  ncclGinIbConnect,
+  NULL,
+  ncclGinIbProxyRegMrSym,
+  ncclGinIbProxyRegMrSymDmaBuf,
+  ncclGinIbProxyDeregMrSym,
+  NULL,
+  ncclGinIbCloseColl,
+  ncclIbCloseListen,
+  ncclGinIbProxyIPut,
+  ncclGinIbProxyIPutSignal,
+  ncclGinIbProxyTest,
+  NULL,
+  NULL,
+  ncclGinIbFinalize
+};
diff --git a/src/transport/net_ib_gin.h b/src/transport/net_ib_gin.h
new file mode 100644
index 000000000..0bc75c599
--- /dev/null
+++ b/src/transport/net_ib_gin.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_NET_IB_GIN_H_
+#define _NCCL_NET_IB_GIN_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nccl.h"
+
+struct ncclGinIbCollComm {
+  int           rank;
+  int           nranks;
+  void*         recvComm;
+  void*         sendComm;
+  void**        fullRecvComm;
+  void**        fullSendComm;
+  int           dev;
+  void*         ginCtx;
+  ncclResult_t (*getProperties)(int dev, void *props);
+  ncclResult_t (*allGather)(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len);
+  ncclResult_t (*allToAll)(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len);
+  ncclResult_t (*getGidIndex)(struct ibv_context *context, uint8_t portNum, struct ibv_port_attr* portAttr, int *gidIndex);
+};
+
+#endif
diff --git a/src/transport/nvls.cc b/src/transport/nvls.cc
index 1f13bb01b..f15701aec 100644
--- a/src/transport/nvls.cc
+++ b/src/transport/nvls.cc
@@ -25,6 +25,7 @@ struct graphRegData {
 struct localRegData {
   struct ncclReg reg;
   intptr_t offset;
+  int handleTypes;
 };
 
 ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -157,7 +158,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
 
   int gpuCount;
   NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
-  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
+  if (!ncclParamNvlsEnable() || gpuCount < 2) return ncclSuccess;
 
   CUdevice dev;
   int driverVersion;
@@ -542,11 +543,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   CUmulticastObjectProp mcprop;
   CUmemAllocationProp ucprop;
   char shareableHandle[NVLS_HANDLE_SIZE];
-  CUmemGenericAllocationHandle mcHandle;
+  CUmemGenericAllocationHandle mcHandle = 0;
   size_t minSize = SIZE_MAX;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
-  size_t ucgran, mcgran, ucsize, mcsize;
+  size_t ucgran, mcgran, ucsize = 0, mcsize = 0;
+  bool bindComplete = false, mapComplete = false;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -594,6 +596,10 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
     if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) {
       goto fail;
     }
+    // We need to check whether the offsets are the same among ranks.
+    if (i > 0 && regData[i].offset != regData[i - 1].offset) {
+      goto fail;
+    }
     /* get minimal reg size of nvls buffers */
     if (minSize > regData[i].reg.regUCSize)
       minSize = regData[i].reg.regUCSize;
@@ -615,33 +621,42 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
+  // intra-node barrier to mitigate the possible hang in cuMulticastBindAddr during abort
+  // It also ensures that if cuMulticastBindAddr fails, the cleanup code won't race with the UDS proxy
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0), ret, fail);
+  CUresult err;
+  err = CUPFN(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0));
+  if (err != CUDA_SUCCESS) {
+    // Don't print an error in case of buffers that are incompatible with MC.
+    if (err != CUDA_ERROR_INVALID_VALUE) {
+      const char *errStr;
+      CUCALL(cuGetErrorString(err, &errStr));
+      INFO(NCCL_REG, "Failed to multicast-bind user buffer: CUDA error %d '%s'", err, errStr);
+    }
+    goto fail;
+  }
+  bindComplete = true;
 
   // Create a VA for the NVLS
   CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
   CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail);
+  mapComplete = true;
   CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
+  /* get all buffer addresses */
+  regRecord->caddrs[comm->localRank] = regRecord->begAddr;
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
+
   regRecord->regAddr = regPtr;
   regRecord->regUCSize = ucsize;
   regRecord->regMCSize = mcsize;
   regRecord->dev = comm->nvlsResources->dev;
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
-  /* get all buffer addresses */
-  regRecord->caddrs[comm->localRank] = regRecord->begAddr;
-  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
-
-  /* Although registration is done, we still need to check whether the offsets are same among ranks. */
-  for (int i = 0; i < comm->localRanks - 1; ++i) {
-    if (regData[i].offset != regData[i + 1].offset) {
-      goto fail;
-    }
-  }
 
   *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
   *regUsed = 1;
@@ -649,6 +664,14 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   free(regData);
   return ret;
 fail:
+  if (regPtr) {
+    if (mapComplete) CUCALL(cuMemUnmap(regPtr, mcsize));
+    CUCALL(cuMemAddressFree(regPtr, mcsize));
+  }
+  if (mcHandle) {
+    if (bindComplete) CUCALL(cuMulticastUnbind(mcHandle, comm->nvlsResources->dev, 0/*mcOffset*/, ucsize));
+    CUCALL(cuMemRelease(mcHandle));
+  }
   *regUsed = 0;
   goto exit;
 }
@@ -667,11 +690,19 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
     memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
     regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->begAddr;
   }
+  if (sendbuff) {
+    CUCHECKGOTO(cuPointerGetAttribute((void*)&regData[comm->localRank * 2].handleTypes,
+                                      CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, (CUdeviceptr)sendbuff), ret, fail);
+  }
 
   if (recvRegRecord) {
     memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
     regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->begAddr;
   }
+  if (recvbuff) {
+    CUCHECKGOTO(cuPointerGetAttribute((void*)&regData[comm->localRank * 2 + 1].handleTypes,
+                                      CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, (CUdeviceptr)recvbuff), ret, fail);
+  }
 
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
 
@@ -688,6 +719,11 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
     if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) {
       goto fail;
     }
+
+    if ((sendbuff && (regData[i * 2].handleTypes & ncclCuMemHandleType) == 0) ||
+        (recvbuff && (regData[i * 2 + 1].handleTypes & ncclCuMemHandleType) == 0)) {
+      goto fail;
+    }
   }
 
   if (sendNeedReg == false) {
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index d9fd01da0..5b08b8e7e 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -125,7 +125,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph
 
   // Check topology / p2p level.
   int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank, NULL));
   if (*ret == 0) return ncclSuccess;
   if (intermediateRank != -1) {
     if (useMemcpy) *ret = 0;
@@ -314,7 +314,7 @@ static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1
   int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank, NULL));
 
   int readEnable = ncclParamP2pReadEnable();
   if (readEnable != -2) *read = readEnable;
@@ -1142,3 +1142,9 @@ static void initCeOperation() {
     init = 1;
   }
 }
+
+// Function to check if P2P is using memcpy (for registration optimization)
+bool ncclP2pUsesMemcpy() {
+  initCeOperation(); // Ensure initialization
+  return useMemcpy != 0;
+}