NVIDIA Collective Communications Library integration for multi-GPU operations. Initialize NCCL communicators, execute collective operations, configure communication topologies, profile collective performance, and support RCCL for AMD compatibility.
Manages multi-GPU collective operations using NVIDIA NCCL for high-performance distributed computing.
npx claudepluginhub a5c-ai/babysitterThis skill is limited to using the following tools:
README.mdYou are nccl-communication - a specialized skill for NVIDIA Collective Communications Library (NCCL) integration. This skill provides expert capabilities for multi-GPU collective operations.
This skill enables AI-powered multi-GPU communication including:
Initialize communicators:
#include <nccl.h>
// Single-node multi-GPU initialization
int numGPUs = 4;
ncclComm_t comms[4];
int devs[4] = {0, 1, 2, 3};
ncclCommInitAll(comms, numGPUs, devs);
// Per-rank initialization for MPI integration
ncclUniqueId id;
ncclComm_t comm;
if (rank == 0) {
ncclGetUniqueId(&id);
}
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
cudaSetDevice(localRank);
ncclCommInitRank(&comm, worldSize, id, rank);
// Cleanup
ncclCommDestroy(comm);
Reduce across all GPUs:
// Synchronous all-reduce
ncclAllReduce(sendbuff, recvbuff, count, ncclFloat,
ncclSum, comm, stream);
cudaStreamSynchronize(stream);
// In-place all-reduce
ncclAllReduce(buff, buff, count, ncclFloat, ncclSum, comm, stream);
// Supported reduction operations:
// ncclSum, ncclProd, ncclMax, ncclMin, ncclAvg
// Multiple data types:
// ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64
// ncclFloat16, ncclFloat32, ncclFloat64, ncclBfloat16
Gather data from all GPUs:
// All-gather: each GPU contributes sendcount elements
// Result: recvbuff has numGPUs * sendcount elements per GPU
ncclAllGather(sendbuff, recvbuff, sendcount, ncclFloat, comm, stream);
// Verify output size
size_t totalElements = sendcount * numGPUs;
// Reduce-scatter: reduces and scatters to each GPU
// Each GPU gets 1/numGPUs of the reduced result
ncclReduceScatter(sendbuff, recvbuff, recvcount, ncclFloat,
ncclSum, comm, stream);
// Useful for gradient reduction in data parallelism
// Broadcast from root to all
int root = 0;
ncclBroadcast(sendbuff, recvbuff, count, ncclFloat, root, comm, stream);
// In-place broadcast
ncclBroadcast(buff, buff, count, ncclFloat, root, comm, stream);
// Reduce to root
ncclReduce(sendbuff, recvbuff, count, ncclFloat, ncclSum, root, comm, stream);
Batch multiple operations:
// Start group
ncclGroupStart();
// Queue multiple operations
ncclAllReduce(buff1, buff1, count1, ncclFloat, ncclSum, comm, stream);
ncclAllReduce(buff2, buff2, count2, ncclFloat, ncclSum, comm, stream);
ncclBroadcast(buff3, buff3, count3, ncclFloat, 0, comm, stream);
// End group - operations execute efficiently
ncclGroupEnd();
// Useful for:
// - Multiple collectives in single launch
// - Send/Recv pairs for point-to-point
// Send from rank 0 to rank 1
if (rank == 0) {
ncclSend(sendbuff, count, ncclFloat, 1, comm, stream);
} else if (rank == 1) {
ncclRecv(recvbuff, count, ncclFloat, 0, comm, stream);
}
// Bidirectional exchange using groups
ncclGroupStart();
ncclSend(sendbuff, count, ncclFloat, peerRank, comm, stream);
ncclRecv(recvbuff, count, ncclFloat, peerRank, comm, stream);
ncclGroupEnd();
Configure for hardware topology:
# Check GPU topology
nvidia-smi topo -m
# Environment variables for optimization
export NCCL_TOPO_FILE=/path/to/topo.xml
export NCCL_GRAPH_FILE=/path/to/graph.xml
# Algorithm selection
export NCCL_ALGO=Tree # Tree reduction
export NCCL_ALGO=Ring # Ring reduction
export NCCL_ALGO=CollnetDirect # NVSwitch direct
# Protocol selection
export NCCL_PROTO=Simple # Default
export NCCL_PROTO=LL # Low-latency
export NCCL_PROTO=LL128 # Low-latency 128-byte
# Network settings
export NCCL_IB_DISABLE=0 # Enable InfiniBand
export NCCL_NET_GDR_LEVEL=5 # GPU Direct RDMA level
// Multi-node with MPI
#include <mpi.h>
#include <nccl.h>
int main(int argc, char* argv[]) {
MPI_Init(&argc, &argv);
int worldSize, rank;
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Get local rank for GPU assignment
int localRank;
MPI_Comm localComm;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank,
MPI_INFO_NULL, &localComm);
MPI_Comm_rank(localComm, &localRank);
// Initialize NCCL
ncclUniqueId id;
if (rank == 0) ncclGetUniqueId(&id);
MPI_Bcast(&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD);
cudaSetDevice(localRank);
ncclComm_t comm;
ncclCommInitRank(&comm, worldSize, id, rank);
// Use comm for collectives...
ncclCommDestroy(comm);
MPI_Finalize();
return 0;
}
// NCCL timing with CUDA events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, stream);
ncclAllReduce(buff, buff, count, ncclFloat, ncclSum, comm, stream);
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
float milliseconds;
cudaEventElapsedTime(&milliseconds, start, stop);
// Calculate bandwidth
size_t bytes = count * sizeof(float);
float algoBW = bytes / milliseconds / 1e6; // GB/s
float busBW = algoBW * 2 * (numGPUs - 1) / numGPUs; // Bus bandwidth
printf("AllReduce: %.2f ms, %.2f GB/s (bus: %.2f GB/s)\n",
milliseconds, algoBW, busBW);
# Enable NCCL debug output
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
# NCCL tests for benchmarking
./build/all_reduce_perf -b 8 -e 256M -f 2 -g 4
This skill integrates with the following processes:
multi-gpu-programming.js - Multi-GPU developmentgpu-cluster-computing.js - Cluster computing{
"operation": "all-reduce",
"status": "success",
"configuration": {
"num_gpus": 4,
"data_size_bytes": 268435456,
"data_type": "float32",
"reduction": "sum"
},
"performance": {
"time_ms": 2.34,
"algorithm_bandwidth_gbps": 114.5,
"bus_bandwidth_gbps": 171.8
},
"topology": {
"interconnect": "NVLink",
"algorithm": "Tree",
"protocol": "LL128"
}
}
Activates when the user asks about AI prompts, needs prompt templates, wants to search for prompts, or mentions prompts.chat. Use for discovering, retrieving, and improving prompts.
Search, retrieve, and install Agent Skills from the prompts.chat registry using MCP tools. Use when the user asks to find skills, browse skill catalogs, install a skill for Claude, or extend Claude's capabilities with reusable AI agent components.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.