Deep integration with NVIDIA CUDA toolkit for kernel development, compilation, and debugging. Execute nvcc compilation with optimization flags analysis, generate and validate CUDA kernel code, analyze PTX/SASS assembly output, and configure execution parameters.
Develops and compiles CUDA kernels with optimization analysis and PTX/SASS assembly inspection.
npx claudepluginhub a5c-ai/babysitterThis skill is limited to using the following tools:
README.mdYou are cuda-toolkit - a specialized skill for NVIDIA CUDA toolkit integration, providing expert capabilities for kernel development, compilation, and debugging workflows.
This skill enables AI-powered CUDA development operations including:
Compile CUDA programs with various optimization flags:
# Basic compilation
nvcc -o program program.cu
# Optimized release build
nvcc -O3 -use_fast_math -o program program.cu
# Debug build with line info
nvcc -G -lineinfo -o program_debug program.cu
# Specify compute capability
nvcc -arch=sm_80 -o program program.cu
# Generate PTX for multiple architectures
nvcc -gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_80,code=sm_80 \
-o program program.cu
# Verbose compilation
nvcc -v --ptxas-options=-v -o program program.cu
Generate properly structured CUDA kernels:
// Thread indexing patterns
__global__ void kernel1D(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] = data[idx] * 2.0f;
}
}
__global__ void kernel2D(float* data, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
int idx = y * width + x;
data[idx] = data[idx] * 2.0f;
}
}
__global__ void kernel3D(float* data, int dimX, int dimY, int dimZ) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int z = blockIdx.z * blockDim.z + threadIdx.z;
if (x < dimX && y < dimY && z < dimZ) {
int idx = z * dimX * dimY + y * dimX + x;
data[idx] = data[idx] * 2.0f;
}
}
Calculate optimal launch parameters:
// Launch configuration helper
void launchKernel(float* d_data, int n) {
int blockSize = 256; // Common optimal block size
int numBlocks = (n + blockSize - 1) / blockSize;
// Limit blocks to device maximum
int deviceId;
cudaGetDevice(&deviceId);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, deviceId);
numBlocks = min(numBlocks, props.maxGridSize[0]);
kernel1D<<<numBlocks, blockSize>>>(d_data, n);
}
// Query optimal block size
int minGridSize, blockSize;
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, kernel1D, 0, 0);
Analyze generated assembly:
# Generate PTX
nvcc -ptx -o program.ptx program.cu
# View PTX
cat program.ptx
# Generate SASS (device assembly)
cuobjdump -sass program > program.sass
# Analyze register usage
nvcc --ptxas-options=-v program.cu 2>&1 | grep -E "registers|memory"
# Dump detailed resource usage
cuobjdump --dump-resource-usage program
Generate proper memory management code:
// Host-device memory transfer pattern
void processData(float* h_input, float* h_output, int n) {
float *d_input, *d_output;
size_t size = n * sizeof(float);
// Allocate device memory
cudaMalloc(&d_input, size);
cudaMalloc(&d_output, size);
// Copy input to device
cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);
// Launch kernel
int blockSize = 256;
int numBlocks = (n + blockSize - 1) / blockSize;
processKernel<<<numBlocks, blockSize>>>(d_input, d_output, n);
// Copy output to host
cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_input);
cudaFree(d_output);
}
// Pinned memory for faster transfers
float* h_pinned;
cudaMallocHost(&h_pinned, size);
// ... use h_pinned ...
cudaFreeHost(h_pinned);
Comprehensive error checking:
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA Error at %s:%d: %s\n", \
__FILE__, __LINE__, cudaGetErrorString(err)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// Usage
CUDA_CHECK(cudaMalloc(&d_data, size));
CUDA_CHECK(cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice));
// Check kernel errors
myKernel<<<blocks, threads>>>(d_data, n);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaDeviceSynchronize());
Target specific GPU architectures:
# SM versions and features
# sm_50 - Maxwell (dynamic parallelism)
# sm_60 - Pascal (unified memory, FP16)
# sm_70 - Volta (tensor cores, independent thread scheduling)
# sm_75 - Turing (RT cores, INT8 tensor cores)
# sm_80 - Ampere (TF32, sparse tensor cores)
# sm_86 - Ampere consumer
# sm_89 - Ada Lovelace
# sm_90 - Hopper (transformer engine, TMA)
# Compile for specific capability
nvcc -arch=sm_80 -code=sm_80 program.cu
# Fat binary for multiple architectures
nvcc -gencode arch=compute_70,code=sm_70 \
-gencode arch=compute_80,code=sm_80 \
-gencode arch=compute_90,code=sm_90 \
-o program program.cu
Validate resource constraints:
// Specify launch bounds for occupancy
__global__ void __launch_bounds__(256, 4)
boundedKernel(float* data, int n) {
// Kernel limited to 256 threads, compiler targets 4 blocks/SM
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) data[idx] *= 2.0f;
}
// Query and validate resources
void validateLaunch() {
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, boundedKernel);
printf("Registers: %d\n", attr.numRegs);
printf("Shared memory: %zu bytes\n", attr.sharedSizeBytes);
printf("Max threads per block: %d\n", attr.maxThreadsPerBlock);
}
This skill integrates with the following processes:
cuda-kernel-development.js - Kernel development workflowcuda-stream-concurrency.js - Stream managementcustom-cuda-operator-development.js - Custom operator creationdynamic-parallelism-implementation.js - Dynamic parallelismWhen executing operations, provide structured output:
{
"operation": "compile",
"status": "success",
"compiler": "nvcc",
"flags": ["-O3", "-arch=sm_80"],
"output": {
"binary": "program",
"ptx": "program.ptx"
},
"resources": {
"registers_per_thread": 32,
"shared_memory_per_block": 4096,
"max_threads_per_block": 1024
},
"warnings": [],
"artifacts": ["program", "program.ptx"]
}
Activates when the user asks about AI prompts, needs prompt templates, wants to search for prompts, or mentions prompts.chat. Use for discovering, retrieving, and improving prompts.
Search, retrieve, and install Agent Skills from the prompts.chat registry using MCP tools. Use when the user asks to find skills, browse skill catalogs, install a skill for Claude, or extend Claude's capabilities with reusable AI agent components.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.