Expert integration with NVIDIA GPU-accelerated math libraries. Configure cuBLAS tensor core operations, generate cuBLAS GEMM calls, integrate cuDNN layers, handle algorithm selection, and support mixed-precision operations.
Integrates NVIDIA GPU math libraries to generate cuBLAS operations and cuDNN layers for accelerated computing.
npx claudepluginhub a5c-ai/babysitterThis skill is limited to using the following tools:
README.mdYou are cublas-cudnn - a specialized skill for NVIDIA GPU-accelerated math library integration. This skill provides expert capabilities for using cuBLAS, cuDNN, and related libraries.
This skill enables AI-powered GPU library operations including:
Matrix multiplication with cuBLAS:
#include <cublas_v2.h>
// Initialize cuBLAS
cublasHandle_t handle;
cublasCreate(&handle);
// Standard SGEMM: C = alpha * A * B + beta * C
float alpha = 1.0f, beta = 0.0f;
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_N, // No transpose
M, N, K, // Dimensions
&alpha,
d_A, M, // A matrix and leading dimension
d_B, K, // B matrix and leading dimension
&beta,
d_C, M); // C matrix and leading dimension
// Batched GEMM for multiple matrices
cublasSgemmBatched(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K,
&alpha,
d_Aarray, M,
d_Barray, K,
&beta,
d_Carray, M,
batchCount);
// Strided batched GEMM (contiguous memory)
cublasSgemmStridedBatched(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K,
&alpha,
d_A, M, strideA,
d_B, K, strideB,
&beta,
d_C, M, strideC,
batchCount);
Enable tensor cores for maximum performance:
// Enable tensor cores (requires Volta+)
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
// For Ampere+, use TF32
cublasSetMathMode(handle, CUBLAS_TF32_TENSOR_OP_MATH);
// Half precision GEMM with tensor cores
cublasGemmEx(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K,
&alpha,
d_A, CUDA_R_16F, M, // FP16 input
d_B, CUDA_R_16F, K, // FP16 input
&beta,
d_C, CUDA_R_16F, M, // FP16 output
CUDA_R_16F, // Compute type
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
// Mixed precision: FP16 inputs, FP32 accumulate
cublasGemmEx(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
M, N, K,
&alpha,
d_A, CUDA_R_16F, M,
d_B, CUDA_R_16F, K,
&beta,
d_C, CUDA_R_32F, M, // FP32 output
CUDA_R_32F, // FP32 compute
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
Deep learning convolutions:
#include <cudnn.h>
cudnnHandle_t cudnn;
cudnnCreate(&cudnn);
// Create tensor descriptors
cudnnTensorDescriptor_t inputDesc, outputDesc;
cudnnCreateTensorDescriptor(&inputDesc);
cudnnCreateTensorDescriptor(&outputDesc);
cudnnSetTensor4dDescriptor(inputDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, N, C, H, W);
// Create filter descriptor
cudnnFilterDescriptor_t filterDesc;
cudnnCreateFilterDescriptor(&filterDesc);
cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT,
CUDNN_TENSOR_NCHW, K, C, R, S);
// Create convolution descriptor
cudnnConvolutionDescriptor_t convDesc;
cudnnCreateConvolutionDescriptor(&convDesc);
cudnnSetConvolution2dDescriptor(convDesc,
pad_h, pad_w, // Padding
stride_h, stride_w, // Stride
1, 1, // Dilation
CUDNN_CROSS_CORRELATION,
CUDNN_DATA_FLOAT);
// Enable tensor cores
cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);
// Find best algorithm
cudnnConvolutionFwdAlgoPerf_t perfResults[8];
int returnedAlgoCount;
cudnnFindConvolutionForwardAlgorithm(cudnn,
inputDesc, filterDesc, convDesc, outputDesc,
8, &returnedAlgoCount, perfResults);
cudnnConvolutionFwdAlgo_t algo = perfResults[0].algo;
// Get workspace size
size_t workspaceSize;
cudnnGetConvolutionForwardWorkspaceSize(cudnn,
inputDesc, filterDesc, convDesc, outputDesc,
algo, &workspaceSize);
void* workspace;
cudaMalloc(&workspace, workspaceSize);
// Execute convolution
float alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(cudnn,
&alpha,
inputDesc, d_input,
filterDesc, d_filter,
convDesc, algo, workspace, workspaceSize,
&beta,
outputDesc, d_output);
cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc;
cudnnCreateTensorDescriptor(&bnScaleBiasMeanVarDesc);
cudnnDeriveBNTensorDescriptor(bnScaleBiasMeanVarDesc, inputDesc,
CUDNN_BATCHNORM_SPATIAL);
// Forward training
cudnnBatchNormalizationForwardTraining(cudnn,
CUDNN_BATCHNORM_SPATIAL,
&alpha, &beta,
inputDesc, d_input,
outputDesc, d_output,
bnScaleBiasMeanVarDesc,
d_scale, d_bias,
0.1, // Exponential average factor
d_runningMean, d_runningVariance,
1e-5, // Epsilon
d_savedMean, d_savedInvVariance);
// Forward inference
cudnnBatchNormalizationForwardInference(cudnn,
CUDNN_BATCHNORM_SPATIAL,
&alpha, &beta,
inputDesc, d_input,
outputDesc, d_output,
bnScaleBiasMeanVarDesc,
d_scale, d_bias,
d_runningMean, d_runningVariance,
1e-5);
// Benchmark all algorithms
cudnnConvolutionFwdAlgoPerf_t perfResults[CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
int returnedCount;
cudnnFindConvolutionForwardAlgorithmEx(cudnn,
inputDesc, d_input,
filterDesc, d_filter,
convDesc,
outputDesc, d_output,
CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
&returnedCount,
perfResults,
workspace, workspaceSize);
// Print benchmark results
for (int i = 0; i < returnedCount; i++) {
printf("Algorithm %d: %.3f ms, workspace: %zu bytes\n",
perfResults[i].algo,
perfResults[i].time,
perfResults[i].memory);
}
// Select algorithm by heuristics
cudnnConvolutionFwdAlgo_t algo;
cudnnGetConvolutionForwardAlgorithm_v7(cudnn,
inputDesc, filterDesc, convDesc, outputDesc,
8, &returnedCount, perfResults);
// Query workspace for all operations
size_t convWorkspace, bnWorkspace, poolWorkspace;
cudnnGetConvolutionForwardWorkspaceSize(cudnn, ...);
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnn, ...);
// Allocate maximum needed
size_t maxWorkspace = max(convWorkspace, max(bnWorkspace, poolWorkspace));
void* workspace;
cudaMalloc(&workspace, maxWorkspace);
// Reuse workspace across operations
// FP16 convolution
cudnnSetTensor4dDescriptor(inputDesc, CUDNN_TENSOR_NHWC,
CUDNN_DATA_HALF, N, C, H, W);
cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_HALF,
CUDNN_TENSOR_NHWC, K, C, R, S);
// INT8 convolution for inference
cudnnSetTensor4dDescriptor(inputDesc, CUDNN_TENSOR_NHWC,
CUDNN_DATA_INT8, N, C, H, W);
cudnnSetConvolution2dDescriptor(convDesc,
pad_h, pad_w, stride_h, stride_w, 1, 1,
CUDNN_CROSS_CORRELATION,
CUDNN_DATA_INT32); // INT32 accumulation
#include <cusparse.h>
cusparseHandle_t sparse;
cusparseCreate(&sparse);
// Create sparse matrix in CSR format
cusparseSpMatDescr_t matA;
cusparseCreateCsr(&matA, M, N, nnz,
d_rowPtr, d_colIdx, d_values,
CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F);
// Create dense vectors
cusparseDnVecDescr_t vecX, vecY;
cusparseCreateDnVec(&vecX, N, d_x, CUDA_R_32F);
cusparseCreateDnVec(&vecY, M, d_y, CUDA_R_32F);
// SpMV: y = alpha * A * x + beta * y
size_t bufferSize;
cusparseSpMV_bufferSize(sparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize);
void* buffer;
cudaMalloc(&buffer, bufferSize);
cusparseSpMV(sparse, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, CUDA_R_32F,
CUSPARSE_SPMV_ALG_DEFAULT, buffer);
This skill integrates with the following processes:
tensor-core-programming.js - Tensor core workflowsml-inference-optimization.js - ML inferencecustom-cuda-operator-development.js - Custom operators{
"operation": "gemm-benchmark",
"library": "cuBLAS",
"configuration": {
"M": 4096, "N": 4096, "K": 4096,
"datatype": "FP16",
"math_mode": "TENSOR_OP_MATH"
},
"performance": {
"time_ms": 0.85,
"tflops": 16.2,
"efficiency_pct": 81.0
},
"recommendations": [
"Use CUBLAS_GEMM_DEFAULT_TENSOR_OP for tensor core path",
"Ensure dimensions are multiples of 8 for optimal tensor core usage"
]
}
Activates when the user asks about AI prompts, needs prompt templates, wants to search for prompts, or mentions prompts.chat. Use for discovering, retrieving, and improving prompts.
Search, retrieve, and install Agent Skills from the prompts.chat registry using MCP tools. Use when the user asks to find skills, browse skill catalogs, install a skill for Claude, or extend Claude's capabilities with reusable AI agent components.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.