Warp-level programming and SIMD optimization. Use warp shuffle instructions, voting functions, cooperative groups, warp-synchronous algorithms, and minimize warp divergence for optimal GPU performance.
Implements warp-level GPU optimizations using shuffle instructions, voting functions, and cooperative groups to minimize divergence.
npx claudepluginhub a5c-ai/babysitterThis skill is limited to using the following tools:
README.mdYou are warp-primitives - a specialized skill for warp-level programming and SIMD optimization on GPUs. This skill provides expert capabilities for low-level GPU performance optimization.
This skill enables AI-powered warp-level programming including:
Data exchange within a warp:
// __shfl_sync: Broadcast from any lane
__device__ float warpBroadcast(float val, int srcLane) {
return __shfl_sync(0xffffffff, val, srcLane);
}
// __shfl_up_sync: Shift up (for inclusive scan)
__device__ float shflUp(float val, int delta) {
return __shfl_up_sync(0xffffffff, val, delta);
}
// __shfl_down_sync: Shift down (for reduction)
__device__ float shflDown(float val, int delta) {
return __shfl_down_sync(0xffffffff, val, delta);
}
// __shfl_xor_sync: Butterfly pattern (for reduction)
__device__ float shflXor(float val, int laneMask) {
return __shfl_xor_sync(0xffffffff, val, laneMask);
}
// Warp-level reduction using shuffle
__device__ float warpReduceSum(float val) {
for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
// Warp-level reduction using XOR (butterfly)
__device__ float warpReduceSumXor(float val) {
for (int mask = warpSize / 2; mask > 0; mask >>= 1) {
val += __shfl_xor_sync(0xffffffff, val, mask);
}
return val; // All lanes have result
}
// Warp-level inclusive scan
__device__ float warpInclusiveScan(float val) {
for (int offset = 1; offset < warpSize; offset <<= 1) {
float n = __shfl_up_sync(0xffffffff, val, offset);
if (threadIdx.x % warpSize >= offset) {
val += n;
}
}
return val;
}
Collective warp operations:
// __ballot_sync: Create bitmask of predicate
__device__ unsigned int warpBallot(bool predicate) {
return __ballot_sync(0xffffffff, predicate);
}
// __any_sync: Any thread has true predicate
__device__ bool warpAny(bool predicate) {
return __any_sync(0xffffffff, predicate);
}
// __all_sync: All threads have true predicate
__device__ bool warpAll(bool predicate) {
return __all_sync(0xffffffff, predicate);
}
// Count set bits in warp
__device__ int warpPopcount(bool predicate) {
return __popc(__ballot_sync(0xffffffff, predicate));
}
// Find position within active threads
__device__ int warpExclusiveCount(bool predicate) {
unsigned int mask = __ballot_sync(0xffffffff, predicate);
unsigned int laneMask = (1u << (threadIdx.x % warpSize)) - 1;
return __popc(mask & laneMask);
}
// Example: Stream compaction within warp
__device__ int warpCompact(int* output, int value, bool keep) {
unsigned int mask = __ballot_sync(0xffffffff, keep);
int total = __popc(mask);
if (keep) {
int pos = __popc(mask & ((1u << (threadIdx.x % warpSize)) - 1));
output[pos] = value;
}
return total;
}
Flexible synchronization:
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
// Warp-level cooperative group
__device__ void warpOperation(float* data) {
cg::thread_block_tile<32> warp = cg::tiled_partition<32>(cg::this_thread_block());
int lane = warp.thread_rank();
float val = data[lane];
// Warp-level reduction
for (int offset = warp.size() / 2; offset > 0; offset >>= 1) {
val += warp.shfl_down(val, offset);
}
if (lane == 0) data[0] = val;
}
// Flexible tile sizes
template<int TILE_SIZE>
__device__ void tiledOperation(float* data) {
cg::thread_block_tile<TILE_SIZE> tile =
cg::tiled_partition<TILE_SIZE>(cg::this_thread_block());
float val = data[tile.thread_rank()];
// Tile-level reduction
for (int offset = tile.size() / 2; offset > 0; offset >>= 1) {
val += tile.shfl_down(val, offset);
}
if (tile.thread_rank() == 0) {
data[tile.meta_group_rank()] = val;
}
}
// Grid-level synchronization (requires cooperative launch)
__global__ void gridSyncKernel(float* data, int n) {
cg::grid_group grid = cg::this_grid();
// Phase 1
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) data[idx] *= 2.0f;
grid.sync(); // Synchronize entire grid
// Phase 2 - all blocks see phase 1 results
if (idx < n) data[idx] += 1.0f;
}
Minimize divergence impact:
// Bad: Divergent branches
__global__ void divergentKernel(float* data, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
if (data[idx] > 0) { // Divergent!
data[idx] = expf(data[idx]); // Some threads execute
} else {
data[idx] = 0.0f; // Other threads execute
}
}
}
// Better: Predicated execution
__global__ void predicatedKernel(float* data, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n) {
bool positive = data[idx] > 0;
// Both paths computed, result selected
float result = positive ? expf(data[idx]) : 0.0f;
data[idx] = result;
}
}
// Best: Reorganize data to reduce divergence
// Process positive and negative values separately
__global__ void reorganizedKernel(float* positive, float* negative,
int nPos, int nNeg) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// All threads in warp take same path
if (idx < nPos) {
positive[idx] = expf(positive[idx]);
}
}
// Warp-level early exit
__global__ void warpEarlyExit(float* data, int* flags, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// Check if entire warp can skip
bool needsWork = (idx < n) && flags[idx];
if (!__any_sync(0xffffffff, needsWork)) {
return; // Entire warp exits
}
// Only warps with work continue
if (needsWork) {
data[idx] = expensiveComputation(data[idx]);
}
}
Implicit warp synchronization:
// Pre-Volta: Implicit warp sync (deprecated pattern)
// Post-Volta: Use explicit __syncwarp()
__device__ float warpSafeReduce(float val) {
// Always use explicit sync mask
val += __shfl_down_sync(0xffffffff, val, 16);
val += __shfl_down_sync(0xffffffff, val, 8);
val += __shfl_down_sync(0xffffffff, val, 4);
val += __shfl_down_sync(0xffffffff, val, 2);
val += __shfl_down_sync(0xffffffff, val, 1);
return val;
}
// Active mask handling
__device__ float activeWarpReduce(float val) {
unsigned int active = __activemask();
for (int offset = warpSize / 2; offset > 0; offset >>= 1) {
val += __shfl_down_sync(active, val, offset);
}
return val;
}
// Match sync for convergent warps
__device__ void convergentOperation() {
// Ensure threads converge before warp operation
unsigned int mask = __match_any_sync(__activemask(), threadIdx.x / 8);
// mask contains threads with same value
}
Matrix fragments with warp cooperation:
// Warp-level matrix multiply (simplified WMMA concept)
__device__ void warpMatMul4x4(float* A, float* B, float* C) {
int lane = threadIdx.x % 32;
// Each lane owns one element of result
int row = lane / 4;
int col = lane % 4;
float sum = 0.0f;
for (int k = 0; k < 4; k++) {
// Broadcast A[row][k] and B[k][col]
float a = __shfl_sync(0xffffffff, A[row * 4 + k], row * 4 + k);
float b = __shfl_sync(0xffffffff, B[k * 4 + col], k * 4 + col);
sum += a * b;
}
C[lane] = sum;
}
Identify and fix stall causes:
// Common stall causes and solutions
// 1. Memory dependency stalls
__global__ void memoryStall(float* data) {
int idx = threadIdx.x;
float val = data[idx]; // Long latency load
// Stall here waiting for data
data[idx] = val * 2.0f;
}
// Solution: Increase occupancy or hide latency
__global__ void hiddenLatency(float* data, int n) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
// Load multiple values
float v1 = data[idx];
float v2 = data[idx + n];
// Compute on v1 while v2 loads
v1 = v1 * 2.0f + 1.0f;
// Now v2 should be ready
v2 = v2 * 2.0f + 1.0f;
data[idx] = v1;
data[idx + n] = v2;
}
// 2. Synchronization stalls
__global__ void syncStall(float* shared_data) {
__shared__ float smem[256];
smem[threadIdx.x] = shared_data[threadIdx.x];
__syncthreads(); // All threads wait here
}
// Solution: Minimize sync points, use warp-level sync
This skill integrates with the following processes:
warp-efficiency-optimization.js - Warp efficiency workflowreduction-scan-implementation.js - Reduction/scan patternsparallel-algorithm-design.js - Algorithm optimization{
"operation": "generate-warp-reduction",
"configuration": {
"data_type": "float",
"reduction_op": "sum",
"use_xor_pattern": true
},
"generated_code": "warp_reduction.cu",
"analysis": {
"shuffle_instructions": 5,
"sync_masks": "0xffffffff",
"cooperative_groups_used": false
},
"performance": {
"instructions_per_element": 6,
"warp_efficiency": 1.0,
"divergence": "none"
}
}
Activates when the user asks about AI prompts, needs prompt templates, wants to search for prompts, or mentions prompts.chat. Use for discovering, retrieving, and improving prompts.
Search, retrieve, and install Agent Skills from the prompts.chat registry using MCP tools. Use when the user asks to find skills, browse skill catalogs, install a skill for Claude, or extend Claude's capabilities with reusable AI agent components.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.