Cross-vendor OpenCL runtime management and kernel development. Query platforms/devices, generate portable OpenCL C kernel code, handle vendor-specific extensions, manage contexts and command queues, compile and cache programs.
Manages OpenCL runtime across vendors and generates portable GPU kernel code.
npx claudepluginhub a5c-ai/babysitterThis skill is limited to using the following tools:
README.mdYou are opencl-runtime - a specialized skill for cross-vendor OpenCL runtime management and kernel development. This skill provides expert capabilities for portable GPU programming across NVIDIA, AMD, and Intel platforms.
This skill enables AI-powered OpenCL development operations including:
Query available OpenCL resources:
// Query platforms
cl_uint numPlatforms;
clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id* platforms = malloc(numPlatforms * sizeof(cl_platform_id));
clGetPlatformIDs(numPlatforms, platforms, NULL);
// Get platform info
char platformName[128];
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, 128, platformName, NULL);
// Query devices
cl_uint numDevices;
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
cl_device_id* devices = malloc(numDevices * sizeof(cl_device_id));
clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
# Using clinfo utility
clinfo --list
# Detailed device info
clinfo -a
Generate portable kernels:
// Basic kernel pattern
__kernel void vectorAdd(
__global const float* a,
__global const float* b,
__global float* c,
const int n)
{
int gid = get_global_id(0);
if (gid < n) {
c[gid] = a[gid] + b[gid];
}
}
// 2D kernel pattern
__kernel void matrixMultiply(
__global const float* A,
__global const float* B,
__global float* C,
const int M, const int N, const int K)
{
int row = get_global_id(0);
int col = get_global_id(1);
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[row * K + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
// Shared memory (local memory) kernel
__kernel void reductionSum(
__global const float* input,
__global float* output,
__local float* localData,
const int n)
{
int gid = get_global_id(0);
int lid = get_local_id(0);
int groupSize = get_local_size(0);
localData[lid] = (gid < n) ? input[gid] : 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
for (int stride = groupSize / 2; stride > 0; stride >>= 1) {
if (lid < stride) {
localData[lid] += localData[lid + stride];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (lid == 0) {
output[get_group_id(0)] = localData[0];
}
}
Create and manage OpenCL contexts:
// Create context
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
// Create command queue (OpenCL 1.x)
cl_command_queue queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
// Create command queue (OpenCL 2.0+)
cl_queue_properties props[] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
0
};
cl_command_queue queue = clCreateCommandQueueWithProperties(context, device, props, &err);
Compile and cache OpenCL programs:
// Create program from source
const char* source = loadKernelSource("kernel.cl");
cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &err);
// Build with options
const char* options = "-cl-fast-relaxed-math -cl-mad-enable";
err = clBuildProgram(program, 1, &device, options, NULL, NULL);
// Get build log on error
if (err != CL_SUCCESS) {
size_t logSize;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* log = malloc(logSize);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
printf("Build error:\n%s\n", log);
free(log);
}
// Get compiled binary for caching
size_t binarySize;
clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL);
unsigned char* binary = malloc(binarySize);
clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &binary, NULL);
saveBinaryToFile("kernel.bin", binary, binarySize);
// Load cached binary
cl_program programFromBinary = clCreateProgramWithBinary(
context, 1, &device, &binarySize, (const unsigned char**)&binary, NULL, &err);
Configure work dimensions:
// 1D NDRange
size_t globalSize = ((n + 255) / 256) * 256; // Round up to multiple of work-group size
size_t localSize = 256;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL);
// 2D NDRange
size_t globalSize2D[2] = {width, height};
size_t localSize2D[2] = {16, 16};
clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize2D, localSize2D, 0, NULL, NULL);
// Query max work-group size
size_t maxWorkGroupSize;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL);
Create and manage buffers:
// Create buffers
cl_mem bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, size, NULL, &err);
cl_mem bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
size, hostDataB, &err);
cl_mem bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, size, NULL, &err);
// Write to buffer
clEnqueueWriteBuffer(queue, bufferA, CL_TRUE, 0, size, hostDataA, 0, NULL, NULL);
// Read from buffer
clEnqueueReadBuffer(queue, bufferC, CL_TRUE, 0, size, hostResult, 0, NULL, NULL);
// Map buffer for direct access
float* mappedPtr = clEnqueueMapBuffer(queue, bufferA, CL_TRUE, CL_MAP_WRITE,
0, size, 0, NULL, NULL, &err);
// ... modify data ...
clEnqueueUnmapMemObject(queue, bufferA, mappedPtr, 0, NULL, NULL);
Handle vendor-specific features:
// Check for extension
char extensions[4096];
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(extensions), extensions, NULL);
if (strstr(extensions, "cl_khr_fp16")) {
// Half precision available
}
if (strstr(extensions, "cl_nv_device_attribute_query")) {
// NVIDIA-specific queries available
cl_uint smCount;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
sizeof(cl_uint), &smCount, NULL);
}
// AMD-specific
if (strstr(extensions, "cl_amd_device_attribute_query")) {
cl_uint simdPerCU;
clGetDeviceInfo(device, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD,
sizeof(cl_uint), &simdPerCU, NULL);
}
Support multiple OpenCL versions:
// Query OpenCL version
char version[128];
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL);
// OpenCL 2.0+ features
#ifdef CL_VERSION_2_0
// Shared Virtual Memory
cl_device_svm_capabilities svmCaps;
clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(svmCaps), &svmCaps, NULL);
if (svmCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) {
void* svmPtr = clSVMAlloc(context, CL_MEM_READ_WRITE, size, 0);
clEnqueueSVMMap(queue, CL_TRUE, CL_MAP_WRITE, svmPtr, size, 0, NULL, NULL);
}
#endif
// OpenCL 3.0 optional features
#ifdef CL_VERSION_3_0
cl_device_atomic_capabilities atomicCaps;
clGetDeviceInfo(device, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES,
sizeof(atomicCaps), &atomicCaps, NULL);
#endif
This skill integrates with the following processes:
opencl-application-development.js - OpenCL application developmenthip-porting-cross-platform.js - Cross-platform porting{
"operation": "enumerate-devices",
"status": "success",
"platforms": [
{
"name": "NVIDIA CUDA",
"version": "OpenCL 3.0 CUDA",
"devices": [
{
"name": "NVIDIA GeForce RTX 4090",
"type": "GPU",
"computeUnits": 128,
"maxWorkGroupSize": 1024,
"globalMemory": "24 GB",
"extensions": ["cl_khr_fp16", "cl_khr_fp64"]
}
]
}
]
}
Activates when the user asks about AI prompts, needs prompt templates, wants to search for prompts, or mentions prompts.chat. Use for discovering, retrieving, and improving prompts.
Search, retrieve, and install Agent Skills from the prompts.chat registry using MCP tools. Use when the user asks to find skills, browse skill catalogs, install a skill for Claude, or extend Claude's capabilities with reusable AI agent components.
This skill should be used when the user asks to "create an agent", "add an agent", "write a subagent", "agent frontmatter", "when to use description", "agent examples", "agent tools", "agent colors", "autonomous agent", or needs guidance on agent structure, system prompts, triggering conditions, or agent development best practices for Claude Code plugins.