From clickhouse-pack
Sets up ClickHouse monitoring with system table queries for query performance/merge health/resource usage, Prometheus metrics, Grafana dashboards, and alerting.
npx claudepluginhub jeremylongshore/claude-code-plugins-plus-skills --plugin clickhouse-packThis skill is limited to using the following tools:
Set up comprehensive monitoring for ClickHouse using built-in system tables,
Triages and remediates ClickHouse production incidents—downtime, OOM, slow queries, errors—using system tables, SQL, curl pings, and kubectl. For on-call emergencies.
Sets up Grafana Cloud Database Observability for MySQL and PostgreSQL using Grafana Alloy. Provides query-level insights with RED metrics, query samples, explain plans via pg_stat_statements and Performance Schema, plus trace correlation. Use for slow query diagnosis and performance monitoring.
Provides ClickHouse table design patterns with MergeTree engines, query optimization techniques, and best practices for high-performance analytics and data engineering.
Share bugs, ideas, or general feedback.
Set up comprehensive monitoring for ClickHouse using built-in system tables, Prometheus integration, Grafana dashboards, and alerting rules.
system.* table access-- Real-time server health snapshot
SELECT
(SELECT count() FROM system.processes) AS running_queries,
(SELECT value FROM system.metrics WHERE metric = 'MemoryTracking') AS memory_bytes,
(SELECT value FROM system.metrics WHERE metric = 'Query') AS concurrent_queries,
(SELECT count() FROM system.merges) AS active_merges,
(SELECT value FROM system.asynchronous_metrics WHERE metric = 'Uptime') AS uptime_sec;
-- Query throughput (last hour, per minute)
SELECT
toStartOfMinute(event_time) AS minute,
count() AS queries,
countIf(exception_code != 0) AS errors,
round(avg(query_duration_ms)) AS avg_ms,
round(quantile(0.95)(query_duration_ms)) AS p95_ms,
formatReadableSize(sum(read_bytes)) AS total_read
FROM system.query_log
WHERE type IN ('QueryFinish', 'ExceptionWhileProcessing')
AND event_time >= now() - INTERVAL 1 HOUR
GROUP BY minute ORDER BY minute;
-- Insert throughput (last hour)
SELECT
toStartOfMinute(event_time) AS minute,
count() AS inserts,
sum(written_rows) AS rows_written,
formatReadableSize(sum(written_bytes)) AS bytes_written
FROM system.query_log
WHERE type = 'QueryFinish' AND query_kind = 'Insert'
AND event_time >= now() - INTERVAL 1 HOUR
GROUP BY minute ORDER BY minute;
-- Part count per table (merge health indicator)
SELECT database, table, count() AS parts, sum(rows) AS rows,
formatReadableSize(sum(bytes_on_disk)) AS size
FROM system.parts WHERE active
GROUP BY database, table
HAVING parts > 50
ORDER BY parts DESC;
ClickHouse Cloud exposes a managed Prometheus endpoint:
# prometheus.yml
scrape_configs:
- job_name: clickhouse-cloud
metrics_path: /v1/organizations/<ORG_ID>/prometheus
basic_auth:
username: <API_KEY_ID>
password: <API_KEY_SECRET>
static_configs:
- targets: ['api.clickhouse.cloud']
params:
filtered_metrics: ['true'] # 125 critical metrics only
Self-hosted — use clickhouse-exporter or built-in metrics endpoint:
# prometheus.yml
scrape_configs:
- job_name: clickhouse
static_configs:
- targets: ['clickhouse-server:9363'] # Built-in Prometheus endpoint
metrics_path: /metrics
<!-- Enable Prometheus endpoint in config.xml -->
<prometheus>
<endpoint>/metrics</endpoint>
<port>9363</port>
<metrics>true</metrics>
<events>true</events>
<asynchronous_metrics>true</asynchronous_metrics>
</prometheus>
import { Registry, Counter, Histogram, Gauge } from 'prom-client';
const registry = new Registry();
const queryDuration = new Histogram({
name: 'clickhouse_query_duration_seconds',
help: 'ClickHouse query duration',
labelNames: ['query_type', 'status'],
buckets: [0.01, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [registry],
});
const queryErrors = new Counter({
name: 'clickhouse_query_errors_total',
help: 'ClickHouse query errors',
labelNames: ['error_code'],
registers: [registry],
});
const insertRows = new Counter({
name: 'clickhouse_insert_rows_total',
help: 'Total rows inserted into ClickHouse',
labelNames: ['table'],
registers: [registry],
});
// Instrumented query wrapper
async function instrumentedQuery<T>(
queryType: string,
fn: () => Promise<T>,
): Promise<T> {
const timer = queryDuration.startTimer({ query_type: queryType });
try {
const result = await fn();
timer({ status: 'success' });
return result;
} catch (err: any) {
timer({ status: 'error' });
queryErrors.inc({ error_code: err.code ?? 'unknown' });
throw err;
}
}
// Expose /metrics endpoint
app.get('/metrics', async (req, res) => {
res.set('Content-Type', registry.contentType);
res.send(await registry.metrics());
});
{
"panels": [
{
"title": "Query Rate (QPS)",
"type": "timeseries",
"targets": [{ "expr": "rate(clickhouse_query_duration_seconds_count[5m])" }]
},
{
"title": "Query Latency P50 / P95 / P99",
"type": "timeseries",
"targets": [
{ "expr": "histogram_quantile(0.5, rate(clickhouse_query_duration_seconds_bucket[5m]))" },
{ "expr": "histogram_quantile(0.95, rate(clickhouse_query_duration_seconds_bucket[5m]))" },
{ "expr": "histogram_quantile(0.99, rate(clickhouse_query_duration_seconds_bucket[5m]))" }
]
},
{
"title": "Error Rate",
"type": "stat",
"targets": [{
"expr": "rate(clickhouse_query_errors_total[5m]) / rate(clickhouse_query_duration_seconds_count[5m])"
}]
},
{
"title": "Insert Throughput (rows/sec)",
"type": "timeseries",
"targets": [{ "expr": "rate(clickhouse_insert_rows_total[5m])" }]
}
]
}
Import the official ClickHouse Grafana dashboard: https://grafana.com/grafana/dashboards/23415
# clickhouse-alerts.yml
groups:
- name: clickhouse
rules:
- alert: ClickHouseHighErrorRate
expr: |
rate(clickhouse_query_errors_total[5m]) /
rate(clickhouse_query_duration_seconds_count[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "ClickHouse error rate > 5%"
- alert: ClickHouseHighLatency
expr: |
histogram_quantile(0.95,
rate(clickhouse_query_duration_seconds_bucket[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "ClickHouse P95 latency > 5 seconds"
- alert: ClickHouseTooManyParts
expr: clickhouse_table_parts > 300
for: 10m
labels:
severity: critical
annotations:
summary: "Table has > 300 active parts — merges falling behind"
- alert: ClickHouseMemoryHigh
expr: clickhouse_server_memory_usage / clickhouse_server_memory_limit > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "ClickHouse memory usage > 90%"
- alert: ClickHouseDiskLow
expr: clickhouse_disk_free_bytes / clickhouse_disk_total_bytes < 0.15
for: 10m
labels:
severity: critical
annotations:
summary: "ClickHouse disk space < 15% free"
import pino from 'pino';
const logger = pino({ name: 'clickhouse' });
// Log query performance for analysis
function logQuery(queryType: string, durationMs: number, rowsRead: number) {
logger.info({
service: 'clickhouse',
query_type: queryType,
duration_ms: durationMs,
rows_read: rowsRead,
status: durationMs > 5000 ? 'slow' : 'ok',
});
}
| Table | What to Monitor | Frequency |
|---|---|---|
system.processes | Running queries, memory usage | Every 10s |
system.query_log | Query performance history | Every 1m |
system.parts | Part count, merge health | Every 1m |
system.merges | Active merge progress | Every 30s |
system.metrics | Server-wide gauges (connections, memory) | Every 10s |
system.events | Cumulative counters | Every 1m |
system.replicas | Replication lag | Every 30s |
system.disks | Disk space | Every 5m |
| Issue | Cause | Solution |
|---|---|---|
| Metrics endpoint empty | Prometheus not configured | Enable /metrics in config |
| High cardinality alerts | Too many label values | Reduce label cardinality |
| Missing query_log data | Logging disabled | Set log_queries = 1 in config |
| Dashboard gaps | Scrape interval too long | Use 10-15s scrape interval |
For incident response, see clickhouse-incident-runbook.