From cohere-pack
Sets up Prometheus metrics, OpenTelemetry traces, and AlertManager alerts for Cohere API v2 in Node.js apps. Tracks latency, tokens, errors, costs for Chat/Embed/Rerank endpoints.
npx claudepluginhub jeremylongshore/claude-code-plugins-plus-skills --plugin cohere-packThis skill is limited to using the following tools:
Set up production observability for Cohere API v2 with Prometheus metrics, OpenTelemetry tracing, and AlertManager rules. Tracks per-endpoint latency, token usage, error rates, and costs.
Executes Cohere API incident runbook: triages outages with curl endpoint tests and status checks, mitigates via kubectl, handles postmortems for integration failures.
Instruments Claude API calls with Python structured logging and Prometheus metrics to track latency, cost, errors, token usage, and rate limits.
Sets up LangSmith tracing, Prometheus metrics callbacks, OpenTelemetry, structured logging, and Grafana dashboards for LangChain apps.
Share bugs, ideas, or general feedback.
Set up production observability for Cohere API v2 with Prometheus metrics, OpenTelemetry tracing, and AlertManager rules. Tracks per-endpoint latency, token usage, error rates, and costs.
cohere-ai SDK v7+import { Registry, Counter, Histogram, Gauge } from 'prom-client';
const registry = new Registry();
// Per-endpoint request counter
const requestCounter = new Counter({
name: 'cohere_requests_total',
help: 'Total Cohere API requests',
labelNames: ['endpoint', 'model', 'status'],
registers: [registry],
});
// Latency histogram
const requestDuration = new Histogram({
name: 'cohere_request_duration_seconds',
help: 'Cohere request duration',
labelNames: ['endpoint', 'model'],
buckets: [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30],
registers: [registry],
});
// Token usage tracking
const tokenCounter = new Counter({
name: 'cohere_tokens_total',
help: 'Total tokens consumed',
labelNames: ['endpoint', 'model', 'direction'], // direction: input|output
registers: [registry],
});
// Error counter by type
const errorCounter = new Counter({
name: 'cohere_errors_total',
help: 'Cohere errors by status code',
labelNames: ['endpoint', 'status_code'],
registers: [registry],
});
// Rate limit headroom
const rateLimitGauge = new Gauge({
name: 'cohere_rate_limit_remaining',
help: 'Remaining rate limit capacity',
labelNames: ['endpoint'],
registers: [registry],
});
import { CohereClientV2, CohereError, CohereTimeoutError } from 'cohere-ai';
const cohere = new CohereClientV2();
async function instrumentedCall<T>(
endpoint: string,
model: string,
operation: () => Promise<T>
): Promise<T> {
const timer = requestDuration.startTimer({ endpoint, model });
try {
const result = await operation();
requestCounter.inc({ endpoint, model, status: 'success' });
timer();
// Track tokens from response
const usage = (result as any)?.usage?.billedUnits;
if (usage) {
if (usage.inputTokens) {
tokenCounter.inc({ endpoint, model, direction: 'input' }, usage.inputTokens);
}
if (usage.outputTokens) {
tokenCounter.inc({ endpoint, model, direction: 'output' }, usage.outputTokens);
}
}
return result;
} catch (err) {
requestCounter.inc({ endpoint, model, status: 'error' });
timer();
if (err instanceof CohereError) {
errorCounter.inc({ endpoint, status_code: String(err.statusCode) });
} else if (err instanceof CohereTimeoutError) {
errorCounter.inc({ endpoint, status_code: 'timeout' });
}
throw err;
}
}
// Usage
const response = await instrumentedCall('chat', 'command-a-03-2025', () =>
cohere.chat({
model: 'command-a-03-2025',
messages: [{ role: 'user', content: query }],
})
);
import { trace, SpanStatusCode, SpanKind } from '@opentelemetry/api';
const tracer = trace.getTracer('cohere-client', '1.0.0');
async function tracedCohereCall<T>(
endpoint: string,
model: string,
operation: () => Promise<T>
): Promise<T> {
return tracer.startActiveSpan(
`cohere.${endpoint}`,
{ kind: SpanKind.CLIENT },
async (span) => {
span.setAttribute('cohere.model', model);
span.setAttribute('cohere.endpoint', endpoint);
try {
const result = await operation();
// Add token usage to span
const usage = (result as any)?.usage?.billedUnits;
if (usage) {
span.setAttribute('cohere.tokens.input', usage.inputTokens ?? 0);
span.setAttribute('cohere.tokens.output', usage.outputTokens ?? 0);
}
span.setStatus({ code: SpanStatusCode.OK });
return result;
} catch (err: any) {
span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
span.recordException(err);
if (err instanceof CohereError) {
span.setAttribute('cohere.error.status', err.statusCode ?? 0);
}
throw err;
} finally {
span.end();
}
}
);
}
import pino from 'pino';
const logger = pino({ name: 'cohere', level: process.env.LOG_LEVEL ?? 'info' });
function logCohereCall(
endpoint: string,
model: string,
durationMs: number,
status: 'success' | 'error',
meta?: Record<string, unknown>
) {
logger[status === 'error' ? 'error' : 'info']({
service: 'cohere',
endpoint,
model,
durationMs,
status,
...meta,
});
}
// Combined instrumentation
async function observedCall<T>(
endpoint: string,
model: string,
fn: () => Promise<T>
): Promise<T> {
return tracedCohereCall(endpoint, model, () =>
instrumentedCall(endpoint, model, async () => {
const start = Date.now();
try {
const result = await fn();
logCohereCall(endpoint, model, Date.now() - start, 'success', {
tokens: (result as any)?.usage?.billedUnits,
});
return result;
} catch (err) {
logCohereCall(endpoint, model, Date.now() - start, 'error', {
error: err instanceof CohereError ? err.statusCode : 'timeout',
});
throw err;
}
})
);
}
# prometheus/cohere-alerts.yml
groups:
- name: cohere
rules:
- alert: CohereHighErrorRate
expr: |
rate(cohere_errors_total[5m]) /
rate(cohere_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Cohere error rate > 5%"
description: "{{ $labels.endpoint }} error rate: {{ $value | humanizePercentage }}"
- alert: CohereRateLimited
expr: rate(cohere_errors_total{status_code="429"}[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "Cohere rate limiting detected"
- alert: CohereHighLatency
expr: |
histogram_quantile(0.95,
rate(cohere_request_duration_seconds_bucket[5m])
) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Cohere P95 latency > 10s"
- alert: CohereAuthFailure
expr: cohere_errors_total{status_code="401"} > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Cohere authentication failure — check API key"
- alert: CohereHighTokenBurn
expr: rate(cohere_tokens_total[1h]) > 100000
for: 15m
labels:
severity: warning
annotations:
summary: "Cohere token burn rate > 100K/hour"
// GET /metrics
import express from 'express';
const app = express();
app.get('/metrics', async (req, res) => {
res.set('Content-Type', registry.contentType);
res.send(await registry.metrics());
});
| Panel | Query | Type |
|---|---|---|
| Request Rate | rate(cohere_requests_total[5m]) | Time series |
| Error Rate | rate(cohere_errors_total[5m]) / rate(cohere_requests_total[5m]) | Stat |
| P50/P95 Latency | histogram_quantile(0.95, rate(cohere_request_duration_seconds_bucket[5m])) | Time series |
| Token Usage | rate(cohere_tokens_total[1h]) | Bar chart |
| Errors by Code | sum by (status_code)(rate(cohere_errors_total[5m])) | Pie chart |
| Issue | Cause | Solution |
|---|---|---|
| Missing token metrics | Usage not in response | Check response.usage.billedUnits |
| High cardinality | Too many model labels | Use model family, not exact version |
| Alert storm | Threshold too low | Tune thresholds for your traffic |
| Trace gaps | Missing context propagation | Ensure OTel context flows through async |
For incident response, see cohere-incident-runbook.