Expert distributed tracing, metrics, and logging with OpenTelemetry for production observability.
/plugin marketplace add DNYoussef/context-cascade/plugin install dnyoussef-context-cascade@DNYoussef/context-cascadeThis skill inherits all available tools. When active, it can use any tool Claude has access to.
enhancement-summary.mdexamples/distributed-tracing-example.jsexamples/metrics-monitoring-example.pyexamples/slo-tracking-example.pyresources/grafana-dashboard.jsonresources/jaeger-config.jsonresources/log-aggregator.shresources/metrics-collector.pyresources/prometheus-config.yamlresources/slo-monitor.pyresources/trace-analyzer.jstests/test-metrics-collector.pytests/test-slo-monitor.pytests/test-trace-analyzer.jsExpert distributed tracing, metrics, and logging with OpenTelemetry for production observability.
Comprehensive OpenTelemetry expertise including auto-instrumentation, custom spans, metrics collection, log correlation, trace context propagation, and sampling. Ensures applications are fully observable with actionable telemetry data.
Required: Understanding of distributed systems, HTTP, basic observability concepts
Agents: cicd-engineer, perf-analyzer, backend-dev, system-architect
Step 1: Install OpenTelemetry Packages
npm install @opentelemetry/sdk-node \
@opentelemetry/auto-instrumentations-node \
@opentelemetry/exporter-trace-otlp-http \
@opentelemetry/exporter-metrics-otlp-http
Step 2: Initialize OpenTelemetry
// instrumentation.js
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-http');
const { OTLPMetricExporter } = require('@opentelemetry/exporter-metrics-otlp-http');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'my-service',
[SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: 'production',
}),
traceExporter: new OTLPTraceExporter({
url: 'http://localhost:4318/v1/traces',
}),
metricReader: new PeriodicExportingMetricReader({
exporter: new OTLPMetricExporter({
url: 'http://localhost:4318/v1/metrics',
}),
exportIntervalMillis: 60000,
}),
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-http': { enabled: true },
'@opentelemetry/instrumentation-express': { enabled: true },
'@opentelemetry/instrumentation-pg': { enabled: true },
'@opentelemetry/instrumentation-redis': { enabled: true },
}),
],
});
sdk.start();
process.on('SIGTERM', () => {
sdk.shutdown().then(
() => console.log('Tracing terminated'),
(err) => console.log('Error terminating tracing', err)
);
});
Step 3: Start Application with Instrumentation
node --require ./instrumentation.js app.js
const { trace } = require('@opentelemetry/api');
const tracer = trace.getTracer('my-service', '1.0.0');
async function processOrder(orderId) {
const span = tracer.startSpan('processOrder', {
attributes: {
'order.id': orderId,
'order.priority': 'high',
},
});
try {
// Set span status
span.setStatus({ code: SpanStatusCode.OK });
// Add event to span
span.addEvent('order_validated', {
'validation.result': 'success',
});
// Child span
const childSpan = tracer.startSpan('calculateTotal', {
parent: span,
});
const total = await calculateTotal(orderId);
childSpan.setAttribute('order.total', total);
childSpan.end();
return total;
} catch (error) {
// Record exception
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message,
});
throw error;
} finally {
span.end();
}
}
const { metrics } = require('@opentelemetry/api');
const meter = metrics.getMeter('my-service', '1.0.0');
// Counter: Monotonically increasing value
const orderCounter = meter.createCounter('orders.processed', {
description: 'Total number of orders processed',
});
orderCounter.add(1, {
'order.type': 'online',
'order.status': 'completed',
});
// Histogram: Statistical distribution
const requestDuration = meter.createHistogram('http.server.duration', {
description: 'HTTP request duration in milliseconds',
unit: 'ms',
});
requestDuration.record(150, {
'http.method': 'POST',
'http.route': '/api/orders',
'http.status_code': 200,
});
// UpDownCounter: Value can go up or down
const activeConnections = meter.createUpDownCounter('db.connections.active', {
description: 'Number of active database connections',
});
activeConnections.add(1); // Connection opened
activeConnections.add(-1); // Connection closed
// ObservableGauge: Current value snapshot
const memoryUsage = meter.createObservableGauge('process.memory.usage', {
description: 'Process memory usage in bytes',
unit: 'bytes',
});
memoryUsage.addCallback((result) => {
result.observe(process.memoryUsage().heapUsed, {
'memory.type': 'heap',
});
});
// Propagate context between services
const { propagation, context } = require('@opentelemetry/api');
// Client-side: Inject trace context into HTTP headers
async function callExternalService(url, data) {
const span = tracer.startSpan('external_api_call');
const headers = {};
// Inject trace context into headers (W3C Trace Context)
propagation.inject(context.active(), headers);
try {
const response = await fetch(url, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
...headers, // traceparent, tracestate headers
},
body: JSON.stringify(data),
});
return response.json();
} finally {
span.end();
}
}
// Server-side: Extract trace context from HTTP headers
app.post('/api/process', (req, res) => {
// Extract context from incoming headers
const extractedContext = propagation.extract(context.active(), req.headers);
context.with(extractedContext, () => {
const span = tracer.startSpan('process_request');
// This span will be a child of the parent trace from the caller
// ...
span.end();
});
res.json({ status: 'ok' });
});
const { ParentBasedSampler, AlwaysOnSampler, AlwaysOffSampler, TraceIdRatioBasedSampler } = require('@opentelemetry/sdk-trace-base');
// Probability-based sampling (10% of traces)
const sampler = new TraceIdRatioBasedSampler(0.1);
// Parent-based sampling with rate limiting
const parentBasedSampler = new ParentBasedSampler({
root: new TraceIdRatioBasedSampler(0.1), // 10% for root spans
remoteParentSampled: new AlwaysOnSampler(), // Always sample if parent sampled
remoteParentNotSampled: new AlwaysOffSampler(), // Never sample if parent not sampled
localParentSampled: new AlwaysOnSampler(),
localParentNotSampled: new AlwaysOffSampler(),
});
const sdk = new NodeSDK({
sampler: parentBasedSampler,
// ... other config
});
1. Use Semantic Conventions
// ✅ GOOD: Standard semantic conventions
const { SemanticAttributes } = require('@opentelemetry/semantic-conventions');
span.setAttributes({
[SemanticAttributes.HTTP_METHOD]: 'POST',
[SemanticAttributes.HTTP_URL]: '/api/users',
[SemanticAttributes.HTTP_STATUS_CODE]: 200,
[SemanticAttributes.DB_SYSTEM]: 'postgresql',
[SemanticAttributes.DB_NAME]: 'mydb',
});
// ❌ BAD: Custom attributes without namespace
span.setAttributes({
method: 'POST',
url: '/api/users',
});
2. Keep Span Names Concise
// ✅ GOOD: Generic operation name (use attributes for details)
const span = tracer.startSpan('GET /api/users/:id', {
attributes: { 'user.id': userId },
});
// ❌ BAD: High cardinality span names
const span = tracer.startSpan(`GET /api/users/${userId}`);
3. Always End Spans
// ✅ GOOD: Use try/finally to ensure span ends
const span = tracer.startSpan('operation');
try {
await doWork();
} finally {
span.end();
}
// ❌ BAD: Span might never end
const span = tracer.startSpan('operation');
await doWork();
span.end();
4. Use Baggage for Cross-Cutting Concerns
const { propagation, baggageUtils } = require('@opentelemetry/api');
// Set baggage (propagates across service boundaries)
const baggage = propagation.createBaggage({
'user.id': { value: '12345' },
'request.id': { value: 'req-abc-123' },
});
context.with(propagation.setBaggage(context.active(), baggage), () => {
// Baggage available in all child spans
const userId = propagation.getBaggage(context.active())?.getEntry('user.id')?.value;
});
5. Log Correlation
const { trace } = require('@opentelemetry/api');
const winston = require('winston');
const logger = winston.createLogger({
format: winston.format.combine(
winston.format((info) => {
const span = trace.getActiveSpan();
if (span) {
const spanContext = span.spanContext();
info.trace_id = spanContext.traceId;
info.span_id = spanContext.spanId;
}
return info;
})(),
winston.format.json()
),
transports: [new winston.transports.Console()],
});
logger.info('Order processed', { order_id: '123' });
// Output: { "message": "Order processed", "order_id": "123", "trace_id": "...", "span_id": "..." }
# Run Jaeger all-in-one (for development)
docker run -d --name jaeger \
-e COLLECTOR_OTLP_ENABLED=true \
-p 16686:16686 \
-p 4318:4318 \
jaegertracing/all-in-one:latest
# Access Jaeger UI: http://localhost:16686
Issue: No traces appearing in Jaeger Solution: Check exporter URL, ensure OTLP collector is running, verify network connectivity
Issue: High memory usage Solution: Reduce sampling rate, use batch span processor with smaller queue size
Issue: Missing trace context between services Solution: Ensure W3C Trace Context headers (traceparent, tracestate) are propagated
kubernetes-specialist: Deploying OTel Collector in K8saws-specialist: AWS X-Ray integrationbackend-dev: Application instrumentationmcp__flow-nexus__execution_stream_subscribe for real-time trace monitoringmcp__flow-nexus__realtime_subscribe for live metricsmcp__memory-mcp__memory_store for OTel patternsSkill Version: 1.0.0 Last Updated: 2025-11-02