🎯 Recommended Samples
Balanced sample collections from various categories for you to explore
Prometheus Monitoring
Comprehensive Prometheus monitoring and alerting system examples including custom metrics exporters, alert rule configuration, PromQL queries, and Grafana dashboard integration
💻 Custom Prometheus Metrics Exporter javascript
Build a custom Prometheus metrics exporter with multiple metric types including counters, gauges, histograms, and summaries for comprehensive application monitoring
// Custom Prometheus Metrics Exporter
// Complete Node.js application with multiple metric types and Express integration
// Run with: node metrics-exporter.js
import express from 'express';
import client from 'prom-client';
import compression from 'compression';
import cors from 'cors';
// Configuration
const PORT = process.env.PORT || 9091;
const HOST = process.env.HOST || '0.0.0.0';
// Create Prometheus Registry
const register = new client.Registry();
// Add default metrics (CPU, memory, etc.)
client.collectDefaultMetrics({ register });
// Custom Metrics
// ==============
// 1. Counter - Only increases (e.g., total requests)
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code', 'user_agent'],
registers: [register]
});
// 2. Gauge - Can go up or down (e.g., active connections)
const activeConnections = new client.Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [register]
});
const queueSize = new client.Gauge({
name: 'queue_size',
help: 'Current size of processing queue',
registers: [register]
});
const systemLoadGauge = new client.Gauge({
name: 'system_load_average',
help: 'System load average over the last minute',
registers: [register]
});
// 3. Histogram - Observations and their distribution
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10],
registers: [register]
});
const databaseQueryDuration = new client.Histogram({
name: 'database_query_duration_seconds',
help: 'Duration of database queries in seconds',
labelNames: ['query_type', 'table'],
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register]
});
// 4. Summary - Similar to histogram but calculates quantiles on the client side
const requestSizeBytes = new client.Summary({
name: 'request_size_bytes',
help: 'Size of HTTP requests in bytes',
percentiles: [0.5, 0.9, 0.95, 0.99],
registers: [register]
});
const responseSizeBytes = new client.Summary({
name: 'response_size_bytes',
help: 'Size of HTTP responses in bytes',
percentiles: [0.5, 0.9, 0.95, 0.99],
registers: [register]
});
// Express Application Setup
// =========================
const app = express();
app.use(compression());
app.use(cors());
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true }));
// Metrics Collection Middleware
app.use((req, res, next) => {
const start = Date.now();
// Track active connections
activeConnections.inc();
// Track request size
if (req.headers['content-length']) {
requestSizeBytes.observe(parseInt(req.headers['content-length']));
}
// Process the request
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.path;
// Update request counter
httpRequestsTotal
.labels(
req.method,
route,
res.statusCode.toString(),
req.headers['user-agent'] || 'unknown'
)
.inc();
// Update request duration histogram
httpRequestDuration
.labels(req.method, route, res.statusCode.toString())
.observe(duration);
// Track response size
if (res.get('content-length')) {
responseSizeBytes.observe(parseInt(res.get('content-length') || '0'));
}
// Decrease active connections
activeConnections.dec();
});
next();
});
// Health Check Endpoint
app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
memory: process.memoryUsage(),
pid: process.pid
});
});
// Metrics Endpoint - This is what Prometheus scrapes
app.get('/metrics', async (req, res) => {
try {
// Update system metrics
systemLoadGauge.set(require('os').loadavg()[0]);
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
} catch (error) {
console.error('Error generating metrics:', error);
res.status(500).end(error.message);
}
});
// API Routes Examples
// ===================
// Example API endpoints with metrics
app.get('/api/users', async (req, res) => {
const queryStart = Date.now();
try {
// Simulate database query
await new Promise(resolve => setTimeout(resolve, Math.random() * 100));
// Track queue processing
queueSize.inc();
// Process business logic
const users = [
{ id: 1, name: 'Alice Johnson', email: '[email protected]', role: 'admin' },
{ id: 2, name: 'Bob Smith', email: '[email protected]', role: 'user' },
{ id: 3, name: 'Charlie Davis', email: '[email protected]', role: 'user' }
];
// Track database query duration
databaseQueryDuration
.labels('SELECT', 'users')
.observe((Date.now() - queryStart) / 1000);
// Decrease queue size
queueSize.dec();
res.json({
success: true,
data: users,
total: users.length,
timestamp: new Date().toISOString()
});
} catch (error) {
queueSize.dec();
databaseQueryDuration
.labels('SELECT', 'users')
.observe((Date.now() - queryStart) / 1000);
console.error('Error fetching users:', error);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
}
});
app.post('/api/orders', async (req, res) => {
const queryStart = Date.now();
try {
const { userId, items, total } = req.body;
// Validate input
if (!userId || !items || !total) {
return res.status(400).json({
success: false,
error: 'Missing required fields',
timestamp: new Date().toISOString()
});
}
// Simulate order processing
queueSize.inc();
await new Promise(resolve => setTimeout(resolve, Math.random() * 200));
const order = {
id: Math.floor(Math.random() * 10000),
userId,
items,
total,
status: 'created',
createdAt: new Date().toISOString()
};
// Track database operation
databaseQueryDuration
.labels('INSERT', 'orders')
.observe((Date.now() - queryStart) / 1000);
queueSize.dec();
res.status(201).json({
success: true,
data: order,
timestamp: new Date().toISOString()
});
} catch (error) {
queueSize.dec();
databaseQueryDuration
.labels('INSERT', 'orders')
.observe((Date.now() - queryStart) / 1000);
console.error('Error creating order:', error);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
}
});
// Error handling middleware
app.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
});
// 404 handler
app.use('*', (req, res) => {
res.status(404).json({
success: false,
error: 'Route not found',
path: req.originalUrl,
timestamp: new Date().toISOString()
});
});
// Server Startup
// ==============
const server = app.listen(PORT, HOST, () => {
console.log(`🚀 Prometheus Metrics Exporter running on http://${HOST}:${PORT}`);
console.log(`📊 Metrics available at: http://${HOST}:${PORT}/metrics`);
console.log(`💚 Health check: http://${HOST}:${PORT}/health`);
console.log(`👥 Users API: http://${HOST}:${PORT}/api/users`);
console.log(`🛒 Orders API: http://${HOST}:${PORT}/api/orders`);
});
// Graceful Shutdown
// ==================
process.on('SIGTERM', () => {
console.log('\n🛑 Received SIGTERM, shutting down gracefully...');
server.close(() => {
console.log('✅ Server closed gracefully');
process.exit(0);
});
});
process.on('SIGINT', () => {
console.log('\n🛑 Received SIGINT, shutting down gracefully...');
server.close(() => {
console.log('✅ Server closed gracefully');
process.exit(0);
});
});
// Handle uncaught exceptions
process.on('uncaughtException', (error) => {
console.error('💥 Uncaught Exception:', error);
process.exit(1);
});
process.on('unhandledRejection', (reason, promise) => {
console.error('💥 Unhandled Rejection at:', promise, 'reason:', reason);
process.exit(1);
});
export default app;
💻 Prometheus Alerting and Alertmanager Configuration yaml
Complete alerting setup with Prometheus alert rules, Alertmanager configuration, multiple notification channels (email, Slack, PagerDuty), and advanced alert routing
# Prometheus Alerting Configuration
# ==================================
# Complete setup with alert rules, Alertmanager, and multiple notification channels
# Prometheus Alert Rules Configuration
# File: prometheus_alert_rules.yml
# ===================================
groups:
- name: system_infrastructure_alerts
interval: 30s
rules:
# High CPU Usage Alert
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "High CPU usage detected on {{ $labels.instance }}"
description: |
CPU usage is above 85% for more than 5 minutes.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Runbook: https://runbooks.example.com/high-cpu
dashboard_url: "https://grafana.example.com/d/cpu-dashboard"
# Critical Memory Usage Alert
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 3m
labels:
severity: critical
service: infrastructure
team: devops
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: |
Memory usage is above 95% for more than 3 minutes.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Action: Immediate investigation required
dashboard_url: "https://grafana.example.com/d/memory-dashboard"
# Disk Space Alert
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
for: 10m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: |
Disk usage is above 90% on {{ $labels.mountpoint }}.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Mount point: {{ $labels.mountpoint }}
dashboard_url: "https://grafana.example.com/d/disk-dashboard"
# System Load Alert
- alert: HighSystemLoad
expr: node_load1 / count by(instance) (node_cpu_seconds_total{mode="idle"}) > 2
for: 5m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "High system load on {{ $labels.instance }}"
description: |
System 1-minute load average is {{ $value | printf "%.2f" }}, which is 2x the number of CPU cores.
Instance: {{ $labels.instance }}
Load average: {{ $value }}
- name: application_performance_alerts
interval: 15s
rules:
# HTTP Error Rate Alert
- alert: HighHTTPErrorRate
expr: (rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
for: 3m
labels:
severity: critical
service: application
team: backend
annotations:
summary: "High HTTP 5xx error rate"
description: |
HTTP 5xx error rate is {{ $value | printf "%.2f" }}% over the last 5 minutes.
Service: {{ $labels.job }}
Instance: {{ $labels.instance }}
Investigation required immediately
# Application Downtime Alert
- alert: ApplicationDown
expr: up{job=~"myapp|webapp"} == 0
for: 1m
labels:
severity: critical
service: application
team: backend
annotations:
summary: "Application {{ $labels.job }} is down"
description: |
Application {{ $labels.job }} on instance {{ $labels.instance }} has been down for more than 1 minute.
Last scrape: {{ $value | humanizeTimestamp }}
Immediate action required
# High Response Time Alert
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
service: application
team: backend
annotations:
summary: "High 95th percentile response time"
description: |
95th percentile response time is {{ $value | printf "%.2f" }}s over the last 5 minutes.
Service: {{ $labels.job }}
Threshold: 2s
# Database Connection Alert
- alert: DatabaseConnectionPoolExhausted
expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
for: 2m
labels:
severity: warning
service: database
team: backend
annotations:
summary: "Database connection pool nearly exhausted"
description: |
Database connection pool usage is {{ $value | printf "%.1f" }}%.
Active connections: {{ query "hikaricp_connections_active" | first | value }}
Max connections: {{ query "hikaricp_connections_max" | first | value }}
- name: business_metrics_alerts
interval: 60s
rules:
# Order Processing Delay Alert
- alert: OrderProcessingDelay
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{route="/api/orders"}[5m])) > 5
for: 5m
labels:
severity: warning
service: business
team: business
annotations:
summary: "Order processing delay detected"
description: |
95th percentile order processing time is {{ $value | printf "%.2f" }}s.
Threshold: 5s
This may impact customer experience
# Low Order Volume Alert
- alert: LowOrderVolume
expr: rate(http_requests_total{route="/api/orders",method="POST"}[1h]) < 1
for: 30m
labels:
severity: warning
service: business
team: business
annotations:
summary: "Low order volume detected"
description: |
Order rate is {{ $value | printf "%.2f" }} orders per hour.
This is below the expected threshold of 1 order/hour.
Time: {{ $value | humanizeTimestamp }}
# User Registration Drop Alert
- alert: UserRegistrationDrop
expr: rate(http_requests_total{route="/api/users",method="POST"}[2h]) < 0.5
for: 1h
labels:
severity: warning
service: business
team: business
annotations:
summary: "User registration rate dropped"
description: |
User registration rate is {{ $value | printf "%.2f" }} per hour.
Normal rate: >0.5 registrations/hour
Investigation recommended
# Alertmanager Configuration
# File: alertmanager.yml
# =======================
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'your-smtp-password'
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
# Route configuration - defines how alerts are grouped and sent
route:
# Group alerts by these labels
group_by: ['alertname', 'cluster', 'service', 'severity']
# How long to wait before sending the first notification for a group
group_wait: 10s
# How long to wait between sending notifications for the same group
group_interval: 10s
# How long to wait before re-sending a notification after it's been resolved
repeat_interval: 12h
# Default receiver
receiver: 'default-receiver'
# Routing rules for different alert types
routes:
# Critical alerts go to critical channels immediately
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
repeat_interval: 5m
# Infrastructure alerts go to DevOps team
- match:
service: infrastructure
receiver: 'infrastructure-alerts'
group_by: ['alertname', 'service']
# Application alerts go to Backend team
- match:
service: application
receiver: 'application-alerts'
# Business alerts go to Business team
- match:
service: business
receiver: 'business-alerts'
routes:
# Only during business hours for non-critical
- match:
severity: warning
active_time_intervals:
- business-hours
- match:
severity: critical
receiver: 'business-critical-alerts'
# Time intervals for routing
time_intervals:
- name: business-hours
time_intervals:
- times:
- start_time: '09:00'
end_time: '17:00'
weekdays: ['monday:friday']
# Receivers define where alerts are sent
receivers:
# Default receiver for unmatched alerts
- name: 'default-receiver'
email_configs:
- to: '[email protected]'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Runbook: {{ .Annotations.runbook_url }}
Dashboard: {{ .Annotations.dashboard_url }}
Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
# Critical alerts - multiple channels
- name: 'critical-alerts'
email_configs:
- to: '[email protected],[email protected]'
subject: '[CRITICAL] {{ .GroupLabels.alertname }} - IMMEDIATE ACTION REQUIRED'
body: |
🚨 CRITICAL ALERT 🚨
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
📊 Current Value: {{ $value }}
🏷️ Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
⏰ Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
🔗 Dashboard: {{ .Annotations.dashboard_url }}
📖 Runbook: {{ .Annotations.runbook_url }}
{{ end }}
IMMEDIATE ACTION REQUIRED!
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/CRITICAL/WEBHOOK'
channel: '#alerts-critical'
title: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*{{ .Annotations.summary }}*
{{ .Annotations.description }}
{{ end }}
color: 'danger'
actions:
- type: button
text: 'View Dashboard'
url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
- type: button
text: 'Runbook'
url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# Infrastructure alerts - DevOps team
- name: 'infrastructure-alerts'
email_configs:
- to: '[email protected]'
subject: '[INFRA] {{ .GroupLabels.alertname }}'
body: |
Infrastructure Alert: {{ .GroupLabels.alertname }}
{{ range .Alerts }}
{{ .Annotations.summary }}
{{ .Annotations.description }}
Instance: {{ .labels.instance }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/INFRA/WEBHOOK'
channel: '#infra-alerts'
title: 'Infrastructure Alert: {{ .GroupLabels.alertname }}'
color: 'warning'
# Application alerts - Backend team
- name: 'application-alerts'
email_configs:
- to: '[email protected]'
subject: '[APP] {{ .GroupLabels.alertname }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/APP/WEBHOOK'
channel: '#backend-alerts'
title: 'Application Alert: {{ .GroupLabels.alertname }}'
color: 'warning'
# Business alerts - Business team
- name: 'business-alerts'
email_configs:
- to: '[email protected]'
subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/BUSINESS/WEBHOOK'
channel: '#business-alerts'
title: 'Business Alert: {{ .GroupLabels.alertname }}'
color: '#good'
# Business critical alerts - PagerDuty integration
- name: 'business-critical-alerts'
pagerduty_configs:
- service_key: 'your-pagerduty-service-key'
description: '{{ .GroupLabels.alertname }}'
details:
firing: '{{ .Alerts.Firing | len }}'
summary: '{{ (index .Alerts 0).Annotations.summary }}'
description: '{{ (index .Alerts 0).Annotations.description }}'
# Inhibition rules - prevent alert spam
inhibit_rules:
# Don't send application alerts if the server is down
- source_match:
alertname: ApplicationDown
target_match_re:
service: application
equal: ['instance']
# Don't send high error rate alerts if server is down
- source_match:
alertname: ApplicationDown
target_match:
alertname: HighHTTPErrorRate
equal: ['instance']
# Templates for custom alert formatting
templates:
- '/etc/alertmanager/templates/*.tmpl'
💻 Advanced PromQL Query Examples yaml
Comprehensive collection of PromQL queries for system monitoring, application performance analysis, business metrics, and alerting scenarios with detailed explanations
# Advanced PromQL Query Examples
# =================================
# Complete collection of monitoring queries for production environments
# 1. System Infrastructure Monitoring
# ====================================
# CPU Utilization by Instance (percentage)
# Shows CPU usage across all instances
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# CPU Usage by Mode (idle, user, system, iowait)
# Breaks down CPU usage by different modes
avg by(instance, mode) (irate(node_cpu_seconds_total[5m])) * 100
# Memory Usage Percentage
# Calculates memory usage (total - available) / total
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Detailed Memory Breakdown
# Shows different memory types in bytes
node_memory_MemTotal_bytes -
node_memory_MemFree_bytes -
node_memory_Buffers_bytes -
node_memory_Cached_bytes -
node_memory_SwapCached_bytes
# Disk Usage by Mount Point
# Shows disk usage percentage for each filesystem
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
# Disk I/O Operations per Second
# Read and write operations
irate(node_disk_reads_completed_total[5m])
irate(node_disk_writes_completed_total[5m])
# Network I/O (bytes per second)
# Network throughput
irate(node_network_receive_bytes_total[5m]) * 8 # Convert to bits
irate(node_network_transmit_bytes_total[5m]) * 8
# System Load Average
# 1-minute, 5-minute, and 15-minute load averages
node_load1
node_load5
node_load15
# Process Count by State
# Number of processes in different states
count by(instance) (node_processes_state)
# Uptime in Hours
# System uptime
time() - node_boot_time_seconds
# 2. Application Performance Monitoring
# ======================================
# HTTP Request Rate (requests per second)
# Overall request rate
rate(http_requests_total[5m])
# HTTP Request Rate by Method
# Breakdown by HTTP method
sum by(method) (rate(http_requests_total{method=~"GET|POST|PUT|DELETE"}[5m]))
# HTTP Request Rate by Status Code
# Shows 2xx, 3xx, 4xx, 5xx breakdown
sum by(status_code) (rate(http_requests_total[5m]))
# HTTP Error Rate (percentage)
# Percentage of 4xx and 5xx responses
(sum(rate(http_requests_total{status_code=~"4..|5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100
# 95th Percentile Response Time
# 95% of requests complete within this time
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# 99th Percentile Response Time by Route
# Slowest routes identification
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
# Average Response Time
# Mean response time across all requests
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])
# Request Rate by Route
# Most frequently accessed endpoints
sum by(route) (rate(http_requests_total[5m]))
# Database Connection Pool Usage
# Database connection pool utilization
(hikaricp_connections_active / hikaricp_connections_max) * 100
# Database Query Performance
# Average query duration by type
avg by(query_type) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))
# Cache Hit Rate
# Cache effectiveness
(rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m]))) * 100
# 3. Business Metrics and KPIs
# ============================
# User Registration Rate (per hour)
# New user signups
rate(user_registrations_total[1h]) * 3600
# Order Processing Rate (per minute)
# Orders being processed
rate(orders_processed_total[5m]) * 60
# Revenue per Minute
# Real-time revenue tracking
rate(order_revenue_total[5m])
# Shopping Cart Abandonment Rate
# Percentage of carts not completed
(1 - (rate(checkouts_completed_total[1h]) / rate(cart_created_total[1h]))) * 100
# Active User Sessions
# Currently logged-in users
active_user_sessions
# Conversion Rate (percentage)
# From visit to purchase
(rate(purchases_total[1h]) / rate(page_views_total{page="landing"}[1h])) * 100
# Average Order Value
# Revenue per order
rate(order_revenue_total[1h]) / rate(orders_completed_total[1h])
# Customer Retention Rate
# Percentage of customers returning
rate(returning_customers_total[24h]) / rate(total_customers_total[24h]) * 100
# Feature Usage Rate
# How often features are used
rate(feature_usage_total{feature=~".*"}[1h])
# 4. Container and Kubernetes Monitoring
# =======================================
# Container CPU Usage
# CPU usage per container
rate(container_cpu_usage_seconds_total{container!="",container!="/POD"}[5m]) * 100
# Container Memory Usage
# Memory usage per container
(container_memory_usage_bytes{container!="",container!="/POD"} / container_spec_memory_limit_bytes) * 100
# Container Network I/O
# Network traffic per container
rate(container_network_receive_bytes_total[5m])
rate(container_network_transmit_bytes_total[5m])
# Pod Status Distribution
# Count of pods by status
sum by(pod_phase) (kube_pod_status_phase)
# Node Ready Status
# Kubernetes cluster node availability
sum by(node) (kube_node_status_condition{condition="Ready",status="true"})
# Pending Pods Count
# Pods waiting to be scheduled
sum(kube_pod_status_phase{phase="Pending"})
# HPA (Horizontal Pod Autoscaler) Current Replicas
# Autoscaling status
kube_hpa_status_current_replicas
# 5. Advanced Alerting Queries
# =============================
# High CPU Usage (>90% for 5 minutes)
# CPU utilization alert
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
# Memory Pressure (>95% for 3 minutes)
# Memory usage alert
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
# Disk Space Warning (>85% usage)
# Disk usage alert
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
# Application Down
# Service availability alert
up{job="myapp"} == 0
# High Error Rate (>5% 5xx responses)
# Application error rate alert
(rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
# Slow Response Time (>2 seconds 95th percentile)
# Performance degradation alert
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
# Database Connection Exhaustion
# Database resource alert
(hikaricp_connections_active / hikaricp_connections_max) > 0.9
# 6. Capacity Planning and Forecasting
# ======================================
# Disk Growth Rate (per day)
# Predict when disks will fill up
increase(node_filesystem_size_bytes[1d]) * 86400
# Memory Growth Trend (per hour)
# Memory usage trend
increase(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes[1h]) * 3600
# CPU Usage Prediction (next hour)
# Linear regression for CPU usage
predict_linear(avg_over_time(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)[1h:1m], 3600)
# Request Growth Rate (per day)
# Traffic growth prediction
increase(http_requests_total[1d])
# Disk Space Prediction (time until full)
# When will disk run out of space
(time() - node_filesystem_avail_bytes) / increase(node_filesystem_avail_bytes[1h]) * 3600
# 7. SLA and SLO Monitoring
# ==========================
# Service Availability (99.9% SLA)
# Uptime percentage calculation
(1 - sum(rate(http_requests_total{status_code=~"5.."}[1d])) / sum(rate(http_requests_total[1d]))) * 100
# Response Time SLO (95% < 500ms)
# Performance SLO compliance
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1d])) by (le, service)) < 0.5
# Error Budget Remaining (percentage)
# How much error budget is left for the month
100 - (sum(rate(http_requests_total{status_code=~"5.."}[30d])) / sum(rate(http_requests_total[30d])) * 100)
# Request Rate SLO (>1000 req/s)
# Capacity SLO
sum(rate(http_requests_total[5m])) > 1000
# 8. Correlation and Complex Queries
# ====================================
# CPU vs Memory Correlation
# Find instances with high CPU AND memory usage
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) *
((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 5000
# Find Slow Queries with High Rate
# Performance bottleneck identification
rate(http_request_duration_seconds_count[5m]) > 10 and
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
# Error Rate with High Traffic
# Critical issues affecting many users
rate(http_requests_total{status_code=~"5.."}[5m]) > 1 and
rate(http_requests_total[5m]) > 100
# Instance Performance Score
# Combined performance metric
(
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) * 0.3 +
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 * 0.3 +
((1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100) * 0.4
)
# 9. Time Series Analysis
# ========================
# Moving Average (smooths fluctuations)
# 1-hour moving average of request rate
avg_over_time(rate(http_requests_total[1m])[1h:1m])
# Rate of Change (identifies anomalies)
# How quickly metrics are changing
deriv(rate(http_requests_total[5m])[5m])
# Standard Deviation (detects volatility)
# Request rate variability
stddev_over_time(rate(http_requests_total[1m])[1h:1m])
# Year-over-Year Comparison
# Compare current period with previous year
rate(http_requests_total[1d]) / rate(http_requests_total offset 365d)
# 10. Custom Business Logic Examples
# ====================================
# Revenue per User
# Business efficiency metric
rate(order_revenue_total[1h]) / rate(active_users_total[1h])
# Customer Lifetime Value Prediction
# Using historical data to predict CLV
rate(order_revenue_total[30d]) *
(rate(returning_customers_total[30d]) / rate(total_customers_total[30d]))
# API Rate Limit Utilization
# How close to rate limits
(rate(api_requests_total[5m]) / api_rate_limit) * 100
# Geographical Performance Distribution
# Performance by region
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, region))
# Cost per Transaction
# Operational efficiency
(rate(infrastructure_cost_total[1h]) / rate(transactions_processed_total[1h]))
# Batch Job Success Rate
# Cron job reliability
(rate(batch_job_success_total[1h]) / (rate(batch_job_success_total[1h]) + rate(batch_job_failure_total[1h]))) * 100
# Cache Warming Effectiveness
# How well cache is being warmed
(rate(cache_hits_total{source="warm"}[5m]) / rate(cache_hits_total[5m])) * 100
# Database Query Performance by Table
# Identify slow tables
avg by(table) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))
# External API Dependency Health
# Third-party service reliability
(rate(external_api_requests_total{status_code=~"2.."}[5m]) / rate(external_api_requests_total[5m])) * 100
# Real-time Concurrent Users
# User engagement metric
sum by(service) (concurrent_user_sessions)
# Conversion Funnel Analysis
# Track user journey through application
sum(rate(page_views_total{page=~"^(home|product|cart|checkout|success)$"}[5m])) by (page)
# These queries can be used in:
# - Grafana dashboards for visualization
# - Prometheus alerting rules
# - Performance analysis
# - Capacity planning
# - Business intelligence
# - SLA/SLO monitoring