🎯 empfohlene Sammlungen
Balanced sample collections from various categories for you to explore
Prometheus Überwachung
Umfassende Prometheus Überwachungs- und Alarmierungsbeispiele einschließlich benutzerdefinierter Metrik-Exporter, Alarmregelkonfiguration, PromQL-Abfragen und Grafana-Dashboard-Integration
💻 Benutzerdefinierter Prometheus Metrik-Exporter javascript
Benutzerdefinierten Prometheus Metrik-Exporter mit mehreren Metrik-Typen erstellen einschließlich Zähler, Messinstrumente, Histogramme und Zusammenfassungen für umfassende Anwendungsüberwachung
// Custom Prometheus Metrics Exporter
// Complete Node.js application with multiple metric types and Express integration
// Run with: node metrics-exporter.js
import express from 'express';
import client from 'prom-client';
import compression from 'compression';
import cors from 'cors';
// Configuration
const PORT = process.env.PORT || 9091;
const HOST = process.env.HOST || '0.0.0.0';
// Create Prometheus Registry
const register = new client.Registry();
// Add default metrics (CPU, memory, etc.)
client.collectDefaultMetrics({ register });
// Custom Metrics
// ==============
// 1. Counter - Only increases (e.g., total requests)
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code', 'user_agent'],
registers: [register]
});
// 2. Gauge - Can go up or down (e.g., active connections)
const activeConnections = new client.Gauge({
name: 'active_connections',
help: 'Number of active connections',
registers: [register]
});
const queueSize = new client.Gauge({
name: 'queue_size',
help: 'Current size of processing queue',
registers: [register]
});
const systemLoadGauge = new client.Gauge({
name: 'system_load_average',
help: 'System load average over the last minute',
registers: [register]
});
// 3. Histogram - Observations and their distribution
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10],
registers: [register]
});
const databaseQueryDuration = new client.Histogram({
name: 'database_query_duration_seconds',
help: 'Duration of database queries in seconds',
labelNames: ['query_type', 'table'],
buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
registers: [register]
});
// 4. Summary - Similar to histogram but calculates quantiles on the client side
const requestSizeBytes = new client.Summary({
name: 'request_size_bytes',
help: 'Size of HTTP requests in bytes',
percentiles: [0.5, 0.9, 0.95, 0.99],
registers: [register]
});
const responseSizeBytes = new client.Summary({
name: 'response_size_bytes',
help: 'Size of HTTP responses in bytes',
percentiles: [0.5, 0.9, 0.95, 0.99],
registers: [register]
});
// Express Application Setup
// =========================
const app = express();
app.use(compression());
app.use(cors());
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true }));
// Metrics Collection Middleware
app.use((req, res, next) => {
const start = Date.now();
// Track active connections
activeConnections.inc();
// Track request size
if (req.headers['content-length']) {
requestSizeBytes.observe(parseInt(req.headers['content-length']));
}
// Process the request
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.path;
// Update request counter
httpRequestsTotal
.labels(
req.method,
route,
res.statusCode.toString(),
req.headers['user-agent'] || 'unknown'
)
.inc();
// Update request duration histogram
httpRequestDuration
.labels(req.method, route, res.statusCode.toString())
.observe(duration);
// Track response size
if (res.get('content-length')) {
responseSizeBytes.observe(parseInt(res.get('content-length') || '0'));
}
// Decrease active connections
activeConnections.dec();
});
next();
});
// Health Check Endpoint
app.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
memory: process.memoryUsage(),
pid: process.pid
});
});
// Metrics Endpoint - This is what Prometheus scrapes
app.get('/metrics', async (req, res) => {
try {
// Update system metrics
systemLoadGauge.set(require('os').loadavg()[0]);
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
} catch (error) {
console.error('Error generating metrics:', error);
res.status(500).end(error.message);
}
});
// API Routes Examples
// ===================
// Example API endpoints with metrics
app.get('/api/users', async (req, res) => {
const queryStart = Date.now();
try {
// Simulate database query
await new Promise(resolve => setTimeout(resolve, Math.random() * 100));
// Track queue processing
queueSize.inc();
// Process business logic
const users = [
{ id: 1, name: 'Alice Johnson', email: '[email protected]', role: 'admin' },
{ id: 2, name: 'Bob Smith', email: '[email protected]', role: 'user' },
{ id: 3, name: 'Charlie Davis', email: '[email protected]', role: 'user' }
];
// Track database query duration
databaseQueryDuration
.labels('SELECT', 'users')
.observe((Date.now() - queryStart) / 1000);
// Decrease queue size
queueSize.dec();
res.json({
success: true,
data: users,
total: users.length,
timestamp: new Date().toISOString()
});
} catch (error) {
queueSize.dec();
databaseQueryDuration
.labels('SELECT', 'users')
.observe((Date.now() - queryStart) / 1000);
console.error('Error fetching users:', error);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
}
});
app.post('/api/orders', async (req, res) => {
const queryStart = Date.now();
try {
const { userId, items, total } = req.body;
// Validate input
if (!userId || !items || !total) {
return res.status(400).json({
success: false,
error: 'Missing required fields',
timestamp: new Date().toISOString()
});
}
// Simulate order processing
queueSize.inc();
await new Promise(resolve => setTimeout(resolve, Math.random() * 200));
const order = {
id: Math.floor(Math.random() * 10000),
userId,
items,
total,
status: 'created',
createdAt: new Date().toISOString()
};
// Track database operation
databaseQueryDuration
.labels('INSERT', 'orders')
.observe((Date.now() - queryStart) / 1000);
queueSize.dec();
res.status(201).json({
success: true,
data: order,
timestamp: new Date().toISOString()
});
} catch (error) {
queueSize.dec();
databaseQueryDuration
.labels('INSERT', 'orders')
.observe((Date.now() - queryStart) / 1000);
console.error('Error creating order:', error);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
}
});
// Error handling middleware
app.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
console.error('Unhandled error:', err);
res.status(500).json({
success: false,
error: 'Internal server error',
timestamp: new Date().toISOString()
});
});
// 404 handler
app.use('*', (req, res) => {
res.status(404).json({
success: false,
error: 'Route not found',
path: req.originalUrl,
timestamp: new Date().toISOString()
});
});
// Server Startup
// ==============
const server = app.listen(PORT, HOST, () => {
console.log(`🚀 Prometheus Metrics Exporter running on http://${HOST}:${PORT}`);
console.log(`📊 Metrics available at: http://${HOST}:${PORT}/metrics`);
console.log(`💚 Health check: http://${HOST}:${PORT}/health`);
console.log(`👥 Users API: http://${HOST}:${PORT}/api/users`);
console.log(`🛒 Orders API: http://${HOST}:${PORT}/api/orders`);
});
// Graceful Shutdown
// ==================
process.on('SIGTERM', () => {
console.log('\n🛑 Received SIGTERM, shutting down gracefully...');
server.close(() => {
console.log('✅ Server closed gracefully');
process.exit(0);
});
});
process.on('SIGINT', () => {
console.log('\n🛑 Received SIGINT, shutting down gracefully...');
server.close(() => {
console.log('✅ Server closed gracefully');
process.exit(0);
});
});
// Handle uncaught exceptions
process.on('uncaughtException', (error) => {
console.error('💥 Uncaught Exception:', error);
process.exit(1);
});
process.on('unhandledRejection', (reason, promise) => {
console.error('💥 Unhandled Rejection at:', promise, 'reason:', reason);
process.exit(1);
});
export default app;
💻 Prometheus Alarmierung und Alertmanager Konfiguration yaml
Vollständige Alarmierungseinrichtung mit Prometheus Alarmregeln, Alertmanager-Konfiguration, mehreren Benachrichtigungskanälen (E-Mail, Slack, PagerDuty) und erweitertem Alarm-Routing
# Prometheus Alerting Configuration
# ==================================
# Complete setup with alert rules, Alertmanager, and multiple notification channels
# Prometheus Alert Rules Configuration
# File: prometheus_alert_rules.yml
# ===================================
groups:
- name: system_infrastructure_alerts
interval: 30s
rules:
# High CPU Usage Alert
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "High CPU usage detected on {{ $labels.instance }}"
description: |
CPU usage is above 85% for more than 5 minutes.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Runbook: https://runbooks.example.com/high-cpu
dashboard_url: "https://grafana.example.com/d/cpu-dashboard"
# Critical Memory Usage Alert
- alert: CriticalMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
for: 3m
labels:
severity: critical
service: infrastructure
team: devops
annotations:
summary: "Critical memory usage on {{ $labels.instance }}"
description: |
Memory usage is above 95% for more than 3 minutes.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Action: Immediate investigation required
dashboard_url: "https://grafana.example.com/d/memory-dashboard"
# Disk Space Alert
- alert: DiskSpaceLow
expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
for: 10m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: |
Disk usage is above 90% on {{ $labels.mountpoint }}.
Current value: {{ $value | printf "%.2f" }}%
Instance: {{ $labels.instance }}
Mount point: {{ $labels.mountpoint }}
dashboard_url: "https://grafana.example.com/d/disk-dashboard"
# System Load Alert
- alert: HighSystemLoad
expr: node_load1 / count by(instance) (node_cpu_seconds_total{mode="idle"}) > 2
for: 5m
labels:
severity: warning
service: infrastructure
team: devops
annotations:
summary: "High system load on {{ $labels.instance }}"
description: |
System 1-minute load average is {{ $value | printf "%.2f" }}, which is 2x the number of CPU cores.
Instance: {{ $labels.instance }}
Load average: {{ $value }}
- name: application_performance_alerts
interval: 15s
rules:
# HTTP Error Rate Alert
- alert: HighHTTPErrorRate
expr: (rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
for: 3m
labels:
severity: critical
service: application
team: backend
annotations:
summary: "High HTTP 5xx error rate"
description: |
HTTP 5xx error rate is {{ $value | printf "%.2f" }}% over the last 5 minutes.
Service: {{ $labels.job }}
Instance: {{ $labels.instance }}
Investigation required immediately
# Application Downtime Alert
- alert: ApplicationDown
expr: up{job=~"myapp|webapp"} == 0
for: 1m
labels:
severity: critical
service: application
team: backend
annotations:
summary: "Application {{ $labels.job }} is down"
description: |
Application {{ $labels.job }} on instance {{ $labels.instance }} has been down for more than 1 minute.
Last scrape: {{ $value | humanizeTimestamp }}
Immediate action required
# High Response Time Alert
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
for: 5m
labels:
severity: warning
service: application
team: backend
annotations:
summary: "High 95th percentile response time"
description: |
95th percentile response time is {{ $value | printf "%.2f" }}s over the last 5 minutes.
Service: {{ $labels.job }}
Threshold: 2s
# Database Connection Alert
- alert: DatabaseConnectionPoolExhausted
expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
for: 2m
labels:
severity: warning
service: database
team: backend
annotations:
summary: "Database connection pool nearly exhausted"
description: |
Database connection pool usage is {{ $value | printf "%.1f" }}%.
Active connections: {{ query "hikaricp_connections_active" | first | value }}
Max connections: {{ query "hikaricp_connections_max" | first | value }}
- name: business_metrics_alerts
interval: 60s
rules:
# Order Processing Delay Alert
- alert: OrderProcessingDelay
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{route="/api/orders"}[5m])) > 5
for: 5m
labels:
severity: warning
service: business
team: business
annotations:
summary: "Order processing delay detected"
description: |
95th percentile order processing time is {{ $value | printf "%.2f" }}s.
Threshold: 5s
This may impact customer experience
# Low Order Volume Alert
- alert: LowOrderVolume
expr: rate(http_requests_total{route="/api/orders",method="POST"}[1h]) < 1
for: 30m
labels:
severity: warning
service: business
team: business
annotations:
summary: "Low order volume detected"
description: |
Order rate is {{ $value | printf "%.2f" }} orders per hour.
This is below the expected threshold of 1 order/hour.
Time: {{ $value | humanizeTimestamp }}
# User Registration Drop Alert
- alert: UserRegistrationDrop
expr: rate(http_requests_total{route="/api/users",method="POST"}[2h]) < 0.5
for: 1h
labels:
severity: warning
service: business
team: business
annotations:
summary: "User registration rate dropped"
description: |
User registration rate is {{ $value | printf "%.2f" }} per hour.
Normal rate: >0.5 registrations/hour
Investigation recommended
# Alertmanager Configuration
# File: alertmanager.yml
# =======================
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: '[email protected]'
smtp_auth_username: '[email protected]'
smtp_auth_password: 'your-smtp-password'
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
# Route configuration - defines how alerts are grouped and sent
route:
# Group alerts by these labels
group_by: ['alertname', 'cluster', 'service', 'severity']
# How long to wait before sending the first notification for a group
group_wait: 10s
# How long to wait between sending notifications for the same group
group_interval: 10s
# How long to wait before re-sending a notification after it's been resolved
repeat_interval: 12h
# Default receiver
receiver: 'default-receiver'
# Routing rules for different alert types
routes:
# Critical alerts go to critical channels immediately
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 0s
repeat_interval: 5m
# Infrastructure alerts go to DevOps team
- match:
service: infrastructure
receiver: 'infrastructure-alerts'
group_by: ['alertname', 'service']
# Application alerts go to Backend team
- match:
service: application
receiver: 'application-alerts'
# Business alerts go to Business team
- match:
service: business
receiver: 'business-alerts'
routes:
# Only during business hours for non-critical
- match:
severity: warning
active_time_intervals:
- business-hours
- match:
severity: critical
receiver: 'business-critical-alerts'
# Time intervals for routing
time_intervals:
- name: business-hours
time_intervals:
- times:
- start_time: '09:00'
end_time: '17:00'
weekdays: ['monday:friday']
# Receivers define where alerts are sent
receivers:
# Default receiver for unmatched alerts
- name: 'default-receiver'
email_configs:
- to: '[email protected]'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Service: {{ .Labels.service }}
Runbook: {{ .Annotations.runbook_url }}
Dashboard: {{ .Annotations.dashboard_url }}
Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
# Critical alerts - multiple channels
- name: 'critical-alerts'
email_configs:
- to: '[email protected],[email protected]'
subject: '[CRITICAL] {{ .GroupLabels.alertname }} - IMMEDIATE ACTION REQUIRED'
body: |
🚨 CRITICAL ALERT 🚨
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
📊 Current Value: {{ $value }}
🏷️ Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}
⏰ Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
🔗 Dashboard: {{ .Annotations.dashboard_url }}
📖 Runbook: {{ .Annotations.runbook_url }}
{{ end }}
IMMEDIATE ACTION REQUIRED!
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/CRITICAL/WEBHOOK'
channel: '#alerts-critical'
title: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
text: |
{{ range .Alerts }}
*{{ .Annotations.summary }}*
{{ .Annotations.description }}
{{ end }}
color: 'danger'
actions:
- type: button
text: 'View Dashboard'
url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
- type: button
text: 'Runbook'
url: '{{ (index .Alerts 0).Annotations.runbook_url }}'
# Infrastructure alerts - DevOps team
- name: 'infrastructure-alerts'
email_configs:
- to: '[email protected]'
subject: '[INFRA] {{ .GroupLabels.alertname }}'
body: |
Infrastructure Alert: {{ .GroupLabels.alertname }}
{{ range .Alerts }}
{{ .Annotations.summary }}
{{ .Annotations.description }}
Instance: {{ .labels.instance }}
{{ end }}
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/INFRA/WEBHOOK'
channel: '#infra-alerts'
title: 'Infrastructure Alert: {{ .GroupLabels.alertname }}'
color: 'warning'
# Application alerts - Backend team
- name: 'application-alerts'
email_configs:
- to: '[email protected]'
subject: '[APP] {{ .GroupLabels.alertname }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/APP/WEBHOOK'
channel: '#backend-alerts'
title: 'Application Alert: {{ .GroupLabels.alertname }}'
color: 'warning'
# Business alerts - Business team
- name: 'business-alerts'
email_configs:
- to: '[email protected]'
subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/BUSINESS/WEBHOOK'
channel: '#business-alerts'
title: 'Business Alert: {{ .GroupLabels.alertname }}'
color: '#good'
# Business critical alerts - PagerDuty integration
- name: 'business-critical-alerts'
pagerduty_configs:
- service_key: 'your-pagerduty-service-key'
description: '{{ .GroupLabels.alertname }}'
details:
firing: '{{ .Alerts.Firing | len }}'
summary: '{{ (index .Alerts 0).Annotations.summary }}'
description: '{{ (index .Alerts 0).Annotations.description }}'
# Inhibition rules - prevent alert spam
inhibit_rules:
# Don't send application alerts if the server is down
- source_match:
alertname: ApplicationDown
target_match_re:
service: application
equal: ['instance']
# Don't send high error rate alerts if server is down
- source_match:
alertname: ApplicationDown
target_match:
alertname: HighHTTPErrorRate
equal: ['instance']
# Templates for custom alert formatting
templates:
- '/etc/alertmanager/templates/*.tmpl'
💻 Erweiterte PromQL Abfragebeispiele yaml
Umfassende Sammlung von PromQL-Abfragen für Systemüberwachung, Anwendungsleistungsanalyse, Geschäftsmetriken und Alarmszenarien mit detaillierten Erklärungen
# Advanced PromQL Query Examples
# =================================
# Complete collection of monitoring queries for production environments
# 1. System Infrastructure Monitoring
# ====================================
# CPU Utilization by Instance (percentage)
# Shows CPU usage across all instances
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# CPU Usage by Mode (idle, user, system, iowait)
# Breaks down CPU usage by different modes
avg by(instance, mode) (irate(node_cpu_seconds_total[5m])) * 100
# Memory Usage Percentage
# Calculates memory usage (total - available) / total
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Detailed Memory Breakdown
# Shows different memory types in bytes
node_memory_MemTotal_bytes -
node_memory_MemFree_bytes -
node_memory_Buffers_bytes -
node_memory_Cached_bytes -
node_memory_SwapCached_bytes
# Disk Usage by Mount Point
# Shows disk usage percentage for each filesystem
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100
# Disk I/O Operations per Second
# Read and write operations
irate(node_disk_reads_completed_total[5m])
irate(node_disk_writes_completed_total[5m])
# Network I/O (bytes per second)
# Network throughput
irate(node_network_receive_bytes_total[5m]) * 8 # Convert to bits
irate(node_network_transmit_bytes_total[5m]) * 8
# System Load Average
# 1-minute, 5-minute, and 15-minute load averages
node_load1
node_load5
node_load15
# Process Count by State
# Number of processes in different states
count by(instance) (node_processes_state)
# Uptime in Hours
# System uptime
time() - node_boot_time_seconds
# 2. Application Performance Monitoring
# ======================================
# HTTP Request Rate (requests per second)
# Overall request rate
rate(http_requests_total[5m])
# HTTP Request Rate by Method
# Breakdown by HTTP method
sum by(method) (rate(http_requests_total{method=~"GET|POST|PUT|DELETE"}[5m]))
# HTTP Request Rate by Status Code
# Shows 2xx, 3xx, 4xx, 5xx breakdown
sum by(status_code) (rate(http_requests_total[5m]))
# HTTP Error Rate (percentage)
# Percentage of 4xx and 5xx responses
(sum(rate(http_requests_total{status_code=~"4..|5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100
# 95th Percentile Response Time
# 95% of requests complete within this time
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# 99th Percentile Response Time by Route
# Slowest routes identification
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))
# Average Response Time
# Mean response time across all requests
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])
# Request Rate by Route
# Most frequently accessed endpoints
sum by(route) (rate(http_requests_total[5m]))
# Database Connection Pool Usage
# Database connection pool utilization
(hikaricp_connections_active / hikaricp_connections_max) * 100
# Database Query Performance
# Average query duration by type
avg by(query_type) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))
# Cache Hit Rate
# Cache effectiveness
(rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m]))) * 100
# 3. Business Metrics and KPIs
# ============================
# User Registration Rate (per hour)
# New user signups
rate(user_registrations_total[1h]) * 3600
# Order Processing Rate (per minute)
# Orders being processed
rate(orders_processed_total[5m]) * 60
# Revenue per Minute
# Real-time revenue tracking
rate(order_revenue_total[5m])
# Shopping Cart Abandonment Rate
# Percentage of carts not completed
(1 - (rate(checkouts_completed_total[1h]) / rate(cart_created_total[1h]))) * 100
# Active User Sessions
# Currently logged-in users
active_user_sessions
# Conversion Rate (percentage)
# From visit to purchase
(rate(purchases_total[1h]) / rate(page_views_total{page="landing"}[1h])) * 100
# Average Order Value
# Revenue per order
rate(order_revenue_total[1h]) / rate(orders_completed_total[1h])
# Customer Retention Rate
# Percentage of customers returning
rate(returning_customers_total[24h]) / rate(total_customers_total[24h]) * 100
# Feature Usage Rate
# How often features are used
rate(feature_usage_total{feature=~".*"}[1h])
# 4. Container and Kubernetes Monitoring
# =======================================
# Container CPU Usage
# CPU usage per container
rate(container_cpu_usage_seconds_total{container!="",container!="/POD"}[5m]) * 100
# Container Memory Usage
# Memory usage per container
(container_memory_usage_bytes{container!="",container!="/POD"} / container_spec_memory_limit_bytes) * 100
# Container Network I/O
# Network traffic per container
rate(container_network_receive_bytes_total[5m])
rate(container_network_transmit_bytes_total[5m])
# Pod Status Distribution
# Count of pods by status
sum by(pod_phase) (kube_pod_status_phase)
# Node Ready Status
# Kubernetes cluster node availability
sum by(node) (kube_node_status_condition{condition="Ready",status="true"})
# Pending Pods Count
# Pods waiting to be scheduled
sum(kube_pod_status_phase{phase="Pending"})
# HPA (Horizontal Pod Autoscaler) Current Replicas
# Autoscaling status
kube_hpa_status_current_replicas
# 5. Advanced Alerting Queries
# =============================
# High CPU Usage (>90% for 5 minutes)
# CPU utilization alert
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
# Memory Pressure (>95% for 3 minutes)
# Memory usage alert
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
# Disk Space Warning (>85% usage)
# Disk usage alert
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85
# Application Down
# Service availability alert
up{job="myapp"} == 0
# High Error Rate (>5% 5xx responses)
# Application error rate alert
(rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
# Slow Response Time (>2 seconds 95th percentile)
# Performance degradation alert
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
# Database Connection Exhaustion
# Database resource alert
(hikaricp_connections_active / hikaricp_connections_max) > 0.9
# 6. Capacity Planning and Forecasting
# ======================================
# Disk Growth Rate (per day)
# Predict when disks will fill up
increase(node_filesystem_size_bytes[1d]) * 86400
# Memory Growth Trend (per hour)
# Memory usage trend
increase(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes[1h]) * 3600
# CPU Usage Prediction (next hour)
# Linear regression for CPU usage
predict_linear(avg_over_time(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)[1h:1m], 3600)
# Request Growth Rate (per day)
# Traffic growth prediction
increase(http_requests_total[1d])
# Disk Space Prediction (time until full)
# When will disk run out of space
(time() - node_filesystem_avail_bytes) / increase(node_filesystem_avail_bytes[1h]) * 3600
# 7. SLA and SLO Monitoring
# ==========================
# Service Availability (99.9% SLA)
# Uptime percentage calculation
(1 - sum(rate(http_requests_total{status_code=~"5.."}[1d])) / sum(rate(http_requests_total[1d]))) * 100
# Response Time SLO (95% < 500ms)
# Performance SLO compliance
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1d])) by (le, service)) < 0.5
# Error Budget Remaining (percentage)
# How much error budget is left for the month
100 - (sum(rate(http_requests_total{status_code=~"5.."}[30d])) / sum(rate(http_requests_total[30d])) * 100)
# Request Rate SLO (>1000 req/s)
# Capacity SLO
sum(rate(http_requests_total[5m])) > 1000
# 8. Correlation and Complex Queries
# ====================================
# CPU vs Memory Correlation
# Find instances with high CPU AND memory usage
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) *
((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 5000
# Find Slow Queries with High Rate
# Performance bottleneck identification
rate(http_request_duration_seconds_count[5m]) > 10 and
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
# Error Rate with High Traffic
# Critical issues affecting many users
rate(http_requests_total{status_code=~"5.."}[5m]) > 1 and
rate(http_requests_total[5m]) > 100
# Instance Performance Score
# Combined performance metric
(
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) * 0.3 +
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 * 0.3 +
((1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100) * 0.4
)
# 9. Time Series Analysis
# ========================
# Moving Average (smooths fluctuations)
# 1-hour moving average of request rate
avg_over_time(rate(http_requests_total[1m])[1h:1m])
# Rate of Change (identifies anomalies)
# How quickly metrics are changing
deriv(rate(http_requests_total[5m])[5m])
# Standard Deviation (detects volatility)
# Request rate variability
stddev_over_time(rate(http_requests_total[1m])[1h:1m])
# Year-over-Year Comparison
# Compare current period with previous year
rate(http_requests_total[1d]) / rate(http_requests_total offset 365d)
# 10. Custom Business Logic Examples
# ====================================
# Revenue per User
# Business efficiency metric
rate(order_revenue_total[1h]) / rate(active_users_total[1h])
# Customer Lifetime Value Prediction
# Using historical data to predict CLV
rate(order_revenue_total[30d]) *
(rate(returning_customers_total[30d]) / rate(total_customers_total[30d]))
# API Rate Limit Utilization
# How close to rate limits
(rate(api_requests_total[5m]) / api_rate_limit) * 100
# Geographical Performance Distribution
# Performance by region
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, region))
# Cost per Transaction
# Operational efficiency
(rate(infrastructure_cost_total[1h]) / rate(transactions_processed_total[1h]))
# Batch Job Success Rate
# Cron job reliability
(rate(batch_job_success_total[1h]) / (rate(batch_job_success_total[1h]) + rate(batch_job_failure_total[1h]))) * 100
# Cache Warming Effectiveness
# How well cache is being warmed
(rate(cache_hits_total{source="warm"}[5m]) / rate(cache_hits_total[5m])) * 100
# Database Query Performance by Table
# Identify slow tables
avg by(table) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))
# External API Dependency Health
# Third-party service reliability
(rate(external_api_requests_total{status_code=~"2.."}[5m]) / rate(external_api_requests_total[5m])) * 100
# Real-time Concurrent Users
# User engagement metric
sum by(service) (concurrent_user_sessions)
# Conversion Funnel Analysis
# Track user journey through application
sum(rate(page_views_total{page=~"^(home|product|cart|checkout|success)$"}[5m])) by (page)
# These queries can be used in:
# - Grafana dashboards for visualization
# - Prometheus alerting rules
# - Performance analysis
# - Capacity planning
# - Business intelligence
# - SLA/SLO monitoring