Prometheus 监控

全面的 Prometheus 监控和告警系统示例,包括自定义指标导出器、告警规则配置、PromQL 查询和 Grafana 仪表板集成

💻 自定义 Prometheus 指标导出器 javascript

🟡 intermediate

构建自定义 Prometheus 指标导出器,包含计数器、仪表盘、直方图和摘要等多种指标类型,用于全面的应用程序监控

// Custom Prometheus Metrics Exporter
// Complete Node.js application with multiple metric types and Express integration
// Run with: node metrics-exporter.js

import express from 'express';
import client from 'prom-client';
import compression from 'compression';
import cors from 'cors';

// Configuration
const PORT = process.env.PORT || 9091;
const HOST = process.env.HOST || '0.0.0.0';

// Create Prometheus Registry
const register = new client.Registry();

// Add default metrics (CPU, memory, etc.)
client.collectDefaultMetrics({ register });

// Custom Metrics
// ==============

// 1. Counter - Only increases (e.g., total requests)
const httpRequestsTotal = new client.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code', 'user_agent'],
  registers: [register]
});

// 2. Gauge - Can go up or down (e.g., active connections)
const activeConnections = new client.Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register]
});

const queueSize = new client.Gauge({
  name: 'queue_size',
  help: 'Current size of processing queue',
  registers: [register]
});

const systemLoadGauge = new client.Gauge({
  name: 'system_load_average',
  help: 'System load average over the last minute',
  registers: [register]
});

// 3. Histogram - Observations and their distribution
const httpRequestDuration = new client.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10],
  registers: [register]
});

const databaseQueryDuration = new client.Histogram({
  name: 'database_query_duration_seconds',
  help: 'Duration of database queries in seconds',
  labelNames: ['query_type', 'table'],
  buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [register]
});

// 4. Summary - Similar to histogram but calculates quantiles on the client side
const requestSizeBytes = new client.Summary({
  name: 'request_size_bytes',
  help: 'Size of HTTP requests in bytes',
  percentiles: [0.5, 0.9, 0.95, 0.99],
  registers: [register]
});

const responseSizeBytes = new client.Summary({
  name: 'response_size_bytes',
  help: 'Size of HTTP responses in bytes',
  percentiles: [0.5, 0.9, 0.95, 0.99],
  registers: [register]
});

// Express Application Setup
// =========================

const app = express();
app.use(compression());
app.use(cors());
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true }));

// Metrics Collection Middleware
app.use((req, res, next) => {
  const start = Date.now();

  // Track active connections
  activeConnections.inc();

  // Track request size
  if (req.headers['content-length']) {
    requestSizeBytes.observe(parseInt(req.headers['content-length']));
  }

  // Process the request
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || req.path;

    // Update request counter
    httpRequestsTotal
      .labels(
        req.method,
        route,
        res.statusCode.toString(),
        req.headers['user-agent'] || 'unknown'
      )
      .inc();

    // Update request duration histogram
    httpRequestDuration
      .labels(req.method, route, res.statusCode.toString())
      .observe(duration);

    // Track response size
    if (res.get('content-length')) {
      responseSizeBytes.observe(parseInt(res.get('content-length') || '0'));
    }

    // Decrease active connections
    activeConnections.dec();
  });

  next();
});

// Health Check Endpoint
app.get('/health', (req, res) => {
  res.json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    memory: process.memoryUsage(),
    pid: process.pid
  });
});

// Metrics Endpoint - This is what Prometheus scrapes
app.get('/metrics', async (req, res) => {
  try {
    // Update system metrics
    systemLoadGauge.set(require('os').loadavg()[0]);

    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
  } catch (error) {
    console.error('Error generating metrics:', error);
    res.status(500).end(error.message);
  }
});

// API Routes Examples
// ===================

// Example API endpoints with metrics
app.get('/api/users', async (req, res) => {
  const queryStart = Date.now();

  try {
    // Simulate database query
    await new Promise(resolve => setTimeout(resolve, Math.random() * 100));

    // Track queue processing
    queueSize.inc();

    // Process business logic
    const users = [
      { id: 1, name: 'Alice Johnson', email: '[email protected]', role: 'admin' },
      { id: 2, name: 'Bob Smith', email: '[email protected]', role: 'user' },
      { id: 3, name: 'Charlie Davis', email: '[email protected]', role: 'user' }
    ];

    // Track database query duration
    databaseQueryDuration
      .labels('SELECT', 'users')
      .observe((Date.now() - queryStart) / 1000);

    // Decrease queue size
    queueSize.dec();

    res.json({
      success: true,
      data: users,
      total: users.length,
      timestamp: new Date().toISOString()
    });

  } catch (error) {
    queueSize.dec();
    databaseQueryDuration
      .labels('SELECT', 'users')
      .observe((Date.now() - queryStart) / 1000);

    console.error('Error fetching users:', error);
    res.status(500).json({
      success: false,
      error: 'Internal server error',
      timestamp: new Date().toISOString()
    });
  }
});

app.post('/api/orders', async (req, res) => {
  const queryStart = Date.now();

  try {
    const { userId, items, total } = req.body;

    // Validate input
    if (!userId || !items || !total) {
      return res.status(400).json({
        success: false,
        error: 'Missing required fields',
        timestamp: new Date().toISOString()
      });
    }

    // Simulate order processing
    queueSize.inc();
    await new Promise(resolve => setTimeout(resolve, Math.random() * 200));

    const order = {
      id: Math.floor(Math.random() * 10000),
      userId,
      items,
      total,
      status: 'created',
      createdAt: new Date().toISOString()
    };

    // Track database operation
    databaseQueryDuration
      .labels('INSERT', 'orders')
      .observe((Date.now() - queryStart) / 1000);

    queueSize.dec();

    res.status(201).json({
      success: true,
      data: order,
      timestamp: new Date().toISOString()
    });

  } catch (error) {
    queueSize.dec();
    databaseQueryDuration
      .labels('INSERT', 'orders')
      .observe((Date.now() - queryStart) / 1000);

    console.error('Error creating order:', error);
    res.status(500).json({
      success: false,
      error: 'Internal server error',
      timestamp: new Date().toISOString()
    });
  }
});

// Error handling middleware
app.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
  console.error('Unhandled error:', err);
  res.status(500).json({
    success: false,
    error: 'Internal server error',
    timestamp: new Date().toISOString()
  });
});

// 404 handler
app.use('*', (req, res) => {
  res.status(404).json({
    success: false,
    error: 'Route not found',
    path: req.originalUrl,
    timestamp: new Date().toISOString()
  });
});

// Server Startup
// ==============

const server = app.listen(PORT, HOST, () => {
  console.log(`🚀 Prometheus Metrics Exporter running on http://${HOST}:${PORT}`);
  console.log(`📊 Metrics available at: http://${HOST}:${PORT}/metrics`);
  console.log(`💚 Health check: http://${HOST}:${PORT}/health`);
  console.log(`👥 Users API: http://${HOST}:${PORT}/api/users`);
  console.log(`🛒 Orders API: http://${HOST}:${PORT}/api/orders`);
});

// Graceful Shutdown
// ==================

process.on('SIGTERM', () => {
  console.log('\n🛑 Received SIGTERM, shutting down gracefully...');
  server.close(() => {
    console.log('✅ Server closed gracefully');
    process.exit(0);
  });
});

process.on('SIGINT', () => {
  console.log('\n🛑 Received SIGINT, shutting down gracefully...');
  server.close(() => {
    console.log('✅ Server closed gracefully');
    process.exit(0);
  });
});

// Handle uncaught exceptions
process.on('uncaughtException', (error) => {
  console.error('💥 Uncaught Exception:', error);
  process.exit(1);
});

process.on('unhandledRejection', (reason, promise) => {
  console.error('💥 Unhandled Rejection at:', promise, 'reason:', reason);
  process.exit(1);
});

export default app;

💻 Prometheus 告警和 Alertmanager 配置 yaml

🟡 intermediate

完整的告警设置,包含 Prometheus 告警规则、Alertmanager 配置、多通知渠道(邮件、Slack、PagerDuty)和高级告警路由

# Prometheus Alerting Configuration
# ==================================
# Complete setup with alert rules, Alertmanager, and multiple notification channels

# Prometheus Alert Rules Configuration
# File: prometheus_alert_rules.yml
# ===================================

groups:
  - name: system_infrastructure_alerts
    interval: 30s
    rules:
      # High CPU Usage Alert
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 5m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "High CPU usage detected on {{ $labels.instance }}"
          description: |
            CPU usage is above 85% for more than 5 minutes.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Runbook: https://runbooks.example.com/high-cpu
          dashboard_url: "https://grafana.example.com/d/cpu-dashboard"

      # Critical Memory Usage Alert
      - alert: CriticalMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 3m
        labels:
          severity: critical
          service: infrastructure
          team: devops
        annotations:
          summary: "Critical memory usage on {{ $labels.instance }}"
          description: |
            Memory usage is above 95% for more than 3 minutes.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Action: Immediate investigation required
          dashboard_url: "https://grafana.example.com/d/memory-dashboard"

      # Disk Space Alert
      - alert: DiskSpaceLow
        expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
        for: 10m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: |
            Disk usage is above 90% on {{ $labels.mountpoint }}.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Mount point: {{ $labels.mountpoint }}
          dashboard_url: "https://grafana.example.com/d/disk-dashboard"

      # System Load Alert
      - alert: HighSystemLoad
        expr: node_load1 / count by(instance) (node_cpu_seconds_total{mode="idle"}) > 2
        for: 5m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "High system load on {{ $labels.instance }}"
          description: |
            System 1-minute load average is {{ $value | printf "%.2f" }}, which is 2x the number of CPU cores.
            Instance: {{ $labels.instance }}
            Load average: {{ $value }}

  - name: application_performance_alerts
    interval: 15s
    rules:
      # HTTP Error Rate Alert
      - alert: HighHTTPErrorRate
        expr: (rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
        for: 3m
        labels:
          severity: critical
          service: application
          team: backend
        annotations:
          summary: "High HTTP 5xx error rate"
          description: |
            HTTP 5xx error rate is {{ $value | printf "%.2f" }}% over the last 5 minutes.
            Service: {{ $labels.job }}
            Instance: {{ $labels.instance }}
            Investigation required immediately

      # Application Downtime Alert
      - alert: ApplicationDown
        expr: up{job=~"myapp|webapp"} == 0
        for: 1m
        labels:
          severity: critical
          service: application
          team: backend
        annotations:
          summary: "Application {{ $labels.job }} is down"
          description: |
            Application {{ $labels.job }} on instance {{ $labels.instance }} has been down for more than 1 minute.
            Last scrape: {{ $value | humanizeTimestamp }}
            Immediate action required

      # High Response Time Alert
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
          service: application
          team: backend
        annotations:
          summary: "High 95th percentile response time"
          description: |
            95th percentile response time is {{ $value | printf "%.2f" }}s over the last 5 minutes.
            Service: {{ $labels.job }}
            Threshold: 2s

      # Database Connection Alert
      - alert: DatabaseConnectionPoolExhausted
        expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
        for: 2m
        labels:
          severity: warning
          service: database
          team: backend
        annotations:
          summary: "Database connection pool nearly exhausted"
          description: |
            Database connection pool usage is {{ $value | printf "%.1f" }}%.
            Active connections: {{ query "hikaricp_connections_active" | first | value }}
            Max connections: {{ query "hikaricp_connections_max" | first | value }}

  - name: business_metrics_alerts
    interval: 60s
    rules:
      # Order Processing Delay Alert
      - alert: OrderProcessingDelay
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{route="/api/orders"}[5m])) > 5
        for: 5m
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "Order processing delay detected"
          description: |
            95th percentile order processing time is {{ $value | printf "%.2f" }}s.
            Threshold: 5s
            This may impact customer experience

      # Low Order Volume Alert
      - alert: LowOrderVolume
        expr: rate(http_requests_total{route="/api/orders",method="POST"}[1h]) < 1
        for: 30m
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "Low order volume detected"
          description: |
            Order rate is {{ $value | printf "%.2f" }} orders per hour.
            This is below the expected threshold of 1 order/hour.
            Time: {{ $value | humanizeTimestamp }}

      # User Registration Drop Alert
      - alert: UserRegistrationDrop
        expr: rate(http_requests_total{route="/api/users",method="POST"}[2h]) < 0.5
        for: 1h
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "User registration rate dropped"
          description: |
            User registration rate is {{ $value | printf "%.2f" }} per hour.
            Normal rate: >0.5 registrations/hour
            Investigation recommended

# Alertmanager Configuration
# File: alertmanager.yml
# =======================

global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: '[email protected]'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'your-smtp-password'
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'

# Route configuration - defines how alerts are grouped and sent
route:
  # Group alerts by these labels
  group_by: ['alertname', 'cluster', 'service', 'severity']

  # How long to wait before sending the first notification for a group
  group_wait: 10s

  # How long to wait between sending notifications for the same group
  group_interval: 10s

  # How long to wait before re-sending a notification after it's been resolved
  repeat_interval: 12h

  # Default receiver
  receiver: 'default-receiver'

  # Routing rules for different alert types
  routes:
    # Critical alerts go to critical channels immediately
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 0s
      repeat_interval: 5m

    # Infrastructure alerts go to DevOps team
    - match:
        service: infrastructure
      receiver: 'infrastructure-alerts'
      group_by: ['alertname', 'service']

    # Application alerts go to Backend team
    - match:
        service: application
      receiver: 'application-alerts'

    # Business alerts go to Business team
    - match:
        service: business
      receiver: 'business-alerts'
      routes:
        # Only during business hours for non-critical
        - match:
            severity: warning
          active_time_intervals:
            - business-hours
        - match:
            severity: critical
          receiver: 'business-critical-alerts'

# Time intervals for routing
time_intervals:
  - name: business-hours
    time_intervals:
      - times:
          - start_time: '09:00'
            end_time: '17:00'
        weekdays: ['monday:friday']

# Receivers define where alerts are sent
receivers:
  # Default receiver for unmatched alerts
  - name: 'default-receiver'
    email_configs:
      - to: '[email protected]'
        subject: '[ALERT] {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Severity: {{ .Labels.severity }}
          Service: {{ .Labels.service }}
          Runbook: {{ .Annotations.runbook_url }}
          Dashboard: {{ .Annotations.dashboard_url }}
          Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ end }}

  # Critical alerts - multiple channels
  - name: 'critical-alerts'
    email_configs:
      - to: '[email protected],[email protected]'
        subject: '[CRITICAL] {{ .GroupLabels.alertname }} - IMMEDIATE ACTION REQUIRED'
        body: |
          🚨 CRITICAL ALERT 🚨

          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}

          📊 Current Value: {{ $value }}
          🏷️  Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}

          ⏰ Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          🔗 Dashboard: {{ .Annotations.dashboard_url }}
          📖 Runbook: {{ .Annotations.runbook_url }}
          {{ end }}

          IMMEDIATE ACTION REQUIRED!
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/CRITICAL/WEBHOOK'
        channel: '#alerts-critical'
        title: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *{{ .Annotations.summary }}*
          {{ .Annotations.description }}
          {{ end }}
        color: 'danger'
        actions:
          - type: button
            text: 'View Dashboard'
            url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
          - type: button
            text: 'Runbook'
            url: '{{ (index .Alerts 0).Annotations.runbook_url }}'

  # Infrastructure alerts - DevOps team
  - name: 'infrastructure-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[INFRA] {{ .GroupLabels.alertname }}'
        body: |
          Infrastructure Alert: {{ .GroupLabels.alertname }}
          {{ range .Alerts }}
          {{ .Annotations.summary }}
          {{ .Annotations.description }}
          Instance: {{ .labels.instance }}
          {{ end }}
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/INFRA/WEBHOOK'
        channel: '#infra-alerts'
        title: 'Infrastructure Alert: {{ .GroupLabels.alertname }}'
        color: 'warning'

  # Application alerts - Backend team
  - name: 'application-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[APP] {{ .GroupLabels.alertname }}'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/APP/WEBHOOK'
        channel: '#backend-alerts'
        title: 'Application Alert: {{ .GroupLabels.alertname }}'
        color: 'warning'

  # Business alerts - Business team
  - name: 'business-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/BUSINESS/WEBHOOK'
        channel: '#business-alerts'
        title: 'Business Alert: {{ .GroupLabels.alertname }}'
        color: '#good'

  # Business critical alerts - PagerDuty integration
  - name: 'business-critical-alerts'
    pagerduty_configs:
      - service_key: 'your-pagerduty-service-key'
        description: '{{ .GroupLabels.alertname }}'
        details:
          firing: '{{ .Alerts.Firing | len }}'
          summary: '{{ (index .Alerts 0).Annotations.summary }}'
          description: '{{ (index .Alerts 0).Annotations.description }}'

# Inhibition rules - prevent alert spam
inhibit_rules:
  # Don't send application alerts if the server is down
  - source_match:
      alertname: ApplicationDown
    target_match_re:
      service: application
    equal: ['instance']

  # Don't send high error rate alerts if server is down
  - source_match:
      alertname: ApplicationDown
    target_match:
      alertname: HighHTTPErrorRate
    equal: ['instance']

# Templates for custom alert formatting
templates:
  - '/etc/alertmanager/templates/*.tmpl'

💻 高级 PromQL 查询示例 yaml

🟡 intermediate

全面的 PromQL 查询集合,用于系统监控、应用程序性能分析、业务指标和告警场景,附带详细说明

# Advanced PromQL Query Examples
# =================================
# Complete collection of monitoring queries for production environments

# 1. System Infrastructure Monitoring
# ====================================

# CPU Utilization by Instance (percentage)
# Shows CPU usage across all instances
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# CPU Usage by Mode (idle, user, system, iowait)
# Breaks down CPU usage by different modes
avg by(instance, mode) (irate(node_cpu_seconds_total[5m])) * 100

# Memory Usage Percentage
# Calculates memory usage (total - available) / total
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

# Detailed Memory Breakdown
# Shows different memory types in bytes
node_memory_MemTotal_bytes -
node_memory_MemFree_bytes -
node_memory_Buffers_bytes -
node_memory_Cached_bytes -
node_memory_SwapCached_bytes

# Disk Usage by Mount Point
# Shows disk usage percentage for each filesystem
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100

# Disk I/O Operations per Second
# Read and write operations
irate(node_disk_reads_completed_total[5m])
irate(node_disk_writes_completed_total[5m])

# Network I/O (bytes per second)
# Network throughput
irate(node_network_receive_bytes_total[5m]) * 8  # Convert to bits
irate(node_network_transmit_bytes_total[5m]) * 8

# System Load Average
# 1-minute, 5-minute, and 15-minute load averages
node_load1
node_load5
node_load15

# Process Count by State
# Number of processes in different states
count by(instance) (node_processes_state)

# Uptime in Hours
# System uptime
time() - node_boot_time_seconds

# 2. Application Performance Monitoring
# ======================================

# HTTP Request Rate (requests per second)
# Overall request rate
rate(http_requests_total[5m])

# HTTP Request Rate by Method
# Breakdown by HTTP method
sum by(method) (rate(http_requests_total{method=~"GET|POST|PUT|DELETE"}[5m]))

# HTTP Request Rate by Status Code
# Shows 2xx, 3xx, 4xx, 5xx breakdown
sum by(status_code) (rate(http_requests_total[5m]))

# HTTP Error Rate (percentage)
# Percentage of 4xx and 5xx responses
(sum(rate(http_requests_total{status_code=~"4..|5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100

# 95th Percentile Response Time
# 95% of requests complete within this time
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# 99th Percentile Response Time by Route
# Slowest routes identification
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))

# Average Response Time
# Mean response time across all requests
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])

# Request Rate by Route
# Most frequently accessed endpoints
sum by(route) (rate(http_requests_total[5m]))

# Database Connection Pool Usage
# Database connection pool utilization
(hikaricp_connections_active / hikaricp_connections_max) * 100

# Database Query Performance
# Average query duration by type
avg by(query_type) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))

# Cache Hit Rate
# Cache effectiveness
(rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m]))) * 100

# 3. Business Metrics and KPIs
# ============================

# User Registration Rate (per hour)
# New user signups
rate(user_registrations_total[1h]) * 3600

# Order Processing Rate (per minute)
# Orders being processed
rate(orders_processed_total[5m]) * 60

# Revenue per Minute
# Real-time revenue tracking
rate(order_revenue_total[5m])

# Shopping Cart Abandonment Rate
# Percentage of carts not completed
(1 - (rate(checkouts_completed_total[1h]) / rate(cart_created_total[1h]))) * 100

# Active User Sessions
# Currently logged-in users
active_user_sessions

# Conversion Rate (percentage)
# From visit to purchase
(rate(purchases_total[1h]) / rate(page_views_total{page="landing"}[1h])) * 100

# Average Order Value
# Revenue per order
rate(order_revenue_total[1h]) / rate(orders_completed_total[1h])

# Customer Retention Rate
# Percentage of customers returning
rate(returning_customers_total[24h]) / rate(total_customers_total[24h]) * 100

# Feature Usage Rate
# How often features are used
rate(feature_usage_total{feature=~".*"}[1h])

# 4. Container and Kubernetes Monitoring
# =======================================

# Container CPU Usage
# CPU usage per container
rate(container_cpu_usage_seconds_total{container!="",container!="/POD"}[5m]) * 100

# Container Memory Usage
# Memory usage per container
(container_memory_usage_bytes{container!="",container!="/POD"} / container_spec_memory_limit_bytes) * 100

# Container Network I/O
# Network traffic per container
rate(container_network_receive_bytes_total[5m])
rate(container_network_transmit_bytes_total[5m])

# Pod Status Distribution
# Count of pods by status
sum by(pod_phase) (kube_pod_status_phase)

# Node Ready Status
# Kubernetes cluster node availability
sum by(node) (kube_node_status_condition{condition="Ready",status="true"})

# Pending Pods Count
# Pods waiting to be scheduled
sum(kube_pod_status_phase{phase="Pending"})

# HPA (Horizontal Pod Autoscaler) Current Replicas
# Autoscaling status
kube_hpa_status_current_replicas

# 5. Advanced Alerting Queries
# =============================

# High CPU Usage (>90% for 5 minutes)
# CPU utilization alert
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90

# Memory Pressure (>95% for 3 minutes)
# Memory usage alert
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95

# Disk Space Warning (>85% usage)
# Disk usage alert
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85

# Application Down
# Service availability alert
up{job="myapp"} == 0

# High Error Rate (>5% 5xx responses)
# Application error rate alert
(rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5

# Slow Response Time (>2 seconds 95th percentile)
# Performance degradation alert
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2

# Database Connection Exhaustion
# Database resource alert
(hikaricp_connections_active / hikaricp_connections_max) > 0.9

# 6. Capacity Planning and Forecasting
# ======================================

# Disk Growth Rate (per day)
# Predict when disks will fill up
increase(node_filesystem_size_bytes[1d]) * 86400

# Memory Growth Trend (per hour)
# Memory usage trend
increase(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes[1h]) * 3600

# CPU Usage Prediction (next hour)
# Linear regression for CPU usage
predict_linear(avg_over_time(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)[1h:1m], 3600)

# Request Growth Rate (per day)
# Traffic growth prediction
increase(http_requests_total[1d])

# Disk Space Prediction (time until full)
# When will disk run out of space
(time() - node_filesystem_avail_bytes) / increase(node_filesystem_avail_bytes[1h]) * 3600

# 7. SLA and SLO Monitoring
# ==========================

# Service Availability (99.9% SLA)
# Uptime percentage calculation
(1 - sum(rate(http_requests_total{status_code=~"5.."}[1d])) / sum(rate(http_requests_total[1d]))) * 100

# Response Time SLO (95% < 500ms)
# Performance SLO compliance
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1d])) by (le, service)) < 0.5

# Error Budget Remaining (percentage)
# How much error budget is left for the month
100 - (sum(rate(http_requests_total{status_code=~"5.."}[30d])) / sum(rate(http_requests_total[30d])) * 100)

# Request Rate SLO (>1000 req/s)
# Capacity SLO
sum(rate(http_requests_total[5m])) > 1000

# 8. Correlation and Complex Queries
# ====================================

# CPU vs Memory Correlation
# Find instances with high CPU AND memory usage
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) *
((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 5000

# Find Slow Queries with High Rate
# Performance bottleneck identification
rate(http_request_duration_seconds_count[5m]) > 10 and
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1

# Error Rate with High Traffic
# Critical issues affecting many users
rate(http_requests_total{status_code=~"5.."}[5m]) > 1 and
rate(http_requests_total[5m]) > 100

# Instance Performance Score
# Combined performance metric
(
  (100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) * 0.3 +
  (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 * 0.3 +
  ((1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100) * 0.4
)

# 9. Time Series Analysis
# ========================

# Moving Average (smooths fluctuations)
# 1-hour moving average of request rate
avg_over_time(rate(http_requests_total[1m])[1h:1m])

# Rate of Change (identifies anomalies)
# How quickly metrics are changing
deriv(rate(http_requests_total[5m])[5m])

# Standard Deviation (detects volatility)
# Request rate variability
stddev_over_time(rate(http_requests_total[1m])[1h:1m])

# Year-over-Year Comparison
# Compare current period with previous year
rate(http_requests_total[1d]) / rate(http_requests_total offset 365d)

# 10. Custom Business Logic Examples
# ====================================

# Revenue per User
# Business efficiency metric
rate(order_revenue_total[1h]) / rate(active_users_total[1h])

# Customer Lifetime Value Prediction
# Using historical data to predict CLV
rate(order_revenue_total[30d]) *
(rate(returning_customers_total[30d]) / rate(total_customers_total[30d]))

# API Rate Limit Utilization
# How close to rate limits
(rate(api_requests_total[5m]) / api_rate_limit) * 100

# Geographical Performance Distribution
# Performance by region
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, region))

# Cost per Transaction
# Operational efficiency
(rate(infrastructure_cost_total[1h]) / rate(transactions_processed_total[1h]))

# Batch Job Success Rate
# Cron job reliability
(rate(batch_job_success_total[1h]) / (rate(batch_job_success_total[1h]) + rate(batch_job_failure_total[1h]))) * 100

# Cache Warming Effectiveness
# How well cache is being warmed
(rate(cache_hits_total{source="warm"}[5m]) / rate(cache_hits_total[5m])) * 100

# Database Query Performance by Table
# Identify slow tables
avg by(table) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))

# External API Dependency Health
# Third-party service reliability
(rate(external_api_requests_total{status_code=~"2.."}[5m]) / rate(external_api_requests_total[5m])) * 100

# Real-time Concurrent Users
# User engagement metric
sum by(service) (concurrent_user_sessions)

# Conversion Funnel Analysis
# Track user journey through application
sum(rate(page_views_total{page=~"^(home|product|cart|checkout|success)$"}[5m])) by (page)

# These queries can be used in:
# - Grafana dashboards for visualization
# - Prometheus alerting rules
# - Performance analysis
# - Capacity planning
# - Business intelligence
# - SLA/SLO monitoring