Surveillance Prometheus

Exemples complets de surveillance et d'alertes Prometheus incluant les exportateurs de métriques personnalisés, la configuration des règles d'alerte, les requêtes PromQL et l'intégration avec les tableaux de bord Grafana

💻 Exportateur de Métriques Personnalisé Prometheus javascript

🟡 intermediate

Construire un exportateur de métriques personnalisé Prometheus avec plusieurs types de métriques incluant compteurs, jauges, histogrammes et résumés pour surveillance complète d'applications

// Custom Prometheus Metrics Exporter
// Complete Node.js application with multiple metric types and Express integration
// Run with: node metrics-exporter.js

import express from 'express';
import client from 'prom-client';
import compression from 'compression';
import cors from 'cors';

// Configuration
const PORT = process.env.PORT || 9091;
const HOST = process.env.HOST || '0.0.0.0';

// Create Prometheus Registry
const register = new client.Registry();

// Add default metrics (CPU, memory, etc.)
client.collectDefaultMetrics({ register });

// Custom Metrics
// ==============

// 1. Counter - Only increases (e.g., total requests)
const httpRequestsTotal = new client.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status_code', 'user_agent'],
  registers: [register]
});

// 2. Gauge - Can go up or down (e.g., active connections)
const activeConnections = new client.Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register]
});

const queueSize = new client.Gauge({
  name: 'queue_size',
  help: 'Current size of processing queue',
  registers: [register]
});

const systemLoadGauge = new client.Gauge({
  name: 'system_load_average',
  help: 'System load average over the last minute',
  registers: [register]
});

// 3. Histogram - Observations and their distribution
const httpRequestDuration = new client.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status_code'],
  buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10],
  registers: [register]
});

const databaseQueryDuration = new client.Histogram({
  name: 'database_query_duration_seconds',
  help: 'Duration of database queries in seconds',
  labelNames: ['query_type', 'table'],
  buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
  registers: [register]
});

// 4. Summary - Similar to histogram but calculates quantiles on the client side
const requestSizeBytes = new client.Summary({
  name: 'request_size_bytes',
  help: 'Size of HTTP requests in bytes',
  percentiles: [0.5, 0.9, 0.95, 0.99],
  registers: [register]
});

const responseSizeBytes = new client.Summary({
  name: 'response_size_bytes',
  help: 'Size of HTTP responses in bytes',
  percentiles: [0.5, 0.9, 0.95, 0.99],
  registers: [register]
});

// Express Application Setup
// =========================

const app = express();
app.use(compression());
app.use(cors());
app.use(express.json({ limit: '10mb' }));
app.use(express.urlencoded({ extended: true }));

// Metrics Collection Middleware
app.use((req, res, next) => {
  const start = Date.now();

  // Track active connections
  activeConnections.inc();

  // Track request size
  if (req.headers['content-length']) {
    requestSizeBytes.observe(parseInt(req.headers['content-length']));
  }

  // Process the request
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || req.path;

    // Update request counter
    httpRequestsTotal
      .labels(
        req.method,
        route,
        res.statusCode.toString(),
        req.headers['user-agent'] || 'unknown'
      )
      .inc();

    // Update request duration histogram
    httpRequestDuration
      .labels(req.method, route, res.statusCode.toString())
      .observe(duration);

    // Track response size
    if (res.get('content-length')) {
      responseSizeBytes.observe(parseInt(res.get('content-length') || '0'));
    }

    // Decrease active connections
    activeConnections.dec();
  });

  next();
});

// Health Check Endpoint
app.get('/health', (req, res) => {
  res.json({
    status: 'healthy',
    timestamp: new Date().toISOString(),
    uptime: process.uptime(),
    memory: process.memoryUsage(),
    pid: process.pid
  });
});

// Metrics Endpoint - This is what Prometheus scrapes
app.get('/metrics', async (req, res) => {
  try {
    // Update system metrics
    systemLoadGauge.set(require('os').loadavg()[0]);

    res.set('Content-Type', register.contentType);
    res.end(await register.metrics());
  } catch (error) {
    console.error('Error generating metrics:', error);
    res.status(500).end(error.message);
  }
});

// API Routes Examples
// ===================

// Example API endpoints with metrics
app.get('/api/users', async (req, res) => {
  const queryStart = Date.now();

  try {
    // Simulate database query
    await new Promise(resolve => setTimeout(resolve, Math.random() * 100));

    // Track queue processing
    queueSize.inc();

    // Process business logic
    const users = [
      { id: 1, name: 'Alice Johnson', email: '[email protected]', role: 'admin' },
      { id: 2, name: 'Bob Smith', email: '[email protected]', role: 'user' },
      { id: 3, name: 'Charlie Davis', email: '[email protected]', role: 'user' }
    ];

    // Track database query duration
    databaseQueryDuration
      .labels('SELECT', 'users')
      .observe((Date.now() - queryStart) / 1000);

    // Decrease queue size
    queueSize.dec();

    res.json({
      success: true,
      data: users,
      total: users.length,
      timestamp: new Date().toISOString()
    });

  } catch (error) {
    queueSize.dec();
    databaseQueryDuration
      .labels('SELECT', 'users')
      .observe((Date.now() - queryStart) / 1000);

    console.error('Error fetching users:', error);
    res.status(500).json({
      success: false,
      error: 'Internal server error',
      timestamp: new Date().toISOString()
    });
  }
});

app.post('/api/orders', async (req, res) => {
  const queryStart = Date.now();

  try {
    const { userId, items, total } = req.body;

    // Validate input
    if (!userId || !items || !total) {
      return res.status(400).json({
        success: false,
        error: 'Missing required fields',
        timestamp: new Date().toISOString()
      });
    }

    // Simulate order processing
    queueSize.inc();
    await new Promise(resolve => setTimeout(resolve, Math.random() * 200));

    const order = {
      id: Math.floor(Math.random() * 10000),
      userId,
      items,
      total,
      status: 'created',
      createdAt: new Date().toISOString()
    };

    // Track database operation
    databaseQueryDuration
      .labels('INSERT', 'orders')
      .observe((Date.now() - queryStart) / 1000);

    queueSize.dec();

    res.status(201).json({
      success: true,
      data: order,
      timestamp: new Date().toISOString()
    });

  } catch (error) {
    queueSize.dec();
    databaseQueryDuration
      .labels('INSERT', 'orders')
      .observe((Date.now() - queryStart) / 1000);

    console.error('Error creating order:', error);
    res.status(500).json({
      success: false,
      error: 'Internal server error',
      timestamp: new Date().toISOString()
    });
  }
});

// Error handling middleware
app.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
  console.error('Unhandled error:', err);
  res.status(500).json({
    success: false,
    error: 'Internal server error',
    timestamp: new Date().toISOString()
  });
});

// 404 handler
app.use('*', (req, res) => {
  res.status(404).json({
    success: false,
    error: 'Route not found',
    path: req.originalUrl,
    timestamp: new Date().toISOString()
  });
});

// Server Startup
// ==============

const server = app.listen(PORT, HOST, () => {
  console.log(`🚀 Prometheus Metrics Exporter running on http://${HOST}:${PORT}`);
  console.log(`📊 Metrics available at: http://${HOST}:${PORT}/metrics`);
  console.log(`💚 Health check: http://${HOST}:${PORT}/health`);
  console.log(`👥 Users API: http://${HOST}:${PORT}/api/users`);
  console.log(`🛒 Orders API: http://${HOST}:${PORT}/api/orders`);
});

// Graceful Shutdown
// ==================

process.on('SIGTERM', () => {
  console.log('\n🛑 Received SIGTERM, shutting down gracefully...');
  server.close(() => {
    console.log('✅ Server closed gracefully');
    process.exit(0);
  });
});

process.on('SIGINT', () => {
  console.log('\n🛑 Received SIGINT, shutting down gracefully...');
  server.close(() => {
    console.log('✅ Server closed gracefully');
    process.exit(0);
  });
});

// Handle uncaught exceptions
process.on('uncaughtException', (error) => {
  console.error('💥 Uncaught Exception:', error);
  process.exit(1);
});

process.on('unhandledRejection', (reason, promise) => {
  console.error('💥 Unhandled Rejection at:', promise, 'reason:', reason);
  process.exit(1);
});

export default app;

💻 Configuration d'Alertes et Alertmanager Prometheus yaml

🟡 intermediate

Configuration complète d'alertes avec règles d'alerte Prometheus, configuration Alertmanager, multiples canaux de notification (email, Slack, PagerDuty) et routage avancé d'alertes

# Prometheus Alerting Configuration
# ==================================
# Complete setup with alert rules, Alertmanager, and multiple notification channels

# Prometheus Alert Rules Configuration
# File: prometheus_alert_rules.yml
# ===================================

groups:
  - name: system_infrastructure_alerts
    interval: 30s
    rules:
      # High CPU Usage Alert
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
        for: 5m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "High CPU usage detected on {{ $labels.instance }}"
          description: |
            CPU usage is above 85% for more than 5 minutes.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Runbook: https://runbooks.example.com/high-cpu
          dashboard_url: "https://grafana.example.com/d/cpu-dashboard"

      # Critical Memory Usage Alert
      - alert: CriticalMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95
        for: 3m
        labels:
          severity: critical
          service: infrastructure
          team: devops
        annotations:
          summary: "Critical memory usage on {{ $labels.instance }}"
          description: |
            Memory usage is above 95% for more than 3 minutes.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Action: Immediate investigation required
          dashboard_url: "https://grafana.example.com/d/memory-dashboard"

      # Disk Space Alert
      - alert: DiskSpaceLow
        expr: (1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 90
        for: 10m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: |
            Disk usage is above 90% on {{ $labels.mountpoint }}.
            Current value: {{ $value | printf "%.2f" }}%
            Instance: {{ $labels.instance }}
            Mount point: {{ $labels.mountpoint }}
          dashboard_url: "https://grafana.example.com/d/disk-dashboard"

      # System Load Alert
      - alert: HighSystemLoad
        expr: node_load1 / count by(instance) (node_cpu_seconds_total{mode="idle"}) > 2
        for: 5m
        labels:
          severity: warning
          service: infrastructure
          team: devops
        annotations:
          summary: "High system load on {{ $labels.instance }}"
          description: |
            System 1-minute load average is {{ $value | printf "%.2f" }}, which is 2x the number of CPU cores.
            Instance: {{ $labels.instance }}
            Load average: {{ $value }}

  - name: application_performance_alerts
    interval: 15s
    rules:
      # HTTP Error Rate Alert
      - alert: HighHTTPErrorRate
        expr: (rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5
        for: 3m
        labels:
          severity: critical
          service: application
          team: backend
        annotations:
          summary: "High HTTP 5xx error rate"
          description: |
            HTTP 5xx error rate is {{ $value | printf "%.2f" }}% over the last 5 minutes.
            Service: {{ $labels.job }}
            Instance: {{ $labels.instance }}
            Investigation required immediately

      # Application Downtime Alert
      - alert: ApplicationDown
        expr: up{job=~"myapp|webapp"} == 0
        for: 1m
        labels:
          severity: critical
          service: application
          team: backend
        annotations:
          summary: "Application {{ $labels.job }} is down"
          description: |
            Application {{ $labels.job }} on instance {{ $labels.instance }} has been down for more than 1 minute.
            Last scrape: {{ $value | humanizeTimestamp }}
            Immediate action required

      # High Response Time Alert
      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
          service: application
          team: backend
        annotations:
          summary: "High 95th percentile response time"
          description: |
            95th percentile response time is {{ $value | printf "%.2f" }}s over the last 5 minutes.
            Service: {{ $labels.job }}
            Threshold: 2s

      # Database Connection Alert
      - alert: DatabaseConnectionPoolExhausted
        expr: hikaricp_connections_active / hikaricp_connections_max > 0.9
        for: 2m
        labels:
          severity: warning
          service: database
          team: backend
        annotations:
          summary: "Database connection pool nearly exhausted"
          description: |
            Database connection pool usage is {{ $value | printf "%.1f" }}%.
            Active connections: {{ query "hikaricp_connections_active" | first | value }}
            Max connections: {{ query "hikaricp_connections_max" | first | value }}

  - name: business_metrics_alerts
    interval: 60s
    rules:
      # Order Processing Delay Alert
      - alert: OrderProcessingDelay
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{route="/api/orders"}[5m])) > 5
        for: 5m
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "Order processing delay detected"
          description: |
            95th percentile order processing time is {{ $value | printf "%.2f" }}s.
            Threshold: 5s
            This may impact customer experience

      # Low Order Volume Alert
      - alert: LowOrderVolume
        expr: rate(http_requests_total{route="/api/orders",method="POST"}[1h]) < 1
        for: 30m
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "Low order volume detected"
          description: |
            Order rate is {{ $value | printf "%.2f" }} orders per hour.
            This is below the expected threshold of 1 order/hour.
            Time: {{ $value | humanizeTimestamp }}

      # User Registration Drop Alert
      - alert: UserRegistrationDrop
        expr: rate(http_requests_total{route="/api/users",method="POST"}[2h]) < 0.5
        for: 1h
        labels:
          severity: warning
          service: business
          team: business
        annotations:
          summary: "User registration rate dropped"
          description: |
            User registration rate is {{ $value | printf "%.2f" }} per hour.
            Normal rate: >0.5 registrations/hour
            Investigation recommended

# Alertmanager Configuration
# File: alertmanager.yml
# =======================

global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: '[email protected]'
  smtp_auth_username: '[email protected]'
  smtp_auth_password: 'your-smtp-password'
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'

# Route configuration - defines how alerts are grouped and sent
route:
  # Group alerts by these labels
  group_by: ['alertname', 'cluster', 'service', 'severity']

  # How long to wait before sending the first notification for a group
  group_wait: 10s

  # How long to wait between sending notifications for the same group
  group_interval: 10s

  # How long to wait before re-sending a notification after it's been resolved
  repeat_interval: 12h

  # Default receiver
  receiver: 'default-receiver'

  # Routing rules for different alert types
  routes:
    # Critical alerts go to critical channels immediately
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 0s
      repeat_interval: 5m

    # Infrastructure alerts go to DevOps team
    - match:
        service: infrastructure
      receiver: 'infrastructure-alerts'
      group_by: ['alertname', 'service']

    # Application alerts go to Backend team
    - match:
        service: application
      receiver: 'application-alerts'

    # Business alerts go to Business team
    - match:
        service: business
      receiver: 'business-alerts'
      routes:
        # Only during business hours for non-critical
        - match:
            severity: warning
          active_time_intervals:
            - business-hours
        - match:
            severity: critical
          receiver: 'business-critical-alerts'

# Time intervals for routing
time_intervals:
  - name: business-hours
    time_intervals:
      - times:
          - start_time: '09:00'
            end_time: '17:00'
        weekdays: ['monday:friday']

# Receivers define where alerts are sent
receivers:
  # Default receiver for unmatched alerts
  - name: 'default-receiver'
    email_configs:
      - to: '[email protected]'
        subject: '[ALERT] {{ .GroupLabels.alertname }}'
        body: |
          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}
          Severity: {{ .Labels.severity }}
          Service: {{ .Labels.service }}
          Runbook: {{ .Annotations.runbook_url }}
          Dashboard: {{ .Annotations.dashboard_url }}
          Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          {{ end }}

  # Critical alerts - multiple channels
  - name: 'critical-alerts'
    email_configs:
      - to: '[email protected],[email protected]'
        subject: '[CRITICAL] {{ .GroupLabels.alertname }} - IMMEDIATE ACTION REQUIRED'
        body: |
          🚨 CRITICAL ALERT 🚨

          {{ range .Alerts }}
          Alert: {{ .Annotations.summary }}
          Description: {{ .Annotations.description }}

          📊 Current Value: {{ $value }}
          🏷️  Labels: {{ range .Labels.SortedPairs }}{{ .Name }}={{ .Value }} {{ end }}

          ⏰ Started: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
          🔗 Dashboard: {{ .Annotations.dashboard_url }}
          📖 Runbook: {{ .Annotations.runbook_url }}
          {{ end }}

          IMMEDIATE ACTION REQUIRED!
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/CRITICAL/WEBHOOK'
        channel: '#alerts-critical'
        title: '🚨 CRITICAL ALERT: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *{{ .Annotations.summary }}*
          {{ .Annotations.description }}
          {{ end }}
        color: 'danger'
        actions:
          - type: button
            text: 'View Dashboard'
            url: '{{ (index .Alerts 0).Annotations.dashboard_url }}'
          - type: button
            text: 'Runbook'
            url: '{{ (index .Alerts 0).Annotations.runbook_url }}'

  # Infrastructure alerts - DevOps team
  - name: 'infrastructure-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[INFRA] {{ .GroupLabels.alertname }}'
        body: |
          Infrastructure Alert: {{ .GroupLabels.alertname }}
          {{ range .Alerts }}
          {{ .Annotations.summary }}
          {{ .Annotations.description }}
          Instance: {{ .labels.instance }}
          {{ end }}
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/INFRA/WEBHOOK'
        channel: '#infra-alerts'
        title: 'Infrastructure Alert: {{ .GroupLabels.alertname }}'
        color: 'warning'

  # Application alerts - Backend team
  - name: 'application-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[APP] {{ .GroupLabels.alertname }}'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/APP/WEBHOOK'
        channel: '#backend-alerts'
        title: 'Application Alert: {{ .GroupLabels.alertname }}'
        color: 'warning'

  # Business alerts - Business team
  - name: 'business-alerts'
    email_configs:
      - to: '[email protected]'
        subject: '[BUSINESS] {{ .GroupLabels.alertname }}'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/BUSINESS/WEBHOOK'
        channel: '#business-alerts'
        title: 'Business Alert: {{ .GroupLabels.alertname }}'
        color: '#good'

  # Business critical alerts - PagerDuty integration
  - name: 'business-critical-alerts'
    pagerduty_configs:
      - service_key: 'your-pagerduty-service-key'
        description: '{{ .GroupLabels.alertname }}'
        details:
          firing: '{{ .Alerts.Firing | len }}'
          summary: '{{ (index .Alerts 0).Annotations.summary }}'
          description: '{{ (index .Alerts 0).Annotations.description }}'

# Inhibition rules - prevent alert spam
inhibit_rules:
  # Don't send application alerts if the server is down
  - source_match:
      alertname: ApplicationDown
    target_match_re:
      service: application
    equal: ['instance']

  # Don't send high error rate alerts if server is down
  - source_match:
      alertname: ApplicationDown
    target_match:
      alertname: HighHTTPErrorRate
    equal: ['instance']

# Templates for custom alert formatting
templates:
  - '/etc/alertmanager/templates/*.tmpl'

💻 Exemples Avancés de Requêtes PromQL yaml

🟡 intermediate

Collection complète de requêtes PromQL pour surveillance de systèmes, analyse de performance d'applications, métriques business et scénarios d'alerte avec explications détaillées

# Advanced PromQL Query Examples
# =================================
# Complete collection of monitoring queries for production environments

# 1. System Infrastructure Monitoring
# ====================================

# CPU Utilization by Instance (percentage)
# Shows CPU usage across all instances
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# CPU Usage by Mode (idle, user, system, iowait)
# Breaks down CPU usage by different modes
avg by(instance, mode) (irate(node_cpu_seconds_total[5m])) * 100

# Memory Usage Percentage
# Calculates memory usage (total - available) / total
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

# Detailed Memory Breakdown
# Shows different memory types in bytes
node_memory_MemTotal_bytes -
node_memory_MemFree_bytes -
node_memory_Buffers_bytes -
node_memory_Cached_bytes -
node_memory_SwapCached_bytes

# Disk Usage by Mount Point
# Shows disk usage percentage for each filesystem
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100

# Disk I/O Operations per Second
# Read and write operations
irate(node_disk_reads_completed_total[5m])
irate(node_disk_writes_completed_total[5m])

# Network I/O (bytes per second)
# Network throughput
irate(node_network_receive_bytes_total[5m]) * 8  # Convert to bits
irate(node_network_transmit_bytes_total[5m]) * 8

# System Load Average
# 1-minute, 5-minute, and 15-minute load averages
node_load1
node_load5
node_load15

# Process Count by State
# Number of processes in different states
count by(instance) (node_processes_state)

# Uptime in Hours
# System uptime
time() - node_boot_time_seconds

# 2. Application Performance Monitoring
# ======================================

# HTTP Request Rate (requests per second)
# Overall request rate
rate(http_requests_total[5m])

# HTTP Request Rate by Method
# Breakdown by HTTP method
sum by(method) (rate(http_requests_total{method=~"GET|POST|PUT|DELETE"}[5m]))

# HTTP Request Rate by Status Code
# Shows 2xx, 3xx, 4xx, 5xx breakdown
sum by(status_code) (rate(http_requests_total[5m]))

# HTTP Error Rate (percentage)
# Percentage of 4xx and 5xx responses
(sum(rate(http_requests_total{status_code=~"4..|5.."}[5m])) / sum(rate(http_requests_total[5m]))) * 100

# 95th Percentile Response Time
# 95% of requests complete within this time
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))

# 99th Percentile Response Time by Route
# Slowest routes identification
histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))

# Average Response Time
# Mean response time across all requests
rate(http_request_duration_seconds_sum[5m]) / rate(http_request_duration_seconds_count[5m])

# Request Rate by Route
# Most frequently accessed endpoints
sum by(route) (rate(http_requests_total[5m]))

# Database Connection Pool Usage
# Database connection pool utilization
(hikaricp_connections_active / hikaricp_connections_max) * 100

# Database Query Performance
# Average query duration by type
avg by(query_type) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))

# Cache Hit Rate
# Cache effectiveness
(rate(cache_hits_total[5m]) / (rate(cache_hits_total[5m]) + rate(cache_misses_total[5m]))) * 100

# 3. Business Metrics and KPIs
# ============================

# User Registration Rate (per hour)
# New user signups
rate(user_registrations_total[1h]) * 3600

# Order Processing Rate (per minute)
# Orders being processed
rate(orders_processed_total[5m]) * 60

# Revenue per Minute
# Real-time revenue tracking
rate(order_revenue_total[5m])

# Shopping Cart Abandonment Rate
# Percentage of carts not completed
(1 - (rate(checkouts_completed_total[1h]) / rate(cart_created_total[1h]))) * 100

# Active User Sessions
# Currently logged-in users
active_user_sessions

# Conversion Rate (percentage)
# From visit to purchase
(rate(purchases_total[1h]) / rate(page_views_total{page="landing"}[1h])) * 100

# Average Order Value
# Revenue per order
rate(order_revenue_total[1h]) / rate(orders_completed_total[1h])

# Customer Retention Rate
# Percentage of customers returning
rate(returning_customers_total[24h]) / rate(total_customers_total[24h]) * 100

# Feature Usage Rate
# How often features are used
rate(feature_usage_total{feature=~".*"}[1h])

# 4. Container and Kubernetes Monitoring
# =======================================

# Container CPU Usage
# CPU usage per container
rate(container_cpu_usage_seconds_total{container!="",container!="/POD"}[5m]) * 100

# Container Memory Usage
# Memory usage per container
(container_memory_usage_bytes{container!="",container!="/POD"} / container_spec_memory_limit_bytes) * 100

# Container Network I/O
# Network traffic per container
rate(container_network_receive_bytes_total[5m])
rate(container_network_transmit_bytes_total[5m])

# Pod Status Distribution
# Count of pods by status
sum by(pod_phase) (kube_pod_status_phase)

# Node Ready Status
# Kubernetes cluster node availability
sum by(node) (kube_node_status_condition{condition="Ready",status="true"})

# Pending Pods Count
# Pods waiting to be scheduled
sum(kube_pod_status_phase{phase="Pending"})

# HPA (Horizontal Pod Autoscaler) Current Replicas
# Autoscaling status
kube_hpa_status_current_replicas

# 5. Advanced Alerting Queries
# =============================

# High CPU Usage (>90% for 5 minutes)
# CPU utilization alert
100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90

# Memory Pressure (>95% for 3 minutes)
# Memory usage alert
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 95

# Disk Space Warning (>85% usage)
# Disk usage alert
(1 - (node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"})) * 100 > 85

# Application Down
# Service availability alert
up{job="myapp"} == 0

# High Error Rate (>5% 5xx responses)
# Application error rate alert
(rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m])) * 100 > 5

# Slow Response Time (>2 seconds 95th percentile)
# Performance degradation alert
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2

# Database Connection Exhaustion
# Database resource alert
(hikaricp_connections_active / hikaricp_connections_max) > 0.9

# 6. Capacity Planning and Forecasting
# ======================================

# Disk Growth Rate (per day)
# Predict when disks will fill up
increase(node_filesystem_size_bytes[1d]) * 86400

# Memory Growth Trend (per hour)
# Memory usage trend
increase(node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes[1h]) * 3600

# CPU Usage Prediction (next hour)
# Linear regression for CPU usage
predict_linear(avg_over_time(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)[1h:1m], 3600)

# Request Growth Rate (per day)
# Traffic growth prediction
increase(http_requests_total[1d])

# Disk Space Prediction (time until full)
# When will disk run out of space
(time() - node_filesystem_avail_bytes) / increase(node_filesystem_avail_bytes[1h]) * 3600

# 7. SLA and SLO Monitoring
# ==========================

# Service Availability (99.9% SLA)
# Uptime percentage calculation
(1 - sum(rate(http_requests_total{status_code=~"5.."}[1d])) / sum(rate(http_requests_total[1d]))) * 100

# Response Time SLO (95% < 500ms)
# Performance SLO compliance
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[1d])) by (le, service)) < 0.5

# Error Budget Remaining (percentage)
# How much error budget is left for the month
100 - (sum(rate(http_requests_total{status_code=~"5.."}[30d])) / sum(rate(http_requests_total[30d])) * 100)

# Request Rate SLO (>1000 req/s)
# Capacity SLO
sum(rate(http_requests_total[5m])) > 1000

# 8. Correlation and Complex Queries
# ====================================

# CPU vs Memory Correlation
# Find instances with high CPU AND memory usage
(100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) *
((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100) > 5000

# Find Slow Queries with High Rate
# Performance bottleneck identification
rate(http_request_duration_seconds_count[5m]) > 10 and
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1

# Error Rate with High Traffic
# Critical issues affecting many users
rate(http_requests_total{status_code=~"5.."}[5m]) > 1 and
rate(http_requests_total[5m]) > 100

# Instance Performance Score
# Combined performance metric
(
  (100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) * 0.3 +
  (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 * 0.3 +
  ((1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)) * 100) * 0.4
)

# 9. Time Series Analysis
# ========================

# Moving Average (smooths fluctuations)
# 1-hour moving average of request rate
avg_over_time(rate(http_requests_total[1m])[1h:1m])

# Rate of Change (identifies anomalies)
# How quickly metrics are changing
deriv(rate(http_requests_total[5m])[5m])

# Standard Deviation (detects volatility)
# Request rate variability
stddev_over_time(rate(http_requests_total[1m])[1h:1m])

# Year-over-Year Comparison
# Compare current period with previous year
rate(http_requests_total[1d]) / rate(http_requests_total offset 365d)

# 10. Custom Business Logic Examples
# ====================================

# Revenue per User
# Business efficiency metric
rate(order_revenue_total[1h]) / rate(active_users_total[1h])

# Customer Lifetime Value Prediction
# Using historical data to predict CLV
rate(order_revenue_total[30d]) *
(rate(returning_customers_total[30d]) / rate(total_customers_total[30d]))

# API Rate Limit Utilization
# How close to rate limits
(rate(api_requests_total[5m]) / api_rate_limit) * 100

# Geographical Performance Distribution
# Performance by region
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, region))

# Cost per Transaction
# Operational efficiency
(rate(infrastructure_cost_total[1h]) / rate(transactions_processed_total[1h]))

# Batch Job Success Rate
# Cron job reliability
(rate(batch_job_success_total[1h]) / (rate(batch_job_success_total[1h]) + rate(batch_job_failure_total[1h]))) * 100

# Cache Warming Effectiveness
# How well cache is being warmed
(rate(cache_hits_total{source="warm"}[5m]) / rate(cache_hits_total[5m])) * 100

# Database Query Performance by Table
# Identify slow tables
avg by(table) (rate(database_query_duration_seconds_sum[5m]) / rate(database_query_duration_seconds_count[5m]))

# External API Dependency Health
# Third-party service reliability
(rate(external_api_requests_total{status_code=~"2.."}[5m]) / rate(external_api_requests_total[5m])) * 100

# Real-time Concurrent Users
# User engagement metric
sum by(service) (concurrent_user_sessions)

# Conversion Funnel Analysis
# Track user journey through application
sum(rate(page_views_total{page=~"^(home|product|cart|checkout|success)$"}[5m])) by (page)

# These queries can be used in:
# - Grafana dashboards for visualization
# - Prometheus alerting rules
# - Performance analysis
# - Capacity planning
# - Business intelligence
# - SLA/SLO monitoring