Datadog Monitoring Samples

Comprehensive Datadog monitoring and observability configuration examples including APM, logs, metrics, and dashboards

💻 Datadog APM Configuration javascript

🟡 intermediate ⭐⭐⭐

Complete Datadog APM setup for application performance monitoring with custom spans and metrics

⏱️ 25 min 🏷️ datadog, apm, monitoring, javascript, node.js
Prerequisites: Node.js, npm, Datadog account
// Datadog APM Configuration Example
// Install: npm install dd-trace

const tracer = require('dd-trace')

// Initialize tracer with configuration
tracer.init({
  service: 'my-web-app',
  env: process.env.NODE_ENV || 'development',
  version: process.env.APP_VERSION || '1.0.0',
  logInjection: true,
  analytics: true,
  // Sample rate for traces (0 to 1)
  sampleRate: 1.0,
  // Enable runtime metrics
  runtimeMetrics: true,
  // Enable health check metrics
  healthCheckMetrics: true,
  // Exporter configuration
  url: 'https://trace.agent.datadoghq.com',
  // Headers for authentication
  headers: {
    'x-datadog-trace-id': process.env.DD_TRACE_ID
  }
})

// Custom span example
function processUserRequest(userId, action) {
  const span = tracer.startSpan('user.process_request', {
    resource: action,
    tags: {
      'user.id': userId,
      'action.type': action
    }
  })

  try {
    // Simulate processing
    const result = performBusinessLogic(userId, action)

    // Add custom metrics
    span.setTag('process.success', true)
    span.setTag('process.duration_ms', Date.now() - span.startTime)

    return result
  } catch (error) {
    span.setTag('process.success', false)
    span.setTag('error.message', error.message)
    throw error
  } finally {
    span.finish()
  }
}

// Distributed tracing example
async function processOrder(orderId) {
  const span = tracer.startSpan('order.process', {
    resource: 'order_processing',
    tags: {
      'order.id': orderId
    }
  })

  try {
    // Step 1: Validate order
    const validateSpan = tracer.startSpan('order.validate', {
      childOf: span
    })
    await validateOrder(orderId)
    validateSpan.finish()

    // Step 2: Process payment
    const paymentSpan = tracer.startSpan('order.payment', {
      childOf: span,
      tags: { 'payment.provider': 'stripe' }
    })
    await processPayment(orderId)
    paymentSpan.finish()

    // Step 3: Update inventory
    const inventorySpan = tracer.startSpan('order.inventory', {
      childOf: span
    })
    await updateInventory(orderId)
    inventorySpan.finish()

    span.setTag('order.status', 'completed')
    return { success: true, orderId }
  } catch (error) {
    span.setTag('order.status', 'failed')
    span.setTag('error.message', error.message)
    throw error
  } finally {
    span.finish()
  }
}

// Custom metrics example
const api = require('dd-trace').metrics

// Increment counter
api.increment('api.requests.total', 1, ['method:GET', 'endpoint:/api/users'])

// Gauge value
api.gauge('database.connections.active', activeConnections)

// Histogram
api.histogram('api.response.time', responseTime, ['endpoint:/api/users'])

// Timer
const timer = api.timer('database.query.time')
timer.start()
await database.query('SELECT * FROM users')
timer.stop()

// Express.js integration example
const express = require('express')
const app = express()

// Datadog middleware for Express
app.use((req, res, next) => {
  const span = tracer.startSpan('express.request', {
    resource: `${req.method} ${req.path}`,
    tags: {
      'http.method': req.method,
      'http.url': req.url,
      'http.user_agent': req.get('User-Agent')
    }
  })

  res.on('finish', () => {
    span.setTag('http.status_code', res.statusCode)
    span.setTag('http.response_time_ms', Date.now() - span.startTime)
    span.finish()
  })

  next()
})

app.get('/api/users', async (req, res) => {
  const span = tracer.scope().active()

  try {
    span.setTag('operation.name', 'get_users')

    const users = await database.getUsers()
    span.setTag('users.count', users.length)

    res.json({ users })
  } catch (error) {
    span.setTag('error.message', error.message)
    res.status(500).json({ error: 'Internal server error' })
  }
})

// Start server
const port = process.env.PORT || 3000
app.listen(port, () => {
  console.log(`Server running on port ${port}`)
})

💻 Datadog Log Management yaml

🟡 intermediate ⭐⭐⭐

Log collection, parsing, and management configuration for centralized logging

⏱️ 30 min 🏷️ datadog, logs, logging, monitoring
Prerequisites: Datadog account, Basic logging concepts
# Datadog Agent Configuration for Log Management
# Location: /etc/datadog-agent/datadog.yaml

# Enable log collection
logs:
  enabled: true
  # Container log collection
  container_collect_all: true
  # Auto-configuration for container logs
  container_collect_using_logs_agent: true

# Configure log processing
logs_config:
  # Number of processing workers
  processing_workers: 2
  # Maximum log size
  logs_max_limit: 256000
  # Auto-configuration
  auto_multi_line_detection: true

# Python application logging with Datadog
"""
requirements.txt:
ddtrace>=1.0.0
datadog>=0.44.0
"""

import logging
import ddtrace
from datadog import initialize, statsd

# Initialize Datadog
initialize({
    'api_key': 'your-api-key',
    'app_key': 'your-app-key',
    'statsd_host': 'localhost',
    'statsd_port': 8125
})

# Configure logger with Datadog formatter
class DatadogFormatter(logging.Formatter):
    def format(self, record):
        # Add DD trace context
        span = ddtrace.tracer.current_span()
        if span:
            record.dd = {
                'trace_id': span.trace_id,
                'span_id': span.span_id,
                'service': span.service
            }

        # Add custom fields
        record.environment = 'production'
        record.version = '1.0.0'

        return super().format(record)

# Set up logger
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(DatadogFormatter(
    '%(asctime)s %(levelname)s %(name)s %(message)s'
))
logger.addHandler(handler)

# Example usage
def process_user_data(user_id):
    logger.info(f"Processing user data", extra={
        'user_id': user_id,
        'operation': 'process_user_data',
        'environment': 'production'
    })

    try:
        # Business logic here
        result = perform_operation(user_id)
        logger.info("User data processed successfully", extra={
            'user_id': user_id,
            'result_count': len(result)
        })
        return result
    except Exception as e:
        logger.error(f"Failed to process user data: {e}", extra={
            'user_id': user_id,
            'error_type': type(e).__name__
        })
        raise

# Node.js structured logging with winston
"""
package.json dependencies:
{
  "winston": "^3.8.0",
  "dd-trace": "^1.0.0",
  "winston-datadog-logs-transport": "^2.0.0"
}
"""

const winston = require('winston')
const tracer = require('dd-trace')
const { DatadogLogsTransport } = require('winston-datadog-logs-transport')

// Configure winston logger with Datadog transport
const logger = winston.createLogger({
  level: 'info',
  format: winston.format.combine(
    winston.format.timestamp(),
    winston.format.errors({ stack: true }),
    winston.format.json(),
    winston.format.metadata({
      fillExcept: ['message', 'level', 'timestamp', 'label']
    })
  ),
  transports: [
    // Console transport
    new winston.transports.Console({
      format: winston.format.simple()
    }),
    // Datadog logs transport
    new DatadogLogsTransport({
      apiKey: process.env.DD_API_KEY,
      service: 'my-web-app',
      hostname: process.env.HOSTNAME,
      env: process.env.NODE_ENV,
      // Add trace context
      ddsource: 'nodejs',
      ddtags: 'env:production,version:1.0.0'
    })
  ]
})

// Add trace context to logs
function addTraceContext(metadata = {}) {
  const span = tracer.scope().active()
  if (span) {
    return {
      ...metadata,
      dd: {
        trace_id: span.context().toTraceId(),
        span_id: span.context().toSpanId()
      }
    }
  }
  return metadata
}

// Usage example
function handleRequest(req, res) {
  logger.info('Request received', addTraceContext({
    method: req.method,
    url: req.url,
    userAgent: req.get('User-Agent')
  }))

  try {
    // Process request
    const result = processRequest(req)
    logger.info('Request processed successfully', addTraceContext({
      result_count: result.length
    }))
    res.json(result)
  } catch (error) {
    logger.error('Request processing failed', addTraceContext({
      error: error.message,
      stack: error.stack
    }))
    res.status(500).json({ error: 'Internal server error' })
  }
}

# Docker configuration for log collection
# docker-compose.yml
version: '3.8'
services:
  app:
    image: my-app:latest
    environment:
      - DD_API_KEY=${DD_API_KEY}
      - DD_SERVICE=my-app
      - DD_ENV=production
      - DD_VERSION=1.0.0
      - DD_LOGS_INJECTION=true
    labels:
      - "com.datadoghq.ad.logs='[{"source": "app", "service": "my-app"}]'"
    logging:
      driver: "json-file"
      options:
        max-size: "10m"
        max-file: "3"

  datadog-agent:
    image: gcr.io/datadoghq/agent:latest
    environment:
      - DD_API_KEY=${DD_API_KEY}
      - DD_SITE=datadoghq.com
      - DD_LOGS_ENABLED=true
      - DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true
      - DD_AC_EXCLUDE="name:datadog-agent"
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock:ro
      - /proc/:/host/proc/:ro
      - /sys/fs/cgroup/:/host/sys/fs/cgroup:ro

💻 Datadog Dashboards json

🟡 intermediate ⭐⭐⭐⭐

Create and configure comprehensive monitoring dashboards with widgets and graphs

⏱️ 35 min 🏷️ datadog, dashboards, monitoring, visualization
Prerequisites: Datadog account, Dashboard concepts
{
  "title": "Application Performance Dashboard",
  "description": "Real-time monitoring of application performance and health",
  "widgets": [
    {
      "id": 1,
      "title": "Request Rate",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "avg:flask.request.count{*}.rollup(sum, 60)",
            "display_type": "line",
            "style": {
              "palette": "dog_classic",
              "line_type": "solid",
              "line_width": "normal"
            }
          }
        ],
        "yaxis": {
          "scale": "linear",
          "label": "Requests per second",
          "min": "auto",
          "max": "auto"
        }
      }
    },
    {
      "id": 2,
      "title": "Response Time",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "avg:flask.request.duration{*}.rollup(avg, 60)",
            "display_type": "line",
            "style": {
              "palette": "warm"
            }
          }
        ],
        "yaxis": {
          "scale": "linear",
          "label": "Response time (ms)",
          "min": "auto"
        }
      }
    },
    {
      "id": 3,
      "title": "Error Rate",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100",
            "display_type": "line",
            "style": {
              "palette": "critical"
            }
          }
        ],
        "yaxis": {
          "scale": "linear",
          "label": "Error rate (%)",
          "min": 0,
          "max": 100
        }
      }
    },
    {
      "id": 4,
      "title": "Active Users",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "avg:web.active_users{*}.rollup(avg, 300)",
            "display_type": "area",
            "style": {
              "palette": "orange"
            }
          }
        ]
      }
    },
    {
      "id": 5,
      "title": "Database Connections",
      "definition": {
        "type": "query_value",
        "requests": [
          {
            "q": "avg:postgres.connections{*}",
            "display_type": "scalar",
            "conditional_formats": [
              {
                "comparator": ">",
                "value": 80,
                "palette": "white_on_red"
              },
              {
                "comparator": ">",
                "value": 60,
                "palette": "white_on_yellow"
              }
            ]
          }
        ]
      }
    },
    {
      "id": 6,
      "title": "CPU Usage by Service",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "avg:system.cpu.total{*} by {service}",
            "display_type": "stacked",
            "style": {
              "palette": "purple"
            }
          }
        ]
      }
    },
    {
      "id": 7,
      "title": "Memory Usage",
      "definition": {
        "type": "timeseries",
        "requests": [
          {
            "q": "avg:system.mem.total{*} - avg:system.mem.usable{*}",
            "display_type": "area",
            "style": {
              "palette": "blue"
            }
          }
        ]
      }
    },
    {
      "id": 8,
      "title": "Top 10 Slowest Endpoints",
      "definition": {
        "type": "table",
        "requests": [
          {
            "q": "top(avg:flask.request.duration{*} by {resource_name}, 10, 'mean', 'desc')",
            "display_type": "table"
          }
        ],
        "style": {
          "palette": "blue"
        }
      }
    },
    {
      "id": 9,
      "title": "API Response Status Codes",
      "definition": {
        "type": "toplist",
        "requests": [
          {
            "q": "sum:flask.request.count{*} by {http.status_code}",
            "display_type": "categorical"
          }
        ]
      }
    },
    {
      "id": 10,
      "title": "Error Logs",
      "definition": {
        "type": "log_stream",
        "requests": [
          {
            "query": "status:error",
            "columns": ["host", "service", "error.stack"],
            "sort": {
              "order": "desc",
              "column": "timestamp"
            },
            "message_display": "expanded"
          }
        ],
        "log_query": "source:python status:error",
        "event_size": "l",
        "title": "Error Logs",
        "title_align": "left",
        "title_size": "16"
      }
    }
  ],
  "layout_type": "ordered",
  "description": "Comprehensive application monitoring dashboard",
  "notify_list": [],
  "template_variables": [
    {
      "name": "env",
      "prefix": null,
      "default": "production"
    },
    {
      "name": "service",
      "prefix": null,
      "default": "my-web-app"
    }
  ],
  "is_read_only": false,
  "created_at": "2025-12-11T00:00:00.000Z",
  "modified_at": "2025-12-11T00:00:00.000Z"
}

# Terraform configuration for dashboard management
# dashboard.tf
resource "datadog_dashboard" "application_dashboard" {
  title = "Application Performance Dashboard"
  description = "Real-time monitoring of application performance"

  template_variable {
    name = "env"
    default = "production"
  }

  template_variable {
    name = "service"
    default = "my-web-app"
  }

  widget {
    timeseries_definition {
      title = "Request Rate"
      request {
        q = "avg:flask.request.count{*}.rollup(sum, 60)"
        display_type = "line"
      }
    }
  }

  widget {
    timeseries_definition {
      title = "Response Time"
      request {
        q = "avg:flask.request.duration{*}.rollup(avg, 60)"
        display_type = "line"
      }
    }
  }

  widget {
    query_value_definition {
      title = "Error Rate"
      request {
        q = "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100"
      }
      autoscale = true
      precision = 2
    }
  }
}

# Custom widget configuration
const dashboardConfig = {
  widgets: [
    // Service Map widget
    {
      id: 11,
      title: "Service Dependencies",
      definition: {
        "type": "servicemap",
        "service": "my-web-app",
        "env": "production",
        "display_format": "list"
      }
    },
    // Heatmap widget
    {
      id: 12,
      title: "Response Time Distribution",
      definition: {
        "type": "heatmap",
        "requests": [
          {
            "q": "avg:flask.request.duration{*} by {resource_name}.rollup(avg, 60)",
            "display_type": "heatmap"
          }
        ],
        "yaxis": {
          "scale": "log"
        }
      }
    },
    // Change widget
    {
      id: 13,
      title: "Deployment Changes",
      definition: {
        "type": "change",
        "requests": [
          {
            "q": "avg:flask.request.duration{*} before(last:30m, last:24h)",
            "compare_to": "week_before"
          }
        ]
      }
    }
  ]
}

💻 Datadog Alerts and Monitors yaml

🔴 complex ⭐⭐⭐⭐

Configure intelligent alerts and monitors for proactive monitoring and incident response

⏱️ 45 min 🏷️ datadog, alerts, monitors, incident-response
Prerequisites: Datadog account, Monitoring concepts, Alerting best practices
# Datadog Monitor Configuration Examples

# 1. High Error Rate Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: high-error-rate
spec:
  name: "[Production] High Error Rate Alert"
  type: "query alert"
  query: "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"
  message: |
    🚨 High error rate detected!

    Error rate: {{value}}%
    Threshold: 5%
    Duration: 15 minutes

    Service: {{service.name}}
    Environment: {{env.name}}

    @pagerduty-service-escalation
    @slack-alerts
  options:
    threshold_windows:
      window: "last_15m"
    thresholds:
      critical: 5
      warning: 3
    notify_audit: false
    locked: false
    timeout_h: 0
    require_full_window: true
    new_host_delay: 300
    notify_no_data: false
    renotify_interval: 0
    escalation_message: ""
    include_tags: true
  tags: ["env:production", "team:backend", "severity:critical"]

# 2. High Response Time Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: high-response-time
spec:
  name: "[Production] High Response Time Alert"
  type: "query alert"
  query: "avg(last_10m):avg:flask.request.duration{*}.rollup(avg, 60) > 1000"
  message: |
    ⚠️ High response time detected!

    Current average: {{value}}ms
    Threshold: 1000ms
    Duration: 10 minutes

    Service: {{service.name}}

    Monitor: {{#is_alert}}Response time is critically high{{/is_alert}}
             {{#is_warning}}Response time is elevated{{/is_warning}}
             {{#is_recovery}}Response time has recovered{{/is_recovery}}
  options:
    thresholds:
      critical: 1000
      warning: 500
    threshold_windows:
      window: "last_10m"
    notify_audit: true
    require_full_window: false
  tags: ["env:production", "team:backend", "performance"]

# 3. Database Connection Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: database-connections
spec:
  name: "[Production] Database Connections High"
  type: "query alert"
  query: "avg(last_5m):avg:postgres.connections{*} > 80"
  message: |
    🗄️ Database connection pool is nearly full!

    Connections: {{value}}
    Max connections: 100
    Utilization: {{value}}%

    Server: {{host.name}}
    Database: postgres

    Action required: Check for connection leaks or consider scaling database.
    @database-team
  options:
    thresholds:
      critical: 80
      warning: 70
    require_full_window: false
    evaluation_delay: 300
  tags: ["env:production", "database", "postgres", "team:backend"]

# 4. Memory Usage Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: memory-usage
spec:
  name: "[Production] High Memory Usage"
  type: "metric alert"
  query: "avg(last_10m):(avg:system.mem.total{*} - avg:system.mem.usable{*}) / avg:system.mem.total{*} * 100 > 85"
  message: |
    💾 High memory usage detected!

    Memory usage: {{value}}%
    Threshold: 85%
    Duration: 10 minutes

    Host: {{host.name}}
    Service: {{service.name}}

    @ops-team
  options:
    thresholds:
      critical: 85
      warning: 75
    notify_audit: true
    require_full_window: true
  tags: ["env:production", "infrastructure", "memory"]

# 5. Custom Service Health Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: service-health
spec:
  name: "[Production] Service Health Check"
  type: "service check"
  query: ""my_app.health_check".over("env:production").by("service").last(2).count_by_status()"
  message: |
    🏥 Service health check failing!

    Service: {{service.name}}
    Status: {{#is_alert}}CRITICAL{{/is_alert}}{{#is_warning}}WARNING{{/is_warning}}{{#is_recovery}}RECOVERED{{/is_recovery}}

    Last 2 checks: {{check_message}}

    @oncall
  options:
    thresholds:
      ok: 1
      critical: 1
    notify_audit: true
    renotify_interval: 15
  tags: ["env:production", "health-check", "team:backend"]

# 6. Anomaly Detection Monitor
apiVersion: v1
kind: Monitor
metadata:
  name: traffic-anomaly
spec:
  name: "[Production] Traffic Anomaly Detection"
  type: "anomaly alert"
  query: "avg(last_1h):anomalies(avg:flask.request.count{*}.rollup(sum, 60), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true')"
  message: |
    📊 Traffic anomaly detected!

    Current requests/sec: {{value}}
    Expected range: {{threshold}} ± {{#is_alert}}2σ{{/is_alert}}

    Time window: last 15 minutes
    Historical baseline: last 1 hour

    This might indicate:
    - Sudden traffic spike
    - Load testing
    - Potential attack

    @ops-team @slack-alerts
  options:
    threshold_windows: {
      window: "last_1h"
    }
    notify_no_data: false
  tags: ["env:production", "anomaly", "traffic", "team:backend"]

# 7. Composite Monitor (Multi-Condition)
apiVersion: v1
kind: Monitor
metadata:
  name: composite-health
spec:
  name: "[Production] Composite System Health"
  type: "composite"
  query: "avg(last_15m):avg:flask.request.duration{*} < 500 and avg(last_15m):(sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100) < 1"
  message: |
    🔍 Composite system health status:

    {{#is_alert}}⚠️ SYSTEM UNHEALTHY{{/is_alert}}
    {{#is_warning}}⚠️ SYSTEM DEGRADED{{/is_warning}}
    {{#is_recovery}}✅ SYSTEM HEALTHY{{/is_recovery}}

    Conditions:
    - Response time < 500ms: {{#is_ok_response_time}}✅{{/is_ok_response_time}}{{#is_warning_response_time}}⚠️{{/is_warning_response_time}}
    - Error rate < 1%: {{#is_ok_error_rate}}✅{{/is_ok_error_rate}}{{#is_warning_error_rate}}⚠️{{/is_warning_error_rate}}

    @oncall @ops-team
  options:
    require_full_window: false
  tags: ["env:production", "composite", "health", "team:backend"]

# Terraform configuration for monitors
# monitors.tf
resource "datadog_monitor" "high_error_rate" {
  name    = "[Production] High Error Rate Alert"
  type    = "query alert"
  query   = "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"

  message = <<EOF
🚨 High error rate detected!

Error rate: {{value}}%
Threshold: 5%
Duration: 15 minutes

Service: {{service.name}}
Environment: {{env.name}}

@pagerduty-service-escalation
@slack-alerts
EOF

  tags = ["env:production", "team:backend", "severity:critical"]

  monitor_thresholds {
    critical = 5
    warning  = 3
  }

  threshold_windows {
    trigger_window  = "last_15m"
    recovery_window = "last_15m"
  }

  notify_no_data    = false
  renotify_interval = 60
}

resource "datadog_monitor" "database_connections" {
  name    = "[Production] Database Connections High"
  type    = "metric alert"
  query   = "avg(last_5m):avg:postgres.connections{*} > 80"

  message = <<EOF
🗄️ Database connection pool is nearly full!

Connections: {{value}}
Max connections: 100
Utilization: {{value}}%

Server: {{host.name}}
Database: postgres

Action required: Check for connection leaks or consider scaling database.
@database-team
EOF

  tags = ["env:production", "database", "postgres", "team:backend"]

  monitor_thresholds {
    critical = 80
    warning  = 70
  }
}

# Monitor with dynamic thresholds
resource "datadog_monitor" "adaptive_threshold" {
  name    = "[Production] Adaptive Response Time"
  type    = "anomaly alert"
  query   = "avg(last_1h):anomalies(avg:flask.request.duration{*}.rollup(avg, 60), 'basic', 2, direction='above', alert_window='last_15m', interval=60)"

  message = <<EOF
📊 Response time anomaly detected!

Current response time: {{value}}ms
Historical baseline: 1 hour data
Anomaly threshold: ±2 standard deviations

Service: {{service.name}}
Environment: {{env.name}}

@ops-team
EOF

  tags = ["env:production", "anomaly", "performance", "team:backend"]
}