Elysia Tools
Navigation mobile
Monitoring & Observability
Exemples Datadog Monitoring
Exemples complets de configuration Datadog monitoring et observabilité incluant APM, logs, métriques et dashboards
Exemples
Entrées de cette collection
Configuration Datadog APM
Configuration complète Datadog APM pour le monitoring de performance d'application avec spans et métriques personnalisées
Difficulté
6/10
Temps estimé
25 min
Étiquettes
datadog, apm, monitoring, javascript, node.js
Prérequis
Node.js, npm, Datadog account
// Datadog APM Configuration Example
// Install: npm install dd-trace
const tracer = require('dd-trace')
// Initialize tracer with configuration
tracer.init({
service: 'my-web-app',
env: process.env.NODE_ENV || 'development',
version: process.env.APP_VERSION || '1.0.0',
logInjection: true,
analytics: true,
// Sample rate for traces (0 to 1)
sampleRate: 1.0,
// Enable runtime metrics
runtimeMetrics: true,
// Enable health check metrics
healthCheckMetrics: true,
// Exporter configuration
url: 'https://trace.agent.datadoghq.com',
// Headers for authentication
headers: {
'x-datadog-trace-id': process.env.DD_TRACE_ID
}
})
// Custom span example
function processUserRequest(userId, action) {
const span = tracer.startSpan('user.process_request', {
resource: action,
tags: {
'user.id': userId,
'action.type': action
}
})
try {
// Simulate processing
const result = performBusinessLogic(userId, action)
// Add custom metrics
span.setTag('process.success', true)
span.setTag('process.duration_ms', Date.now() - span.startTime)
return result
} catch (error) {
span.setTag('process.success', false)
span.setTag('error.message', error.message)
throw error
} finally {
span.finish()
}
}
// Distributed tracing example
async function processOrder(orderId) {
const span = tracer.startSpan('order.process', {
resource: 'order_processing',
tags: {
'order.id': orderId
}
})
try {
// Step 1: Validate order
const validateSpan = tracer.startSpan('order.validate', {
childOf: span
})
await validateOrder(orderId)
validateSpan.finish()
// Step 2: Process payment
const paymentSpan = tracer.startSpan('order.payment', {
childOf: span,
tags: { 'payment.provider': 'stripe' }
})
await processPayment(orderId)
paymentSpan.finish()
// Step 3: Update inventory
const inventorySpan = tracer.startSpan('order.inventory', {
childOf: span
})
await updateInventory(orderId)
inventorySpan.finish()
span.setTag('order.status', 'completed')
return { success: true, orderId }
} catch (error) {
span.setTag('order.status', 'failed')
span.setTag('error.message', error.message)
throw error
} finally {
span.finish()
}
}
// Custom metrics example
const api = require('dd-trace').metrics
// Increment counter
api.increment('api.requests.total', 1, ['method:GET', 'endpoint:/api/users'])
// Gauge value
api.gauge('database.connections.active', activeConnections)
// Histogram
api.histogram('api.response.time', responseTime, ['endpoint:/api/users'])
// Timer
const timer = api.timer('database.query.time')
async function trackDatabaseQueryTime() {
timer.start()
await database.query('SELECT * FROM users')
timer.stop()
}
// Express.js integration example
const express = require('express')
const app = express()
// Datadog middleware for Express
app.use((req, res, next) => {
const span = tracer.startSpan('express.request', {
resource: `${req.method} ${req.path}`,
tags: {
'http.method': req.method,
'http.url': req.url,
'http.user_agent': req.get('User-Agent')
}
})
res.on('finish', () => {
span.setTag('http.status_code', res.statusCode)
span.setTag('http.response_time_ms', Date.now() - span.startTime)
span.finish()
})
next()
})
app.get('/api/users', async (req, res) => {
const span = tracer.scope().active()
try {
span.setTag('operation.name', 'get_users')
const users = await database.getUsers()
span.setTag('users.count', users.length)
res.json({ users })
} catch (error) {
span.setTag('error.message', error.message)
res.status(500).json({ error: 'Internal server error' })
}
})
// Start server
const port = process.env.PORT || 3000
app.listen(port, () => {
console.log(`Server running on port ${port}`)
})Gestion des Logs Datadog
Configuration de collecte, parsing et gestion des logs pour le logging centralisé
Difficulté
6/10
Temps estimé
30 min
Étiquettes
datadog, logs, logging, monitoring
Prérequis
Datadog account, Basic logging concepts
# Datadog Agent Configuration for Log Management
# Location: /etc/datadog-agent/datadog.yaml
# Enable log collection
logs:
enabled: true
# Container log collection
container_collect_all: true
# Auto-configuration for container logs
container_collect_using_logs_agent: true
# Configure log processing
logs_config:
# Number of processing workers
processing_workers: 2
# Maximum log size
logs_max_limit: 256000
# Auto-configuration
auto_multi_line_detection: true
# Python application logging with Datadog
"""
requirements.txt:
ddtrace>=1.0.0
datadog>=0.44.0
"""
import logging
import ddtrace
from datadog import initialize, statsd
# Initialize Datadog
initialize({
'api_key': 'your-api-key',
'app_key': 'your-app-key',
'statsd_host': 'localhost',
'statsd_port': 8125
})
# Configure logger with Datadog formatter
class DatadogFormatter(logging.Formatter):
def format(self, record):
# Add DD trace context
span = ddtrace.tracer.current_span()
if span:
record.dd = {
'trace_id': span.trace_id,
'span_id': span.span_id,
'service': span.service
}
# Add custom fields
record.environment = 'production'
record.version = '1.0.0'
return super().format(record)
# Set up logger
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(DatadogFormatter(
'%(asctime)s %(levelname)s %(name)s %(message)s'
))
logger.addHandler(handler)
# Example usage
def process_user_data(user_id):
logger.info(f"Processing user data", extra={
'user_id': user_id,
'operation': 'process_user_data',
'environment': 'production'
})
try:
# Business logic here
result = perform_operation(user_id)
logger.info("User data processed successfully", extra={
'user_id': user_id,
'result_count': len(result)
})
return result
except Exception as e:
logger.error(f"Failed to process user data: {e}", extra={
'user_id': user_id,
'error_type': type(e).__name__
})
raise
# Node.js structured logging with winston
"""
package.json dependencies:
{
"winston": "^3.8.0",
"dd-trace": "^1.0.0",
"winston-datadog-logs-transport": "^2.0.0"
}
"""
const winston = require('winston')
const tracer = require('dd-trace')
const { DatadogLogsTransport } = require('winston-datadog-logs-transport')
// Configure winston logger with Datadog transport
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json(),
winston.format.metadata({
fillExcept: ['message', 'level', 'timestamp', 'label']
})
),
transports: [
// Console transport
new winston.transports.Console({
format: winston.format.simple()
}),
// Datadog logs transport
new DatadogLogsTransport({
apiKey: process.env.DD_API_KEY,
service: 'my-web-app',
hostname: process.env.HOSTNAME,
env: process.env.NODE_ENV,
// Add trace context
ddsource: 'nodejs',
ddtags: 'env:production,version:1.0.0'
})
]
})
// Add trace context to logs
function addTraceContext(metadata = {}) {
const span = tracer.scope().active()
if (span) {
return {
...metadata,
dd: {
trace_id: span.context().toTraceId(),
span_id: span.context().toSpanId()
}
}
}
return metadata
}
// Usage example
function handleRequest(req, res) {
logger.info('Request received', addTraceContext({
method: req.method,
url: req.url,
userAgent: req.get('User-Agent')
}))
try {
// Process request
const result = processRequest(req)
logger.info('Request processed successfully', addTraceContext({
result_count: result.length
}))
res.json(result)
} catch (error) {
logger.error('Request processing failed', addTraceContext({
error: error.message,
stack: error.stack
}))
res.status(500).json({ error: 'Internal server error' })
}
}
# Docker configuration for log collection
# docker-compose.yml
version: '3.8'
services:
app:
image: my-app:latest
environment:
- DD_API_KEY=${DD_API_KEY}
- DD_SERVICE=my-app
- DD_ENV=production
- DD_VERSION=1.0.0
- DD_LOGS_INJECTION=true
labels:
- "com.datadoghq.ad.logs='[{"source": "app", "service": "my-app"}]'"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
datadog-agent:
image: gcr.io/datadoghq/agent:latest
environment:
- DD_API_KEY=${DD_API_KEY}
- DD_SITE=datadoghq.com
- DD_LOGS_ENABLED=true
- DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true
- DD_AC_EXCLUDE="name:datadog-agent"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- /proc/:/host/proc/:ro
- /sys/fs/cgroup/:/host/sys/fs/cgroup:roDashboards Datadog
Créer et configurer des dashboards de monitoring complets avec widgets et graphiques
Difficulté
7/10
Temps estimé
35 min
Étiquettes
datadog, dashboards, monitoring, visualization
Prérequis
Datadog account, Dashboard concepts
{
"title": "Application Performance Dashboard",
"description": "Real-time monitoring of application performance and health",
"widgets": [
{
"id": 1,
"title": "Request Rate",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:flask.request.count{*}.rollup(sum, 60)",
"display_type": "line",
"style": {
"palette": "dog_classic",
"line_type": "solid",
"line_width": "normal"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Requests per second",
"min": "auto",
"max": "auto"
}
}
},
{
"id": 2,
"title": "Response Time",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:flask.request.duration{*}.rollup(avg, 60)",
"display_type": "line",
"style": {
"palette": "warm"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Response time (ms)",
"min": "auto"
}
}
},
{
"id": 3,
"title": "Error Rate",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100",
"display_type": "line",
"style": {
"palette": "critical"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Error rate (%)",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Active Users",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:web.active_users{*}.rollup(avg, 300)",
"display_type": "area",
"style": {
"palette": "orange"
}
}
]
}
},
{
"id": 5,
"title": "Database Connections",
"definition": {
"type": "query_value",
"requests": [
{
"q": "avg:postgres.connections{*}",
"display_type": "scalar",
"conditional_formats": [
{
"comparator": ">",
"value": 80,
"palette": "white_on_red"
},
{
"comparator": ">",
"value": 60,
"palette": "white_on_yellow"
}
]
}
]
}
},
{
"id": 6,
"title": "CPU Usage by Service",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:system.cpu.total{*} by {service}",
"display_type": "stacked",
"style": {
"palette": "purple"
}
}
]
}
},
{
"id": 7,
"title": "Memory Usage",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:system.mem.total{*} - avg:system.mem.usable{*}",
"display_type": "area",
"style": {
"palette": "blue"
}
}
]
}
},
{
"id": 8,
"title": "Top 10 Slowest Endpoints",
"definition": {
"type": "table",
"requests": [
{
"q": "top(avg:flask.request.duration{*} by {resource_name}, 10, 'mean', 'desc')",
"display_type": "table"
}
],
"style": {
"palette": "blue"
}
}
},
{
"id": 9,
"title": "API Response Status Codes",
"definition": {
"type": "toplist",
"requests": [
{
"q": "sum:flask.request.count{*} by {http.status_code}",
"display_type": "categorical"
}
]
}
},
{
"id": 10,
"title": "Error Logs",
"definition": {
"type": "log_stream",
"requests": [
{
"query": "status:error",
"columns": ["host", "service", "error.stack"],
"sort": {
"order": "desc",
"column": "timestamp"
},
"message_display": "expanded"
}
],
"log_query": "source:python status:error",
"event_size": "l",
"title": "Error Logs",
"title_align": "left",
"title_size": "16"
}
}
],
"layout_type": "ordered",
"description": "Comprehensive application monitoring dashboard",
"notify_list": [],
"template_variables": [
{
"name": "env",
"prefix": null,
"default": "production"
},
{
"name": "service",
"prefix": null,
"default": "my-web-app"
}
],
"is_read_only": false,
"created_at": "2025-12-11T00:00:00.000Z",
"modified_at": "2025-12-11T00:00:00.000Z"
}
# Terraform configuration for dashboard management
# dashboard.tf
resource "datadog_dashboard" "application_dashboard" {
title = "Application Performance Dashboard"
description = "Real-time monitoring of application performance"
template_variable {
name = "env"
default = "production"
}
template_variable {
name = "service"
default = "my-web-app"
}
widget {
timeseries_definition {
title = "Request Rate"
request {
q = "avg:flask.request.count{*}.rollup(sum, 60)"
display_type = "line"
}
}
}
widget {
timeseries_definition {
title = "Response Time"
request {
q = "avg:flask.request.duration{*}.rollup(avg, 60)"
display_type = "line"
}
}
}
widget {
query_value_definition {
title = "Error Rate"
request {
q = "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100"
}
autoscale = true
precision = 2
}
}
}
# Custom widget configuration
const dashboardConfig = {
widgets: [
// Service Map widget
{
id: 11,
title: "Service Dependencies",
definition: {
"type": "servicemap",
"service": "my-web-app",
"env": "production",
"display_format": "list"
}
},
// Heatmap widget
{
id: 12,
title: "Response Time Distribution",
definition: {
"type": "heatmap",
"requests": [
{
"q": "avg:flask.request.duration{*} by {resource_name}.rollup(avg, 60)",
"display_type": "heatmap"
}
],
"yaxis": {
"scale": "log"
}
}
},
// Change widget
{
id: 13,
title: "Deployment Changes",
definition: {
"type": "change",
"requests": [
{
"q": "avg:flask.request.duration{*} before(last:30m, last:24h)",
"compare_to": "week_before"
}
]
}
}
]
}Alertes et Moniteurs Datadog
Configurer des alertes intelligentes et moniteurs pour le monitoring proactif et la réponse aux incidents
Difficulté
8/10
Temps estimé
45 min
Étiquettes
datadog, alerts, monitors, incident-response
Prérequis
Datadog account, Monitoring concepts, Alerting best practices
# Datadog Monitor Configuration Examples
# 1. High Error Rate Monitor
apiVersion: v1
kind: Monitor
metadata:
name: high-error-rate
spec:
name: "[Production] High Error Rate Alert"
type: "query alert"
query: "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"
message: |
🚨 High error rate detected!
Error rate: {{value}}%
Threshold: 5%
Duration: 15 minutes
Service: {{service.name}}
Environment: {{env.name}}
@pagerduty-service-escalation
@slack-alerts
options:
threshold_windows:
window: "last_15m"
thresholds:
critical: 5
warning: 3
notify_audit: false
locked: false
timeout_h: 0
require_full_window: true
new_host_delay: 300
notify_no_data: false
renotify_interval: 0
escalation_message: ""
include_tags: true
tags: ["env:production", "team:backend", "severity:critical"]
# 2. High Response Time Monitor
apiVersion: v1
kind: Monitor
metadata:
name: high-response-time
spec:
name: "[Production] High Response Time Alert"
type: "query alert"
query: "avg(last_10m):avg:flask.request.duration{*}.rollup(avg, 60) > 1000"
message: |
⚠️ High response time detected!
Current average: {{value}}ms
Threshold: 1000ms
Duration: 10 minutes
Service: {{service.name}}
Monitor: {{#is_alert}}Response time is critically high{{/is_alert}}
{{#is_warning}}Response time is elevated{{/is_warning}}
{{#is_recovery}}Response time has recovered{{/is_recovery}}
options:
thresholds:
critical: 1000
warning: 500
threshold_windows:
window: "last_10m"
notify_audit: true
require_full_window: false
tags: ["env:production", "team:backend", "performance"]
# 3. Database Connection Monitor
apiVersion: v1
kind: Monitor
metadata:
name: database-connections
spec:
name: "[Production] Database Connections High"
type: "query alert"
query: "avg(last_5m):avg:postgres.connections{*} > 80"
message: |
🗄️ Database connection pool is nearly full!
Connections: {{value}}
Max connections: 100
Utilization: {{value}}%
Server: {{host.name}}
Database: postgres
Action required: Check for connection leaks or consider scaling database.
@database-team
options:
thresholds:
critical: 80
warning: 70
require_full_window: false
evaluation_delay: 300
tags: ["env:production", "database", "postgres", "team:backend"]
# 4. Memory Usage Monitor
apiVersion: v1
kind: Monitor
metadata:
name: memory-usage
spec:
name: "[Production] High Memory Usage"
type: "metric alert"
query: "avg(last_10m):(avg:system.mem.total{*} - avg:system.mem.usable{*}) / avg:system.mem.total{*} * 100 > 85"
message: |
💾 High memory usage detected!
Memory usage: {{value}}%
Threshold: 85%
Duration: 10 minutes
Host: {{host.name}}
Service: {{service.name}}
@ops-team
options:
thresholds:
critical: 85
warning: 75
notify_audit: true
require_full_window: true
tags: ["env:production", "infrastructure", "memory"]
# 5. Custom Service Health Monitor
apiVersion: v1
kind: Monitor
metadata:
name: service-health
spec:
name: "[Production] Service Health Check"
type: "service check"
query: ""my_app.health_check".over("env:production").by("service").last(2).count_by_status()"
message: |
🏥 Service health check failing!
Service: {{service.name}}
Status: {{#is_alert}}CRITICAL{{/is_alert}}{{#is_warning}}WARNING{{/is_warning}}{{#is_recovery}}RECOVERED{{/is_recovery}}
Last 2 checks: {{check_message}}
@oncall
options:
thresholds:
ok: 1
critical: 1
notify_audit: true
renotify_interval: 15
tags: ["env:production", "health-check", "team:backend"]
# 6. Anomaly Detection Monitor
apiVersion: v1
kind: Monitor
metadata:
name: traffic-anomaly
spec:
name: "[Production] Traffic Anomaly Detection"
type: "anomaly alert"
query: "avg(last_1h):anomalies(avg:flask.request.count{*}.rollup(sum, 60), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true')"
message: |
📊 Traffic anomaly detected!
Current requests/sec: {{value}}
Expected range: {{threshold}} ± {{#is_alert}}2σ{{/is_alert}}
Time window: last 15 minutes
Historical baseline: last 1 hour
This might indicate:
- Sudden traffic spike
- Load testing
- Potential attack
@ops-team @slack-alerts
options:
threshold_windows: {
window: "last_1h"
}
notify_no_data: false
tags: ["env:production", "anomaly", "traffic", "team:backend"]
# 7. Composite Monitor (Multi-Condition)
apiVersion: v1
kind: Monitor
metadata:
name: composite-health
spec:
name: "[Production] Composite System Health"
type: "composite"
query: "avg(last_15m):avg:flask.request.duration{*} < 500 and avg(last_15m):(sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100) < 1"
message: |
🔍 Composite system health status:
{{#is_alert}}⚠️ SYSTEM UNHEALTHY{{/is_alert}}
{{#is_warning}}⚠️ SYSTEM DEGRADED{{/is_warning}}
{{#is_recovery}}✅ SYSTEM HEALTHY{{/is_recovery}}
Conditions:
- Response time < 500ms: {{#is_ok_response_time}}✅{{/is_ok_response_time}}{{#is_warning_response_time}}⚠️{{/is_warning_response_time}}
- Error rate < 1%: {{#is_ok_error_rate}}✅{{/is_ok_error_rate}}{{#is_warning_error_rate}}⚠️{{/is_warning_error_rate}}
@oncall @ops-team
options:
require_full_window: false
tags: ["env:production", "composite", "health", "team:backend"]
# Terraform configuration for monitors
# monitors.tf
resource "datadog_monitor" "high_error_rate" {
name = "[Production] High Error Rate Alert"
type = "query alert"
query = "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"
message = <<EOF
🚨 High error rate detected!
Error rate: {{value}}%
Threshold: 5%
Duration: 15 minutes
Service: {{service.name}}
Environment: {{env.name}}
@pagerduty-service-escalation
@slack-alerts
EOF
tags = ["env:production", "team:backend", "severity:critical"]
monitor_thresholds {
critical = 5
warning = 3
}
threshold_windows {
trigger_window = "last_15m"
recovery_window = "last_15m"
}
notify_no_data = false
renotify_interval = 60
}
resource "datadog_monitor" "database_connections" {
name = "[Production] Database Connections High"
type = "metric alert"
query = "avg(last_5m):avg:postgres.connections{*} > 80"
message = <<EOF
🗄️ Database connection pool is nearly full!
Connections: {{value}}
Max connections: 100
Utilization: {{value}}%
Server: {{host.name}}
Database: postgres
Action required: Check for connection leaks or consider scaling database.
@database-team
EOF
tags = ["env:production", "database", "postgres", "team:backend"]
monitor_thresholds {
critical = 80
warning = 70
}
}
# Monitor with dynamic thresholds
resource "datadog_monitor" "adaptive_threshold" {
name = "[Production] Adaptive Response Time"
type = "anomaly alert"
query = "avg(last_1h):anomalies(avg:flask.request.duration{*}.rollup(avg, 60), 'basic', 2, direction='above', alert_window='last_15m', interval=60)"
message = <<EOF
📊 Response time anomaly detected!
Current response time: {{value}}ms
Historical baseline: 1 hour data
Anomaly threshold: ±2 standard deviations
Service: {{service.name}}
Environment: {{env.name}}
@ops-team
EOF
tags = ["env:production", "anomaly", "performance", "team:backend"]
}Outils
Outils souvent utilisés avec cet exemple
Associé