🎯 Ejemplos recomendados
Balanced sample collections from various categories for you to explore
Ejemplos de Datadog Monitoring
Ejemplos completos de configuración de monitoring y observabilidad de Datadog incluyendo APM, logs, métricas y dashboards
💻 Configuración de Datadog APM javascript
🟡 intermediate
⭐⭐⭐
Configuración completa de Datadog APM para monitoreo de rendimiento de aplicaciones con spans y métricas personalizadas
⏱️ 25 min
🏷️ datadog, apm, monitoring, javascript, node.js
Prerequisites:
Node.js, npm, Datadog account
// Datadog APM Configuration Example
// Install: npm install dd-trace
const tracer = require('dd-trace')
// Initialize tracer with configuration
tracer.init({
service: 'my-web-app',
env: process.env.NODE_ENV || 'development',
version: process.env.APP_VERSION || '1.0.0',
logInjection: true,
analytics: true,
// Sample rate for traces (0 to 1)
sampleRate: 1.0,
// Enable runtime metrics
runtimeMetrics: true,
// Enable health check metrics
healthCheckMetrics: true,
// Exporter configuration
url: 'https://trace.agent.datadoghq.com',
// Headers for authentication
headers: {
'x-datadog-trace-id': process.env.DD_TRACE_ID
}
})
// Custom span example
function processUserRequest(userId, action) {
const span = tracer.startSpan('user.process_request', {
resource: action,
tags: {
'user.id': userId,
'action.type': action
}
})
try {
// Simulate processing
const result = performBusinessLogic(userId, action)
// Add custom metrics
span.setTag('process.success', true)
span.setTag('process.duration_ms', Date.now() - span.startTime)
return result
} catch (error) {
span.setTag('process.success', false)
span.setTag('error.message', error.message)
throw error
} finally {
span.finish()
}
}
// Distributed tracing example
async function processOrder(orderId) {
const span = tracer.startSpan('order.process', {
resource: 'order_processing',
tags: {
'order.id': orderId
}
})
try {
// Step 1: Validate order
const validateSpan = tracer.startSpan('order.validate', {
childOf: span
})
await validateOrder(orderId)
validateSpan.finish()
// Step 2: Process payment
const paymentSpan = tracer.startSpan('order.payment', {
childOf: span,
tags: { 'payment.provider': 'stripe' }
})
await processPayment(orderId)
paymentSpan.finish()
// Step 3: Update inventory
const inventorySpan = tracer.startSpan('order.inventory', {
childOf: span
})
await updateInventory(orderId)
inventorySpan.finish()
span.setTag('order.status', 'completed')
return { success: true, orderId }
} catch (error) {
span.setTag('order.status', 'failed')
span.setTag('error.message', error.message)
throw error
} finally {
span.finish()
}
}
// Custom metrics example
const api = require('dd-trace').metrics
// Increment counter
api.increment('api.requests.total', 1, ['method:GET', 'endpoint:/api/users'])
// Gauge value
api.gauge('database.connections.active', activeConnections)
// Histogram
api.histogram('api.response.time', responseTime, ['endpoint:/api/users'])
// Timer
const timer = api.timer('database.query.time')
timer.start()
await database.query('SELECT * FROM users')
timer.stop()
// Express.js integration example
const express = require('express')
const app = express()
// Datadog middleware for Express
app.use((req, res, next) => {
const span = tracer.startSpan('express.request', {
resource: `${req.method} ${req.path}`,
tags: {
'http.method': req.method,
'http.url': req.url,
'http.user_agent': req.get('User-Agent')
}
})
res.on('finish', () => {
span.setTag('http.status_code', res.statusCode)
span.setTag('http.response_time_ms', Date.now() - span.startTime)
span.finish()
})
next()
})
app.get('/api/users', async (req, res) => {
const span = tracer.scope().active()
try {
span.setTag('operation.name', 'get_users')
const users = await database.getUsers()
span.setTag('users.count', users.length)
res.json({ users })
} catch (error) {
span.setTag('error.message', error.message)
res.status(500).json({ error: 'Internal server error' })
}
})
// Start server
const port = process.env.PORT || 3000
app.listen(port, () => {
console.log(`Server running on port ${port}`)
})
💻 Gestión de Logs de Datadog yaml
🟡 intermediate
⭐⭐⭐
Configuración de recolección, procesamiento y gestión de logs para logging centralizado
⏱️ 30 min
🏷️ datadog, logs, logging, monitoring
Prerequisites:
Datadog account, Basic logging concepts
# Datadog Agent Configuration for Log Management
# Location: /etc/datadog-agent/datadog.yaml
# Enable log collection
logs:
enabled: true
# Container log collection
container_collect_all: true
# Auto-configuration for container logs
container_collect_using_logs_agent: true
# Configure log processing
logs_config:
# Number of processing workers
processing_workers: 2
# Maximum log size
logs_max_limit: 256000
# Auto-configuration
auto_multi_line_detection: true
# Python application logging with Datadog
"""
requirements.txt:
ddtrace>=1.0.0
datadog>=0.44.0
"""
import logging
import ddtrace
from datadog import initialize, statsd
# Initialize Datadog
initialize({
'api_key': 'your-api-key',
'app_key': 'your-app-key',
'statsd_host': 'localhost',
'statsd_port': 8125
})
# Configure logger with Datadog formatter
class DatadogFormatter(logging.Formatter):
def format(self, record):
# Add DD trace context
span = ddtrace.tracer.current_span()
if span:
record.dd = {
'trace_id': span.trace_id,
'span_id': span.span_id,
'service': span.service
}
# Add custom fields
record.environment = 'production'
record.version = '1.0.0'
return super().format(record)
# Set up logger
logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(DatadogFormatter(
'%(asctime)s %(levelname)s %(name)s %(message)s'
))
logger.addHandler(handler)
# Example usage
def process_user_data(user_id):
logger.info(f"Processing user data", extra={
'user_id': user_id,
'operation': 'process_user_data',
'environment': 'production'
})
try:
# Business logic here
result = perform_operation(user_id)
logger.info("User data processed successfully", extra={
'user_id': user_id,
'result_count': len(result)
})
return result
except Exception as e:
logger.error(f"Failed to process user data: {e}", extra={
'user_id': user_id,
'error_type': type(e).__name__
})
raise
# Node.js structured logging with winston
"""
package.json dependencies:
{
"winston": "^3.8.0",
"dd-trace": "^1.0.0",
"winston-datadog-logs-transport": "^2.0.0"
}
"""
const winston = require('winston')
const tracer = require('dd-trace')
const { DatadogLogsTransport } = require('winston-datadog-logs-transport')
// Configure winston logger with Datadog transport
const logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json(),
winston.format.metadata({
fillExcept: ['message', 'level', 'timestamp', 'label']
})
),
transports: [
// Console transport
new winston.transports.Console({
format: winston.format.simple()
}),
// Datadog logs transport
new DatadogLogsTransport({
apiKey: process.env.DD_API_KEY,
service: 'my-web-app',
hostname: process.env.HOSTNAME,
env: process.env.NODE_ENV,
// Add trace context
ddsource: 'nodejs',
ddtags: 'env:production,version:1.0.0'
})
]
})
// Add trace context to logs
function addTraceContext(metadata = {}) {
const span = tracer.scope().active()
if (span) {
return {
...metadata,
dd: {
trace_id: span.context().toTraceId(),
span_id: span.context().toSpanId()
}
}
}
return metadata
}
// Usage example
function handleRequest(req, res) {
logger.info('Request received', addTraceContext({
method: req.method,
url: req.url,
userAgent: req.get('User-Agent')
}))
try {
// Process request
const result = processRequest(req)
logger.info('Request processed successfully', addTraceContext({
result_count: result.length
}))
res.json(result)
} catch (error) {
logger.error('Request processing failed', addTraceContext({
error: error.message,
stack: error.stack
}))
res.status(500).json({ error: 'Internal server error' })
}
}
# Docker configuration for log collection
# docker-compose.yml
version: '3.8'
services:
app:
image: my-app:latest
environment:
- DD_API_KEY=${DD_API_KEY}
- DD_SERVICE=my-app
- DD_ENV=production
- DD_VERSION=1.0.0
- DD_LOGS_INJECTION=true
labels:
- "com.datadoghq.ad.logs='[{"source": "app", "service": "my-app"}]'"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
datadog-agent:
image: gcr.io/datadoghq/agent:latest
environment:
- DD_API_KEY=${DD_API_KEY}
- DD_SITE=datadoghq.com
- DD_LOGS_ENABLED=true
- DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL=true
- DD_AC_EXCLUDE="name:datadog-agent"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- /proc/:/host/proc/:ro
- /sys/fs/cgroup/:/host/sys/fs/cgroup:ro
💻 Dashboards de Datadog json
🟡 intermediate
⭐⭐⭐⭐
Crear y configurar dashboards completos de monitoreo con widgets y gráficos
⏱️ 35 min
🏷️ datadog, dashboards, monitoring, visualization
Prerequisites:
Datadog account, Dashboard concepts
{
"title": "Application Performance Dashboard",
"description": "Real-time monitoring of application performance and health",
"widgets": [
{
"id": 1,
"title": "Request Rate",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:flask.request.count{*}.rollup(sum, 60)",
"display_type": "line",
"style": {
"palette": "dog_classic",
"line_type": "solid",
"line_width": "normal"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Requests per second",
"min": "auto",
"max": "auto"
}
}
},
{
"id": 2,
"title": "Response Time",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:flask.request.duration{*}.rollup(avg, 60)",
"display_type": "line",
"style": {
"palette": "warm"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Response time (ms)",
"min": "auto"
}
}
},
{
"id": 3,
"title": "Error Rate",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100",
"display_type": "line",
"style": {
"palette": "critical"
}
}
],
"yaxis": {
"scale": "linear",
"label": "Error rate (%)",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Active Users",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:web.active_users{*}.rollup(avg, 300)",
"display_type": "area",
"style": {
"palette": "orange"
}
}
]
}
},
{
"id": 5,
"title": "Database Connections",
"definition": {
"type": "query_value",
"requests": [
{
"q": "avg:postgres.connections{*}",
"display_type": "scalar",
"conditional_formats": [
{
"comparator": ">",
"value": 80,
"palette": "white_on_red"
},
{
"comparator": ">",
"value": 60,
"palette": "white_on_yellow"
}
]
}
]
}
},
{
"id": 6,
"title": "CPU Usage by Service",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:system.cpu.total{*} by {service}",
"display_type": "stacked",
"style": {
"palette": "purple"
}
}
]
}
},
{
"id": 7,
"title": "Memory Usage",
"definition": {
"type": "timeseries",
"requests": [
{
"q": "avg:system.mem.total{*} - avg:system.mem.usable{*}",
"display_type": "area",
"style": {
"palette": "blue"
}
}
]
}
},
{
"id": 8,
"title": "Top 10 Slowest Endpoints",
"definition": {
"type": "table",
"requests": [
{
"q": "top(avg:flask.request.duration{*} by {resource_name}, 10, 'mean', 'desc')",
"display_type": "table"
}
],
"style": {
"palette": "blue"
}
}
},
{
"id": 9,
"title": "API Response Status Codes",
"definition": {
"type": "toplist",
"requests": [
{
"q": "sum:flask.request.count{*} by {http.status_code}",
"display_type": "categorical"
}
]
}
},
{
"id": 10,
"title": "Error Logs",
"definition": {
"type": "log_stream",
"requests": [
{
"query": "status:error",
"columns": ["host", "service", "error.stack"],
"sort": {
"order": "desc",
"column": "timestamp"
},
"message_display": "expanded"
}
],
"log_query": "source:python status:error",
"event_size": "l",
"title": "Error Logs",
"title_align": "left",
"title_size": "16"
}
}
],
"layout_type": "ordered",
"description": "Comprehensive application monitoring dashboard",
"notify_list": [],
"template_variables": [
{
"name": "env",
"prefix": null,
"default": "production"
},
{
"name": "service",
"prefix": null,
"default": "my-web-app"
}
],
"is_read_only": false,
"created_at": "2025-12-11T00:00:00.000Z",
"modified_at": "2025-12-11T00:00:00.000Z"
}
# Terraform configuration for dashboard management
# dashboard.tf
resource "datadog_dashboard" "application_dashboard" {
title = "Application Performance Dashboard"
description = "Real-time monitoring of application performance"
template_variable {
name = "env"
default = "production"
}
template_variable {
name = "service"
default = "my-web-app"
}
widget {
timeseries_definition {
title = "Request Rate"
request {
q = "avg:flask.request.count{*}.rollup(sum, 60)"
display_type = "line"
}
}
}
widget {
timeseries_definition {
title = "Response Time"
request {
q = "avg:flask.request.duration{*}.rollup(avg, 60)"
display_type = "line"
}
}
}
widget {
query_value_definition {
title = "Error Rate"
request {
q = "sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100"
}
autoscale = true
precision = 2
}
}
}
# Custom widget configuration
const dashboardConfig = {
widgets: [
// Service Map widget
{
id: 11,
title: "Service Dependencies",
definition: {
"type": "servicemap",
"service": "my-web-app",
"env": "production",
"display_format": "list"
}
},
// Heatmap widget
{
id: 12,
title: "Response Time Distribution",
definition: {
"type": "heatmap",
"requests": [
{
"q": "avg:flask.request.duration{*} by {resource_name}.rollup(avg, 60)",
"display_type": "heatmap"
}
],
"yaxis": {
"scale": "log"
}
}
},
// Change widget
{
id: 13,
title: "Deployment Changes",
definition: {
"type": "change",
"requests": [
{
"q": "avg:flask.request.duration{*} before(last:30m, last:24h)",
"compare_to": "week_before"
}
]
}
}
]
}
💻 Alertas y Monitores de Datadog yaml
🔴 complex
⭐⭐⭐⭐
Configurar alertas inteligentes y monitores para monitoreo proactivo y respuesta a incidentes
⏱️ 45 min
🏷️ datadog, alerts, monitors, incident-response
Prerequisites:
Datadog account, Monitoring concepts, Alerting best practices
# Datadog Monitor Configuration Examples
# 1. High Error Rate Monitor
apiVersion: v1
kind: Monitor
metadata:
name: high-error-rate
spec:
name: "[Production] High Error Rate Alert"
type: "query alert"
query: "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"
message: |
🚨 High error rate detected!
Error rate: {{value}}%
Threshold: 5%
Duration: 15 minutes
Service: {{service.name}}
Environment: {{env.name}}
@pagerduty-service-escalation
@slack-alerts
options:
threshold_windows:
window: "last_15m"
thresholds:
critical: 5
warning: 3
notify_audit: false
locked: false
timeout_h: 0
require_full_window: true
new_host_delay: 300
notify_no_data: false
renotify_interval: 0
escalation_message: ""
include_tags: true
tags: ["env:production", "team:backend", "severity:critical"]
# 2. High Response Time Monitor
apiVersion: v1
kind: Monitor
metadata:
name: high-response-time
spec:
name: "[Production] High Response Time Alert"
type: "query alert"
query: "avg(last_10m):avg:flask.request.duration{*}.rollup(avg, 60) > 1000"
message: |
⚠️ High response time detected!
Current average: {{value}}ms
Threshold: 1000ms
Duration: 10 minutes
Service: {{service.name}}
Monitor: {{#is_alert}}Response time is critically high{{/is_alert}}
{{#is_warning}}Response time is elevated{{/is_warning}}
{{#is_recovery}}Response time has recovered{{/is_recovery}}
options:
thresholds:
critical: 1000
warning: 500
threshold_windows:
window: "last_10m"
notify_audit: true
require_full_window: false
tags: ["env:production", "team:backend", "performance"]
# 3. Database Connection Monitor
apiVersion: v1
kind: Monitor
metadata:
name: database-connections
spec:
name: "[Production] Database Connections High"
type: "query alert"
query: "avg(last_5m):avg:postgres.connections{*} > 80"
message: |
🗄️ Database connection pool is nearly full!
Connections: {{value}}
Max connections: 100
Utilization: {{value}}%
Server: {{host.name}}
Database: postgres
Action required: Check for connection leaks or consider scaling database.
@database-team
options:
thresholds:
critical: 80
warning: 70
require_full_window: false
evaluation_delay: 300
tags: ["env:production", "database", "postgres", "team:backend"]
# 4. Memory Usage Monitor
apiVersion: v1
kind: Monitor
metadata:
name: memory-usage
spec:
name: "[Production] High Memory Usage"
type: "metric alert"
query: "avg(last_10m):(avg:system.mem.total{*} - avg:system.mem.usable{*}) / avg:system.mem.total{*} * 100 > 85"
message: |
💾 High memory usage detected!
Memory usage: {{value}}%
Threshold: 85%
Duration: 10 minutes
Host: {{host.name}}
Service: {{service.name}}
@ops-team
options:
thresholds:
critical: 85
warning: 75
notify_audit: true
require_full_window: true
tags: ["env:production", "infrastructure", "memory"]
# 5. Custom Service Health Monitor
apiVersion: v1
kind: Monitor
metadata:
name: service-health
spec:
name: "[Production] Service Health Check"
type: "service check"
query: ""my_app.health_check".over("env:production").by("service").last(2).count_by_status()"
message: |
🏥 Service health check failing!
Service: {{service.name}}
Status: {{#is_alert}}CRITICAL{{/is_alert}}{{#is_warning}}WARNING{{/is_warning}}{{#is_recovery}}RECOVERED{{/is_recovery}}
Last 2 checks: {{check_message}}
@oncall
options:
thresholds:
ok: 1
critical: 1
notify_audit: true
renotify_interval: 15
tags: ["env:production", "health-check", "team:backend"]
# 6. Anomaly Detection Monitor
apiVersion: v1
kind: Monitor
metadata:
name: traffic-anomaly
spec:
name: "[Production] Traffic Anomaly Detection"
type: "anomaly alert"
query: "avg(last_1h):anomalies(avg:flask.request.count{*}.rollup(sum, 60), 'basic', 2, direction='both', alert_window='last_15m', interval=60, count_default_zero='true')"
message: |
📊 Traffic anomaly detected!
Current requests/sec: {{value}}
Expected range: {{threshold}} ± {{#is_alert}}2σ{{/is_alert}}
Time window: last 15 minutes
Historical baseline: last 1 hour
This might indicate:
- Sudden traffic spike
- Load testing
- Potential attack
@ops-team @slack-alerts
options:
threshold_windows: {
window: "last_1h"
}
notify_no_data: false
tags: ["env:production", "anomaly", "traffic", "team:backend"]
# 7. Composite Monitor (Multi-Condition)
apiVersion: v1
kind: Monitor
metadata:
name: composite-health
spec:
name: "[Production] Composite System Health"
type: "composite"
query: "avg(last_15m):avg:flask.request.duration{*} < 500 and avg(last_15m):(sum:flask.request.errors{*}.rollup(sum, 60) / sum:flask.request.count{*}.rollup(sum, 60) * 100) < 1"
message: |
🔍 Composite system health status:
{{#is_alert}}⚠️ SYSTEM UNHEALTHY{{/is_alert}}
{{#is_warning}}⚠️ SYSTEM DEGRADED{{/is_warning}}
{{#is_recovery}}✅ SYSTEM HEALTHY{{/is_recovery}}
Conditions:
- Response time < 500ms: {{#is_ok_response_time}}✅{{/is_ok_response_time}}{{#is_warning_response_time}}⚠️{{/is_warning_response_time}}
- Error rate < 1%: {{#is_ok_error_rate}}✅{{/is_ok_error_rate}}{{#is_warning_error_rate}}⚠️{{/is_warning_error_rate}}
@oncall @ops-team
options:
require_full_window: false
tags: ["env:production", "composite", "health", "team:backend"]
# Terraform configuration for monitors
# monitors.tf
resource "datadog_monitor" "high_error_rate" {
name = "[Production] High Error Rate Alert"
type = "query alert"
query = "avg(last_15m):avg:flask.request.errors{*}.rollup(sum, 60) / avg:flask.request.count{*}.rollup(sum, 60) * 100 > 5"
message = <<EOF
🚨 High error rate detected!
Error rate: {{value}}%
Threshold: 5%
Duration: 15 minutes
Service: {{service.name}}
Environment: {{env.name}}
@pagerduty-service-escalation
@slack-alerts
EOF
tags = ["env:production", "team:backend", "severity:critical"]
monitor_thresholds {
critical = 5
warning = 3
}
threshold_windows {
trigger_window = "last_15m"
recovery_window = "last_15m"
}
notify_no_data = false
renotify_interval = 60
}
resource "datadog_monitor" "database_connections" {
name = "[Production] Database Connections High"
type = "metric alert"
query = "avg(last_5m):avg:postgres.connections{*} > 80"
message = <<EOF
🗄️ Database connection pool is nearly full!
Connections: {{value}}
Max connections: 100
Utilization: {{value}}%
Server: {{host.name}}
Database: postgres
Action required: Check for connection leaks or consider scaling database.
@database-team
EOF
tags = ["env:production", "database", "postgres", "team:backend"]
monitor_thresholds {
critical = 80
warning = 70
}
}
# Monitor with dynamic thresholds
resource "datadog_monitor" "adaptive_threshold" {
name = "[Production] Adaptive Response Time"
type = "anomaly alert"
query = "avg(last_1h):anomalies(avg:flask.request.duration{*}.rollup(avg, 60), 'basic', 2, direction='above', alert_window='last_15m', interval=60)"
message = <<EOF
📊 Response time anomaly detected!
Current response time: {{value}}ms
Historical baseline: 1 hour data
Anomaly threshold: ±2 standard deviations
Service: {{service.name}}
Environment: {{env.name}}
@ops-team
EOF
tags = ["env:production", "anomaly", "performance", "team:backend"]
}