Kubernetes YAML Samples
Complete Kubernetes YAML manifests for Deployment, Service, ConfigMap, Secret, Ingress and other essential resources
Key Facts
- Category
- Container Orchestration
- Items
- 4
- Format Families
- yaml, sql
Sample Overview
Complete Kubernetes YAML manifests for Deployment, Service, ConfigMap, Secret, Ingress and other essential resources This sample set belongs to Container Orchestration and can be used to test related workflows inside Elysia Tools.
💻 Basic Kubernetes Deployment
Simple deployment manifest for running a web application with multiple replicas
# Basic Kubernetes Deployment Example
# This file creates a simple deployment for a web application
apiVersion: apps/v1
kind: Deployment
metadata:
name: web-app-deployment
namespace: default
labels:
app: web-app
version: v1
environment: production
spec:
replicas: 3 # Number of pod replicas
selector:
matchLabels:
app: web-app
template:
metadata:
labels:
app: web-app
version: v1
spec:
containers:
- name: web-app-container
image: nginx:1.21-alpine # Container image
ports:
- containerPort: 80
name: http
protocol: TCP
resources:
requests:
memory: "64Mi"
cpu: "250m" # 0.25 CPU cores
limits:
memory: "128Mi"
cpu: "500m" # 0.5 CPU cores
livenessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
env:
- name: APP_NAME
value: "MyWebApp"
- name: APP_VERSION
value: "1.0.0"
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
volumes:
- name: config-volume
configMap:
name: web-app-config
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
securityContext:
runAsNonRoot: true
runAsUser: 101
fsGroup: 101
---
# Service to expose the deployment
apiVersion: v1
kind: Service
metadata:
name: web-app-service
namespace: default
labels:
app: web-app
spec:
type: ClusterIP # Internal service
ports:
- port: 80
targetPort: 80
protocol: TCP
name: http
selector:
app: web-app
---
# ConfigMap for application configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: web-app-config
namespace: default
labels:
app: web-app
data:
app.properties: |
server.port=8080
app.name=WebApp
app.version=1.0.0
logging.level.root=INFO
nginx.conf: |
server {
listen 80;
server_name localhost;
location / {
root /usr/share/nginx/html;
index index.html;
}
location /api {
proxy_pass http://backend:3000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}
}
💻 Node.js Application with Database
Complete Node.js application setup with PostgreSQL database, environment variables, and health checks
# Node.js Application with PostgreSQL Database
# This example shows a complete web application setup
apiVersion: v1
kind: Namespace
metadata:
name: node-app
---
# ConfigMap for application settings
apiVersion: v1
kind: ConfigMap
metadata:
name: node-app-config
namespace: node-app
data:
NODE_ENV: "production"
LOG_LEVEL: "info"
API_PORT: "3000"
DATABASE_NAME: "nodeappdb"
REDIS_HOST: "redis-service"
REDIS_PORT: "6379"
app.conf: |
{
"server": {
"port": 3000,
"host": "0.0.0.0"
},
"database": {
"host": "$DATABASE_HOST",
"port": "$DATABASE_PORT",
"name": "$DATABASE_NAME",
"ssl": true
},
"redis": {
"host": "$REDIS_HOST",
"port": "$REDIS_PORT",
"ttl": 3600
},
"logging": {
"level": "$LOG_LEVEL",
"format": "json"
}
}
---
# Secret for sensitive data
apiVersion: v1
kind: Secret
metadata:
name: node-app-secrets
namespace: node-app
type: Opaque
data:
# Base64 encoded values
DATABASE_HOST: cG9zdGdyZXNxbC1zZXJ2aWNl # postgresql-service
DATABASE_PORT: NTQzMg== # 5432
DATABASE_NAME: bm9kZWFwcGRi # nodeappdb
DATABASE_USER: YXBwX3VzZXI= # app_user
DATABASE_PASSWORD: c3VwZXJfc2VjcmV0X3Bhc3M= # super_secr8t_pass
JWT_SECRET: bm90X3NvX3NlY3JldF9qd3Rfa2V5 # not_so_secret_jwt_key
REDIS_PASSWORD: cmVkaXNfcGFzc3dvcmQ= # redis_password
---
# PostgreSQL StatefulSet
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgresql
namespace: node-app
spec:
serviceName: postgresql-service
replicas: 1
selector:
matchLabels:
app: postgresql
template:
metadata:
labels:
app: postgresql
spec:
containers:
- name: postgresql
image: postgres:15-alpine
env:
- name: POSTGRES_DB
valueFrom:
configMapKeyRef:
name: node-app-config
key: DATABASE_NAME
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PASSWORD
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
ports:
- containerPort: 5432
name: postgresql
volumeMounts:
- name: postgresql-storage
mountPath: /var/lib/postgresql/data
- name: postgresql-config
mountPath: /etc/postgresql/postgresql.conf
subPath: postgresql.conf
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: postgresql-config
configMap:
name: postgresql-config
volumeClaimTemplates:
- metadata:
name: postgresql-storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: fast-ssd # Adjust based on your cluster
---
# PostgreSQL Service
apiVersion: v1
kind: Service
metadata:
name: postgresql-service
namespace: node-app
labels:
app: postgresql
spec:
type: ClusterIP
ports:
- port: 5432
targetPort: 5432
protocol: TCP
name: postgresql
selector:
app: postgresql
---
# PostgreSQL Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-config
namespace: node-app
data:
postgresql.conf: |
# PostgreSQL Configuration
listen_addresses = '*'
port = 5432
max_connections = 100
shared_buffers = 128MB
effective_cache_size = 4GB
maintenance_work_mem = 64MB
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
work_mem = 4MB
min_wal_size = 1GB
max_wal_size = 4GB
logging_collector = on
log_directory = 'pg_log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_statement = 'all'
log_min_duration_statement = 1000
---
# Redis Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: node-app
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:7-alpine
command:
- redis-server
- --requirepass
- $(REDIS_PASSWORD)
env:
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: REDIS_PASSWORD
ports:
- containerPort: 6379
name: redis
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
livenessProbe:
exec:
command:
- redis-cli
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- redis-cli
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 5
periodSeconds: 5
---
# Redis Service
apiVersion: v1
kind: Service
metadata:
name: redis-service
namespace: node-app
labels:
app: redis
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: 6379
protocol: TCP
name: redis
selector:
app: redis
---
# Node.js Application Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: node-app
namespace: node-app
labels:
app: node-app
version: v1
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: node-app
template:
metadata:
labels:
app: node-app
version: v1
spec:
containers:
- name: node-app
image: node-app:1.0.0 # Replace with your actual image
envFrom:
- configMapRef:
name: node-app-config
env:
- name: DATABASE_HOST
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_HOST
- name: DATABASE_PORT
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PORT
- name: DATABASE_NAME
valueFrom:
configMapKeyRef:
name: node-app-config
key: DATABASE_NAME
- name: DATABASE_USER
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_USER
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PASSWORD
- name: JWT_SECRET
valueFrom:
secretKeyRef:
name: node-app-secrets
key: JWT_SECRET
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: REDIS_PASSWORD
ports:
- containerPort: 3000
name: http
resources:
requests:
memory: "256Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
startupProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 30
volumeMounts:
- name: config-volume
mountPath: /app/config
readOnly: true
- name: logs-volume
mountPath: /app/logs
volumes:
- name: config-volume
configMap:
name: node-app-config
items:
- key: app.conf
path: app.conf
- name: logs-volume
emptyDir: {}
imagePullSecrets:
- name: registry-secret # For private Docker registry
---
# Node.js Application Service
apiVersion: v1
kind: Service
metadata:
name: node-app-service
namespace: node-app
labels:
app: node-app
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
selector:
app: node-app
---
# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: node-app-hpa
namespace: node-app
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: node-app
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 15
- type: Pods
value: 4
periodSeconds: 15
selectPolicy: Max
💻 Ingress with SSL Termination
Advanced ingress configuration with SSL certificates, path-based routing, and load balancing
# Ingress Controller with SSL Termination and Path-Based Routing
# This example shows how to expose multiple services with HTTPS
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: web-app-ingress
namespace: default
annotations:
kubernetes.io/ingress.class: "nginx" # Use NGINX Ingress Controller
cert-manager.io/cluster-issuer: "letsencrypt-prod" # For automatic SSL certificates
nginx.ingress.kubernetes.io/ssl-redirect: "true" # Redirect HTTP to HTTPS
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/limit-connections: "100" # Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "50"
nginx.ingress.kubernetes.io/limit-burst: "100"
nginx.ingress.kubernetes.io/proxy-body-size: "50m" # Max file upload size
nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
nginx.ingress.kubernetes.io/rate-limit: "100" # Rate limiting per IP
nginx.ingress.kubernetes.io/rate-limit-window: "1m"
# CORS settings
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "*"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS"
nginx.ingress.kubernetes.io/cors-allow-headers: "DNT,X-CustomHeader,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization"
# Custom error pages
nginx.ingress.kubernetes.io/default-backend: "error-page-service"
spec:
tls:
- hosts:
- api.example.com
- app.example.com
- admin.example.com
secretName: example-com-tls # TLS certificate secret
rules:
- host: api.example.com
http:
paths:
- path: /api/v1
pathType: Prefix
backend:
service:
name: api-v1-service
port:
number: 80
- path: /api/v2
pathType: Prefix
backend:
service:
name: api-v2-service
port:
number: 80
- path: /health
pathType: Prefix
backend:
service:
name: api-health-service
port:
number: 80
- host: app.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: frontend-service
port:
number: 80
- path: /static
pathType: Prefix
backend:
service:
name: static-assets-service
port:
number: 80
- host: admin.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: admin-dashboard-service
port:
number: 80
---
# Certificate for Ingress (using cert-manager)
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
server: https://acme-v02.api.letsencrypt.org/directory
email: [email protected]
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: nginx
---
# TLS Certificate Secret (if not using cert-manager)
apiVersion: v1
kind: Secret
metadata:
name: example-com-tls
namespace: default
type: kubernetes.io/tls
data:
tls.crt: LS0tLS1CRUdJTi... # Base64 encoded certificate
tls.key: LS0tLS1CRUdJTi... # Base64 encoded private key
---
# Backend Services for Ingress
apiVersion: v1
kind: Service
metadata:
name: api-v1-service
namespace: default
labels:
app: api
version: v1
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
selector:
app: api
version: v1
---
apiVersion: v1
kind: Service
metadata:
name: frontend-service
namespace: default
labels:
app: frontend
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 80
protocol: TCP
name: http
selector:
app: frontend
---
# Network Policy for Security
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: ingress-network-policy
namespace: default
spec:
podSelector: {}
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
- podSelector: {}
ports:
- protocol: TCP
port: 80
- protocol: TCP
port: 443
---
# Rate Limiting ConfigMap for Ingress
apiVersion: v1
kind: ConfigMap
metadata:
name: nginx-configuration
namespace: ingress-nginx
labels:
app.kubernetes.io/name: ingress-nginx
data:
use-proxy-protocol: "true"
proxy-protocol-header-pattern: "^\[?([0-9a-zA-Z\.-]+)\]?:([0-9]+)$"
limit-connections: "100"
limit-rps: "50"
limit-burst: "100"
client-body-buffer-size: "64k"
proxy-buffering: "on"
proxy-buffer-size: "4k"
proxy-buffers-number: "4"
---
# Custom Error Page Backend
apiVersion: v1
kind: Service
metadata:
name: error-page-service
namespace: default
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 80
selector:
app: error-pages
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: error-pages
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: error-pages
template:
metadata:
labels:
app: error-pages
spec:
containers:
- name: error-pages
image: nginx:alpine
ports:
- containerPort: 80
volumeMounts:
- name: error-pages-config
mountPath: /usr/share/nginx/html
volumes:
- name: error-pages-config
configMap:
name: error-pages-content
---
apiVersion: v1
kind: ConfigMap
metadata:
name: error-pages-content
namespace: default
data:
404.html: |
<!DOCTYPE html>
<html>
<head>
<title>404 - Page Not Found</title>
<style>
body { font-family: Arial, sans-serif; text-align: center; margin-top: 100px; }
h1 { font-size: 48px; color: #e74c3c; }
p { font-size: 18px; color: #7f8c8d; }
</style>
</head>
<body>
<h1>404</h1>
<p>Oops! The page you're looking for doesn't exist.</p>
<p><a href="/">Go back home</a></p>
</body>
</html>
500.html: |
<!DOCTYPE html>
<html>
<head>
<title>500 - Server Error</title>
<style>
body { font-family: Arial, sans-serif; text-align: center; margin-top: 100px; }
h1 { font-size: 48px; color: #e74c3c; }
p { font-size: 18px; color: #7f8c8d; }
</style>
</head>
<body>
<h1>500</h1>
<p>Oops! Something went wrong on our end.</p>
<p>Please try again later or contact support.</p>
</body>
</html>
---
# Pod Disruption Budget for High Availability
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: api-pdb
namespace: default
spec:
minAvailable: 2
selector:
matchLabels:
app: api
---
# Resource Quota for Namespace
apiVersion: v1
kind: ResourceQuota
metadata:
name: default-quota
namespace: default
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
persistentvolumeclaims: "10"
services: "20"
secrets: "20"
configmaps: "20"
💻 CronJob with Monitoring
Scheduled jobs with monitoring, alerting, and failure handling mechanisms
# Kubernetes CronJob with Monitoring and Alerting
# This example shows scheduled jobs with comprehensive monitoring
apiVersion: batch/v1
kind: CronJob
metadata:
name: data-backup-cronjob
namespace: default
labels:
app: backup
type: database
spec:
schedule: "0 2 * * *" # Run at 2 AM every day
concurrencyPolicy: Forbid # Don't run concurrent jobs
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 5
jobTemplate:
spec:
template:
metadata:
labels:
app: backup
type: database-backup
spec:
restartPolicy: OnFailure
activeDeadlineSeconds: 3600 # 1 hour timeout
containers:
- name: backup-container
image: postgres:15-alpine
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: database-secrets
key: POSTGRES_PASSWORD
- name: PGHOST
value: "postgresql-service.default.svc.cluster.local"
- name: PGUSER
value: "postgres"
- name: PGDATABASE
value: "production"
- name: BACKUP_DATE
value: "$(date +%Y-%m-%d)"
- name: S3_BUCKET
value: "s3://company-backups/database"
- name: SLACK_WEBHOOK
valueFrom:
secretKeyRef:
name: notifications-secrets
key: SLACK_WEBHOOK_URL
command:
- /bin/bash
- -c
- |
set -e
echo "Starting backup process at $(date)"
# Create backup directory
BACKUP_DIR="/tmp/backup-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
# Database backup
echo "Creating database backup..."
pg_dump -h "$PGHOST" -U "$PGUSER" -d "$PGDATABASE" > "$BACKUP_DIR/database_$(date +%Y%m%d).sql"
# Compress backup
echo "Compressing backup..."
gzip "$BACKUP_DIR/database_$(date +%Y%m%d).sql"
# Upload to S3
echo "Uploading to S3..."
aws s3 cp "$BACKUP_DIR/database_$(date +%Y%m%d).sql.gz" "$S3_BUCKET/$(date +%Y)/"
# Verify upload
if aws s3 ls "$S3_BUCKET/$(date +%Y)/database_$(date +%Y%m%d).sql.gz"; then
echo "Backup completed successfully!"
# Send success notification
curl -X POST -H 'Content-type: application/json' --data '{"text":"✅ Database backup completed successfully for '$BACKUP_DATE'"}' "$SLACK_WEBHOOK"
else
echo "Backup verification failed!"
# Send failure notification
curl -X POST -H 'Content-type: application/json' --data '{"text":"❌ Database backup verification failed for '$BACKUP_DATE'"}' "$SLACK_WEBHOOK"
exit 1
fi
# Cleanup
rm -rf "$BACKUP_DIR"
echo "Backup process completed at $(date)"
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
volumeMounts:
- name: backup-temp
mountPath: /tmp
- name: backup-config
mountPath: /etc/backup-config
readOnly: true
volumes:
- name: backup-temp
emptyDir: {}
- name: backup-config
configMap:
name: backup-config
# Node selector for backup jobs
nodeSelector:
node-type: worker
backup-capable: "true"
tolerations:
- key: "backup-job"
operator: "Equal"
value: "true"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-type
operator: In
values: ["worker"]
---
# Backup Configuration ConfigMap
apiVersion: v1
kind: ConfigMap
metadata:
name: backup-config
namespace: default
data:
backup.conf: |
# Backup Configuration
RETENTION_DAYS=30
S3_REGION=us-west-2
S3_STORAGE_CLASS=STANDARD_IA
BACKUP_TYPE=full
ENCRYPTION_ENABLED=true
retention-policy.sh: |
#!/bin/bash
# Cleanup old backups (retention policy)
S3_BUCKET="s3://company-backups/database"
RETENTION_DAYS=30
aws s3 ls "$S3_BUCKET/" --recursive | while read -r line; do
file_date=$(echo "$line" | awk '{print $1, $2}')
file_path=$(echo "$line" | awk '{print $4}')
file_timestamp=$(date -d"$file_date" +%s)
current_timestamp=$(date +%s)
age_days=$(( (current_timestamp - file_timestamp) / 86400 ))
if [ "$age_days" -gt "$RETENTION_DAYS" ]; then
echo "Deleting old backup: $file_path (age: $age_days days)"
aws s3 rm "$S3_BUCKET/$file_path"
fi
done
---
# Service Account for Backup Jobs
apiVersion: v1
kind: ServiceAccount
metadata:
name: backup-service-account
namespace: default
automountServiceAccountToken: true
---
# Role for Backup Operations
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: backup-role
namespace: default
rules:
- apiGroups: [""]
resources: ["pods", "pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
---
# Role Binding
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: backup-role-binding
namespace: default
subjects:
- kind: ServiceAccount
name: backup-service-account
namespace: default
roleRef:
kind: Role
name: backup-role
apiGroup: rbac.authorization.k8s.io
---
# Monitoring Service for Backup Jobs
apiVersion: v1
kind: Service
metadata:
name: backup-metrics
namespace: default
labels:
app: backup-metrics
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: metrics
selector:
app: backup-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: backup-metrics
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: backup-metrics
template:
metadata:
labels:
app: backup-metrics
spec:
serviceAccountName: backup-service-account
containers:
- name: metrics-exporter
image: prometheus-operator/prometheus-config-reloader:v0.43.2
command:
- /bin/sh
- -c
- |
# Export backup job metrics
while true; do
kubectl get cronjobs -n default -o json | jq '.items[] |
select(.metadata.name | startswith("backup")) |
{
job_name: .metadata.name,
last_schedule: .status.lastScheduleTime,
successful_jobs: .status.successfulJobsHistoryLimit,
failed_jobs: .status.failedJobsHistoryLimit
}'
kubectl get jobs -n default -l type=database-backup -o json | jq '.items[] |
{
job_name: .metadata.name,
start_time: .status.startTime,
completion_time: .status.completionTime,
succeeded: .status.succeeded,
failed: .status.failed,
active: .status.active
}'
sleep 30
done
ports:
- containerPort: 9090
name: metrics
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "100m"
---
# Prometheus Rule for Backup Alerting
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: backup-alerting-rules
namespace: default
labels:
app: prometheus-operator
spec:
groups:
- name: backup.rules
rules:
- alert: BackupJobFailed
expr: kube_job_status_failed{job=~".*backup.*"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Backup job {{ $labels.job_name }} failed"
description: "Backup job {{ $labels.job_name }} has failed. Check the job logs for details."
- alert: BackupJobNotRun
expr: time() - kube_cronjob_status_last_schedule_time{job=~".*backup.*"} > 86400
for: 15m
labels:
severity: warning
annotations:
summary: "Backup job {{ $labels.job_name }} hasn't run in 24 hours"
description: "Backup job {{ $labels.job_name }} was scheduled but hasn't run in the last 24 hours."
- alert: BackupJobLongRunning
expr: kube_job_status_active{job=~".*backup.*"} > 0
for: 2h
labels:
severity: warning
annotations:
summary: "Backup job {{ $labels.job_name }} is running for more than 2 hours"
description: "Backup job {{ $labels.job_name }} has been active for more than 2 hours."
---
# Horizontal Pod Autoscaler for Backup Metrics
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: backup-metrics-hpa
namespace: default
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: backup-metrics
minReplicas: 1
maxReplicas: 3
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
---
# PodMonitor for Custom Metrics
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: backup-pods
namespace: default
labels:
app: prometheus-operator
spec:
selector:
matchLabels:
type: database-backup
namespaceSelector:
matchNames:
- default
podMetricsEndpoints:
- port: metrics
interval: 30s
path: /metrics