🎯 Recommended Samples
Balanced sample collections from various categories for you to explore
Kubernetes YAML Samples
Complete Kubernetes YAML manifests for Deployment, Service, ConfigMap, Secret, Ingress and other essential resources
💻 Basic Kubernetes Deployment
🟢 simple
⭐
Simple deployment manifest for running a web application with multiple replicas
⏱️ 10 min
🏷️ kubernetes, deployment, containers
Prerequisites:
Kubernetes basics, YAML syntax
# Basic Kubernetes Deployment Example
# This file creates a simple deployment for a web application
apiVersion: apps/v1
kind: Deployment
metadata:
name: web-app-deployment
namespace: default
labels:
app: web-app
version: v1
environment: production
spec:
replicas: 3 # Number of pod replicas
selector:
matchLabels:
app: web-app
template:
metadata:
labels:
app: web-app
version: v1
spec:
containers:
- name: web-app-container
image: nginx:1.21-alpine # Container image
ports:
- containerPort: 80
name: http
protocol: TCP
resources:
requests:
memory: "64Mi"
cpu: "250m" # 0.25 CPU cores
limits:
memory: "128Mi"
cpu: "500m" # 0.5 CPU cores
livenessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /
port: 80
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
env:
- name: APP_NAME
value: "MyWebApp"
- name: APP_VERSION
value: "1.0.0"
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
volumes:
- name: config-volume
configMap:
name: web-app-config
restartPolicy: Always
terminationGracePeriodSeconds: 30
dnsPolicy: ClusterFirst
securityContext:
runAsNonRoot: true
runAsUser: 101
fsGroup: 101
---
# Service to expose the deployment
apiVersion: v1
kind: Service
metadata:
name: web-app-service
namespace: default
labels:
app: web-app
spec:
type: ClusterIP # Internal service
ports:
- port: 80
targetPort: 80
protocol: TCP
name: http
selector:
app: web-app
---
# ConfigMap for application configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: web-app-config
namespace: default
labels:
app: web-app
data:
app.properties: |
server.port=8080
app.name=WebApp
app.version=1.0.0
logging.level.root=INFO
nginx.conf: |
server {
listen 80;
server_name localhost;
location / {
root /usr/share/nginx/html;
index index.html;
}
location /api {
proxy_pass http://backend:3000;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
}
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}
}
💻 Node.js Application with Database
🟡 intermediate
⭐⭐⭐
Complete Node.js application setup with PostgreSQL database, environment variables, and health checks
⏱️ 25 min
🏷️ kubernetes, nodejs, database, microservices
Prerequisites:
Kubernetes basics, Node.js, Database concepts, HPA
# Node.js Application with PostgreSQL Database
# This example shows a complete web application setup
apiVersion: v1
kind: Namespace
metadata:
name: node-app
---
# ConfigMap for application settings
apiVersion: v1
kind: ConfigMap
metadata:
name: node-app-config
namespace: node-app
data:
NODE_ENV: "production"
LOG_LEVEL: "info"
API_PORT: "3000"
REDIS_HOST: "redis-service"
REDIS_PORT: "6379"
app.conf: |
{
"server": {
"port": 3000,
"host": "0.0.0.0"
},
"database": {
"host": "$DATABASE_HOST",
"port": "$DATABASE_PORT",
"name": "$DATABASE_NAME",
"ssl": true
},
"redis": {
"host": "$REDIS_HOST",
"port": "$REDIS_PORT",
"ttl": 3600
},
"logging": {
"level": "$LOG_LEVEL",
"format": "json"
}
}
---
# Secret for sensitive data
apiVersion: v1
kind: Secret
metadata:
name: node-app-secrets
namespace: node-app
type: Opaque
data:
# Base64 encoded values
DATABASE_HOST: cG9zdGdyZXNxbS1zZXJ2aWNl # postgresql-service
DATABASE_PORT: NTQzMw== # 5432
DATABASE_NAME: bm9kZWFwcGRi # nodeappdb
DATABASE_USER: YXBwX3VzZXI= # app_user
DATABASE_PASSWORD: c3VwZXJfc2VjcmV0X3Bhc3M= # super_secr8t_pass
JWT_SECRET: bm90X3NvX3NlY3JldF9qd3Rfa2V5 # not_so_secret_jwt_key
REDIS_PASSWORD: cmVkaXNfcGFzc3dvcmQ= # redis_password
---
# PostgreSQL StatefulSet
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgresql
namespace: node-app
spec:
serviceName: postgresql-service
replicas: 1
selector:
matchLabels:
app: postgresql
template:
metadata:
labels:
app: postgresql
spec:
containers:
- name: postgresql
image: postgres:15-alpine
env:
- name: POSTGRES_DB
valueFrom:
configMapKeyRef:
name: node-app-config
key: DATABASE_NAME
- name: POSTGRES_USER
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_USER
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PASSWORD
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
ports:
- containerPort: 5432
name: postgresql
volumeMounts:
- name: postgresql-storage
mountPath: /var/lib/postgresql/data
- name: postgresql-config
mountPath: /etc/postgresql/postgresql.conf
subPath: postgresql.conf
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- pg_isready
- -U
- postgres
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: postgresql-config
configMap:
name: postgresql-config
volumeClaimTemplates:
- metadata:
name: postgresql-storage
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 10Gi
storageClassName: fast-ssd # Adjust based on your cluster
---
# PostgreSQL Service
apiVersion: v1
kind: Service
metadata:
name: postgresql-service
namespace: node-app
labels:
app: postgresql
spec:
type: ClusterIP
ports:
- port: 5432
targetPort: 5432
protocol: TCP
name: postgresql
selector:
app: postgresql
---
# PostgreSQL Configuration
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-config
namespace: node-app
data:
postgresql.conf: |
# PostgreSQL Configuration
listen_addresses = '*'
port = 5432
max_connections = 100
shared_buffers = 128MB
effective_cache_size = 4GB
maintenance_work_mem = 64MB
checkpoint_completion_target = 0.9
wal_buffers = 16MB
default_statistics_target = 100
random_page_cost = 1.1
effective_io_concurrency = 200
work_mem = 4MB
min_wal_size = 1GB
max_wal_size = 4GB
logging_collector = on
log_directory = 'pg_log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_statement = 'all'
log_min_duration_statement = 1000
---
# Redis Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
namespace: node-app
spec:
replicas: 1
selector:
matchLabels:
app: redis
template:
metadata:
labels:
app: redis
spec:
containers:
- name: redis
image: redis:7-alpine
command:
- redis-server
- --requirepass
- $(REDIS_PASSWORD)
env:
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: REDIS_PASSWORD
ports:
- containerPort: 6379
name: redis
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "200m"
livenessProbe:
exec:
command:
- redis-cli
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command:
- redis-cli
- -a
- $(REDIS_PASSWORD)
- ping
initialDelaySeconds: 5
periodSeconds: 5
---
# Redis Service
apiVersion: v1
kind: Service
metadata:
name: redis-service
namespace: node-app
labels:
app: redis
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: 6379
protocol: TCP
name: redis
selector:
app: redis
---
# Node.js Application Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: node-app
namespace: node-app
labels:
app: node-app
version: v1
spec:
replicas: 3
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
selector:
matchLabels:
app: node-app
template:
metadata:
labels:
app: node-app
version: v1
spec:
containers:
- name: node-app
image: node-app:1.0.0 # Replace with your actual image
envFrom:
- configMapRef:
name: node-app-config
env:
- name: DATABASE_HOST
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_HOST
- name: DATABASE_PORT
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PORT
- name: DATABASE_NAME
valueFrom:
configMapKeyRef:
name: node-app-config
key: DATABASE_NAME
- name: DATABASE_USER
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_USER
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: DATABASE_PASSWORD
- name: JWT_SECRET
valueFrom:
secretKeyRef:
name: node-app-secrets
key: JWT_SECRET
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: node-app-secrets
key: REDIS_PASSWORD
ports:
- containerPort: 3000
name: http
resources:
requests:
memory: "256Mi"
cpu: "200m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /ready
port: 3000
initialDelaySeconds: 10
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
startupProbe:
httpGet:
path: /health
port: 3000
initialDelaySeconds: 10
periodSeconds: 10
timeoutSeconds: 3
failureThreshold: 30
volumeMounts:
- name: config-volume
mountPath: /app/config
readOnly: true
- name: logs-volume
mountPath: /app/logs
volumes:
- name: config-volume
configMap:
name: node-app-config
items:
- key: app.conf
path: app.conf
- name: logs-volume
emptyDir: {}
imagePullSecrets:
- name: registry-secret # For private Docker registry
---
# Node.js Application Service
apiVersion: v1
kind: Service
metadata:
name: node-app-service
namespace: node-app
labels:
app: node-app
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
selector:
app: node-app
---
# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: node-app-hpa
namespace: node-app
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: node-app
minReplicas: 2
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 10
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 0
policies:
- type: Percent
value: 100
periodSeconds: 15
- type: Pods
value: 4
periodSeconds: 15
selectPolicy: Max
💻 Ingress with SSL Termination
🟡 intermediate
⭐⭐⭐⭐
Advanced ingress configuration with SSL certificates, path-based routing, and load balancing
⏱️ 20 min
🏷️ kubernetes, ingress, ssl, load-balancer
Prerequisites:
Kubernetes Ingress, SSL certificates, NGINX controller
# Ingress Controller with SSL Termination and Path-Based Routing
# This example shows how to expose multiple services with HTTPS
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: web-app-ingress
namespace: default
annotations:
kubernetes.io/ingress.class: "nginx" # Use NGINX Ingress Controller
cert-manager.io/cluster-issuer: "letsencrypt-prod" # For automatic SSL certificates
nginx.ingress.kubernetes.io/ssl-redirect: "true" # Redirect HTTP to HTTPS
nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
nginx.ingress.kubernetes.io/limit-connections: "100" # Rate limiting
nginx.ingress.kubernetes.io/limit-rps: "50"
nginx.ingress.kubernetes.io/limit-burst: "100"
nginx.ingress.kubernetes.io/proxy-body-size: "50m" # Max file upload size
nginx.ingress.kubernetes.io/proxy-read-timeout: "300"
nginx.ingress.kubernetes.io/proxy-send-timeout: "300"
nginx.ingress.kubernetes.io/rate-limit: "100" # Rate limiting per IP
nginx.ingress.kubernetes.io/rate-limit-window: "1m"
# CORS settings
nginx.ingress.kubernetes.io/enable-cors: "true"
nginx.ingress.kubernetes.io/cors-allow-origin: "*"
nginx.ingress.kubernetes.io/cors-allow-methods: "GET, POST, PUT, DELETE, OPTIONS"
nginx.ingress.kubernetes.io/cors-allow-headers: "DNT,X-CustomHeader,Keep-Alive,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Authorization"
# Custom error pages
nginx.ingress.kubernetes.io/default-backend: "error-page-service"
spec:
tls:
- hosts:
- api.example.com
- app.example.com
- admin.example.com
secretName: example-com-tls # TLS certificate secret
rules:
- host: api.example.com
http:
paths:
- path: /api/v1
pathType: Prefix
backend:
service:
name: api-v1-service
port:
number: 80
- path: /api/v2
pathType: Prefix
backend:
service:
name: api-v2-service
port:
number: 80
- path: /health
pathType: Prefix
backend:
service:
name: api-health-service
port:
number: 80
- host: app.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: frontend-service
port:
number: 80
- path: /static
pathType: Prefix
backend:
service:
name: static-assets-service
port:
number: 80
- host: admin.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: admin-dashboard-service
port:
number: 80
---
# Certificate for Ingress (using cert-manager)
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
spec:
acme:
server: https://acme-v02.api.letsencrypt.org/directory
email: [email protected]
privateKeySecretRef:
name: letsencrypt-prod-account-key
solvers:
- http01:
ingress:
class: nginx
---
# TLS Certificate Secret (if not using cert-manager)
apiVersion: v1
kind: Secret
metadata:
name: example-com-tls
namespace: default
type: kubernetes.io/tls
data:
tls.crt: LS0tLS1CRUdJTi... # Base64 encoded certificate
tls.key: LS0tLS1CRUdJTi... # Base64 encoded private key
---
# Backend Services for Ingress
apiVersion: v1
kind: Service
metadata:
name: api-v1-service
namespace: default
labels:
app: api
version: v1
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 3000
protocol: TCP
name: http
selector:
app: api
version: v1
---
apiVersion: v1
kind: Service
metadata:
name: frontend-service
namespace: default
labels:
app: frontend
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 80
protocol: TCP
name: http
selector:
app: frontend
---
# Network Policy for Security
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: ingress-network-policy
namespace: default
spec:
podSelector: {}
policyTypes:
- Ingress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
- podSelector: {}
ports:
- protocol: TCP
port: 80
- protocol: TCP
port: 443
---
# Rate Limiting ConfigMap for Ingress
apiVersion: v1
kind: ConfigMap
metadata:
name: nginx-configuration
namespace: ingress-nginx
labels:
app.kubernetes.io/name: ingress-nginx
data:
use-proxy-protocol: "true"
proxy-protocol-header-pattern: "^\[?([0-9a-zA-Z\.-]+)\]?:([0-9]+)$"
limit-connections: "100"
limit-rps: "50"
limit-burst: "100"
client-body-buffer-size: "64k"
proxy-buffering: "on"
proxy-buffer-size: "4k"
proxy-buffers-number: "4"
---
# Custom Error Page Backend
apiVersion: v1
kind: Service
metadata:
name: error-page-service
namespace: default
spec:
type: ClusterIP
ports:
- port: 80
targetPort: 80
selector:
app: error-pages
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: error-pages
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: error-pages
template:
metadata:
labels:
app: error-pages
spec:
containers:
- name: error-pages
image: nginx:alpine
ports:
- containerPort: 80
volumeMounts:
- name: error-pages-config
mountPath: /usr/share/nginx/html
volumes:
- name: error-pages-config
configMap:
name: error-pages-content
---
apiVersion: v1
kind: ConfigMap
metadata:
name: error-pages-content
namespace: default
data:
404.html: |
<!DOCTYPE html>
<html>
<head>
<title>404 - Page Not Found</title>
<style>
body { font-family: Arial, sans-serif; text-align: center; margin-top: 100px; }
h1 { font-size: 48px; color: #e74c3c; }
p { font-size: 18px; color: #7f8c8d; }
</style>
</head>
<body>
<h1>404</h1>
<p>Oops! The page you're looking for doesn't exist.</p>
<p><a href="/">Go back home</a></p>
</body>
</html>
500.html: |
<!DOCTYPE html>
<html>
<head>
<title>500 - Server Error</title>
<style>
body { font-family: Arial, sans-serif; text-align: center; margin-top: 100px; }
h1 { font-size: 48px; color: #e74c3c; }
p { font-size: 18px; color: #7f8c8d; }
</style>
</head>
<body>
<h1>500</h1>
<p>Oops! Something went wrong on our end.</p>
<p>Please try again later or contact support.</p>
</body>
</html>
---
# Pod Disruption Budget for High Availability
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: api-pdb
namespace: default
spec:
minAvailable: 2
selector:
matchLabels:
app: api
---
# Resource Quota for Namespace
apiVersion: v1
kind: ResourceQuota
metadata:
name: default-quota
namespace: default
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
persistentvolumeclaims: "10"
services: "20"
secrets: "20"
configmaps: "20"
💻 CronJob with Monitoring
🔴 complex
⭐⭐⭐⭐
Scheduled jobs with monitoring, alerting, and failure handling mechanisms
⏱️ 35 min
🏷️ kubernetes, cronjob, monitoring, backup
Prerequisites:
Kubernetes CronJob, Prometheus monitoring, RBAC
# Kubernetes CronJob with Monitoring and Alerting
# This example shows scheduled jobs with comprehensive monitoring
apiVersion: batch/v1
kind: CronJob
metadata:
name: data-backup-cronjob
namespace: default
labels:
app: backup
type: database
spec:
schedule: "0 2 * * *" # Run at 2 AM every day
concurrencyPolicy: Forbid # Don't run concurrent jobs
successfulJobsHistoryLimit: 3
failedJobsHistoryLimit: 5
jobTemplate:
spec:
template:
metadata:
labels:
app: backup
type: database-backup
spec:
restartPolicy: OnFailure
activeDeadlineSeconds: 3600 # 1 hour timeout
containers:
- name: backup-container
image: postgres:15-alpine
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: database-secrets
key: POSTGRES_PASSWORD
- name: PGHOST
value: "postgresql-service.default.svc.cluster.local"
- name: PGUSER
value: "postgres"
- name: PGDATABASE
value: "production"
- name: BACKUP_DATE
value: "$(date +%Y-%m-%d)"
- name: S3_BUCKET
value: "s3://company-backups/database"
- name: SLACK_WEBHOOK
valueFrom:
secretKeyRef:
name: notifications-secrets
key: SLACK_WEBHOOK_URL
command:
- /bin/bash
- -c
- |
set -e
echo "Starting backup process at $(date)"
# Create backup directory
BACKUP_DIR="/tmp/backup-$(date +%Y%m%d-%H%M%S)"
mkdir -p "$BACKUP_DIR"
# Database backup
echo "Creating database backup..."
pg_dump -h "$PGHOST" -U "$PGUSER" -d "$PGDATABASE" > "$BACKUP_DIR/database_$(date +%Y%m%d).sql"
# Compress backup
echo "Compressing backup..."
gzip "$BACKUP_DIR/database_$(date +%Y%m%d).sql"
# Upload to S3
echo "Uploading to S3..."
aws s3 cp "$BACKUP_DIR/database_$(date +%Y%m%d).sql.gz" "$S3_BUCKET/$(date +%Y)/"
# Verify upload
if aws s3 ls "$S3_BUCKET/$(date +%Y)/database_$(date +%Y%m%d).sql.gz"; then
echo "Backup completed successfully!"
# Send success notification
curl -X POST -H 'Content-type: application/json' --data '{"text":"✅ Database backup completed successfully for '$BACKUP_DATE'"}' "$SLACK_WEBHOOK"
else
echo "Backup verification failed!"
# Send failure notification
curl -X POST -H 'Content-type: application/json' --data '{"text":"❌ Database backup verification failed for '$BACKUP_DATE'"}' "$SLACK_WEBHOOK"
exit 1
fi
# Cleanup
rm -rf "$BACKUP_DIR"
echo "Backup process completed at $(date)"
resources:
requests:
memory: "512Mi"
cpu: "250m"
limits:
memory: "1Gi"
cpu: "500m"
volumeMounts:
- name: backup-temp
mountPath: /tmp
- name: backup-config
mountPath: /etc/backup-config
readOnly: true
volumes:
- name: backup-temp
emptyDir: {}
- name: backup-config
configMap:
name: backup-config
# Node selector for backup jobs
nodeSelector:
node-type: worker
backup-capable: "true"
tolerations:
- key: "backup-job"
operator: "Equal"
value: "true"
effect: "NoSchedule"
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-type
operator: In
values: ["worker"]
---
# Backup Configuration ConfigMap
apiVersion: v1
kind: ConfigMap
metadata:
name: backup-config
namespace: default
data:
backup.conf: |
# Backup Configuration
RETENTION_DAYS=30
S3_REGION=us-west-2
S3_STORAGE_CLASS=STANDARD_IA
BACKUP_TYPE=full
ENCRYPTION_ENABLED=true
retention-policy.sh: |
#!/bin/bash
# Cleanup old backups (retention policy)
S3_BUCKET="s3://company-backups/database"
RETENTION_DAYS=30
aws s3 ls "$S3_BUCKET/" --recursive | while read -r line; do
file_date=$(echo "$line" | awk '{print $1, $2}')
file_path=$(echo "$line" | awk '{print $4}')
file_timestamp=$(date -d"$file_date" +%s)
current_timestamp=$(date +%s)
age_days=$(( (current_timestamp - file_timestamp) / 86400 ))
if [ "$age_days" -gt "$RETENTION_DAYS" ]; then
echo "Deleting old backup: $file_path (age: $age_days days)"
aws s3 rm "$S3_BUCKET/$file_path"
fi
done
---
# Service Account for Backup Jobs
apiVersion: v1
kind: ServiceAccount
metadata:
name: backup-service-account
namespace: default
automountServiceAccountToken: true
---
# Role for Backup Operations
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: backup-role
namespace: default
rules:
- apiGroups: [""]
resources: ["pods", "pods/log"]
verbs: ["get", "list", "watch"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "patch"]
---
# Role Binding
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: backup-role-binding
namespace: default
subjects:
- kind: ServiceAccount
name: backup-service-account
namespace: default
roleRef:
kind: Role
name: backup-role
apiGroup: rbac.authorization.k8s.io
---
# Monitoring Service for Backup Jobs
apiVersion: v1
kind: Service
metadata:
name: backup-metrics
namespace: default
labels:
app: backup-metrics
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "9090"
prometheus.io/path: "/metrics"
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
name: metrics
selector:
app: backup-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: backup-metrics
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: backup-metrics
template:
metadata:
labels:
app: backup-metrics
spec:
serviceAccountName: backup-service-account
containers:
- name: metrics-exporter
image: prometheus-operator/prometheus-config-reloader:v0.43.2
command:
- /bin/sh
- -c
- |
# Export backup job metrics
while true; do
kubectl get cronjobs -n default -o json | jq '.items[] |
select(.metadata.name | startswith("backup")) |
{
job_name: .metadata.name,
last_schedule: .status.lastScheduleTime,
successful_jobs: .status.successfulJobsHistoryLimit,
failed_jobs: .status.failedJobsHistoryLimit
}'
kubectl get jobs -n default -l type=database-backup -o json | jq '.items[] |
{
job_name: .metadata.name,
start_time: .status.startTime,
completion_time: .status.completionTime,
succeeded: .status.succeeded,
failed: .status.failed,
active: .status.active
}'
sleep 30
done
ports:
- containerPort: 9090
name: metrics
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "100m"
---
# Prometheus Rule for Backup Alerting
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: backup-alerting-rules
namespace: default
labels:
app: prometheus-operator
spec:
groups:
- name: backup.rules
rules:
- alert: BackupJobFailed
expr: kube_job_status_failed{job=~".*backup.*"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Backup job {{ $labels.job_name }} failed"
description: "Backup job {{ $labels.job_name }} has failed. Check the job logs for details."
- alert: BackupJobNotRun
expr: time() - kube_cronjob_status_last_schedule_time{job=~".*backup.*"} > 86400
for: 15m
labels:
severity: warning
annotations:
summary: "Backup job {{ $labels.job_name }} hasn't run in 24 hours"
description: "Backup job {{ $labels.job_name }} was scheduled but hasn't run in the last 24 hours."
- alert: BackupJobLongRunning
expr: kube_job_status_active{job=~".*backup.*"} > 0
for: 2h
labels:
severity: warning
annotations:
summary: "Backup job {{ $labels.job_name }} is running for more than 2 hours"
description: "Backup job {{ $labels.job_name }} has been active for more than 2 hours."
---
# Horizontal Pod Autoscaler for Backup Metrics
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: backup-metrics-hpa
namespace: default
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: backup-metrics
minReplicas: 1
maxReplicas: 3
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
---
# PodMonitor for Custom Metrics
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: backup-pods
namespace: default
labels:
app: prometheus-operator
spec:
selector:
matchLabels:
type: database-backup
namespaceSelector:
matchNames:
- default
podMetricsEndpoints:
- port: metrics
interval: 30s
path: /metrics