Docker Compose 生产实践
Docker Compose 不仅适用于开发环境,也可以用于生产环境部署。本文将介绍如何在生产环境中安全、高效地使用 Docker Compose。
目录
生产环境考虑
1.1 生产环境 vs 开发环境
生产环境关键差异:
┌─────────────────┬──────────────────┬──────────────────┐
│ 方面 │ 开发环境 │ 生产环境 │
├─────────────────┼──────────────────┼──────────────────┤
│ 安全性 │ 宽松 │ 严格 │
│ 性能 │ 可接受 │ 优化 │
│ 可用性 │ 可中断 │ 高可用 │
│ 监控 │ 基础 │ 全面 │
│ 备份 │ 可选 │ 必需 │
│ 资源限制 │ 宽松 │ 严格 │
│ 日志管理 │ 控制台输出 │ 集中收集 │
│ 更新策略 │ 随时 │ 计划窗口 │
└─────────────────┴──────────────────┴──────────────────┘1.2 生产环境检查清单
- [ ] 使用特定版本标签(避免
latest) - [ ] 配置资源限制
- [ ] 启用健康检查
- [ ] 配置日志轮转
- [ ] 使用非 root 用户
- [ ] 配置重启策略
- [ ] 启用 TLS/SSL
- [ ] 配置防火墙规则
- [ ] 设置监控告警
- [ ] 制定备份策略
- [ ] 配置 CI/CD 流程
- [ ] 编写运维文档
安全配置
2.1 镜像安全
yaml
version: '3.8'
services:
web:
# 使用特定版本
image: nginx:1.25.3-alpine
# 不使用 latest
# image: nginx:latest # ❌ 不推荐
# 使用私有仓库
# image: myregistry.com/nginx:1.25.3-alpine
# 安全选项
read_only: true
user: "1000:1000"
security_opt:
- no-new-privileges:true
- seccomp:unconfined
cap_drop:
- ALL
cap_add:
- NET_BIND_SERVICE
# 只读根文件系统 + 临时写入点
tmpfs:
- /tmp:noexec,nosuid,size=100m
- /var/cache/nginx:noexec,nosuid,size=100m
- /var/run:noexec,nosuid,size=100m2.2 网络安全
yaml
version: '3.8'
services:
web:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
networks:
- frontend
api:
image: myapi:latest
networks:
- frontend
- backend
# 不暴露端口到主机
expose:
- "8080"
db:
image: postgres:15-alpine
networks:
- backend
# 数据库不暴露到外部
# 只能通过 backend 网络访问
networks:
frontend:
driver: bridge
backend:
driver: bridge
internal: true # 无外部访问
ipam:
config:
- subnet: 172.20.0.0/162.3 密钥管理
yaml
version: '3.8'
services:
api:
image: myapi:latest
environment:
# 使用文件引用敏感数据
DB_PASSWORD_FILE: /run/secrets/db-password
API_KEY_FILE: /run/secrets/api-key
JWT_SECRET_FILE: /run/secrets/jwt-secret
secrets:
- db-password
- api-key
- jwt-secret
# 不使用环境变量传递敏感数据
# environment: # ❌ 不推荐
# DB_PASSWORD: mypassword
# API_KEY: myapikey
db:
image: postgres:15-alpine
environment:
POSTGRES_PASSWORD_FILE: /run/secrets/db-password
secrets:
- db-password
secrets:
db-password:
file: ./secrets/db-password.txt
# 或使用外部密钥管理
# external: true
api-key:
file: ./secrets/api-key.txt
jwt-secret:
file: ./secrets/jwt-secret.txt2.4 TLS/SSL 配置
yaml
version: '3.8'
services:
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
- ./nginx/dhparam.pem:/etc/nginx/dhparam.pem:ro
configs:
- source: nginx-config
target: /etc/nginx/nginx.conf
secrets:
- ssl-certificate
- ssl-certificate-key
configs:
nginx-config:
file: ./nginx/nginx.conf
secrets:
ssl-certificate:
file: ./secrets/ssl/cert.pem
ssl-certificate-key:
file: ./secrets/ssl/key.pemNginx SSL 配置示例:
nginx
server {
listen 443 ssl http2;
server_name example.com;
ssl_certificate /run/secrets/ssl-certificate;
ssl_certificate_key /run/secrets/ssl-certificate-key;
# SSL 安全配置
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers off;
ssl_dhparam /etc/nginx/dhparam.pem;
# HSTS
add_header Strict-Transport-Security "max-age=63072000" always;
# 其他配置...
}
server {
listen 80;
server_name example.com;
return 301 https://$server_name$request_uri;
}高可用配置
3.1 服务副本和重启策略
yaml
version: '3.8'
services:
web:
image: nginx:alpine
deploy:
# 副本数
replicas: 3
# 更新配置
update_config:
parallelism: 1 # 每次更新 1 个
delay: 10s # 更新间隔
failure_action: rollback # 失败回滚
order: start-first # 先启动新容器
# 回滚配置
rollback_config:
parallelism: 1
delay: 10s
failure_action: pause
# 重启策略
restart_policy:
condition: any # 任何情况都重启
delay: 5s # 重启延迟
max_attempts: 3 # 最大重试次数
window: 120s # 统计窗口
# 放置约束
placement:
constraints:
- node.role == worker
preferences:
- spread: node.labels.zone
# 资源限制
resources:
limits:
cpus: '0.5'
memory: 256M
reservations:
cpus: '0.25'
memory: 128M
healthcheck:
test: ["CMD", "wget", "--quiet", "--tries=1", "--spider", "http://localhost/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s3.2 负载均衡
yaml
version: '3.8'
services:
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
depends_on:
- web
networks:
- frontend
web:
image: myapp:latest
deploy:
replicas: 3
restart_policy:
condition: on-failure
expose:
- "3000"
networks:
- frontend
- backend
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/health"]
interval: 30s
timeout: 10s
retries: 3
networks:
frontend:
driver: bridge
backend:
driver: bridge
internal: trueNginx 负载均衡配置:
nginx
upstream web {
least_conn; # 最少连接算法
server web_1:3000 max_fails=3 fail_timeout=30s;
server web_2:3000 max_fails=3 fail_timeout=30s;
server web_3:3000 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
location / {
proxy_pass http://web;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 健康检查
proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
}
location /health {
access_log off;
return 200 "healthy\n";
add_header Content-Type text/plain;
}
}3.3 数据库高可用
yaml
version: '3.8'
services:
db-primary:
image: postgres:15-alpine
environment:
POSTGRES_USER: app
POSTGRES_DB: app
POSTGRES_INITDB_ARGS: "--encoding=UTF-8"
PGDATA: /var/lib/postgresql/data/pgdata
volumes:
- db-primary-data:/var/lib/postgresql/data
- ./db/primary:/docker-entrypoint-initdb.d
networks:
- backend
healthcheck:
test: ["CMD-SHELL", "pg_isready -U app"]
interval: 10s
timeout: 5s
retries: 5
deploy:
placement:
constraints:
- node.labels.database == primary
db-replica:
image: postgres:15-alpine
environment:
POSTGRES_USER: app
POSTGRES_DB: app
PGDATA: /var/lib/postgresql/data/pgdata
REPLICATE_FROM: db-primary
volumes:
- db-replica-data:/var/lib/postgresql/data
- ./db/replica:/docker-entrypoint-initdb.d
networks:
- backend
depends_on:
- db-primary
healthcheck:
test: ["CMD-SHELL", "pg_isready -U app"]
interval: 10s
timeout: 5s
retries: 5
deploy:
placement:
constraints:
- node.labels.database == replica
pgpool:
image: pgpool/pgpool:latest
environment:
PGPOOL_BACKEND_NODES: "0:db-primary:5432,1:db-replica:5432"
PGPOOL_SR_CHECK_USER: app
PGPOOL_LOAD_BALANCE_MODE: "on"
ports:
- "5432:5432"
networks:
- backend
depends_on:
- db-primary
- db-replica
volumes:
db-primary-data:
db-replica-data:
networks:
backend:
internal: true监控和日志
4.1 日志配置
yaml
version: '3.8'
services:
web:
image: myapp:latest
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
labels: "service_name,environment"
env: "OS_VERSION,NODE_VERSION"
# 或使用 syslog
# logging:
# driver: syslog
# options:
# syslog-address: "tcp://192.168.1.100:514"
# tag: "docker/{{.Name}}"
# 或使用 fluentd
# logging:
# driver: fluentd
# options:
# fluentd-address: localhost:24224
# tag: docker.web
labels:
- "service_name=web"
- "environment=production"4.2 监控配置
yaml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
networks:
- monitoring
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./grafana/datasources:/etc/grafana/provisioning/datasources:ro
environment:
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
networks:
- monitoring
node-exporter:
image: prom/node-exporter:latest
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- monitoring
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /cgroup:/cgroup:ro
networks:
- monitoring
volumes:
prometheus-data:
grafana-data:
networks:
monitoring:
driver: bridge4.3 告警配置
yaml
# prometheus/alert.rules.yml
groups:
- name: docker-alerts
rules:
- alert: ContainerDown
expr: up{job="docker-containers"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Container {{ $labels.name }} is down"
description: "Container {{ $labels.name }} has been down for more than 1 minute."
- alert: HighMemoryUsage
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.name }}"
description: "Container {{ $labels.name }} is using more than 85% of memory limit."
- alert: HighCPUUsage
expr: rate(container_cpu_usage_seconds_total[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.name }}"
description: "Container {{ $labels.name }} is using more than 80% of CPU."备份和恢复
5.1 数据备份
yaml
version: '3.8'
services:
backup:
image: offen/docker-volume-backup:latest
environment:
BACKUP_CRON_EXPRESSION: "0 2 * * *"
BACKUP_RETENTION_DAYS: "30"
AWS_S3_BUCKET_NAME: ${AWS_S3_BUCKET}
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY}
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_KEY}
AWS_DEFAULT_REGION: ${AWS_REGION}
BACKUP_FILENAME: backup-%Y-%m-%dT%H-%M-%S.tar.gz
volumes:
- db-data:/backup/data:ro
- redis-data:/backup/redis:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- backend
volumes:
db-data:
external: true
redis-data:
external: true
networks:
backend:
external: true5.2 手动备份脚本
bash
#!/bin/bash
# backup.sh
BACKUP_DIR="/backup/$(date +%Y%m%d_%H%M%S)"
mkdir -p $BACKUP_DIR
# 备份数据库
docker-compose exec -T db pg_dump -U app app > $BACKUP_DIR/database.sql
# 备份卷数据
docker run --rm \
-v myapp_db-data:/data:ro \
-v $BACKUP_DIR:/backup \
alpine tar czf /backup/db-data.tar.gz -C /data .
docker run --rm \
-v myapp_redis-data:/data:ro \
-v $BACKUP_DIR:/backup \
alpine tar czf /backup/redis-data.tar.gz -C /data .
# 备份配置
cp docker-compose.yml $BACKUP_DIR/
cp -r config $BACKUP_DIR/
# 压缩备份
tar czf $BACKUP_DIR.tar.gz -C $(dirname $BACKUP_DIR) $(basename $BACKUP_DIR)
rm -rf $BACKUP_DIR
# 上传到 S3 (可选)
aws s3 cp $BACKUP_DIR.tar.gz s3://my-backup-bucket/docker/
# 清理旧备份 (保留最近 30 天)
find /backup -name "*.tar.gz" -mtime +30 -delete
echo "Backup completed: $BACKUP_DIR.tar.gz"5.3 恢复脚本
bash
#!/bin/bash
# restore.sh
BACKUP_FILE=$1
if [ -z "$BACKUP_FILE" ]; then
echo "Usage: $0 <backup-file>"
exit 1
fi
# 停止服务
docker-compose down
# 恢复卷数据
docker run --rm \
-v myapp_db-data:/data \
-v $(dirname $BACKUP_FILE):/backup:ro \
alpine tar xzf /backup/$(basename $BACKUP_FILE) -C /data
# 启动服务
docker-compose up -d
# 恢复数据库(如果需要)
# docker-compose exec -T db psql -U app app < database.sql
echo "Restore completed"部署策略
6.1 蓝绿部署
bash
#!/bin/bash
# blue-green-deploy.sh
VERSION=$1
BLUE="myapp-blue"
GREEN="myapp-green"
NGINX_CONF="./nginx/nginx.conf"
# 确定当前活跃版本
CURRENT=$(docker-compose ps -q web-blue | wc -l)
if [ $CURRENT -gt 0 ]; then
# 当前是 blue,部署到 green
echo "Deploying to green..."
docker-compose up -d web-green
# 健康检查
sleep 10
if docker-compose exec web-green wget --quiet --tries=1 --spider http://localhost:3000/health; then
# 切换流量到 green
sed -i 's/upstream web {/upstream web {\n server web-green:3000;/' $NGINX_CONF
docker-compose exec nginx nginx -s reload
# 停止 blue
docker-compose stop web-blue
echo "Deployment to green completed"
else
echo "Health check failed, rollback..."
docker-compose stop web-green
exit 1
fi
else
# 当前是 green,部署到 blue
echo "Deploying to blue..."
docker-compose up -d web-blue
# 健康检查
sleep 10
if docker-compose exec web-blue wget --quiet --tries=1 --spider http://localhost:3000/health; then
# 切换流量到 blue
sed -i 's/upstream web {/upstream web {\n server web-blue:3000;/' $NGINX_CONF
docker-compose exec nginx nginx -s reload
# 停止 green
docker-compose stop web-green
echo "Deployment to blue completed"
else
echo "Health check failed, rollback..."
docker-compose stop web-blue
exit 1
fi
fi6.2 滚动部署
yaml
version: '3.8'
services:
web:
image: myapp:${VERSION}
deploy:
replicas: 3
update_config:
parallelism: 1 # 每次更新 1 个容器
delay: 10s # 更新间隔
failure_action: rollback # 失败回滚
order: start-first # 先启动新容器
monitor: 60s # 监控时间
rollback_config:
parallelism: 1
delay: 10s
failure_action: pause
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 36.3 GitOps 部署
yaml
# .github/workflows/deploy.yml
name: Deploy to Production
on:
push:
branches: [main]
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Configure SSH
uses: webfactory/ssh-agent@v0.7.0
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
- name: Deploy
run: |
ssh -o StrictHostKeyChecking=no user@server << 'EOF'
cd /opt/myapp
git pull origin main
docker-compose pull
docker-compose up -d
docker-compose ps
EOF生产环境最佳实践
7.1 目录结构
myapp-production/
├── docker-compose.yml # 基础配置
├── docker-compose.prod.yml # 生产配置
├── docker-compose.override.yml # 本地覆盖(不提交)
├── .env # 环境变量(不提交)
├── .env.example # 环境变量示例
├── config/
│ ├── nginx/
│ │ ├── nginx.conf
│ │ └── ssl/
│ ├── prometheus/
│ │ └── prometheus.yml
│ └── grafana/
├── secrets/ # 密钥目录(不提交)
├── scripts/
│ ├── backup.sh
│ ├── restore.sh
│ └── deploy.sh
├── logs/ # 日志目录
└── README.md7.2 环境变量管理
bash
# .env.example
# 复制为 .env 并填入实际值
# 应用配置
APP_NAME=myapp
APP_ENV=production
APP_VERSION=1.0.0
# 数据库
DB_HOST=db
DB_PORT=5432
DB_NAME=app
DB_USER=app
# DB_PASSWORD 在 secrets/db-password.txt
# Redis
REDIS_HOST=redis
REDIS_PORT=6379
# 监控
GRAFANA_PASSWORD=changeme
# 备份
AWS_S3_BUCKET=my-backup-bucket
AWS_ACCESS_KEY=AKIA...
AWS_SECRET_KEY=...
AWS_REGION=us-east-17.3 运维脚本
bash
#!/bin/bash
# ops.sh - 运维脚本
case "$1" in
start)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml up -d
;;
stop)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml down
;;
restart)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml restart
;;
logs)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml logs -f
;;
status)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml ps
;;
update)
docker-compose -f docker-compose.yml -f docker-compose.prod.yml pull
docker-compose -f docker-compose.yml -f docker-compose.prod.yml up -d
;;
backup)
./scripts/backup.sh
;;
restore)
./scripts/restore.sh $2
;;
*)
echo "Usage: $0 {start|stop|restart|logs|status|update|backup|restore}"
exit 1
;;
esac