Presence Service 部署文档
1. 部署概述
1.1 部署架构
┌─────────────────────────────────────┐
│ Load Balancer │
│ (Nginx / ALB) │
└──────────────┬──────────────────────┘
│
┌───────────────────┼───────────────────┐
│ │ │
┌─────▼─────┐ ┌─────▼─────┐ ┌─────▼─────┐
│ Presence │ │ Presence │ │ Presence │
│ Service │ │ Service │ │ Service │
│ (Pod 1) │ │ (Pod 2) │ │ (Pod N) │
└─────┬─────┘ └─────┬─────┘ └─────┬─────┘
│ │ │
└───────────────────┼───────────────────┘
│
┌─────────────────────────┼─────────────────────────┐
│ │ │
┌─────▼─────┐ ┌───────▼───────┐ ┌───────▼───────┐
│PostgreSQL │ │ Redis │ │ Kafka │
│ (Primary) │ │ Cluster │ │ Cluster │
└───────────┘ └───────────────┘ └───────────────┘
1.2 环境列表
2. Docker 部署
2.1 Dockerfile
# =============================================================================
# Presence Service - Production Dockerfile
# =============================================================================
# Stage 1: Build
FROM node:20-alpine AS builder
WORKDIR /app
# Install dependencies
COPY package*.json ./
RUN npm ci --only=production=false
# Copy source and build
COPY prisma ./prisma/
COPY src ./src/
COPY tsconfig*.json ./
RUN npx prisma generate
RUN npm run build
# Remove dev dependencies
RUN npm prune --production
# Stage 2: Production
FROM node:20-alpine AS production
WORKDIR /app
# Install security updates
RUN apk update && apk upgrade && apk add --no-cache dumb-init
# Create non-root user
RUN addgroup -g 1001 -S nodejs && adduser -S nestjs -u 1001
# Copy built application
COPY --from=builder --chown=nestjs:nodejs /app/dist ./dist
COPY --from=builder --chown=nestjs:nodejs /app/node_modules ./node_modules
COPY --from=builder --chown=nestjs:nodejs /app/prisma ./prisma
COPY --from=builder --chown=nestjs:nodejs /app/package.json ./
# Switch to non-root user
USER nestjs
# Environment
ENV NODE_ENV=production
ENV PORT=3000
EXPOSE 3000
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:3000/api/v1/health || exit 1
# Start application
ENTRYPOINT ["dumb-init", "--"]
CMD ["node", "dist/main.js"]
2.2 构建镜像
# 构建镜像
docker build -t presence-service:latest .
# 带版本标签构建
docker build -t presence-service:v1.0.0 .
# 推送到镜像仓库
docker tag presence-service:v1.0.0 registry.example.com/presence-service:v1.0.0
docker push registry.example.com/presence-service:v1.0.0
2.3 Docker Compose (开发/测试)
# docker-compose.yml
version: '3.8'
services:
presence-service:
build: .
ports:
- "3000:3000"
environment:
NODE_ENV: production
DATABASE_URL: postgresql://postgres:postgres@postgres:5432/presence?schema=public
REDIS_HOST: redis
REDIS_PORT: 6379
KAFKA_BROKERS: kafka:9092
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/v1/health"]
interval: 30s
timeout: 10s
retries: 3
postgres:
image: postgres:15-alpine
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: presence
volumes:
- postgres-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 10s
timeout: 5s
retries: 5
redis:
image: redis:7-alpine
volumes:
- redis-data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
volumes:
postgres-data:
redis-data:
3. Kubernetes 部署
3.1 Namespace
# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: presence
labels:
app.kubernetes.io/name: presence-service
3.2 ConfigMap
# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: presence-service-config
namespace: presence
data:
NODE_ENV: "production"
PORT: "3000"
REDIS_PORT: "6379"
KAFKA_BROKERS: "kafka-cluster.kafka:9092"
3.3 Secret
# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: presence-service-secret
namespace: presence
type: Opaque
stringData:
DATABASE_URL: "postgresql://user:password@postgres-cluster:5432/presence?schema=public"
REDIS_HOST: "redis-cluster.redis"
REDIS_PASSWORD: "redis-password"
JWT_SECRET: "your-jwt-secret"
3.4 Deployment
# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: presence-service
namespace: presence
labels:
app: presence-service
spec:
replicas: 3
selector:
matchLabels:
app: presence-service
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
template:
metadata:
labels:
app: presence-service
spec:
serviceAccountName: presence-service
securityContext:
runAsNonRoot: true
runAsUser: 1001
fsGroup: 1001
containers:
- name: presence-service
image: registry.example.com/presence-service:v1.0.0
imagePullPolicy: Always
ports:
- containerPort: 3000
protocol: TCP
envFrom:
- configMapRef:
name: presence-service-config
- secretRef:
name: presence-service-secret
resources:
requests:
cpu: "100m"
memory: "256Mi"
limits:
cpu: "500m"
memory: "512Mi"
livenessProbe:
httpGet:
path: /api/v1/health
port: 3000
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /api/v1/health
port: 3000
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
volumeMounts:
- name: tmp
mountPath: /tmp
volumes:
- name: tmp
emptyDir: {}
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: presence-service
topologyKey: kubernetes.io/hostname
3.5 Service
# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
name: presence-service
namespace: presence
spec:
type: ClusterIP
selector:
app: presence-service
ports:
- port: 80
targetPort: 3000
protocol: TCP
3.6 Ingress
# k8s/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: presence-service
namespace: presence
annotations:
kubernetes.io/ingress.class: nginx
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/rate-limit: "100"
nginx.ingress.kubernetes.io/rate-limit-window: "1m"
spec:
tls:
- hosts:
- presence.example.com
secretName: presence-tls
rules:
- host: presence.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: presence-service
port:
number: 80
3.7 HorizontalPodAutoscaler
# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: presence-service
namespace: presence
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: presence-service
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
3.8 部署命令
# 应用所有配置
kubectl apply -f k8s/
# 查看部署状态
kubectl get pods -n presence
# 查看日志
kubectl logs -f deployment/presence-service -n presence
# 滚动更新
kubectl set image deployment/presence-service \
presence-service=registry.example.com/presence-service:v1.1.0 \
-n presence
# 回滚
kubectl rollout undo deployment/presence-service -n presence
4. 数据库迁移
4.1 迁移策略
# 开发环境 - 直接同步
npx prisma db push
# 生产环境 - 使用迁移
npx prisma migrate deploy
4.2 Kubernetes Job 迁移
# k8s/migration-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: prisma-migrate
namespace: presence
spec:
template:
spec:
containers:
- name: migrate
image: registry.example.com/presence-service:v1.0.0
command: ["npx", "prisma", "migrate", "deploy"]
envFrom:
- secretRef:
name: presence-service-secret
restartPolicy: Never
backoffLimit: 3
4.3 CI/CD 迁移脚本
#!/bin/bash
# scripts/migrate.sh
set -e
echo "Running database migrations..."
# 等待数据库就绪
until pg_isready -h $DB_HOST -p $DB_PORT -U $DB_USER; do
echo "Waiting for database..."
sleep 2
done
# 执行迁移
npx prisma migrate deploy
echo "Migrations completed successfully"
5. 环境变量
5.1 必需变量
| 变量 |
描述 |
示例 |
NODE_ENV |
运行环境 |
production |
PORT |
服务端口 |
3000 |
DATABASE_URL |
PostgreSQL 连接串 |
postgresql://... |
REDIS_HOST |
Redis 主机 |
redis-cluster |
REDIS_PORT |
Redis 端口 |
6379 |
JWT_SECRET |
JWT 密钥 |
xxx |
5.2 可选变量
| 变量 |
描述 |
默认值 |
REDIS_PASSWORD |
Redis 密码 |
- |
REDIS_DB |
Redis 数据库 |
0 |
KAFKA_BROKERS |
Kafka 集群地址 |
- |
LOG_LEVEL |
日志级别 |
info |
6. 监控和告警
6.1 健康检查端点
GET /api/v1/health
Response:
{
"status": "ok",
"service": "presence-service",
"timestamp": "2025-01-01T00:00:00Z"
}
6.2 Prometheus 指标
# k8s/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: presence-service
namespace: presence
spec:
selector:
matchLabels:
app: presence-service
endpoints:
- port: http
path: /metrics
interval: 30s
6.3 关键指标
| 指标 |
描述 |
告警阈值 |
http_request_duration_seconds |
请求延迟 |
P99 > 1s |
http_requests_total |
请求总数 |
- |
http_request_errors_total |
错误请求数 |
错误率 > 1% |
presence_online_users |
在线用户数 |
- |
nodejs_heap_size_used_bytes |
堆内存使用 |
> 400MB |
6.4 告警规则
# prometheus/alerts.yaml
groups:
- name: presence-service
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_request_errors_total{service="presence-service"}[5m]))
/ sum(rate(http_requests_total{service="presence-service"}[5m])) > 0.01
for: 5m
labels:
severity: critical
annotations:
summary: High error rate in presence-service
- alert: HighLatency
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket{service="presence-service"}[5m])) by (le)
) > 1
for: 5m
labels:
severity: warning
annotations:
summary: High latency in presence-service
- alert: PodNotReady
expr: |
kube_pod_status_ready{namespace="presence", condition="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: Presence service pod not ready
7. 日志管理
7.1 日志格式
{
"timestamp": "2025-01-01T12:00:00.000Z",
"level": "info",
"context": "PresenceController",
"message": "Heartbeat recorded",
"userId": "12345",
"installId": "xxx",
"requestId": "uuid",
"duration": 15
}
7.2 日志收集 (Fluentd)
# fluentd/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: fluentd-config
data:
fluent.conf: |
<source>
@type tail
path /var/log/containers/presence-*.log
pos_file /var/log/presence.pos
tag kubernetes.presence
<parse>
@type json
</parse>
</source>
<match kubernetes.presence>
@type elasticsearch
host elasticsearch
port 9200
index_name presence-logs
</match>
8. 备份和恢复
8.1 数据库备份
#!/bin/bash
# scripts/backup.sh
DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="presence_backup_${DATE}.sql"
# 备份
pg_dump $DATABASE_URL > /backups/$BACKUP_FILE
# 压缩
gzip /backups/$BACKUP_FILE
# 上传到 S3
aws s3 cp /backups/${BACKUP_FILE}.gz s3://backups/presence/
# 清理本地旧备份 (保留7天)
find /backups -name "*.gz" -mtime +7 -delete
8.2 Kubernetes CronJob
# k8s/backup-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: db-backup
namespace: presence
spec:
schedule: "0 2 * * *" # 每天凌晨2点
jobTemplate:
spec:
template:
spec:
containers:
- name: backup
image: postgres:15-alpine
command: ["/scripts/backup.sh"]
envFrom:
- secretRef:
name: presence-service-secret
volumeMounts:
- name: scripts
mountPath: /scripts
- name: backups
mountPath: /backups
volumes:
- name: scripts
configMap:
name: backup-scripts
- name: backups
persistentVolumeClaim:
claimName: backup-pvc
restartPolicy: OnFailure
9. 安全配置
9.1 Network Policy
# k8s/network-policy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: presence-service
namespace: presence
spec:
podSelector:
matchLabels:
app: presence-service
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: ingress-nginx
ports:
- protocol: TCP
port: 3000
egress:
- to:
- namespaceSelector:
matchLabels:
name: postgres
ports:
- protocol: TCP
port: 5432
- to:
- namespaceSelector:
matchLabels:
name: redis
ports:
- protocol: TCP
port: 6379
- to:
- namespaceSelector:
matchLabels:
name: kafka
ports:
- protocol: TCP
port: 9092
9.2 Pod Security Policy
# k8s/pod-security.yaml
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
name: presence-service
spec:
privileged: false
runAsUser:
rule: MustRunAsNonRoot
seLinux:
rule: RunAsAny
fsGroup:
rule: RunAsAny
volumes:
- 'configMap'
- 'secret'
- 'emptyDir'
10. 故障排查
10.1 常见问题
数据库连接失败
# 检查数据库连通性
kubectl run -it --rm debug --image=postgres:15-alpine -- \
psql $DATABASE_URL -c "SELECT 1"
# 检查 Secret 配置
kubectl get secret presence-service-secret -n presence -o yaml
Redis 连接失败
# 检查 Redis 连通性
kubectl run -it --rm debug --image=redis:7-alpine -- \
redis-cli -h $REDIS_HOST -p $REDIS_PORT ping
Pod CrashLoopBackOff
# 查看日志
kubectl logs -f <pod-name> -n presence --previous
# 查看事件
kubectl describe pod <pod-name> -n presence
10.2 性能调优
# 调整资源限制
resources:
requests:
cpu: "200m"
memory: "512Mi"
limits:
cpu: "1000m"
memory: "1Gi"
# Node.js 内存配置
env:
- name: NODE_OPTIONS
value: "--max-old-space-size=768"