rwadurian/backend/services/presence-service/docs/DEPLOYMENT.md

19 KiB

Presence Service 部署文档

1. 部署概述

1.1 部署架构

                         ┌─────────────────────────────────────┐
                         │           Load Balancer             │
                         │         (Nginx / ALB)               │
                         └──────────────┬──────────────────────┘
                                        │
                    ┌───────────────────┼───────────────────┐
                    │                   │                   │
              ┌─────▼─────┐       ┌─────▼─────┐       ┌─────▼─────┐
              │ Presence  │       │ Presence  │       │ Presence  │
              │ Service   │       │ Service   │       │ Service   │
              │ (Pod 1)   │       │ (Pod 2)   │       │ (Pod N)   │
              └─────┬─────┘       └─────┬─────┘       └─────┬─────┘
                    │                   │                   │
                    └───────────────────┼───────────────────┘
                                        │
              ┌─────────────────────────┼─────────────────────────┐
              │                         │                         │
        ┌─────▼─────┐           ┌───────▼───────┐         ┌───────▼───────┐
        │PostgreSQL │           │    Redis      │         │    Kafka      │
        │ (Primary) │           │   Cluster     │         │   Cluster     │
        └───────────┘           └───────────────┘         └───────────────┘

1.2 环境列表

环境 用途 URL
Development 本地开发 http://localhost:3000
Staging 预发布测试 https://staging-presence.example.com
Production 生产环境 https://presence.example.com

2. Docker 部署

2.1 Dockerfile

# =============================================================================
# Presence Service - Production Dockerfile
# =============================================================================

# Stage 1: Build
FROM node:20-alpine AS builder

WORKDIR /app

# Install dependencies
COPY package*.json ./
RUN npm ci --only=production=false

# Copy source and build
COPY prisma ./prisma/
COPY src ./src/
COPY tsconfig*.json ./

RUN npx prisma generate
RUN npm run build

# Remove dev dependencies
RUN npm prune --production

# Stage 2: Production
FROM node:20-alpine AS production

WORKDIR /app

# Install security updates
RUN apk update && apk upgrade && apk add --no-cache dumb-init

# Create non-root user
RUN addgroup -g 1001 -S nodejs && adduser -S nestjs -u 1001

# Copy built application
COPY --from=builder --chown=nestjs:nodejs /app/dist ./dist
COPY --from=builder --chown=nestjs:nodejs /app/node_modules ./node_modules
COPY --from=builder --chown=nestjs:nodejs /app/prisma ./prisma
COPY --from=builder --chown=nestjs:nodejs /app/package.json ./

# Switch to non-root user
USER nestjs

# Environment
ENV NODE_ENV=production
ENV PORT=3000

EXPOSE 3000

# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
  CMD wget --no-verbose --tries=1 --spider http://localhost:3000/api/v1/health || exit 1

# Start application
ENTRYPOINT ["dumb-init", "--"]
CMD ["node", "dist/main.js"]

2.2 构建镜像

# 构建镜像
docker build -t presence-service:latest .

# 带版本标签构建
docker build -t presence-service:v1.0.0 .

# 推送到镜像仓库
docker tag presence-service:v1.0.0 registry.example.com/presence-service:v1.0.0
docker push registry.example.com/presence-service:v1.0.0

2.3 Docker Compose (开发/测试)

# docker-compose.yml
version: '3.8'

services:
  presence-service:
    build: .
    ports:
      - "3000:3000"
    environment:
      NODE_ENV: production
      DATABASE_URL: postgresql://postgres:postgres@postgres:5432/presence?schema=public
      REDIS_HOST: redis
      REDIS_PORT: 6379
      KAFKA_BROKERS: kafka:9092
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "wget", "--spider", "-q", "http://localhost:3000/api/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 3

  postgres:
    image: postgres:15-alpine
    environment:
      POSTGRES_USER: postgres
      POSTGRES_PASSWORD: postgres
      POSTGRES_DB: presence
    volumes:
      - postgres-data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U postgres"]
      interval: 10s
      timeout: 5s
      retries: 5

  redis:
    image: redis:7-alpine
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5

volumes:
  postgres-data:
  redis-data:

3. Kubernetes 部署

3.1 Namespace

# k8s/namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
  name: presence
  labels:
    app.kubernetes.io/name: presence-service

3.2 ConfigMap

# k8s/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: presence-service-config
  namespace: presence
data:
  NODE_ENV: "production"
  PORT: "3000"
  REDIS_PORT: "6379"
  KAFKA_BROKERS: "kafka-cluster.kafka:9092"

3.3 Secret

# k8s/secret.yaml
apiVersion: v1
kind: Secret
metadata:
  name: presence-service-secret
  namespace: presence
type: Opaque
stringData:
  DATABASE_URL: "postgresql://user:password@postgres-cluster:5432/presence?schema=public"
  REDIS_HOST: "redis-cluster.redis"
  REDIS_PASSWORD: "redis-password"
  JWT_SECRET: "your-jwt-secret"

3.4 Deployment

# k8s/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: presence-service
  namespace: presence
  labels:
    app: presence-service
spec:
  replicas: 3
  selector:
    matchLabels:
      app: presence-service
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  template:
    metadata:
      labels:
        app: presence-service
    spec:
      serviceAccountName: presence-service
      securityContext:
        runAsNonRoot: true
        runAsUser: 1001
        fsGroup: 1001
      containers:
        - name: presence-service
          image: registry.example.com/presence-service:v1.0.0
          imagePullPolicy: Always
          ports:
            - containerPort: 3000
              protocol: TCP
          envFrom:
            - configMapRef:
                name: presence-service-config
            - secretRef:
                name: presence-service-secret
          resources:
            requests:
              cpu: "100m"
              memory: "256Mi"
            limits:
              cpu: "500m"
              memory: "512Mi"
          livenessProbe:
            httpGet:
              path: /api/v1/health
              port: 3000
            initialDelaySeconds: 30
            periodSeconds: 10
            timeoutSeconds: 5
            failureThreshold: 3
          readinessProbe:
            httpGet:
              path: /api/v1/health
              port: 3000
            initialDelaySeconds: 5
            periodSeconds: 5
            timeoutSeconds: 3
            failureThreshold: 3
          volumeMounts:
            - name: tmp
              mountPath: /tmp
      volumes:
        - name: tmp
          emptyDir: {}
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
            - weight: 100
              podAffinityTerm:
                labelSelector:
                  matchLabels:
                    app: presence-service
                topologyKey: kubernetes.io/hostname

3.5 Service

# k8s/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: presence-service
  namespace: presence
spec:
  type: ClusterIP
  selector:
    app: presence-service
  ports:
    - port: 80
      targetPort: 3000
      protocol: TCP

3.6 Ingress

# k8s/ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: presence-service
  namespace: presence
  annotations:
    kubernetes.io/ingress.class: nginx
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/rate-limit: "100"
    nginx.ingress.kubernetes.io/rate-limit-window: "1m"
spec:
  tls:
    - hosts:
        - presence.example.com
      secretName: presence-tls
  rules:
    - host: presence.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: presence-service
                port:
                  number: 80

3.7 HorizontalPodAutoscaler

# k8s/hpa.yaml
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: presence-service
  namespace: presence
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: presence-service
  minReplicas: 3
  maxReplicas: 10
  metrics:
    - type: Resource
      resource:
        name: cpu
        target:
          type: Utilization
          averageUtilization: 70
    - type: Resource
      resource:
        name: memory
        target:
          type: Utilization
          averageUtilization: 80

3.8 部署命令

# 应用所有配置
kubectl apply -f k8s/

# 查看部署状态
kubectl get pods -n presence

# 查看日志
kubectl logs -f deployment/presence-service -n presence

# 滚动更新
kubectl set image deployment/presence-service \
  presence-service=registry.example.com/presence-service:v1.1.0 \
  -n presence

# 回滚
kubectl rollout undo deployment/presence-service -n presence

4. 数据库迁移

4.1 迁移策略

# 开发环境 - 直接同步
npx prisma db push

# 生产环境 - 使用迁移
npx prisma migrate deploy

4.2 Kubernetes Job 迁移

# k8s/migration-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
  name: prisma-migrate
  namespace: presence
spec:
  template:
    spec:
      containers:
        - name: migrate
          image: registry.example.com/presence-service:v1.0.0
          command: ["npx", "prisma", "migrate", "deploy"]
          envFrom:
            - secretRef:
                name: presence-service-secret
      restartPolicy: Never
  backoffLimit: 3

4.3 CI/CD 迁移脚本

#!/bin/bash
# scripts/migrate.sh

set -e

echo "Running database migrations..."

# 等待数据库就绪
until pg_isready -h $DB_HOST -p $DB_PORT -U $DB_USER; do
  echo "Waiting for database..."
  sleep 2
done

# 执行迁移
npx prisma migrate deploy

echo "Migrations completed successfully"

5. 环境变量

5.1 必需变量

变量 描述 示例
NODE_ENV 运行环境 production
PORT 服务端口 3000
DATABASE_URL PostgreSQL 连接串 postgresql://...
REDIS_HOST Redis 主机 redis-cluster
REDIS_PORT Redis 端口 6379
JWT_SECRET JWT 密钥 xxx

5.2 可选变量

变量 描述 默认值
REDIS_PASSWORD Redis 密码 -
REDIS_DB Redis 数据库 0
KAFKA_BROKERS Kafka 集群地址 -
LOG_LEVEL 日志级别 info

6. 监控和告警

6.1 健康检查端点

GET /api/v1/health

Response:
{
  "status": "ok",
  "service": "presence-service",
  "timestamp": "2025-01-01T00:00:00Z"
}

6.2 Prometheus 指标

# k8s/servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: presence-service
  namespace: presence
spec:
  selector:
    matchLabels:
      app: presence-service
  endpoints:
    - port: http
      path: /metrics
      interval: 30s

6.3 关键指标

指标 描述 告警阈值
http_request_duration_seconds 请求延迟 P99 > 1s
http_requests_total 请求总数 -
http_request_errors_total 错误请求数 错误率 > 1%
presence_online_users 在线用户数 -
nodejs_heap_size_used_bytes 堆内存使用 > 400MB

6.4 告警规则

# prometheus/alerts.yaml
groups:
  - name: presence-service
    rules:
      - alert: HighErrorRate
        expr: |
          sum(rate(http_request_errors_total{service="presence-service"}[5m]))
          / sum(rate(http_requests_total{service="presence-service"}[5m])) > 0.01          
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: High error rate in presence-service

      - alert: HighLatency
        expr: |
          histogram_quantile(0.99,
            sum(rate(http_request_duration_seconds_bucket{service="presence-service"}[5m])) by (le)
          ) > 1          
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: High latency in presence-service

      - alert: PodNotReady
        expr: |
          kube_pod_status_ready{namespace="presence", condition="true"} == 0          
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: Presence service pod not ready

7. 日志管理

7.1 日志格式

{
  "timestamp": "2025-01-01T12:00:00.000Z",
  "level": "info",
  "context": "PresenceController",
  "message": "Heartbeat recorded",
  "userId": "12345",
  "installId": "xxx",
  "requestId": "uuid",
  "duration": 15
}

7.2 日志收集 (Fluentd)

# fluentd/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: fluentd-config
data:
  fluent.conf: |
    <source>
      @type tail
      path /var/log/containers/presence-*.log
      pos_file /var/log/presence.pos
      tag kubernetes.presence
      <parse>
        @type json
      </parse>
    </source>

    <match kubernetes.presence>
      @type elasticsearch
      host elasticsearch
      port 9200
      index_name presence-logs
    </match>    

8. 备份和恢复

8.1 数据库备份

#!/bin/bash
# scripts/backup.sh

DATE=$(date +%Y%m%d_%H%M%S)
BACKUP_FILE="presence_backup_${DATE}.sql"

# 备份
pg_dump $DATABASE_URL > /backups/$BACKUP_FILE

# 压缩
gzip /backups/$BACKUP_FILE

# 上传到 S3
aws s3 cp /backups/${BACKUP_FILE}.gz s3://backups/presence/

# 清理本地旧备份 (保留7天)
find /backups -name "*.gz" -mtime +7 -delete

8.2 Kubernetes CronJob

# k8s/backup-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
  name: db-backup
  namespace: presence
spec:
  schedule: "0 2 * * *"  # 每天凌晨2点
  jobTemplate:
    spec:
      template:
        spec:
          containers:
            - name: backup
              image: postgres:15-alpine
              command: ["/scripts/backup.sh"]
              envFrom:
                - secretRef:
                    name: presence-service-secret
              volumeMounts:
                - name: scripts
                  mountPath: /scripts
                - name: backups
                  mountPath: /backups
          volumes:
            - name: scripts
              configMap:
                name: backup-scripts
            - name: backups
              persistentVolumeClaim:
                claimName: backup-pvc
          restartPolicy: OnFailure

9. 安全配置

9.1 Network Policy

# k8s/network-policy.yaml
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: presence-service
  namespace: presence
spec:
  podSelector:
    matchLabels:
      app: presence-service
  policyTypes:
    - Ingress
    - Egress
  ingress:
    - from:
        - namespaceSelector:
            matchLabels:
              name: ingress-nginx
      ports:
        - protocol: TCP
          port: 3000
  egress:
    - to:
        - namespaceSelector:
            matchLabels:
              name: postgres
      ports:
        - protocol: TCP
          port: 5432
    - to:
        - namespaceSelector:
            matchLabels:
              name: redis
      ports:
        - protocol: TCP
          port: 6379
    - to:
        - namespaceSelector:
            matchLabels:
              name: kafka
      ports:
        - protocol: TCP
          port: 9092

9.2 Pod Security Policy

# k8s/pod-security.yaml
apiVersion: policy/v1beta1
kind: PodSecurityPolicy
metadata:
  name: presence-service
spec:
  privileged: false
  runAsUser:
    rule: MustRunAsNonRoot
  seLinux:
    rule: RunAsAny
  fsGroup:
    rule: RunAsAny
  volumes:
    - 'configMap'
    - 'secret'
    - 'emptyDir'

10. 故障排查

10.1 常见问题

数据库连接失败

# 检查数据库连通性
kubectl run -it --rm debug --image=postgres:15-alpine -- \
  psql $DATABASE_URL -c "SELECT 1"

# 检查 Secret 配置
kubectl get secret presence-service-secret -n presence -o yaml

Redis 连接失败

# 检查 Redis 连通性
kubectl run -it --rm debug --image=redis:7-alpine -- \
  redis-cli -h $REDIS_HOST -p $REDIS_PORT ping

Pod CrashLoopBackOff

# 查看日志
kubectl logs -f <pod-name> -n presence --previous

# 查看事件
kubectl describe pod <pod-name> -n presence

10.2 性能调优

# 调整资源限制
resources:
  requests:
    cpu: "200m"
    memory: "512Mi"
  limits:
    cpu: "1000m"
    memory: "1Gi"

# Node.js 内存配置
env:
  - name: NODE_OPTIONS
    value: "--max-old-space-size=768"