# ============================================================================= # RWA 微服务告警规则 # ============================================================================= groups: # =========================================================================== # 服务可用性告警 # =========================================================================== - name: service-availability rules: - alert: ServiceDown expr: up{job=~".*-service"} == 0 for: 1m labels: severity: critical annotations: summary: "服务不可用: {{ $labels.job }}" description: "服务 {{ $labels.job }} 已经停止响应超过 1 分钟" - alert: ServiceHighErrorRate expr: | sum(rate(http_requests_total{job=~".*-service", status=~"5.."}[5m])) by (job) / sum(rate(http_requests_total{job=~".*-service"}[5m])) by (job) > 0.05 for: 5m labels: severity: warning annotations: summary: "服务错误率过高: {{ $labels.job }}" description: "服务 {{ $labels.job }} 的 5xx 错误率超过 5%" - alert: ServiceHighLatency expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~".*-service"}[5m])) by (le, job)) > 2 for: 5m labels: severity: warning annotations: summary: "服务响应延迟过高: {{ $labels.job }}" description: "服务 {{ $labels.job }} 的 P95 响应时间超过 2 秒" # =========================================================================== # Kong 网关告警 # =========================================================================== - name: kong-gateway rules: - alert: KongHighLatency expr: | histogram_quantile(0.99, sum(rate(kong_latency_bucket{type="request"}[5m])) by (le)) > 5000 for: 5m labels: severity: warning annotations: summary: "Kong 网关延迟过高" description: "Kong 网关 P99 延迟超过 5 秒" - alert: KongHighErrorRate expr: | sum(rate(kong_http_status{code=~"5.."}[5m])) / sum(rate(kong_http_status[5m])) > 0.01 for: 5m labels: severity: critical annotations: summary: "Kong 网关错误率过高" description: "Kong 网关 5xx 错误率超过 1%" - alert: KongRateLimitTriggered expr: sum(rate(kong_rate_limiting_requests_total{status="over_limit"}[5m])) > 10 for: 2m labels: severity: warning annotations: summary: "触发限流保护" description: "每分钟有超过 10 个请求被限流" # =========================================================================== # 基础设施告警 # =========================================================================== - name: infrastructure rules: - alert: ConsulServiceUnhealthy expr: consul_health_service_status{status!="passing"} > 0 for: 2m labels: severity: warning annotations: summary: "Consul 服务健康检查失败: {{ $labels.service }}" description: "服务 {{ $labels.service }} 在 Consul 中的健康检查未通过" - alert: LokiIngestionErrors expr: sum(rate(loki_distributor_bytes_received_total[5m])) == 0 for: 5m labels: severity: warning annotations: summary: "Loki 没有接收到日志" description: "Loki 在过去 5 分钟内没有接收到任何日志数据" # =========================================================================== # 业务指标告警 (示例) # =========================================================================== - name: business-metrics rules: - alert: LowDAU expr: rwa_presence_dau < 100 for: 1h labels: severity: info annotations: summary: "日活用户数较低" description: "当前 DAU 仅为 {{ $value }}" - alert: HighPendingRewards expr: rwa_rewards_pending_count > 10000 for: 30m labels: severity: warning annotations: summary: "待领取奖励积压" description: "待领取奖励数量超过 10000"