rwadurian/backend/infrastructure/prometheus/rules/rwa-alerts.yml

127 lines
4.4 KiB
YAML

# =============================================================================
# RWA 微服务告警规则
# =============================================================================
groups:
# ===========================================================================
# 服务可用性告警
# ===========================================================================
- name: service-availability
rules:
- alert: ServiceDown
expr: up{job=~".*-service"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务不可用: {{ $labels.job }}"
description: "服务 {{ $labels.job }} 已经停止响应超过 1 分钟"
- alert: ServiceHighErrorRate
expr: |
sum(rate(http_requests_total{job=~".*-service", status=~"5.."}[5m])) by (job)
/
sum(rate(http_requests_total{job=~".*-service"}[5m])) by (job)
> 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "服务错误率过高: {{ $labels.job }}"
description: "服务 {{ $labels.job }} 的 5xx 错误率超过 5%"
- alert: ServiceHighLatency
expr: |
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~".*-service"}[5m])) by (le, job))
> 2
for: 5m
labels:
severity: warning
annotations:
summary: "服务响应延迟过高: {{ $labels.job }}"
description: "服务 {{ $labels.job }} 的 P95 响应时间超过 2 秒"
# ===========================================================================
# Kong 网关告警
# ===========================================================================
- name: kong-gateway
rules:
- alert: KongHighLatency
expr: |
histogram_quantile(0.99, sum(rate(kong_latency_bucket{type="request"}[5m])) by (le))
> 5000
for: 5m
labels:
severity: warning
annotations:
summary: "Kong 网关延迟过高"
description: "Kong 网关 P99 延迟超过 5 秒"
- alert: KongHighErrorRate
expr: |
sum(rate(kong_http_status{code=~"5.."}[5m]))
/
sum(rate(kong_http_status[5m]))
> 0.01
for: 5m
labels:
severity: critical
annotations:
summary: "Kong 网关错误率过高"
description: "Kong 网关 5xx 错误率超过 1%"
- alert: KongRateLimitTriggered
expr: sum(rate(kong_rate_limiting_requests_total{status="over_limit"}[5m])) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "触发限流保护"
description: "每分钟有超过 10 个请求被限流"
# ===========================================================================
# 基础设施告警
# ===========================================================================
- name: infrastructure
rules:
- alert: ConsulServiceUnhealthy
expr: consul_health_service_status{status!="passing"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: "Consul 服务健康检查失败: {{ $labels.service }}"
description: "服务 {{ $labels.service }} 在 Consul 中的健康检查未通过"
- alert: LokiIngestionErrors
expr: sum(rate(loki_distributor_bytes_received_total[5m])) == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Loki 没有接收到日志"
description: "Loki 在过去 5 分钟内没有接收到任何日志数据"
# ===========================================================================
# 业务指标告警 (示例)
# ===========================================================================
- name: business-metrics
rules:
- alert: LowDAU
expr: rwa_presence_dau < 100
for: 1h
labels:
severity: info
annotations:
summary: "日活用户数较低"
description: "当前 DAU 仅为 {{ $value }}"
- alert: HighPendingRewards
expr: rwa_rewards_pending_count > 10000
for: 30m
labels:
severity: warning
annotations:
summary: "待领取奖励积压"
description: "待领取奖励数量超过 10000"