127 lines
4.4 KiB
YAML
127 lines
4.4 KiB
YAML
# =============================================================================
|
|
# RWA 微服务告警规则
|
|
# =============================================================================
|
|
|
|
groups:
|
|
# ===========================================================================
|
|
# 服务可用性告警
|
|
# ===========================================================================
|
|
- name: service-availability
|
|
rules:
|
|
- alert: ServiceDown
|
|
expr: up{job=~".*-service"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "服务不可用: {{ $labels.job }}"
|
|
description: "服务 {{ $labels.job }} 已经停止响应超过 1 分钟"
|
|
|
|
- alert: ServiceHighErrorRate
|
|
expr: |
|
|
sum(rate(http_requests_total{job=~".*-service", status=~"5.."}[5m])) by (job)
|
|
/
|
|
sum(rate(http_requests_total{job=~".*-service"}[5m])) by (job)
|
|
> 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "服务错误率过高: {{ $labels.job }}"
|
|
description: "服务 {{ $labels.job }} 的 5xx 错误率超过 5%"
|
|
|
|
- alert: ServiceHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~".*-service"}[5m])) by (le, job))
|
|
> 2
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "服务响应延迟过高: {{ $labels.job }}"
|
|
description: "服务 {{ $labels.job }} 的 P95 响应时间超过 2 秒"
|
|
|
|
# ===========================================================================
|
|
# Kong 网关告警
|
|
# ===========================================================================
|
|
- name: kong-gateway
|
|
rules:
|
|
- alert: KongHighLatency
|
|
expr: |
|
|
histogram_quantile(0.99, sum(rate(kong_latency_bucket{type="request"}[5m])) by (le))
|
|
> 5000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Kong 网关延迟过高"
|
|
description: "Kong 网关 P99 延迟超过 5 秒"
|
|
|
|
- alert: KongHighErrorRate
|
|
expr: |
|
|
sum(rate(kong_http_status{code=~"5.."}[5m]))
|
|
/
|
|
sum(rate(kong_http_status[5m]))
|
|
> 0.01
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Kong 网关错误率过高"
|
|
description: "Kong 网关 5xx 错误率超过 1%"
|
|
|
|
- alert: KongRateLimitTriggered
|
|
expr: sum(rate(kong_rate_limiting_requests_total{status="over_limit"}[5m])) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "触发限流保护"
|
|
description: "每分钟有超过 10 个请求被限流"
|
|
|
|
# ===========================================================================
|
|
# 基础设施告警
|
|
# ===========================================================================
|
|
- name: infrastructure
|
|
rules:
|
|
- alert: ConsulServiceUnhealthy
|
|
expr: consul_health_service_status{status!="passing"} > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Consul 服务健康检查失败: {{ $labels.service }}"
|
|
description: "服务 {{ $labels.service }} 在 Consul 中的健康检查未通过"
|
|
|
|
- alert: LokiIngestionErrors
|
|
expr: sum(rate(loki_distributor_bytes_received_total[5m])) == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Loki 没有接收到日志"
|
|
description: "Loki 在过去 5 分钟内没有接收到任何日志数据"
|
|
|
|
# ===========================================================================
|
|
# 业务指标告警 (示例)
|
|
# ===========================================================================
|
|
- name: business-metrics
|
|
rules:
|
|
- alert: LowDAU
|
|
expr: rwa_presence_dau < 100
|
|
for: 1h
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "日活用户数较低"
|
|
description: "当前 DAU 仅为 {{ $value }}"
|
|
|
|
- alert: HighPendingRewards
|
|
expr: rwa_rewards_pending_count > 10000
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "待领取奖励积压"
|
|
description: "待领取奖励数量超过 10000"
|