From bc82f58549fce0bc59448ecc34a4974b5474b477 Mon Sep 17 00:00:00 2001 From: hailin Date: Sat, 6 Dec 2025 17:51:29 -0800 Subject: [PATCH] feat: add infrastructure components for observability and service discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add modular infrastructure stack with: - Consul: service discovery and configuration center - Jaeger: distributed tracing - Loki + Promtail: log aggregation - Prometheus: metrics collection with alert rules - Grafana: unified visualization dashboard All components are optional and can be enabled on-demand using Docker profiles. No changes required to existing microservices. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- backend/infrastructure/.env.example | 47 +++ backend/infrastructure/README.md | 257 ++++++++++++ .../consul/config/kv-defaults.json | 46 +++ .../consul/config/services.json | 232 +++++++++++ backend/infrastructure/deploy.sh | 273 ++++++++++++ backend/infrastructure/docker-compose.yml | 254 ++++++++++++ .../provisioning/dashboards/dashboards.yml | 18 + .../dashboards/rwa-services-overview.json | 387 ++++++++++++++++++ .../provisioning/datasources/datasources.yml | 104 +++++ backend/infrastructure/loki/loki-config.yml | 81 ++++ .../infrastructure/loki/promtail-config.yml | 129 ++++++ .../infrastructure/prometheus/prometheus.yml | 147 +++++++ .../prometheus/rules/rwa-alerts.yml | 126 ++++++ 13 files changed, 2101 insertions(+) create mode 100644 backend/infrastructure/.env.example create mode 100644 backend/infrastructure/README.md create mode 100644 backend/infrastructure/consul/config/kv-defaults.json create mode 100644 backend/infrastructure/consul/config/services.json create mode 100644 backend/infrastructure/deploy.sh create mode 100644 backend/infrastructure/docker-compose.yml create mode 100644 backend/infrastructure/grafana/provisioning/dashboards/dashboards.yml create mode 100644 backend/infrastructure/grafana/provisioning/dashboards/rwa-services-overview.json create mode 100644 backend/infrastructure/grafana/provisioning/datasources/datasources.yml create mode 100644 backend/infrastructure/loki/loki-config.yml create mode 100644 backend/infrastructure/loki/promtail-config.yml create mode 100644 backend/infrastructure/prometheus/prometheus.yml create mode 100644 backend/infrastructure/prometheus/rules/rwa-alerts.yml diff --git a/backend/infrastructure/.env.example b/backend/infrastructure/.env.example new file mode 100644 index 00000000..c4f11eeb --- /dev/null +++ b/backend/infrastructure/.env.example @@ -0,0 +1,47 @@ +# ============================================================================= +# RWA Infrastructure - 环境变量配置 +# ============================================================================= +# 复制此文件为 .env 并修改配置 +# ============================================================================= + +# ----------------------------------------------------------------------------- +# Consul 配置 +# ----------------------------------------------------------------------------- +CONSUL_HTTP_PORT=8500 +CONSUL_DNS_PORT=8600 + +# ----------------------------------------------------------------------------- +# Jaeger 配置 +# ----------------------------------------------------------------------------- +JAEGER_UI_PORT=16686 + +# ----------------------------------------------------------------------------- +# Loki 配置 +# ----------------------------------------------------------------------------- +LOKI_PORT=3100 + +# ----------------------------------------------------------------------------- +# Grafana 配置 +# ----------------------------------------------------------------------------- +GRAFANA_PORT=3030 +GRAFANA_ADMIN_USER=admin +GRAFANA_ADMIN_PASSWORD=admin123 +GRAFANA_ROOT_URL=http://localhost:3030 +GRAFANA_LOG_LEVEL=info + +# ----------------------------------------------------------------------------- +# Prometheus 配置 +# ----------------------------------------------------------------------------- +PROMETHEUS_PORT=9090 + +# ----------------------------------------------------------------------------- +# 后端服务器 IP (用于 Prometheus 抓取) +# ----------------------------------------------------------------------------- +BACKEND_SERVER_IP=192.168.1.111 +KONG_SERVER_IP=192.168.1.100 + +# ----------------------------------------------------------------------------- +# PostgreSQL 配置 (用于 Grafana 数据源) +# ----------------------------------------------------------------------------- +POSTGRES_USER=rwa_user +POSTGRES_PASSWORD=your_password_here diff --git a/backend/infrastructure/README.md b/backend/infrastructure/README.md new file mode 100644 index 00000000..3a905ed3 --- /dev/null +++ b/backend/infrastructure/README.md @@ -0,0 +1,257 @@ +# RWA Infrastructure - 可观测性与服务治理 + +可插拔的基础设施组件,支持按需启用,不影响现有微服务代码。 + +## 架构概览 + +``` + ┌─────────────────────────────────────────────────────┐ + │ Grafana │ + │ (统一可视化仪表盘 :3030) │ + └───────┬─────────────┬─────────────┬─────────────────┘ + │ │ │ + ┌──────────────▼──┐ ┌───────▼───────┐ ┌──▼──────────────┐ + │ Prometheus │ │ Loki │ │ Jaeger │ + │ (指标 :9090) │ │ (日志 :3100) │ │ (追踪 :16686) │ + └────────┬────────┘ └───────┬───────┘ └────────┬────────┘ + │ │ │ + │ ┌──────▼───────┐ │ + │ │ Promtail │ │ + │ │ (日志收集) │ │ + │ └──────┬───────┘ │ + │ │ │ + ┌──────────────────▼───────────────────▼────────────────────▼────────┐ + │ │ + │ RWA 微服务集群 │ + │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ + │ │ identity │ │ wallet │ │ mpc │ │ reward │ │ presence │ │ + │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ + │ ... │ + └─────────────────────────────────┬───────────────────────────────────┘ + │ + ┌────────▼────────┐ + │ Consul │ + │ (服务发现 :8500) │ + └─────────────────┘ +``` + +## 快速开始 + +### 1. 启动所有组件 + +```bash +cd infrastructure +./deploy.sh up +``` + +### 2. 按需启动 + +```bash +# 只启动服务发现 +./deploy.sh up consul + +# 启动日志系统 +./deploy.sh up loki grafana + +# 启动追踪系统 +./deploy.sh up jaeger grafana + +# 启动监控系统 +./deploy.sh up prometheus grafana +``` + +### 3. 访问地址 + +| 服务 | 地址 | 用途 | +|------|------|------| +| Consul | http://localhost:8500 | 服务发现 & 配置中心 | +| Jaeger | http://localhost:16686 | 链路追踪 UI | +| Grafana | http://localhost:3030 | 统一仪表盘 | +| Prometheus | http://localhost:9090 | 指标查询 | +| Loki | http://localhost:3100 | 日志 API | + +## 组件说明 + +### Consul - 服务发现与配置中心 + +**功能:** +- 服务注册与发现 +- 健康检查 +- KV 配置存储 +- 多数据中心支持 + +**配置文件:** +- `consul/config/services.json` - 服务注册定义 +- `consul/config/kv-defaults.json` - 默认 KV 配置 + +**使用示例:** +```bash +# 查看已注册服务 +curl http://localhost:8500/v1/catalog/services + +# 读取配置 +curl http://localhost:8500/v1/kv/rwa/config/global/log_level?raw + +# 更新配置 +curl -X PUT -d 'debug' http://localhost:8500/v1/kv/rwa/config/global/log_level +``` + +### Jaeger - 分布式链路追踪 + +**功能:** +- 请求链路追踪 +- 性能瓶颈分析 +- 服务依赖可视化 +- 错误定位 + +**接入方式(NestJS):** +```typescript +// 安装依赖 +npm install @opentelemetry/sdk-node @opentelemetry/exporter-jaeger + +// 在 main.ts 中初始化 +import { NodeSDK } from '@opentelemetry/sdk-node'; +import { JaegerExporter } from '@opentelemetry/exporter-jaeger'; + +const sdk = new NodeSDK({ + traceExporter: new JaegerExporter({ + endpoint: 'http://jaeger:14268/api/traces', + }), + serviceName: 'identity-service', +}); +sdk.start(); +``` + +### Loki + Promtail - 日志聚合 + +**功能:** +- 自动收集 Docker 容器日志 +- 日志标签化与索引 +- LogQL 查询 +- 与 Grafana 深度集成 + +**日志查询示例(Grafana):** +```logql +# 查看所有错误日志 +{job="rwa-backend"} |~ "error|Error|ERROR" + +# 按服务筛选 +{service="identity-service"} | json | level="error" + +# 查看特定 trace +{trace_id="abc123"} +``` + +### Prometheus - 指标监控 + +**功能:** +- 指标收集 +- 告警规则 +- PromQL 查询 + +**告警规则:** +- `prometheus/rules/rwa-alerts.yml` - 预定义告警规则 + +### Grafana - 统一可视化 + +**预置仪表盘:** +- RWA Services Overview - 服务概览 +- Kong Dashboard - API 网关监控 +- Presence Dashboard - 用户在线状态 + +**数据源:** +- Prometheus (指标) +- Loki (日志) +- Jaeger (追踪) + +## 目录结构 + +``` +infrastructure/ +├── docker-compose.yml # 主编排文件 +├── deploy.sh # 部署脚本 +├── .env.example # 环境变量模板 +├── README.md # 本文档 +│ +├── consul/ +│ └── config/ +│ ├── services.json # 服务注册 +│ └── kv-defaults.json # KV 默认配置 +│ +├── jaeger/ # Jaeger 配置 (使用默认) +│ +├── loki/ +│ ├── loki-config.yml # Loki 配置 +│ └── promtail-config.yml # Promtail 配置 +│ +├── prometheus/ +│ ├── prometheus.yml # Prometheus 配置 +│ └── rules/ +│ └── rwa-alerts.yml # 告警规则 +│ +└── grafana/ + └── provisioning/ + ├── datasources/ + │ └── datasources.yml # 数据源配置 + └── dashboards/ + ├── dashboards.yml # 仪表盘配置 + └── rwa-services-overview.json +``` + +## 常用命令 + +```bash +# 启动 +./deploy.sh up # 启动所有 +./deploy.sh up consul jaeger # 启动指定组件 + +# 管理 +./deploy.sh status # 查看状态 +./deploy.sh health # 健康检查 +./deploy.sh logs grafana # 查看日志 +./deploy.sh restart # 重启 + +# 停止 +./deploy.sh down # 停止所有 +``` + +## 与现有服务集成 + +这些组件是**完全可选的**,不需要修改现有微服务代码即可获得以下能力: + +| 能力 | 无需改代码 | 需要少量改动 | +|------|-----------|-------------| +| 服务健康监控 | ✅ Consul 健康检查 | - | +| 日志聚合 | ✅ Docker 日志自动收集 | - | +| 基础指标 | ✅ Kong Prometheus 插件 | - | +| 详细指标 | - | 添加 Prometheus 中间件 | +| 链路追踪 | - | 添加 OpenTelemetry SDK | +| 动态配置 | - | 集成 Consul KV | + +## 扩展配置 + +### 添加告警通知 + +编辑 `prometheus/prometheus.yml`,配置 Alertmanager: + +```yaml +alerting: + alertmanagers: + - static_configs: + - targets: ['alertmanager:9093'] +``` + +### 添加更多服务到 Consul + +编辑 `consul/config/services.json`,添加新服务定义。 + +### 自定义 Grafana 仪表盘 + +将 JSON 文件放入 `grafana/provisioning/dashboards/` 目录即可自动加载。 + +## 生产环境建议 + +1. **持久化存储**:当前使用 Docker volumes,生产环境建议使用外部存储 +2. **高可用**:Consul 建议 3-5 节点集群 +3. **安全**:配置 TLS 和访问控制 +4. **资源限制**:添加 Docker 资源限制配置 diff --git a/backend/infrastructure/consul/config/kv-defaults.json b/backend/infrastructure/consul/config/kv-defaults.json new file mode 100644 index 00000000..086bf138 --- /dev/null +++ b/backend/infrastructure/consul/config/kv-defaults.json @@ -0,0 +1,46 @@ +{ + "description": "Consul KV 默认配置 - 可通过 API 或 UI 动态修改", + "usage": "consul kv import @kv-defaults.json", + "config": [ + { + "key": "rwa/config/global/log_level", + "value": "info" + }, + { + "key": "rwa/config/global/environment", + "value": "production" + }, + { + "key": "rwa/config/rate-limit/default", + "value": "{\"requests_per_minute\": 100, \"requests_per_hour\": 5000}" + }, + { + "key": "rwa/config/rate-limit/auth", + "value": "{\"requests_per_minute\": 20, \"requests_per_hour\": 200}" + }, + { + "key": "rwa/config/cache/ttl", + "value": "{\"default\": 300, \"session\": 3600, \"static\": 86400}" + }, + { + "key": "rwa/config/features/maintenance_mode", + "value": "false" + }, + { + "key": "rwa/config/features/new_user_registration", + "value": "true" + }, + { + "key": "rwa/config/features/kyc_required", + "value": "true" + }, + { + "key": "rwa/config/database/pool_size", + "value": "20" + }, + { + "key": "rwa/config/kafka/batch_size", + "value": "100" + } + ] +} diff --git a/backend/infrastructure/consul/config/services.json b/backend/infrastructure/consul/config/services.json new file mode 100644 index 00000000..304e2b73 --- /dev/null +++ b/backend/infrastructure/consul/config/services.json @@ -0,0 +1,232 @@ +{ + "services": [ + { + "name": "identity-service", + "id": "identity-service-1", + "address": "192.168.1.111", + "port": 3000, + "tags": ["rwa", "api", "identity"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3000/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "wallet-service", + "id": "wallet-service-1", + "address": "192.168.1.111", + "port": 3001, + "tags": ["rwa", "api", "wallet"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3001/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "backup-service", + "id": "backup-service-1", + "address": "192.168.1.111", + "port": 3002, + "tags": ["rwa", "api", "backup", "mpc"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3002/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "planting-service", + "id": "planting-service-1", + "address": "192.168.1.111", + "port": 3003, + "tags": ["rwa", "api", "planting"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3003/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "referral-service", + "id": "referral-service-1", + "address": "192.168.1.111", + "port": 3004, + "tags": ["rwa", "api", "referral"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3004/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "reward-service", + "id": "reward-service-1", + "address": "192.168.1.111", + "port": 3005, + "tags": ["rwa", "api", "reward"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3005/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "mpc-service", + "id": "mpc-service-1", + "address": "192.168.1.111", + "port": 3006, + "tags": ["rwa", "api", "mpc", "crypto"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3006/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "leaderboard-service", + "id": "leaderboard-service-1", + "address": "192.168.1.111", + "port": 3007, + "tags": ["rwa", "api", "leaderboard"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3007/api/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "reporting-service", + "id": "reporting-service-1", + "address": "192.168.1.111", + "port": 3008, + "tags": ["rwa", "api", "reporting"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3008/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "authorization-service", + "id": "authorization-service-1", + "address": "192.168.1.111", + "port": 3009, + "tags": ["rwa", "api", "authorization", "rbac"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3009/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "admin-service", + "id": "admin-service-1", + "address": "192.168.1.111", + "port": 3010, + "tags": ["rwa", "api", "admin"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3010/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + }, + { + "name": "presence-service", + "id": "presence-service-1", + "address": "192.168.1.111", + "port": 3011, + "tags": ["rwa", "api", "presence", "realtime"], + "meta": { + "version": "1.0.0", + "environment": "production" + }, + "checks": [ + { + "http": "http://192.168.1.111:3011/api/v1/health", + "interval": "10s", + "timeout": "5s", + "deregister_critical_service_after": "1m" + } + ] + } + ] +} diff --git a/backend/infrastructure/deploy.sh b/backend/infrastructure/deploy.sh new file mode 100644 index 00000000..770ecfd6 --- /dev/null +++ b/backend/infrastructure/deploy.sh @@ -0,0 +1,273 @@ +#!/bin/bash +# ============================================================================= +# RWA Infrastructure - 部署脚本 +# ============================================================================= +# +# 用法: +# ./deploy.sh up [组件...] 启动组件 (默认: full) +# ./deploy.sh down 停止所有组件 +# ./deploy.sh restart [组件...] 重启组件 +# ./deploy.sh logs [组件] 查看日志 +# ./deploy.sh status 查看状态 +# ./deploy.sh health 健康检查 +# +# 可用组件: +# consul - 服务发现与配置中心 +# jaeger - 分布式链路追踪 +# loki - 日志聚合 (包含 promtail) +# grafana - 可视化仪表盘 +# prometheus- 指标收集 +# full - 所有组件 +# +# 示例: +# ./deploy.sh up # 启动所有组件 +# ./deploy.sh up consul jaeger # 只启动 Consul 和 Jaeger +# ./deploy.sh up loki grafana # 启动日志和可视化 +# ./deploy.sh logs jaeger # 查看 Jaeger 日志 +# +# ============================================================================= + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +# 颜色定义 +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# 配置 +COMPOSE_FILE="docker-compose.yml" +ENV_FILE=".env" +ENV_EXAMPLE=".env.example" + +# ============================================================================= +# 工具函数 +# ============================================================================= + +log_info() { + echo -e "${BLUE}[INFO]${NC} $1" +} + +log_success() { + echo -e "${GREEN}[SUCCESS]${NC} $1" +} + +log_warning() { + echo -e "${YELLOW}[WARNING]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# 检查 .env 文件 +check_env() { + if [ ! -f "$ENV_FILE" ]; then + if [ -f "$ENV_EXAMPLE" ]; then + log_info "创建 .env 文件..." + cp "$ENV_EXAMPLE" "$ENV_FILE" + log_warning "请检查 .env 文件并配置必要的环境变量" + else + log_warning ".env 文件不存在,使用默认配置" + fi + fi +} + +# 获取 profile 参数 +get_profiles() { + local profiles="" + if [ $# -eq 0 ] || [ "$1" = "full" ]; then + profiles="--profile full" + else + for component in "$@"; do + case "$component" in + consul|jaeger|loki|logging|grafana|prometheus|metrics|tracing) + profiles="$profiles --profile $component" + ;; + *) + log_error "未知组件: $component" + echo "可用组件: consul, jaeger, loki, grafana, prometheus, full" + exit 1 + ;; + esac + done + fi + echo "$profiles" +} + +# ============================================================================= +# 命令实现 +# ============================================================================= + +cmd_up() { + check_env + local profiles=$(get_profiles "$@") + + log_info "启动基础设施组件..." + docker compose -f "$COMPOSE_FILE" $profiles up -d + + log_success "组件已启动!" + echo "" + cmd_status +} + +cmd_down() { + log_info "停止所有组件..." + docker compose -f "$COMPOSE_FILE" --profile full down + log_success "所有组件已停止" +} + +cmd_restart() { + local profiles=$(get_profiles "$@") + + log_info "重启组件..." + docker compose -f "$COMPOSE_FILE" $profiles restart + log_success "组件已重启" +} + +cmd_logs() { + local service="${1:-}" + if [ -n "$service" ]; then + docker compose -f "$COMPOSE_FILE" logs -f "$service" + else + docker compose -f "$COMPOSE_FILE" --profile full logs -f + fi +} + +cmd_status() { + echo "" + echo "==========================================" + echo " RWA Infrastructure 状态" + echo "==========================================" + echo "" + + docker compose -f "$COMPOSE_FILE" --profile full ps --format "table {{.Name}}\t{{.Status}}\t{{.Ports}}" + + echo "" + echo "==========================================" + echo " 访问地址" + echo "==========================================" + echo "" + echo " Consul UI: http://localhost:${CONSUL_HTTP_PORT:-8500}" + echo " Jaeger UI: http://localhost:${JAEGER_UI_PORT:-16686}" + echo " Grafana: http://localhost:${GRAFANA_PORT:-3030}" + echo " Prometheus: http://localhost:${PROMETHEUS_PORT:-9090}" + echo " Loki: http://localhost:${LOKI_PORT:-3100}" + echo "" +} + +cmd_health() { + echo "" + echo "==========================================" + echo " 健康检查" + echo "==========================================" + echo "" + + # Consul + if curl -s http://localhost:${CONSUL_HTTP_PORT:-8500}/v1/status/leader > /dev/null 2>&1; then + echo -e " Consul: ${GREEN}✓ Healthy${NC}" + else + echo -e " Consul: ${RED}✗ Unhealthy${NC}" + fi + + # Jaeger + if curl -s http://localhost:${JAEGER_UI_PORT:-16686} > /dev/null 2>&1; then + echo -e " Jaeger: ${GREEN}✓ Healthy${NC}" + else + echo -e " Jaeger: ${RED}✗ Unhealthy${NC}" + fi + + # Grafana + if curl -s http://localhost:${GRAFANA_PORT:-3030}/api/health > /dev/null 2>&1; then + echo -e " Grafana: ${GREEN}✓ Healthy${NC}" + else + echo -e " Grafana: ${RED}✗ Unhealthy${NC}" + fi + + # Prometheus + if curl -s http://localhost:${PROMETHEUS_PORT:-9090}/-/healthy > /dev/null 2>&1; then + echo -e " Prometheus: ${GREEN}✓ Healthy${NC}" + else + echo -e " Prometheus: ${RED}✗ Unhealthy${NC}" + fi + + # Loki + if curl -s http://localhost:${LOKI_PORT:-3100}/ready > /dev/null 2>&1; then + echo -e " Loki: ${GREEN}✓ Healthy${NC}" + else + echo -e " Loki: ${RED}✗ Unhealthy${NC}" + fi + + echo "" +} + +cmd_help() { + echo "" + echo "RWA Infrastructure 部署工具" + echo "" + echo "用法: $0 <命令> [参数...]" + echo "" + echo "命令:" + echo " up [组件...] 启动组件 (默认启动全部)" + echo " down 停止所有组件" + echo " restart [组件...] 重启组件" + echo " logs [组件] 查看日志" + echo " status 查看运行状态" + echo " health 健康检查" + echo " help 显示帮助" + echo "" + echo "可用组件:" + echo " consul 服务发现与配置中心" + echo " jaeger 分布式链路追踪" + echo " loki 日志聚合系统" + echo " grafana 可视化仪表盘" + echo " prometheus 指标收集" + echo " full 所有组件 (默认)" + echo "" + echo "示例:" + echo " $0 up # 启动所有组件" + echo " $0 up consul jaeger # 只启动 Consul 和 Jaeger" + echo " $0 logs grafana # 查看 Grafana 日志" + echo " $0 health # 检查所有组件健康状态" + echo "" +} + +# ============================================================================= +# 主入口 +# ============================================================================= + +case "${1:-help}" in + up) + shift + cmd_up "$@" + ;; + down) + cmd_down + ;; + restart) + shift + cmd_restart "$@" + ;; + logs) + shift + cmd_logs "$@" + ;; + status) + cmd_status + ;; + health) + cmd_health + ;; + help|--help|-h) + cmd_help + ;; + *) + log_error "未知命令: $1" + cmd_help + exit 1 + ;; +esac diff --git a/backend/infrastructure/docker-compose.yml b/backend/infrastructure/docker-compose.yml new file mode 100644 index 00000000..b2251bcf --- /dev/null +++ b/backend/infrastructure/docker-compose.yml @@ -0,0 +1,254 @@ +# ============================================================================= +# RWA Infrastructure - 可观测性与服务治理基础设施 +# ============================================================================= +# +# 模块化设计,可按需启用: +# - consul: 服务发现与配置中心 +# - jaeger: 分布式链路追踪 +# - loki: 日志聚合 +# - grafana: 统一可视化仪表盘 +# +# 使用方法: +# ./deploy.sh up # 启动所有组件 +# ./deploy.sh up consul # 只启动 Consul +# ./deploy.sh up jaeger loki # 启动指定组件 +# ./deploy.sh down # 停止所有组件 +# +# ============================================================================= + +services: + # =========================================================================== + # Consul - 服务发现与配置中心 + # =========================================================================== + # 功能: + # - 服务注册与发现 + # - 健康检查 + # - KV 配置存储 + # - 多数据中心支持 + # =========================================================================== + consul: + image: docker.io/hashicorp/consul:1.18 + container_name: rwa-consul + command: agent -server -bootstrap-expect=1 -ui -client=0.0.0.0 -datacenter=rwa-dc1 + environment: + CONSUL_BIND_INTERFACE: eth0 + ports: + - "${CONSUL_HTTP_PORT:-8500}:8500" # HTTP API + UI + - "${CONSUL_DNS_PORT:-8600}:8600/udp" # DNS + - "8301:8301" # Serf LAN + - "8302:8302" # Serf WAN + volumes: + - consul_data:/consul/data + - ./consul/config:/consul/config:ro + healthcheck: + test: ["CMD", "consul", "members"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - rwa-infra + profiles: + - consul + - full + + # =========================================================================== + # Jaeger - 分布式链路追踪 + # =========================================================================== + # 功能: + # - 请求链路追踪 + # - 性能瓶颈分析 + # - 服务依赖可视化 + # - 错误定位 + # =========================================================================== + jaeger: + image: docker.io/jaegertracing/all-in-one:1.54 + container_name: rwa-jaeger + environment: + COLLECTOR_ZIPKIN_HOST_PORT: :9411 + COLLECTOR_OTLP_ENABLED: true + SPAN_STORAGE_TYPE: badger + BADGER_EPHEMERAL: false + BADGER_DIRECTORY_VALUE: /badger/data + BADGER_DIRECTORY_KEY: /badger/key + ports: + - "${JAEGER_UI_PORT:-16686}:16686" # UI + - "6831:6831/udp" # Thrift compact (agent) + - "6832:6832/udp" # Thrift binary (agent) + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + - "14250:14250" # gRPC (collector) + - "14268:14268" # HTTP (collector) + - "9411:9411" # Zipkin compatible + volumes: + - jaeger_data:/badger + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:16686 || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - rwa-infra + profiles: + - jaeger + - tracing + - full + + # =========================================================================== + # Loki - 日志聚合系统 + # =========================================================================== + # 功能: + # - 日志收集与存储 + # - 日志查询 (LogQL) + # - 与 Grafana 深度集成 + # - 低资源占用 + # =========================================================================== + loki: + image: docker.io/grafana/loki:2.9.4 + container_name: rwa-loki + command: -config.file=/etc/loki/loki-config.yml + ports: + - "${LOKI_PORT:-3100}:3100" + volumes: + - ./loki/loki-config.yml:/etc/loki/loki-config.yml:ro + - loki_data:/loki + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:3100/ready || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - rwa-infra + profiles: + - loki + - logging + - full + + # =========================================================================== + # Promtail - 日志收集代理 + # =========================================================================== + # 功能: + # - 收集 Docker 容器日志 + # - 日志标签化 + # - 推送到 Loki + # =========================================================================== + promtail: + image: docker.io/grafana/promtail:2.9.4 + container_name: rwa-promtail + command: -config.file=/etc/promtail/promtail-config.yml + volumes: + - ./loki/promtail-config.yml:/etc/promtail/promtail-config.yml:ro + - /var/lib/docker/containers:/var/lib/docker/containers:ro + - /var/run/docker.sock:/var/run/docker.sock:ro + - promtail_positions:/tmp + depends_on: + loki: + condition: service_healthy + restart: unless-stopped + networks: + - rwa-infra + profiles: + - loki + - logging + - full + + # =========================================================================== + # Grafana - 统一可视化平台 + # =========================================================================== + # 功能: + # - 多数据源集成 (Prometheus, Loki, Jaeger) + # - 自定义仪表盘 + # - 告警管理 + # - 团队协作 + # =========================================================================== + grafana: + image: docker.io/grafana/grafana:10.3.1 + container_name: rwa-grafana + environment: + - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} + - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin123} + - GF_USERS_ALLOW_SIGN_UP=false + # 服务器配置 + - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3030} + - GF_SERVER_SERVE_FROM_SUB_PATH=false + # 安全配置 + - GF_SECURITY_ALLOW_EMBEDDING=true + - GF_SECURITY_COOKIE_SAMESITE=lax + # 功能开关 + - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor tempoSearch tempoBackendSearch + # 日志级别 + - GF_LOG_LEVEL=${GRAFANA_LOG_LEVEL:-info} + ports: + - "${GRAFANA_PORT:-3030}:3000" + volumes: + - grafana_data:/var/lib/grafana + - ./grafana/provisioning:/etc/grafana/provisioning:ro + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - rwa-infra + profiles: + - grafana + - full + + # =========================================================================== + # Prometheus - 指标收集 (可选,如果 api-gateway 已有可跳过) + # =========================================================================== + prometheus: + image: docker.io/prom/prometheus:v2.49.1 + container_name: rwa-prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--storage.tsdb.retention.time=15d' + - '--web.enable-lifecycle' + - '--web.enable-admin-api' + ports: + - "${PROMETHEUS_PORT:-9090}:9090" + volumes: + - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/rules:/etc/prometheus/rules:ro + - prometheus_data:/prometheus + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:9090/-/healthy || exit 1"] + interval: 10s + timeout: 5s + retries: 5 + restart: unless-stopped + networks: + - rwa-infra + profiles: + - prometheus + - metrics + - full + +# ============================================================================= +# Volumes - 持久化存储 +# ============================================================================= +volumes: + consul_data: + driver: local + jaeger_data: + driver: local + loki_data: + driver: local + promtail_positions: + driver: local + grafana_data: + driver: local + prometheus_data: + driver: local + +# ============================================================================= +# Networks +# ============================================================================= +networks: + rwa-infra: + driver: bridge + name: rwa-infra diff --git a/backend/infrastructure/grafana/provisioning/dashboards/dashboards.yml b/backend/infrastructure/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..41e8f9f6 --- /dev/null +++ b/backend/infrastructure/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,18 @@ +# ============================================================================= +# Grafana Dashboard Provisioning +# ============================================================================= + +apiVersion: 1 + +providers: + - name: 'RWA Dashboards' + orgId: 1 + folder: 'RWA' + folderUid: 'rwa' + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards + foldersFromFilesStructure: false diff --git a/backend/infrastructure/grafana/provisioning/dashboards/rwa-services-overview.json b/backend/infrastructure/grafana/provisioning/dashboards/rwa-services-overview.json new file mode 100644 index 00000000..1c8faa4f --- /dev/null +++ b/backend/infrastructure/grafana/provisioning/dashboards/rwa-services-overview.json @@ -0,0 +1,387 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "RWA 微服务集群监控概览", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { "h": 3, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "options": { + "code": { "language": "plaintext", "showLineNumbers": false }, + "content": "# RWA 微服务监控中心\n\n实时监控 12 个微服务的健康状态、性能指标和日志", + "mode": "markdown" + }, + "pluginVersion": "10.3.1", + "title": "", + "transparent": true, + "type": "text" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "none" + } + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 3 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "pluginVersion": "10.3.1", + "targets": [ + { + "expr": "up{job=\"identity-service\"}", + "refId": "A" + } + ], + "title": "Identity Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 3 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"wallet-service\"}", + "refId": "A" + } + ], + "title": "Wallet Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 3 }, + "id": 4, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"mpc-service\"}", + "refId": "A" + } + ], + "title": "MPC Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 3 }, + "id": 5, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"reward-service\"}", + "refId": "A" + } + ], + "title": "Reward Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 3 }, + "id": 6, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"presence-service\"}", + "refId": "A" + } + ], + "title": "Presence Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + } + } + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 3 }, + "id": 7, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { + "expr": "up{job=\"backup-service\"}", + "refId": "A" + } + ], + "title": "Backup Service", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "reqps" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, + "id": 8, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "sum(rate(http_requests_total{job=~\".*-service\"}[5m])) by (job)", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Request Rate by Service", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "unit": "ms" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, + "id": 9, + "options": { + "legend": { "calcs": ["mean", "p95"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~\".*-service\"}[5m])) by (le, job)) * 1000", + "legendFormat": "{{job}} P95", + "refId": "A" + } + ], + "title": "Response Time P95 by Service", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 15 }, + "id": 10, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": true, + "showCommonLabels": false, + "showLabels": true, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "expr": "{job=\"rwa-backend\"} |~ \"error|Error|ERROR|warn|Warn|WARN\" | json", + "refId": "A" + } + ], + "title": "Error & Warning Logs", + "type": "logs" + } + ], + "refresh": "10s", + "schemaVersion": 39, + "tags": ["rwa", "microservices", "overview"], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "RWA Services Overview", + "uid": "rwa-services-overview", + "version": 1, + "weekStart": "" +} diff --git a/backend/infrastructure/grafana/provisioning/datasources/datasources.yml b/backend/infrastructure/grafana/provisioning/datasources/datasources.yml new file mode 100644 index 00000000..35e65b5c --- /dev/null +++ b/backend/infrastructure/grafana/provisioning/datasources/datasources.yml @@ -0,0 +1,104 @@ +# ============================================================================= +# Grafana Datasources - 自动配置数据源 +# ============================================================================= + +apiVersion: 1 + +datasources: + # =========================================================================== + # Prometheus - 指标数据源 + # =========================================================================== + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false + jsonData: + httpMethod: POST + manageAlerts: true + prometheusType: Prometheus + prometheusVersion: 2.49.1 + + # =========================================================================== + # Loki - 日志数据源 + # =========================================================================== + - name: Loki + type: loki + access: proxy + url: http://loki:3100 + editable: false + jsonData: + maxLines: 1000 + derivedFields: + # 从日志中提取 trace_id 并链接到 Jaeger + - name: TraceID + matcherRegex: '"trace_id":"([a-f0-9]+)"' + url: 'http://localhost:16686/trace/$${__value.raw}' + datasourceUid: jaeger + urlDisplayLabel: View Trace + + # =========================================================================== + # Jaeger - 链路追踪数据源 + # =========================================================================== + - name: Jaeger + type: jaeger + uid: jaeger + access: proxy + url: http://jaeger:16686 + editable: false + jsonData: + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + filterByTraceID: true + filterBySpanID: true + tracesToMetrics: + datasourceUid: prometheus + spanStartTimeShift: '-1h' + spanEndTimeShift: '1h' + nodeGraph: + enabled: true + traceQuery: + timeShiftEnabled: true + spanStartTimeShift: '1h' + spanEndTimeShift: '-1h' + + # =========================================================================== + # Kong Prometheus (如果 api-gateway 的 Prometheus 单独部署) + # =========================================================================== + - name: Kong-Prometheus + type: prometheus + access: proxy + url: http://192.168.1.100:9099 + editable: false + jsonData: + httpMethod: POST + + # =========================================================================== + # PostgreSQL - 直接查询数据库 (可选) + # =========================================================================== + - name: PostgreSQL-RWA + type: postgres + access: proxy + url: 192.168.1.111:5432 + user: ${POSTGRES_USER:-rwa_user} + editable: false + jsonData: + database: rwa_identity + sslmode: disable + maxOpenConns: 5 + maxIdleConns: 2 + connMaxLifetime: 14400 + secureJsonData: + password: ${POSTGRES_PASSWORD:-} + + # =========================================================================== + # Redis - 缓存监控 (需要 Redis 插件) + # =========================================================================== + # - name: Redis + # type: redis-datasource + # access: proxy + # url: redis://192.168.1.111:6379 + # editable: false diff --git a/backend/infrastructure/loki/loki-config.yml b/backend/infrastructure/loki/loki-config.yml new file mode 100644 index 00000000..53474927 --- /dev/null +++ b/backend/infrastructure/loki/loki-config.yml @@ -0,0 +1,81 @@ +# ============================================================================= +# Loki Configuration - 日志聚合系统 +# ============================================================================= + +auth_enabled: false + +server: + http_listen_port: 3100 + grpc_listen_port: 9096 + log_level: info + +common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + +query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + +ruler: + alertmanager_url: http://localhost:9093 + storage: + type: local + local: + directory: /loki/rules + rule_path: /loki/rules-temp + ring: + kvstore: + store: inmemory + enable_api: true + +limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h # 7 days + max_cache_freshness_per_query: 10m + split_queries_by_interval: 15m + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 20 + per_stream_rate_limit: 5MB + per_stream_rate_limit_burst: 15MB + +chunk_store_config: + max_look_back_period: 0s + +table_manager: + retention_deletes_enabled: true + retention_period: 336h # 14 days retention + +compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem diff --git a/backend/infrastructure/loki/promtail-config.yml b/backend/infrastructure/loki/promtail-config.yml new file mode 100644 index 00000000..786e6594 --- /dev/null +++ b/backend/infrastructure/loki/promtail-config.yml @@ -0,0 +1,129 @@ +# ============================================================================= +# Promtail Configuration - 日志收集代理 +# ============================================================================= +# 功能: +# - 自动发现 Docker 容器日志 +# - 解析并标签化日志 +# - 推送到 Loki +# ============================================================================= + +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + tenant_id: rwa + +scrape_configs: + # =========================================================================== + # Docker 容器日志收集 + # =========================================================================== + - job_name: docker + docker_sd_configs: + - host: unix:///var/run/docker.sock + refresh_interval: 5s + filters: + - name: label + values: ["com.docker.compose.project"] + + relabel_configs: + # 使用容器名作为日志标签 + - source_labels: ['__meta_docker_container_name'] + regex: '/(.*)' + target_label: 'container' + + # 使用 compose 项目名 + - source_labels: ['__meta_docker_container_label_com_docker_compose_project'] + target_label: 'project' + + # 使用 compose 服务名 + - source_labels: ['__meta_docker_container_label_com_docker_compose_service'] + target_label: 'service' + + # 容器 ID + - source_labels: ['__meta_docker_container_id'] + target_label: 'container_id' + + pipeline_stages: + # 解析 JSON 格式日志 (NestJS 默认格式) + - json: + expressions: + level: level + message: message + context: context + timestamp: timestamp + trace_id: traceId + span_id: spanId + + # 设置日志级别标签 + - labels: + level: + context: + trace_id: + span_id: + + # 时间戳解析 + - timestamp: + source: timestamp + format: RFC3339Nano + fallback_formats: + - RFC3339 + - UnixMs + + # 过滤健康检查日志 (可选,减少噪音) + - match: + selector: '{service=~".+"}' + stages: + - regex: + expression: '.*(health|healthcheck|ready|live).*' + - drop: + expression: '.*(health|healthcheck|ready|live).*' + drop_counter_reason: healthcheck_noise + + # =========================================================================== + # RWA 微服务日志 (直接从服务器采集) + # =========================================================================== + - job_name: rwa-services + static_configs: + - targets: + - localhost + labels: + job: rwa-backend + __path__: /var/log/rwa/*.log + + pipeline_stages: + - json: + expressions: + level: level + message: message + context: context + service: service + - labels: + level: + context: + service: + + # =========================================================================== + # 系统日志 (可选) + # =========================================================================== + - job_name: system + static_configs: + - targets: + - localhost + labels: + job: system + __path__: /var/log/syslog + + pipeline_stages: + - regex: + expression: '^(?P\w+\s+\d+\s+\d+:\d+:\d+)\s+(?P\S+)\s+(?P\S+):\s+(?P.*)$' + - labels: + host: + process: + - timestamp: + source: timestamp + format: 'Jan 02 15:04:05' diff --git a/backend/infrastructure/prometheus/prometheus.yml b/backend/infrastructure/prometheus/prometheus.yml new file mode 100644 index 00000000..d4d34f30 --- /dev/null +++ b/backend/infrastructure/prometheus/prometheus.yml @@ -0,0 +1,147 @@ +# ============================================================================= +# Prometheus Configuration - RWA 微服务监控 +# ============================================================================= + +global: + scrape_interval: 15s + evaluation_interval: 15s + external_labels: + cluster: 'rwa-production' + env: 'production' + +# 告警规则文件 +rule_files: + - /etc/prometheus/rules/*.yml + +# Alertmanager 配置 (可选) +# alerting: +# alertmanagers: +# - static_configs: +# - targets: +# - alertmanager:9093 + +scrape_configs: + # =========================================================================== + # Prometheus 自身监控 + # =========================================================================== + - job_name: 'prometheus' + static_configs: + - targets: ['localhost:9090'] + + # =========================================================================== + # Kong API Gateway 监控 + # =========================================================================== + - job_name: 'kong' + static_configs: + - targets: ['192.168.1.100:8001'] + metrics_path: /metrics + scrape_interval: 10s + + # =========================================================================== + # RWA 微服务监控 + # =========================================================================== + - job_name: 'identity-service' + static_configs: + - targets: ['192.168.1.111:3000'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'wallet-service' + static_configs: + - targets: ['192.168.1.111:3001'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'backup-service' + static_configs: + - targets: ['192.168.1.111:3002'] + metrics_path: /metrics + scrape_interval: 15s + + - job_name: 'planting-service' + static_configs: + - targets: ['192.168.1.111:3003'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'referral-service' + static_configs: + - targets: ['192.168.1.111:3004'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'reward-service' + static_configs: + - targets: ['192.168.1.111:3005'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'mpc-service' + static_configs: + - targets: ['192.168.1.111:3006'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'leaderboard-service' + static_configs: + - targets: ['192.168.1.111:3007'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'reporting-service' + static_configs: + - targets: ['192.168.1.111:3008'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'authorization-service' + static_configs: + - targets: ['192.168.1.111:3009'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'admin-service' + static_configs: + - targets: ['192.168.1.111:3010'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + - job_name: 'presence-service' + static_configs: + - targets: ['192.168.1.111:3011'] + metrics_path: /api/v1/metrics + scrape_interval: 15s + + # =========================================================================== + # 基础设施监控 + # =========================================================================== + - job_name: 'consul' + static_configs: + - targets: ['consul:8500'] + metrics_path: /v1/agent/metrics + params: + format: ['prometheus'] + + - job_name: 'jaeger' + static_configs: + - targets: ['jaeger:14269'] + metrics_path: /metrics + + - job_name: 'loki' + static_configs: + - targets: ['loki:3100'] + metrics_path: /metrics + + # =========================================================================== + # Docker 容器监控 (需要 cAdvisor) + # =========================================================================== + # - job_name: 'cadvisor' + # static_configs: + # - targets: ['cadvisor:8080'] + + # =========================================================================== + # Node Exporter (主机监控) + # =========================================================================== + # - job_name: 'node' + # static_configs: + # - targets: ['192.168.1.111:9100', '192.168.1.100:9100'] diff --git a/backend/infrastructure/prometheus/rules/rwa-alerts.yml b/backend/infrastructure/prometheus/rules/rwa-alerts.yml new file mode 100644 index 00000000..55bfd863 --- /dev/null +++ b/backend/infrastructure/prometheus/rules/rwa-alerts.yml @@ -0,0 +1,126 @@ +# ============================================================================= +# RWA 微服务告警规则 +# ============================================================================= + +groups: + # =========================================================================== + # 服务可用性告警 + # =========================================================================== + - name: service-availability + rules: + - alert: ServiceDown + expr: up{job=~".*-service"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "服务不可用: {{ $labels.job }}" + description: "服务 {{ $labels.job }} 已经停止响应超过 1 分钟" + + - alert: ServiceHighErrorRate + expr: | + sum(rate(http_requests_total{job=~".*-service", status=~"5.."}[5m])) by (job) + / + sum(rate(http_requests_total{job=~".*-service"}[5m])) by (job) + > 0.05 + for: 5m + labels: + severity: warning + annotations: + summary: "服务错误率过高: {{ $labels.job }}" + description: "服务 {{ $labels.job }} 的 5xx 错误率超过 5%" + + - alert: ServiceHighLatency + expr: | + histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job=~".*-service"}[5m])) by (le, job)) + > 2 + for: 5m + labels: + severity: warning + annotations: + summary: "服务响应延迟过高: {{ $labels.job }}" + description: "服务 {{ $labels.job }} 的 P95 响应时间超过 2 秒" + + # =========================================================================== + # Kong 网关告警 + # =========================================================================== + - name: kong-gateway + rules: + - alert: KongHighLatency + expr: | + histogram_quantile(0.99, sum(rate(kong_latency_bucket{type="request"}[5m])) by (le)) + > 5000 + for: 5m + labels: + severity: warning + annotations: + summary: "Kong 网关延迟过高" + description: "Kong 网关 P99 延迟超过 5 秒" + + - alert: KongHighErrorRate + expr: | + sum(rate(kong_http_status{code=~"5.."}[5m])) + / + sum(rate(kong_http_status[5m])) + > 0.01 + for: 5m + labels: + severity: critical + annotations: + summary: "Kong 网关错误率过高" + description: "Kong 网关 5xx 错误率超过 1%" + + - alert: KongRateLimitTriggered + expr: sum(rate(kong_rate_limiting_requests_total{status="over_limit"}[5m])) > 10 + for: 2m + labels: + severity: warning + annotations: + summary: "触发限流保护" + description: "每分钟有超过 10 个请求被限流" + + # =========================================================================== + # 基础设施告警 + # =========================================================================== + - name: infrastructure + rules: + - alert: ConsulServiceUnhealthy + expr: consul_health_service_status{status!="passing"} > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Consul 服务健康检查失败: {{ $labels.service }}" + description: "服务 {{ $labels.service }} 在 Consul 中的健康检查未通过" + + - alert: LokiIngestionErrors + expr: sum(rate(loki_distributor_bytes_received_total[5m])) == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Loki 没有接收到日志" + description: "Loki 在过去 5 分钟内没有接收到任何日志数据" + + # =========================================================================== + # 业务指标告警 (示例) + # =========================================================================== + - name: business-metrics + rules: + - alert: LowDAU + expr: rwa_presence_dau < 100 + for: 1h + labels: + severity: info + annotations: + summary: "日活用户数较低" + description: "当前 DAU 仅为 {{ $value }}" + + - alert: HighPendingRewards + expr: rwa_rewards_pending_count > 10000 + for: 30m + labels: + severity: warning + annotations: + summary: "待领取奖励积压" + description: "待领取奖励数量超过 10000"