feat(agents): add Redis checkpoint for agent loop crash recovery
- New RedisClientService: optional ioredis wrapper, gracefully degrades without REDIS_URL - New RedisModule: global NestJS module providing Redis connectivity - AgentCheckpoint interface: captures turn, messages, cost, agents, timestamp - Agent loop saves checkpoint after each tool execution batch (TTL=10min) - On restart with same conversationId+requestId, loads checkpoint and resumes from saved state - Checkpoint auto-deleted after load to prevent stale recovery - Coordinator injects @Optional() RedisClientService, builds save/load callbacks - Zero impact when Redis is not configured — checkpoint silently skipped Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1a1573dda3
commit
0d488ac68b
|
|
@ -49,6 +49,9 @@ import { EVALUATION_RULE_REPOSITORY } from '../../domain/repositories/evaluation
|
|||
// Tool Hooks
|
||||
import { ToolHooksService } from './hooks/tool-hooks.service';
|
||||
|
||||
// Redis (optional — checkpoint persistence)
|
||||
import { RedisModule } from '../cache/redis.module';
|
||||
|
||||
/**
|
||||
* Anthropic Client Provider
|
||||
* 共享的 Anthropic SDK 实例,所有 Agent 共用
|
||||
|
|
@ -82,6 +85,7 @@ const AnthropicClientProvider = {
|
|||
TypeOrmModule.forFeature([TokenUsageORM, EvaluationRuleORM, ConversationORM]),
|
||||
McpModule,
|
||||
PaymentModule,
|
||||
RedisModule,
|
||||
],
|
||||
providers: [
|
||||
// Shared Anthropic client
|
||||
|
|
|
|||
|
|
@ -125,14 +125,43 @@ export async function* agentLoop(
|
|||
traceId,
|
||||
} = params;
|
||||
|
||||
const currentTurn = params.currentTurnCount || 0;
|
||||
const currentCost = params.currentCostUsd || 0;
|
||||
let currentTurn = params.currentTurnCount || 0;
|
||||
let currentCost = params.currentCostUsd || 0;
|
||||
/** 日志前缀 — 包含 traceId 用于端到端追踪 */
|
||||
const lp = traceId ? `[trace:${traceId}]` : `[conv:${conversationId}]`;
|
||||
let totalInputTokens = 0;
|
||||
let totalOutputTokens = 0;
|
||||
const agentsUsed: SpecialistAgentType[] = [];
|
||||
|
||||
// ---- Checkpoint Recovery (if available) ----
|
||||
// 首轮(currentTurn=0)尝试从 Redis 恢复上次崩溃前的状态
|
||||
if (currentTurn === 0 && params.checkpoint) {
|
||||
try {
|
||||
const saved = await params.checkpoint.load();
|
||||
if (saved) {
|
||||
logger.log(`${lp} Recovering from checkpoint at turn ${saved.turn} (${Date.now() - saved.timestamp}ms ago)`);
|
||||
// 用 checkpoint 的 messages 替代当前 messages,跳到 saved turn 继续
|
||||
yield* agentLoop(
|
||||
anthropicClient,
|
||||
{
|
||||
...params,
|
||||
messages: saved.messages,
|
||||
currentTurnCount: saved.turn,
|
||||
currentCostUsd: saved.costSoFar,
|
||||
checkpoint: undefined, // 恢复后不再尝试加载,只保存
|
||||
},
|
||||
toolExecutor,
|
||||
additionalTools,
|
||||
additionalConcurrencyMap,
|
||||
progressEventSink,
|
||||
);
|
||||
return;
|
||||
}
|
||||
} catch (err) {
|
||||
logger.warn(`${lp} Checkpoint load failed (starting fresh): ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Safety Checks ----
|
||||
if (currentTurn >= maxTurns) {
|
||||
yield {
|
||||
|
|
@ -545,6 +574,22 @@ export async function* agentLoop(
|
|||
{ role: 'user', content: toolResultMessages },
|
||||
];
|
||||
|
||||
// ---- Save Checkpoint (if configured) ----
|
||||
if (params.checkpoint) {
|
||||
try {
|
||||
await params.checkpoint.save({
|
||||
turn: currentTurn + 1,
|
||||
messages: nextMessages,
|
||||
costSoFar: currentCost + turnCost,
|
||||
agentsUsed,
|
||||
timestamp: Date.now(),
|
||||
});
|
||||
logger.debug(`${lp} Checkpoint saved at turn ${currentTurn + 1}`);
|
||||
} catch (err) {
|
||||
logger.warn(`${lp} Checkpoint save failed (non-fatal): ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Recurse ----
|
||||
yield* agentLoop(
|
||||
anthropicClient,
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@
|
|||
* 4. 产出流式事件给 ConversationService
|
||||
*/
|
||||
|
||||
import { Injectable, OnModuleInit } from '@nestjs/common';
|
||||
import { Injectable, OnModuleInit, Optional } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
import { Logger } from '@nestjs/common';
|
||||
|
|
@ -37,6 +37,7 @@ import {
|
|||
ClaudeMessage,
|
||||
SystemPromptBlock,
|
||||
SpecialistAgentType,
|
||||
CheckpointCallbacks,
|
||||
} from '../types/agent.types';
|
||||
import { StreamEvent } from '../types/stream.types';
|
||||
import { ConversationContext } from '../types/context.types';
|
||||
|
|
@ -61,6 +62,9 @@ import { InputGateService } from './input-gate.service';
|
|||
import { ToolHooksService } from '../hooks/tool-hooks.service';
|
||||
import { ToolType } from '../hooks/tool-hooks.types';
|
||||
|
||||
// Redis (optional checkpoint persistence)
|
||||
import { RedisClientService } from '../../cache/redis-client.service';
|
||||
|
||||
// ============================================================
|
||||
// Compatibility Types (与 ClaudeAgentServiceV2 的 StreamChunk 兼容)
|
||||
// ============================================================
|
||||
|
|
@ -157,6 +161,8 @@ export class CoordinatorAgentService implements OnModuleInit {
|
|||
private readonly tenantContext: TenantContextService,
|
||||
// Tool hooks (PreToolUse/PostToolUse interception)
|
||||
private readonly toolHooks: ToolHooksService,
|
||||
// Redis (optional — for checkpoint persistence)
|
||||
@Optional() private readonly redisClient?: RedisClientService,
|
||||
) {}
|
||||
|
||||
onModuleInit() {
|
||||
|
|
@ -287,6 +293,9 @@ export class CoordinatorAgentService implements OnModuleInit {
|
|||
return gateResult;
|
||||
};
|
||||
|
||||
// 5.5. Build checkpoint callbacks (if Redis available)
|
||||
const checkpoint = this.buildCheckpointCallbacks(context.conversationId, traceId);
|
||||
|
||||
// 6. Build agent loop params
|
||||
const loopParams: AgentLoopParams = {
|
||||
messages: enrichedMessages,
|
||||
|
|
@ -301,6 +310,7 @@ export class CoordinatorAgentService implements OnModuleInit {
|
|||
currentTurnCount: 0,
|
||||
currentCostUsd: 0,
|
||||
evaluationGate: evaluationGateCallback,
|
||||
checkpoint,
|
||||
};
|
||||
|
||||
// 6. Create progress event sink (shared between tool executor and agent loop)
|
||||
|
|
@ -782,6 +792,42 @@ export class CoordinatorAgentService implements OnModuleInit {
|
|||
}
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Checkpoint Persistence
|
||||
// ============================================================
|
||||
|
||||
/** Checkpoint TTL: 10 minutes */
|
||||
private readonly CHECKPOINT_TTL_SECONDS = 600;
|
||||
|
||||
/**
|
||||
* 构建 checkpoint 回调 — 如果 Redis 不可用则返回 undefined
|
||||
*/
|
||||
private buildCheckpointCallbacks(
|
||||
conversationId: string,
|
||||
requestId: string,
|
||||
): CheckpointCallbacks | undefined {
|
||||
if (!this.redisClient?.isAvailable()) return undefined;
|
||||
|
||||
const key = `agent:checkpoint:${conversationId}:${requestId}`;
|
||||
|
||||
return {
|
||||
save: async (checkpoint) => {
|
||||
await this.redisClient!.setex(
|
||||
key,
|
||||
this.CHECKPOINT_TTL_SECONDS,
|
||||
JSON.stringify(checkpoint),
|
||||
);
|
||||
},
|
||||
load: async () => {
|
||||
const raw = await this.redisClient!.get(key);
|
||||
if (!raw) return null;
|
||||
// 加载后删除 — 避免过期 checkpoint 重复恢复
|
||||
await this.redisClient!.del(key);
|
||||
return JSON.parse(raw);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================
|
||||
// Event Mapping
|
||||
// ============================================================
|
||||
|
|
|
|||
|
|
@ -262,6 +262,21 @@ export interface CoordinatorTurnStats {
|
|||
// Coordinator Loop Parameters
|
||||
// ============================================================
|
||||
|
||||
/** Agent Loop 状态快照 — 用于崩溃恢复 */
|
||||
export interface AgentCheckpoint {
|
||||
turn: number;
|
||||
messages: ClaudeMessage[];
|
||||
costSoFar: number;
|
||||
agentsUsed: string[];
|
||||
timestamp: number;
|
||||
}
|
||||
|
||||
/** Checkpoint 回调 — save/load 由 coordinator 注入,agent-loop 调用 */
|
||||
export interface CheckpointCallbacks {
|
||||
save: (checkpoint: AgentCheckpoint) => Promise<void>;
|
||||
load: () => Promise<AgentCheckpoint | null>;
|
||||
}
|
||||
|
||||
/** Agent Loop 参数 */
|
||||
export interface AgentLoopParams {
|
||||
messages: ClaudeMessage[];
|
||||
|
|
@ -284,6 +299,8 @@ export interface AgentLoopParams {
|
|||
turnCount: number,
|
||||
agentsUsed: string[],
|
||||
) => Promise<import('../coordinator/evaluation-gate.service').GateResult>;
|
||||
/** 可选 checkpoint 回调 — 提供时在每轮 tool 执行后保存状态快照 */
|
||||
checkpoint?: CheckpointCallbacks;
|
||||
}
|
||||
|
||||
/** Claude API 消息格式 */
|
||||
|
|
|
|||
82
packages/services/conversation-service/src/infrastructure/cache/redis-client.service.ts
vendored
Normal file
82
packages/services/conversation-service/src/infrastructure/cache/redis-client.service.ts
vendored
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
/**
|
||||
* Redis Client Service
|
||||
* 可选的 Redis 连接 — 如果 REDIS_URL 未配置则静默跳过
|
||||
*
|
||||
* 用途:Agent Loop 状态快照(checkpoint)
|
||||
*/
|
||||
|
||||
import { Injectable, OnModuleInit, OnModuleDestroy, Logger, Optional } from '@nestjs/common';
|
||||
import { ConfigService } from '@nestjs/config';
|
||||
import Redis from 'ioredis';
|
||||
|
||||
@Injectable()
|
||||
export class RedisClientService implements OnModuleInit, OnModuleDestroy {
|
||||
private readonly logger = new Logger(RedisClientService.name);
|
||||
private client: Redis | null = null;
|
||||
|
||||
constructor(@Optional() private readonly configService?: ConfigService) {}
|
||||
|
||||
async onModuleInit() {
|
||||
const redisUrl = this.configService?.get<string>('REDIS_URL');
|
||||
if (!redisUrl) {
|
||||
this.logger.log('REDIS_URL not configured — checkpoint persistence disabled');
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
this.client = new Redis(redisUrl, {
|
||||
maxRetriesPerRequest: 3,
|
||||
lazyConnect: true,
|
||||
connectTimeout: 5000,
|
||||
});
|
||||
await this.client.connect();
|
||||
this.logger.log(`Connected to Redis: ${redisUrl.replace(/\/\/.*@/, '//*****@')}`);
|
||||
} catch (error) {
|
||||
this.logger.warn(`Redis connection failed (checkpoint disabled): ${error}`);
|
||||
this.client = null;
|
||||
}
|
||||
}
|
||||
|
||||
async onModuleDestroy() {
|
||||
if (this.client) {
|
||||
await this.client.quit();
|
||||
this.client = null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Redis 是否可用 */
|
||||
isAvailable(): boolean {
|
||||
return this.client !== null && this.client.status === 'ready';
|
||||
}
|
||||
|
||||
/** SET with TTL (seconds) */
|
||||
async setex(key: string, ttlSeconds: number, value: string): Promise<void> {
|
||||
if (!this.client) return;
|
||||
try {
|
||||
await this.client.setex(key, ttlSeconds, value);
|
||||
} catch (error) {
|
||||
this.logger.debug(`Redis setex failed: ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
/** GET */
|
||||
async get(key: string): Promise<string | null> {
|
||||
if (!this.client) return null;
|
||||
try {
|
||||
return await this.client.get(key);
|
||||
} catch (error) {
|
||||
this.logger.debug(`Redis get failed: ${error}`);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** DEL */
|
||||
async del(key: string): Promise<void> {
|
||||
if (!this.client) return;
|
||||
try {
|
||||
await this.client.del(key);
|
||||
} catch (error) {
|
||||
this.logger.debug(`Redis del failed: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
16
packages/services/conversation-service/src/infrastructure/cache/redis.module.ts
vendored
Normal file
16
packages/services/conversation-service/src/infrastructure/cache/redis.module.ts
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
/**
|
||||
* Redis Module
|
||||
* 可选的 Redis 连接模块 — 提供 RedisClientService
|
||||
*/
|
||||
|
||||
import { Module, Global } from '@nestjs/common';
|
||||
import { ConfigModule } from '@nestjs/config';
|
||||
import { RedisClientService } from './redis-client.service';
|
||||
|
||||
@Global()
|
||||
@Module({
|
||||
imports: [ConfigModule],
|
||||
providers: [RedisClientService],
|
||||
exports: [RedisClientService],
|
||||
})
|
||||
export class RedisModule {}
|
||||
Loading…
Reference in New Issue