feat: voice mode event filtering — skip tool/thinking events for Agent SDK

1. Remove on_enter greeting entirely (no more race condition)
2. voice-agent sends voiceMode: true when engine_type is claude_agent_sdk
3. AgentController.runTaskStream() filters thinking, tool_use, tool_result
   events in voice mode — only text, completed, error reach the client
4. Detailed logging: each event logged with [FILTERED-voice] tag when skipped

Claude API mode is completely unaffected (voiceMode defaults to false).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-02 02:56:41 -08:00
parent 7c9fabd891
commit da17488389
3 changed files with 27 additions and 13 deletions

View File

@ -38,6 +38,7 @@ export class AgentController {
allowedTools?: string[]; allowedTools?: string[];
engineType?: string; engineType?: string;
maxContextMessages?: number; maxContextMessages?: number;
voiceMode?: boolean;
attachments?: Array<{ base64Data: string; mediaType: string; fileName?: string }>; attachments?: Array<{ base64Data: string; mediaType: string; fileName?: string }>;
}, },
) { ) {
@ -118,6 +119,7 @@ export class AgentController {
maxTurns: body.maxTurns || 10, maxTurns: body.maxTurns || 10,
conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined, conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined,
resumeSessionId, resumeSessionId,
voiceMode: body.voiceMode ?? false,
}); });
return { sessionId: session.id, taskId: task.id }; return { sessionId: session.id, taskId: task.id };
@ -406,9 +408,17 @@ export class AgentController {
maxTurns: number; maxTurns: number;
conversationHistory?: Array<{ role: 'user' | 'assistant'; content: string | any[] }>; conversationHistory?: Array<{ role: 'user' | 'assistant'; content: string | any[] }>;
resumeSessionId?: string; resumeSessionId?: string;
voiceMode?: boolean;
}, },
) { ) {
const isSdkEngine = engine.engineType === AgentEngineType.CLAUDE_AGENT_SDK; const isSdkEngine = engine.engineType === AgentEngineType.CLAUDE_AGENT_SDK;
const voiceMode = params.voiceMode ?? false;
/** Event types to suppress in voice mode (only forward text/completed/error) */
const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']);
if (voiceMode) {
this.logger.log(`[Task ${task.id}] Voice mode ON — filtering ${[...voiceFilteredTypes].join(', ')} events`);
}
const taskPromise = (async () => { const taskPromise = (async () => {
let finished = false; let finished = false;
@ -431,8 +441,14 @@ export class AgentController {
for await (const event of stream) { for await (const event of stream) {
eventCount++; eventCount++;
this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}`); const isFiltered = voiceMode && voiceFilteredTypes.has(event.type);
this.gateway.emitStreamEvent(session.id, event); this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}${isFiltered ? ' [FILTERED-voice]' : ''}`);
// In voice mode, skip intermediate events (tool_use, tool_result, thinking)
// but still process lifecycle events below (completed/error/approval)
if (!isFiltered) {
this.gateway.emitStreamEvent(session.id, event);
}
// Collect text for assistant message // Collect text for assistant message
if (event.type === 'text') { if (event.type === 'text') {

View File

@ -93,14 +93,8 @@ class IT0VoiceAgent(Agent):
) )
async def on_enter(self): async def on_enter(self):
"""Called when the agent becomes active — greet the user. """No greeting — wait for the user to speak first."""
pass
Uses session.say() with a static message instead of generate_reply()
to avoid triggering the Agent SDK / LLM pipeline for a simple greeting.
This prevents a race condition when the user speaks before the
greeting LLM task completes.
"""
self.session.say("你好我是IT0运维助手有什么可以帮你的")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@ -225,19 +225,23 @@ class AgentServiceLLMStream(llm.LLMStream):
f"\n用户说:{user_text}" f"\n用户说:{user_text}"
) )
# Voice mode flag: tell agent-service to filter intermediate events
# (tool_use, tool_result, thinking) — only stream text + completed + error
voice_mode = engine_type == "claude_agent_sdk"
body: dict[str, Any] = { body: dict[str, Any] = {
"prompt": prompt, "prompt": prompt,
"engineType": engine_type, "engineType": engine_type,
"voiceMode": voice_mode,
} }
if self._llm_instance._agent_session_id: if self._llm_instance._agent_session_id:
body["sessionId"] = self._llm_instance._agent_session_id body["sessionId"] = self._llm_instance._agent_session_id
logger.info( logger.info(
"POST /tasks engine=%s wrapped=%s user_text=%s body.engineType=%s", "POST /tasks engine=%s voiceMode=%s user_text=%s",
engine_type, engine_type,
engine_type == "claude_agent_sdk", voice_mode,
user_text[:80], user_text[:80],
body["engineType"],
) )
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10), timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),