feat: voice mode event filtering — skip tool/thinking events for Agent SDK

1. Remove on_enter greeting entirely (no more race condition)
2. voice-agent sends voiceMode: true when engine_type is claude_agent_sdk
3. AgentController.runTaskStream() filters thinking, tool_use, tool_result
   events in voice mode — only text, completed, error reach the client
4. Detailed logging: each event logged with [FILTERED-voice] tag when skipped

Claude API mode is completely unaffected (voiceMode defaults to false).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-02 02:56:41 -08:00
parent 7c9fabd891
commit da17488389
3 changed files with 27 additions and 13 deletions

View File

@ -38,6 +38,7 @@ export class AgentController {
allowedTools?: string[];
engineType?: string;
maxContextMessages?: number;
voiceMode?: boolean;
attachments?: Array<{ base64Data: string; mediaType: string; fileName?: string }>;
},
) {
@ -118,6 +119,7 @@ export class AgentController {
maxTurns: body.maxTurns || 10,
conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined,
resumeSessionId,
voiceMode: body.voiceMode ?? false,
});
return { sessionId: session.id, taskId: task.id };
@ -406,9 +408,17 @@ export class AgentController {
maxTurns: number;
conversationHistory?: Array<{ role: 'user' | 'assistant'; content: string | any[] }>;
resumeSessionId?: string;
voiceMode?: boolean;
},
) {
const isSdkEngine = engine.engineType === AgentEngineType.CLAUDE_AGENT_SDK;
const voiceMode = params.voiceMode ?? false;
/** Event types to suppress in voice mode (only forward text/completed/error) */
const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']);
if (voiceMode) {
this.logger.log(`[Task ${task.id}] Voice mode ON — filtering ${[...voiceFilteredTypes].join(', ')} events`);
}
const taskPromise = (async () => {
let finished = false;
@ -431,8 +441,14 @@ export class AgentController {
for await (const event of stream) {
eventCount++;
this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}`);
this.gateway.emitStreamEvent(session.id, event);
const isFiltered = voiceMode && voiceFilteredTypes.has(event.type);
this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}${isFiltered ? ' [FILTERED-voice]' : ''}`);
// In voice mode, skip intermediate events (tool_use, tool_result, thinking)
// but still process lifecycle events below (completed/error/approval)
if (!isFiltered) {
this.gateway.emitStreamEvent(session.id, event);
}
// Collect text for assistant message
if (event.type === 'text') {

View File

@ -93,14 +93,8 @@ class IT0VoiceAgent(Agent):
)
async def on_enter(self):
"""Called when the agent becomes active — greet the user.
Uses session.say() with a static message instead of generate_reply()
to avoid triggering the Agent SDK / LLM pipeline for a simple greeting.
This prevents a race condition when the user speaks before the
greeting LLM task completes.
"""
self.session.say("你好我是IT0运维助手有什么可以帮你的")
"""No greeting — wait for the user to speak first."""
pass
# ---------------------------------------------------------------------------

View File

@ -225,19 +225,23 @@ class AgentServiceLLMStream(llm.LLMStream):
f"\n用户说:{user_text}"
)
# Voice mode flag: tell agent-service to filter intermediate events
# (tool_use, tool_result, thinking) — only stream text + completed + error
voice_mode = engine_type == "claude_agent_sdk"
body: dict[str, Any] = {
"prompt": prompt,
"engineType": engine_type,
"voiceMode": voice_mode,
}
if self._llm_instance._agent_session_id:
body["sessionId"] = self._llm_instance._agent_session_id
logger.info(
"POST /tasks engine=%s wrapped=%s user_text=%s body.engineType=%s",
"POST /tasks engine=%s voiceMode=%s user_text=%s",
engine_type,
engine_type == "claude_agent_sdk",
voice_mode,
user_text[:80],
body["engineType"],
)
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),