diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts index 80aead7..aa6a5eb 100644 --- a/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts +++ b/packages/services/agent-service/src/interfaces/rest/controllers/agent.controller.ts @@ -38,6 +38,7 @@ export class AgentController { allowedTools?: string[]; engineType?: string; maxContextMessages?: number; + voiceMode?: boolean; attachments?: Array<{ base64Data: string; mediaType: string; fileName?: string }>; }, ) { @@ -118,6 +119,7 @@ export class AgentController { maxTurns: body.maxTurns || 10, conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined, resumeSessionId, + voiceMode: body.voiceMode ?? false, }); return { sessionId: session.id, taskId: task.id }; @@ -406,9 +408,17 @@ export class AgentController { maxTurns: number; conversationHistory?: Array<{ role: 'user' | 'assistant'; content: string | any[] }>; resumeSessionId?: string; + voiceMode?: boolean; }, ) { const isSdkEngine = engine.engineType === AgentEngineType.CLAUDE_AGENT_SDK; + const voiceMode = params.voiceMode ?? false; + /** Event types to suppress in voice mode (only forward text/completed/error) */ + const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']); + + if (voiceMode) { + this.logger.log(`[Task ${task.id}] Voice mode ON — filtering ${[...voiceFilteredTypes].join(', ')} events`); + } const taskPromise = (async () => { let finished = false; @@ -431,8 +441,14 @@ export class AgentController { for await (const event of stream) { eventCount++; - this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}`); - this.gateway.emitStreamEvent(session.id, event); + const isFiltered = voiceMode && voiceFilteredTypes.has(event.type); + this.logger.log(`[Task ${task.id}] Event #${eventCount}: type=${event.type}${event.type === 'text' ? ` len=${(event as any).content?.length}` : ''}${event.type === 'error' ? ` msg=${(event as any).message}` : ''}${isFiltered ? ' [FILTERED-voice]' : ''}`); + + // In voice mode, skip intermediate events (tool_use, tool_result, thinking) + // but still process lifecycle events below (completed/error/approval) + if (!isFiltered) { + this.gateway.emitStreamEvent(session.id, event); + } // Collect text for assistant message if (event.type === 'text') { diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index 9e1e66b..1d647d0 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -93,14 +93,8 @@ class IT0VoiceAgent(Agent): ) async def on_enter(self): - """Called when the agent becomes active — greet the user. - - Uses session.say() with a static message instead of generate_reply() - to avoid triggering the Agent SDK / LLM pipeline for a simple greeting. - This prevents a race condition when the user speaks before the - greeting LLM task completes. - """ - self.session.say("你好,我是IT0运维助手,有什么可以帮你的?") + """No greeting — wait for the user to speak first.""" + pass # --------------------------------------------------------------------------- diff --git a/packages/services/voice-agent/src/plugins/agent_llm.py b/packages/services/voice-agent/src/plugins/agent_llm.py index 399979d..3bbae12 100644 --- a/packages/services/voice-agent/src/plugins/agent_llm.py +++ b/packages/services/voice-agent/src/plugins/agent_llm.py @@ -225,19 +225,23 @@ class AgentServiceLLMStream(llm.LLMStream): f"\n用户说:{user_text}" ) + # Voice mode flag: tell agent-service to filter intermediate events + # (tool_use, tool_result, thinking) — only stream text + completed + error + voice_mode = engine_type == "claude_agent_sdk" + body: dict[str, Any] = { "prompt": prompt, "engineType": engine_type, + "voiceMode": voice_mode, } if self._llm_instance._agent_session_id: body["sessionId"] = self._llm_instance._agent_session_id logger.info( - "POST /tasks engine=%s wrapped=%s user_text=%s body.engineType=%s", + "POST /tasks engine=%s voiceMode=%s user_text=%s", engine_type, - engine_type == "claude_agent_sdk", + voice_mode, user_text[:80], - body["engineType"], ) async with httpx.AsyncClient( timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),