fix: improve voice pipeline robustness for poor network conditions

Flutter (agent_call_page.dart): - Add ConnectOptions with 15s timeouts for connection/peerConnection/iceRestart - Add RoomReconnectingEvent/RoomAttemptReconnectEvent/RoomReconnectedEvent listeners with "网络重连中" UI indicator during reconnection - Add TimeoutException detection in _friendlyError() voice-agent (agent.py): - Wrap entrypoint() in try-except with full traceback logging - Register room disconnect listener to close httpx clients (instead of finally block, since session.start() returns while session runs in bg) - Add asyncio import for ensure_future cleanup voice-agent LLM proxy (agent_llm.py): - Add retry with exponential backoff (max 2 retries, 1s/3s delays) for network errors (ConnectError/ConnectTimeout/OSError) and WS InvalidStatusCode - Extract _do_stream() method for single-attempt logic - Add WebSocket connection params: open_timeout=10, ping_interval=20, ping_timeout=10 for keepalive and faster dead-connection detection - Use granular httpx.Timeout(connect=10, read=30, write=10, pool=10) - Increase WS recv timeout from 5s to 30s to reduce unnecessary loops Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 23:34:55 -08:00 · 2026-03-01 23:34:55 -08:00 · e66c187353
parent 32922c6819
commit e66c187353
3 changed files with 351 additions and 208 deletions
--- a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
+++ b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
@ -42,6 +42,10 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
  bool _isMuted = false;
  bool _isSpeaker = true;
  // Reconnection state
  bool _isReconnecting = false;
  int _reconnectAttempt = 0;
  // Prevent double-actions
  bool _userEndedCall = false;
@ -95,16 +99,47 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
        ..on<TrackUnsubscribedEvent>((event) {
          // Agent's audio track removed
        })
        ..on<RoomReconnectingEvent>((event) {
          if (mounted) {
            setState(() {
              _isReconnecting = true;
              _reconnectAttempt = 0;
            });
          }
        })
        ..on<RoomAttemptReconnectEvent>((event) {
          if (mounted) {
            setState(() {
              _reconnectAttempt = event.attempt;
            });
          }
        })
        ..on<RoomReconnectedEvent>((event) {
          if (mounted) {
            setState(() {
              _isReconnecting = false;
              _reconnectAttempt = 0;
            });
          }
        })
        ..on<RoomDisconnectedEvent>((event) {
          if (_phase != _CallPhase.ended && !_userEndedCall) {
            _onCallEnded();
          }
        });
-      // 4. Connect to LiveKit room
+      // 4. Connect to LiveKit room (with timeout)
      await _room!.connect(
        livekitUrl,
        token,
        connectOptions: const ConnectOptions(
          timeouts: Timeouts(
            connection: Duration(seconds: 15),
            peerConnection: Duration(seconds: 15),
            iceRestart: Duration(seconds: 15),
            publish: Duration(seconds: 15),
          ),
        ),
        roomOptions: const RoomOptions(
          adaptiveStream: true,
          dynacast: true,
@ -284,6 +319,28 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
                  ),
                ),
              const SizedBox(height: 24),
              if (_phase == _CallPhase.active && _isReconnecting)
                Padding(
                  padding: const EdgeInsets.only(bottom: 12),
                  child: Row(
                    mainAxisAlignment: MainAxisAlignment.center,
                    children: [
                      const SizedBox(
                        width: 14,
                        height: 14,
                        child: CircularProgressIndicator(strokeWidth: 2),
                      ),
                      const SizedBox(width: 8),
                      Text(
                        '网络重连中${_reconnectAttempt > 0 ? ' ($_reconnectAttempt)' : ''}...',
                        style: TextStyle(
                          color: AppColors.warning,
                          fontSize: 13,
                        ),
                      ),
                    ],
                  ),
                ),
              if (_phase == _CallPhase.active) _buildWaveform(),
              const Spacer(flex: 3),
              _buildControls(),
@ -344,6 +401,9 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
    if (s.contains('SocketException') || s.contains('Connection refused')) {
      return '无法连接到服务器';
    }
    if (s.contains('TimeoutException') || s.contains('timed out')) {
      return '连接超时，请检查网络';
    }
    if (s.length > 80) return '${s.substring(0, 80)}...';
    return s;
  }
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@ -8,6 +8,7 @@ Usage:
    python -m src.agent start
 """
 import asyncio
 import json
 import logging
 import ssl
@ -167,9 +168,32 @@ server.setup_fnc = prewarm
@server.rtc_session(agent_name="voice-agent")
 async def entrypoint(ctx: JobContext) -> None:
-    """Main entrypoint — called for each voice session."""
+    """Main entrypoint — called for each voice session.
    NOTE: session.start() returns immediately while the session continues
    running in the background.  Resources (httpx clients) must stay alive
    for the session's lifetime and are cleaned up via the room disconnect
    listener, NOT in a finally block.
    """
    logger.info("New voice session: room=%s", ctx.room.name)
    # httpx clients to close when the room disconnects
    _http_clients: list = []
    async def _on_room_disconnect() -> None:
        """Clean up httpx clients when the room disconnects."""
        for client in _http_clients:
            try:
                await client.aclose()
            except Exception:
                pass
        logger.info("Cleaned up %d httpx client(s) for room %s",
                    len(_http_clients), ctx.room.name)
    # Register cleanup before anything else so it fires even on errors
    ctx.room.on("disconnected", lambda *_: asyncio.ensure_future(_on_room_disconnect()))
    try:
        # Extract auth header from job metadata
        # The token endpoint embeds {"auth_header": "Bearer ..."} via RoomAgentDispatch metadata,
        # which LiveKit passes through as job.metadata to the agent worker.
@ -196,6 +220,7 @@ async def entrypoint(ctx: JobContext) -> None:
            # OPENAI_BASE_URL may use a self-signed certificate (e.g. proxy)
            _http_client = _httpx.AsyncClient(verify=False)
            _http_clients.append(_http_client)
            _oai_client = _openai.AsyncOpenAI(http_client=_http_client)
            stt = openai_plugin.STT(
@ -225,6 +250,7 @@ async def entrypoint(ctx: JobContext) -> None:
            import openai as _openai
            _http_client_tts = _httpx.AsyncClient(verify=False)
            _http_clients.append(_http_client_tts)
            _oai_client_tts = _openai.AsyncOpenAI(http_client=_http_client_tts)
            default_instructions = "用自然、友好的中文语气说话，语速稍快，简洁干练，像专业助手一样。"
@ -263,6 +289,12 @@ async def entrypoint(ctx: JobContext) -> None:
        logger.info("Voice session started for room %s", ctx.room.name)
    except Exception as exc:
        logger.error(
            "Voice session failed for room %s: %s: %s",
            ctx.room.name, type(exc).__name__, exc, exc_info=True,
        )
 if __name__ == "__main__":
    cli.run_app(server)
--- a/packages/services/voice-agent/src/plugins/agent_llm.py
+++ b/packages/services/voice-agent/src/plugins/agent_llm.py
@ -89,6 +89,10 @@ class AgentServiceLLMStream(llm.LLMStream):
        )
        self._llm_instance = llm_instance
    # Retry configuration
    _MAX_RETRIES = 2
    _RETRY_DELAYS = [1.0, 3.0]  # seconds between retries
    async def _run(self) -> None:
        # Extract the latest user message from ChatContext
        # items can contain ChatMessage and AgentConfigUpdate; filter by type
@ -114,6 +118,65 @@ class AgentServiceLLMStream(llm.LLMStream):
            logger.warning("No user message found in chat context")
            return
        request_id = f"agent-{uuid.uuid4().hex[:12]}"
        last_error: Exception | None = None
        for attempt in range(self._MAX_RETRIES + 1):
            try:
                if attempt > 0:
                    delay = self._RETRY_DELAYS[min(attempt - 1, len(self._RETRY_DELAYS) - 1)]
                    logger.info("Retry %d/%d after %.1fs", attempt, self._MAX_RETRIES, delay)
                    await asyncio.sleep(delay)
                await self._do_stream(user_text, request_id)
                return  # success
            except (httpx.ConnectError, httpx.ConnectTimeout, OSError) as exc:
                # Network-level errors — retryable
                last_error = exc
                logger.warning(
                    "Agent stream attempt %d failed (network): %s: %s",
                    attempt + 1, type(exc).__name__, exc,
                )
            except websockets.exceptions.InvalidStatusCode as exc:
                last_error = exc
                logger.warning(
                    "Agent WS connect attempt %d failed: status %s",
                    attempt + 1, getattr(exc, "status_code", "?"),
                )
            except Exception as exc:
                # Non-retryable errors — fail immediately
                logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
                self._event_ch.send_nowait(
                    llm.ChatChunk(
                        id=request_id,
                        delta=llm.ChoiceDelta(
                            role="assistant",
                            content="抱歉，Agent服务暂时不可用。",
                        ),
                    )
                )
                return
        # All retries exhausted
        logger.error(
            "Agent stream failed after %d attempts: %s",
            self._MAX_RETRIES + 1, last_error,
        )
        self._event_ch.send_nowait(
            llm.ChatChunk(
                id=request_id,
                delta=llm.ChoiceDelta(
                    role="assistant",
                    content="抱歉，Agent服务暂时不可用，请稍后再试。",
                ),
            )
        )
    async def _do_stream(self, user_text: str, request_id: str) -> None:
        """Execute a single WS+HTTP streaming attempt."""
        import time
        agent_url = self._llm_instance._agent_service_url
        auth_header = self._llm_instance._auth_header
@ -124,12 +187,16 @@ class AgentServiceLLMStream(llm.LLMStream):
        ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
        ws_url = f"{ws_url}/ws/agent"
        request_id = f"agent-{uuid.uuid4().hex[:12]}"
        timeout_secs = 120
        try:
        logger.info("Connecting to agent-service WS: %s", ws_url)
-            async with websockets.connect(ws_url) as ws:
+        async with websockets.connect(
            ws_url,
            open_timeout=10,
            close_timeout=5,
            ping_interval=20,
            ping_timeout=10,
        ) as ws:
            # 1. Pre-subscribe with existing session ID (for event buffering)
            if self._llm_instance._agent_session_id:
                await ws.send(json.dumps({
@ -137,7 +204,7 @@ class AgentServiceLLMStream(llm.LLMStream):
                    "data": {"sessionId": self._llm_instance._agent_session_id},
                }))
-                # 2. Create agent task
+            # 2. Create agent task (with timeout)
            body: dict[str, Any] = {
                "prompt": user_text,
                "engineType": "claude_api",
@ -146,7 +213,9 @@ class AgentServiceLLMStream(llm.LLMStream):
                body["sessionId"] = self._llm_instance._agent_session_id
            logger.info("POST /tasks prompt=%s", user_text[:80])
-                async with httpx.AsyncClient(timeout=30) as client:
+            async with httpx.AsyncClient(
                timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
            ) as client:
                resp = await client.post(
                    f"{agent_url}/api/v1/agent/tasks",
                    json=body,
@ -192,21 +261,20 @@ class AgentServiceLLMStream(llm.LLMStream):
            )
            # 5. Stream events → ChatChunk
                import time
            deadline = time.time() + timeout_secs
            while time.time() < deadline:
                remaining = deadline - time.time()
                try:
                    raw = await asyncio.wait_for(
-                            ws.recv(), timeout=min(5.0, remaining)
+                        ws.recv(), timeout=min(30.0, remaining)
                    )
                except asyncio.TimeoutError:
                    if time.time() >= deadline:
                        logger.warning("Agent stream timeout after %ds", timeout_secs)
                    continue
                except websockets.exceptions.ConnectionClosed:
-                        logger.warning("Agent WS connection closed")
+                    logger.warning("Agent WS connection closed during streaming")
                    break
                try:
@ -231,11 +299,6 @@ class AgentServiceLLMStream(llm.LLMStream):
                            )
                    elif evt_type == "completed":
                            # If no text was streamed, use the summary
                            summary = evt_data.get("summary", "")
                            if summary:
                                # Check if we already sent text chunks
                                pass  # LiveKit pipeline handles this
                        logger.info("Agent stream completed")
                        return
@ -251,15 +314,3 @@ class AgentServiceLLMStream(llm.LLMStream):
                            )
                        )
                        return
        except Exception as exc:
            logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
            self._event_ch.send_nowait(
                llm.ChatChunk(
                    id=request_id,
                    delta=llm.ChoiceDelta(
                        role="assistant",
                        content="抱歉，Agent服务暂时不可用。",
                    ),
                )
            )