fix: improve voice pipeline robustness for poor network conditions

Flutter (agent_call_page.dart): - Add ConnectOptions with 15s timeouts for connection/peerConnection/iceRestart - Add RoomReconnectingEvent/RoomAttemptReconnectEvent/RoomReconnectedEvent listeners with "网络重连中" UI indicator during reconnection - Add TimeoutException detection in _friendlyError() voice-agent (agent.py): - Wrap entrypoint() in try-except with full traceback logging - Register room disconnect listener to close httpx clients (instead of finally block, since session.start() returns while session runs in bg) - Add asyncio import for ensure_future cleanup voice-agent LLM proxy (agent_llm.py): - Add retry with exponential backoff (max 2 retries, 1s/3s delays) for network errors (ConnectError/ConnectTimeout/OSError) and WS InvalidStatusCode - Extract _do_stream() method for single-attempt logic - Add WebSocket connection params: open_timeout=10, ping_interval=20, ping_timeout=10 for keepalive and faster dead-connection detection - Use granular httpx.Timeout(connect=10, read=30, write=10, pool=10) - Increase WS recv timeout from 5s to 30s to reduce unnecessary loops Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-01 23:34:55 -08:00 · 2026-03-01 23:34:55 -08:00 · e66c187353
parent 32922c6819
commit e66c187353
3 changed files with 351 additions and 208 deletions
--- a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
+++ b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
@ -42,6 +42,10 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
  bool _isMuted = false;
  bool _isSpeaker = true;

+  // Reconnection state
+  bool _isReconnecting = false;
+  int _reconnectAttempt = 0;
+
  // Prevent double-actions
  bool _userEndedCall = false;

@ -95,16 +99,47 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
        ..on<TrackUnsubscribedEvent>((event) {
          // Agent's audio track removed
        })
+        ..on<RoomReconnectingEvent>((event) {
+          if (mounted) {
+            setState(() {
+              _isReconnecting = true;
+              _reconnectAttempt = 0;
+            });
+          }
+        })
+        ..on<RoomAttemptReconnectEvent>((event) {
+          if (mounted) {
+            setState(() {
+              _reconnectAttempt = event.attempt;
+            });
+          }
+        })
+        ..on<RoomReconnectedEvent>((event) {
+          if (mounted) {
+            setState(() {
+              _isReconnecting = false;
+              _reconnectAttempt = 0;
+            });
+          }
+        })
        ..on<RoomDisconnectedEvent>((event) {
          if (_phase != _CallPhase.ended && !_userEndedCall) {
            _onCallEnded();
          }
        });

-      // 4. Connect to LiveKit room
+      // 4. Connect to LiveKit room (with timeout)
      await _room!.connect(
        livekitUrl,
        token,
+        connectOptions: const ConnectOptions(
+          timeouts: Timeouts(
+            connection: Duration(seconds: 15),
+            peerConnection: Duration(seconds: 15),
+            iceRestart: Duration(seconds: 15),
+            publish: Duration(seconds: 15),
+          ),
+        ),
        roomOptions: const RoomOptions(
          adaptiveStream: true,
          dynacast: true,
@ -284,6 +319,28 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
                  ),
                ),
              const SizedBox(height: 24),
+              if (_phase == _CallPhase.active && _isReconnecting)
+                Padding(
+                  padding: const EdgeInsets.only(bottom: 12),
+                  child: Row(
+                    mainAxisAlignment: MainAxisAlignment.center,
+                    children: [
+                      const SizedBox(
+                        width: 14,
+                        height: 14,
+                        child: CircularProgressIndicator(strokeWidth: 2),
+                      ),
+                      const SizedBox(width: 8),
+                      Text(
+                        '网络重连中${_reconnectAttempt > 0 ? ' ($_reconnectAttempt)' : ''}...',
+                        style: TextStyle(
+                          color: AppColors.warning,
+                          fontSize: 13,
+                        ),
+                      ),
+                    ],
+                  ),
+                ),
              if (_phase == _CallPhase.active) _buildWaveform(),
              const Spacer(flex: 3),
              _buildControls(),
@ -344,6 +401,9 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
    if (s.contains('SocketException') || s.contains('Connection refused')) {
      return '无法连接到服务器';
    }
+    if (s.contains('TimeoutException') || s.contains('timed out')) {
+      return '连接超时，请检查网络';
+    }
    if (s.length > 80) return '${s.substring(0, 80)}...';
    return s;
  }
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@ -8,6 +8,7 @@ Usage:
    python -m src.agent start
 """

+import asyncio
 import json
 import logging
 import ssl
@ -167,9 +168,32 @@ server.setup_fnc = prewarm

@server.rtc_session(agent_name="voice-agent")
 async def entrypoint(ctx: JobContext) -> None:
-    """Main entrypoint — called for each voice session."""
+    """Main entrypoint — called for each voice session.
+
+    NOTE: session.start() returns immediately while the session continues
+    running in the background.  Resources (httpx clients) must stay alive
+    for the session's lifetime and are cleaned up via the room disconnect
+    listener, NOT in a finally block.
+    """
    logger.info("New voice session: room=%s", ctx.room.name)

+    # httpx clients to close when the room disconnects
+    _http_clients: list = []
+
+    async def _on_room_disconnect() -> None:
+        """Clean up httpx clients when the room disconnects."""
+        for client in _http_clients:
+            try:
+                await client.aclose()
+            except Exception:
+                pass
+        logger.info("Cleaned up %d httpx client(s) for room %s",
+                    len(_http_clients), ctx.room.name)
+
+    # Register cleanup before anything else so it fires even on errors
+    ctx.room.on("disconnected", lambda *_: asyncio.ensure_future(_on_room_disconnect()))
+
+    try:
        # Extract auth header from job metadata
        # The token endpoint embeds {"auth_header": "Bearer ..."} via RoomAgentDispatch metadata,
        # which LiveKit passes through as job.metadata to the agent worker.
@ -196,6 +220,7 @@ async def entrypoint(ctx: JobContext) -> None:

            # OPENAI_BASE_URL may use a self-signed certificate (e.g. proxy)
            _http_client = _httpx.AsyncClient(verify=False)
+            _http_clients.append(_http_client)
            _oai_client = _openai.AsyncOpenAI(http_client=_http_client)

            stt = openai_plugin.STT(
@ -225,6 +250,7 @@ async def entrypoint(ctx: JobContext) -> None:
            import openai as _openai

            _http_client_tts = _httpx.AsyncClient(verify=False)
+            _http_clients.append(_http_client_tts)
            _oai_client_tts = _openai.AsyncOpenAI(http_client=_http_client_tts)

            default_instructions = "用自然、友好的中文语气说话，语速稍快，简洁干练，像专业助手一样。"
@ -263,6 +289,12 @@ async def entrypoint(ctx: JobContext) -> None:

        logger.info("Voice session started for room %s", ctx.room.name)

+    except Exception as exc:
+        logger.error(
+            "Voice session failed for room %s: %s: %s",
+            ctx.room.name, type(exc).__name__, exc, exc_info=True,
+        )
+

 if __name__ == "__main__":
    cli.run_app(server)
--- a/packages/services/voice-agent/src/plugins/agent_llm.py
+++ b/packages/services/voice-agent/src/plugins/agent_llm.py
@ -89,6 +89,10 @@ class AgentServiceLLMStream(llm.LLMStream):
        )
        self._llm_instance = llm_instance

+    # Retry configuration
+    _MAX_RETRIES = 2
+    _RETRY_DELAYS = [1.0, 3.0]  # seconds between retries
+
    async def _run(self) -> None:
        # Extract the latest user message from ChatContext
        # items can contain ChatMessage and AgentConfigUpdate; filter by type
@ -114,6 +118,65 @@ class AgentServiceLLMStream(llm.LLMStream):
            logger.warning("No user message found in chat context")
            return

+        request_id = f"agent-{uuid.uuid4().hex[:12]}"
+        last_error: Exception | None = None
+
+        for attempt in range(self._MAX_RETRIES + 1):
+            try:
+                if attempt > 0:
+                    delay = self._RETRY_DELAYS[min(attempt - 1, len(self._RETRY_DELAYS) - 1)]
+                    logger.info("Retry %d/%d after %.1fs", attempt, self._MAX_RETRIES, delay)
+                    await asyncio.sleep(delay)
+
+                await self._do_stream(user_text, request_id)
+                return  # success
+
+            except (httpx.ConnectError, httpx.ConnectTimeout, OSError) as exc:
+                # Network-level errors — retryable
+                last_error = exc
+                logger.warning(
+                    "Agent stream attempt %d failed (network): %s: %s",
+                    attempt + 1, type(exc).__name__, exc,
+                )
+            except websockets.exceptions.InvalidStatusCode as exc:
+                last_error = exc
+                logger.warning(
+                    "Agent WS connect attempt %d failed: status %s",
+                    attempt + 1, getattr(exc, "status_code", "?"),
+                )
+            except Exception as exc:
+                # Non-retryable errors — fail immediately
+                logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
+                self._event_ch.send_nowait(
+                    llm.ChatChunk(
+                        id=request_id,
+                        delta=llm.ChoiceDelta(
+                            role="assistant",
+                            content="抱歉，Agent服务暂时不可用。",
+                        ),
+                    )
+                )
+                return
+
+        # All retries exhausted
+        logger.error(
+            "Agent stream failed after %d attempts: %s",
+            self._MAX_RETRIES + 1, last_error,
+        )
+        self._event_ch.send_nowait(
+            llm.ChatChunk(
+                id=request_id,
+                delta=llm.ChoiceDelta(
+                    role="assistant",
+                    content="抱歉，Agent服务暂时不可用，请稍后再试。",
+                ),
+            )
+        )
+
+    async def _do_stream(self, user_text: str, request_id: str) -> None:
+        """Execute a single WS+HTTP streaming attempt."""
+        import time
+
        agent_url = self._llm_instance._agent_service_url
        auth_header = self._llm_instance._auth_header

@ -124,12 +187,16 @@ class AgentServiceLLMStream(llm.LLMStream):
        ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
        ws_url = f"{ws_url}/ws/agent"

-        request_id = f"agent-{uuid.uuid4().hex[:12]}"
        timeout_secs = 120

-        try:
        logger.info("Connecting to agent-service WS: %s", ws_url)
-            async with websockets.connect(ws_url) as ws:
+        async with websockets.connect(
+            ws_url,
+            open_timeout=10,
+            close_timeout=5,
+            ping_interval=20,
+            ping_timeout=10,
+        ) as ws:
            # 1. Pre-subscribe with existing session ID (for event buffering)
            if self._llm_instance._agent_session_id:
                await ws.send(json.dumps({
@ -137,7 +204,7 @@ class AgentServiceLLMStream(llm.LLMStream):
                    "data": {"sessionId": self._llm_instance._agent_session_id},
                }))

-                # 2. Create agent task
+            # 2. Create agent task (with timeout)
            body: dict[str, Any] = {
                "prompt": user_text,
                "engineType": "claude_api",
@ -146,7 +213,9 @@ class AgentServiceLLMStream(llm.LLMStream):
                body["sessionId"] = self._llm_instance._agent_session_id

            logger.info("POST /tasks prompt=%s", user_text[:80])
-                async with httpx.AsyncClient(timeout=30) as client:
+            async with httpx.AsyncClient(
+                timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
+            ) as client:
                resp = await client.post(
                    f"{agent_url}/api/v1/agent/tasks",
                    json=body,
@ -192,21 +261,20 @@ class AgentServiceLLMStream(llm.LLMStream):
            )

            # 5. Stream events → ChatChunk
-                import time
            deadline = time.time() + timeout_secs

            while time.time() < deadline:
                remaining = deadline - time.time()
                try:
                    raw = await asyncio.wait_for(
-                            ws.recv(), timeout=min(5.0, remaining)
+                        ws.recv(), timeout=min(30.0, remaining)
                    )
                except asyncio.TimeoutError:
                    if time.time() >= deadline:
                        logger.warning("Agent stream timeout after %ds", timeout_secs)
                    continue
                except websockets.exceptions.ConnectionClosed:
-                        logger.warning("Agent WS connection closed")
+                    logger.warning("Agent WS connection closed during streaming")
                    break

                try:
@ -231,11 +299,6 @@ class AgentServiceLLMStream(llm.LLMStream):
                            )

                    elif evt_type == "completed":
-                            # If no text was streamed, use the summary
-                            summary = evt_data.get("summary", "")
-                            if summary:
-                                # Check if we already sent text chunks
-                                pass  # LiveKit pipeline handles this
                        logger.info("Agent stream completed")
                        return

@ -251,15 +314,3 @@ class AgentServiceLLMStream(llm.LLMStream):
                            )
                        )
                        return
-
-        except Exception as exc:
-            logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
-            self._event_ch.send_nowait(
-                llm.ChatChunk(
-                    id=request_id,
-                    delta=llm.ChoiceDelta(
-                        role="assistant",
-                        content="抱歉，Agent服务暂时不可用。",
-                    ),
-                )
-            )