fix: improve voice pipeline robustness for poor network conditions

Flutter (agent_call_page.dart):
- Add ConnectOptions with 15s timeouts for connection/peerConnection/iceRestart
- Add RoomReconnectingEvent/RoomAttemptReconnectEvent/RoomReconnectedEvent
  listeners with "网络重连中" UI indicator during reconnection
- Add TimeoutException detection in _friendlyError()

voice-agent (agent.py):
- Wrap entrypoint() in try-except with full traceback logging
- Register room disconnect listener to close httpx clients (instead of
  finally block, since session.start() returns while session runs in bg)
- Add asyncio import for ensure_future cleanup

voice-agent LLM proxy (agent_llm.py):
- Add retry with exponential backoff (max 2 retries, 1s/3s delays) for
  network errors (ConnectError/ConnectTimeout/OSError) and WS InvalidStatusCode
- Extract _do_stream() method for single-attempt logic
- Add WebSocket connection params: open_timeout=10, ping_interval=20,
  ping_timeout=10 for keepalive and faster dead-connection detection
- Use granular httpx.Timeout(connect=10, read=30, write=10, pool=10)
- Increase WS recv timeout from 5s to 30s to reduce unnecessary loops

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-01 23:34:55 -08:00
parent 32922c6819
commit e66c187353
3 changed files with 351 additions and 208 deletions

View File

@ -42,6 +42,10 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
bool _isMuted = false;
bool _isSpeaker = true;
// Reconnection state
bool _isReconnecting = false;
int _reconnectAttempt = 0;
// Prevent double-actions
bool _userEndedCall = false;
@ -95,16 +99,47 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
..on<TrackUnsubscribedEvent>((event) {
// Agent's audio track removed
})
..on<RoomReconnectingEvent>((event) {
if (mounted) {
setState(() {
_isReconnecting = true;
_reconnectAttempt = 0;
});
}
})
..on<RoomAttemptReconnectEvent>((event) {
if (mounted) {
setState(() {
_reconnectAttempt = event.attempt;
});
}
})
..on<RoomReconnectedEvent>((event) {
if (mounted) {
setState(() {
_isReconnecting = false;
_reconnectAttempt = 0;
});
}
})
..on<RoomDisconnectedEvent>((event) {
if (_phase != _CallPhase.ended && !_userEndedCall) {
_onCallEnded();
}
});
// 4. Connect to LiveKit room
// 4. Connect to LiveKit room (with timeout)
await _room!.connect(
livekitUrl,
token,
connectOptions: const ConnectOptions(
timeouts: Timeouts(
connection: Duration(seconds: 15),
peerConnection: Duration(seconds: 15),
iceRestart: Duration(seconds: 15),
publish: Duration(seconds: 15),
),
),
roomOptions: const RoomOptions(
adaptiveStream: true,
dynacast: true,
@ -284,6 +319,28 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
),
),
const SizedBox(height: 24),
if (_phase == _CallPhase.active && _isReconnecting)
Padding(
padding: const EdgeInsets.only(bottom: 12),
child: Row(
mainAxisAlignment: MainAxisAlignment.center,
children: [
const SizedBox(
width: 14,
height: 14,
child: CircularProgressIndicator(strokeWidth: 2),
),
const SizedBox(width: 8),
Text(
'网络重连中${_reconnectAttempt > 0 ? ' ($_reconnectAttempt)' : ''}...',
style: TextStyle(
color: AppColors.warning,
fontSize: 13,
),
),
],
),
),
if (_phase == _CallPhase.active) _buildWaveform(),
const Spacer(flex: 3),
_buildControls(),
@ -344,6 +401,9 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
if (s.contains('SocketException') || s.contains('Connection refused')) {
return '无法连接到服务器';
}
if (s.contains('TimeoutException') || s.contains('timed out')) {
return '连接超时,请检查网络';
}
if (s.length > 80) return '${s.substring(0, 80)}...';
return s;
}

View File

@ -8,6 +8,7 @@ Usage:
python -m src.agent start
"""
import asyncio
import json
import logging
import ssl
@ -167,9 +168,32 @@ server.setup_fnc = prewarm
@server.rtc_session(agent_name="voice-agent")
async def entrypoint(ctx: JobContext) -> None:
"""Main entrypoint — called for each voice session."""
"""Main entrypoint — called for each voice session.
NOTE: session.start() returns immediately while the session continues
running in the background. Resources (httpx clients) must stay alive
for the session's lifetime and are cleaned up via the room disconnect
listener, NOT in a finally block.
"""
logger.info("New voice session: room=%s", ctx.room.name)
# httpx clients to close when the room disconnects
_http_clients: list = []
async def _on_room_disconnect() -> None:
"""Clean up httpx clients when the room disconnects."""
for client in _http_clients:
try:
await client.aclose()
except Exception:
pass
logger.info("Cleaned up %d httpx client(s) for room %s",
len(_http_clients), ctx.room.name)
# Register cleanup before anything else so it fires even on errors
ctx.room.on("disconnected", lambda *_: asyncio.ensure_future(_on_room_disconnect()))
try:
# Extract auth header from job metadata
# The token endpoint embeds {"auth_header": "Bearer ..."} via RoomAgentDispatch metadata,
# which LiveKit passes through as job.metadata to the agent worker.
@ -196,6 +220,7 @@ async def entrypoint(ctx: JobContext) -> None:
# OPENAI_BASE_URL may use a self-signed certificate (e.g. proxy)
_http_client = _httpx.AsyncClient(verify=False)
_http_clients.append(_http_client)
_oai_client = _openai.AsyncOpenAI(http_client=_http_client)
stt = openai_plugin.STT(
@ -225,6 +250,7 @@ async def entrypoint(ctx: JobContext) -> None:
import openai as _openai
_http_client_tts = _httpx.AsyncClient(verify=False)
_http_clients.append(_http_client_tts)
_oai_client_tts = _openai.AsyncOpenAI(http_client=_http_client_tts)
default_instructions = "用自然、友好的中文语气说话,语速稍快,简洁干练,像专业助手一样。"
@ -263,6 +289,12 @@ async def entrypoint(ctx: JobContext) -> None:
logger.info("Voice session started for room %s", ctx.room.name)
except Exception as exc:
logger.error(
"Voice session failed for room %s: %s: %s",
ctx.room.name, type(exc).__name__, exc, exc_info=True,
)
if __name__ == "__main__":
cli.run_app(server)

View File

@ -89,6 +89,10 @@ class AgentServiceLLMStream(llm.LLMStream):
)
self._llm_instance = llm_instance
# Retry configuration
_MAX_RETRIES = 2
_RETRY_DELAYS = [1.0, 3.0] # seconds between retries
async def _run(self) -> None:
# Extract the latest user message from ChatContext
# items can contain ChatMessage and AgentConfigUpdate; filter by type
@ -114,6 +118,65 @@ class AgentServiceLLMStream(llm.LLMStream):
logger.warning("No user message found in chat context")
return
request_id = f"agent-{uuid.uuid4().hex[:12]}"
last_error: Exception | None = None
for attempt in range(self._MAX_RETRIES + 1):
try:
if attempt > 0:
delay = self._RETRY_DELAYS[min(attempt - 1, len(self._RETRY_DELAYS) - 1)]
logger.info("Retry %d/%d after %.1fs", attempt, self._MAX_RETRIES, delay)
await asyncio.sleep(delay)
await self._do_stream(user_text, request_id)
return # success
except (httpx.ConnectError, httpx.ConnectTimeout, OSError) as exc:
# Network-level errors — retryable
last_error = exc
logger.warning(
"Agent stream attempt %d failed (network): %s: %s",
attempt + 1, type(exc).__name__, exc,
)
except websockets.exceptions.InvalidStatusCode as exc:
last_error = exc
logger.warning(
"Agent WS connect attempt %d failed: status %s",
attempt + 1, getattr(exc, "status_code", "?"),
)
except Exception as exc:
# Non-retryable errors — fail immediately
logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(
role="assistant",
content="抱歉Agent服务暂时不可用。",
),
)
)
return
# All retries exhausted
logger.error(
"Agent stream failed after %d attempts: %s",
self._MAX_RETRIES + 1, last_error,
)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(
role="assistant",
content="抱歉Agent服务暂时不可用请稍后再试。",
),
)
)
async def _do_stream(self, user_text: str, request_id: str) -> None:
"""Execute a single WS+HTTP streaming attempt."""
import time
agent_url = self._llm_instance._agent_service_url
auth_header = self._llm_instance._auth_header
@ -124,12 +187,16 @@ class AgentServiceLLMStream(llm.LLMStream):
ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
ws_url = f"{ws_url}/ws/agent"
request_id = f"agent-{uuid.uuid4().hex[:12]}"
timeout_secs = 120
try:
logger.info("Connecting to agent-service WS: %s", ws_url)
async with websockets.connect(ws_url) as ws:
async with websockets.connect(
ws_url,
open_timeout=10,
close_timeout=5,
ping_interval=20,
ping_timeout=10,
) as ws:
# 1. Pre-subscribe with existing session ID (for event buffering)
if self._llm_instance._agent_session_id:
await ws.send(json.dumps({
@ -137,7 +204,7 @@ class AgentServiceLLMStream(llm.LLMStream):
"data": {"sessionId": self._llm_instance._agent_session_id},
}))
# 2. Create agent task
# 2. Create agent task (with timeout)
body: dict[str, Any] = {
"prompt": user_text,
"engineType": "claude_api",
@ -146,7 +213,9 @@ class AgentServiceLLMStream(llm.LLMStream):
body["sessionId"] = self._llm_instance._agent_session_id
logger.info("POST /tasks prompt=%s", user_text[:80])
async with httpx.AsyncClient(timeout=30) as client:
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
) as client:
resp = await client.post(
f"{agent_url}/api/v1/agent/tasks",
json=body,
@ -192,21 +261,20 @@ class AgentServiceLLMStream(llm.LLMStream):
)
# 5. Stream events → ChatChunk
import time
deadline = time.time() + timeout_secs
while time.time() < deadline:
remaining = deadline - time.time()
try:
raw = await asyncio.wait_for(
ws.recv(), timeout=min(5.0, remaining)
ws.recv(), timeout=min(30.0, remaining)
)
except asyncio.TimeoutError:
if time.time() >= deadline:
logger.warning("Agent stream timeout after %ds", timeout_secs)
continue
except websockets.exceptions.ConnectionClosed:
logger.warning("Agent WS connection closed")
logger.warning("Agent WS connection closed during streaming")
break
try:
@ -231,11 +299,6 @@ class AgentServiceLLMStream(llm.LLMStream):
)
elif evt_type == "completed":
# If no text was streamed, use the summary
summary = evt_data.get("summary", "")
if summary:
# Check if we already sent text chunks
pass # LiveKit pipeline handles this
logger.info("Agent stream completed")
return
@ -251,15 +314,3 @@ class AgentServiceLLMStream(llm.LLMStream):
)
)
return
except Exception as exc:
logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(
role="assistant",
content="抱歉Agent服务暂时不可用。",
),
)
)