feat(voice): long-lived agent session with proper hangup termination

Replace the per-turn POST /tasks approach for voice calls with a
long-lived agent run loop tied to the call lifecycle:

agent-service:
- Add AsyncQueue<T> utility for blocking message relay
- Add VoiceSessionManager: spawns one background run loop per voice call,
  accepts injected messages, terminates cleanly on hangup
- Add VoiceSessionController with 3 endpoints:
    POST   /api/v1/agent/sessions/voice/start  (call start)
    POST   /api/v1/agent/sessions/:id/voice/inject  (each speech turn)
    DELETE /api/v1/agent/sessions/:id/voice    (user hung up)
- Register VoiceSessionManager + VoiceSessionController in agent.module.ts

voice-agent:
- AgentServiceLLM: add start_voice_session(), terminate_voice_session(),
  inject_text_message() (voice/inject-aware), _do_inject_voice()
- AgentServiceLLMStream._run(): use voice/inject path when voice session
  is active; fall back to per-task POST for text-chat / non-SDK engines
- entrypoint(): call start_voice_session() after session.start();
  register _on_room_disconnect that calls terminate_voice_session()
  so the agent is always killed when the user hangs up

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-04 04:01:02 -08:00
parent 6ca8aab243
commit 635cca18fa
6 changed files with 938 additions and 117 deletions

View File

@ -43,7 +43,9 @@ import { VoiceConfigRepository } from './infrastructure/repositories/voice-confi
import { UsageRecordRepository } from './infrastructure/repositories/usage-record.repository'; import { UsageRecordRepository } from './infrastructure/repositories/usage-record.repository';
import { VoiceConfigService } from './infrastructure/services/voice-config.service'; import { VoiceConfigService } from './infrastructure/services/voice-config.service';
import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller'; import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller';
import { VoiceSessionController } from './interfaces/rest/controllers/voice-session.controller';
import { ConversationContextService } from './domain/services/conversation-context.service'; import { ConversationContextService } from './domain/services/conversation-context.service';
import { VoiceSessionManager } from './domain/services/voice-session-manager.service';
import { EventPublisherService } from './infrastructure/messaging/event-publisher.service'; import { EventPublisherService } from './infrastructure/messaging/event-publisher.service';
@Module({ @Module({
@ -58,7 +60,8 @@ import { EventPublisherService } from './infrastructure/messaging/event-publishe
], ],
controllers: [ controllers: [
AgentController, SessionController, RiskRulesController, AgentController, SessionController, RiskRulesController,
TenantAgentConfigController, AgentConfigController, VoiceConfigController, SkillsController, HooksController, TenantAgentConfigController, AgentConfigController, VoiceConfigController,
VoiceSessionController, SkillsController, HooksController,
], ],
providers: [ providers: [
AgentStreamGateway, AgentStreamGateway,
@ -72,6 +75,7 @@ import { EventPublisherService } from './infrastructure/messaging/event-publishe
StandingOrderExtractorService, StandingOrderExtractorService,
AllowedToolsResolverService, AllowedToolsResolverService,
ConversationContextService, ConversationContextService,
VoiceSessionManager,
SessionRepository, SessionRepository,
TaskRepository, TaskRepository,
MessageRepository, MessageRepository,

View File

@ -0,0 +1,333 @@
/**
* VoiceSessionManager
*
* Manages long-lived agent run loops for voice calls.
*
* Lifecycle:
* startSession(sessionId) spawn background run loop, ready for messages
* injectMessage(sessionId) enqueue speech turn; loop processes sequentially
* terminateSession(sessionId) send poison-pill + abort; loop exits cleanly
*
* Run-loop design (per voice session):
* while not terminated:
* message queue.dequeue() blocks between speech turns
* executeTask(message, resume) one SDK turn, streams events via gateway
* capture new sdkSessionId for next turn's native resume
*
* This replaces the per-turn "POST /tasks" model used by text chat.
* The SDK session is kept alive across turns via the `resume` option,
* and the run loop is explicitly terminated when the user hangs up.
*/
import { Injectable, Logger } from '@nestjs/common';
import { AsyncQueue } from '../../infrastructure/voice/async-queue';
import { EngineRegistry } from '../../infrastructure/engines/engine-registry';
import { ClaudeAgentSdkEngine } from '../../infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine';
import { AgentStreamGateway } from '../../interfaces/ws/agent-stream.gateway';
import { SessionRepository } from '../../infrastructure/repositories/session.repository';
import { TaskRepository } from '../../infrastructure/repositories/task.repository';
import { ConversationContextService } from './conversation-context.service';
import { AgentEngineType } from '../value-objects/agent-engine-type.vo';
import { TaskStatus } from '../value-objects/task-status.vo';
import { AgentTask } from '../entities/agent-task.entity';
import { TenantContextService } from '@it0/common';
import * as crypto from 'crypto';
/** Sentinel value enqueued to signal the run loop to exit. */
const TERMINATE: null = null;
/** In-memory handle for a running voice session. */
interface VoiceSessionHandle {
/** Message queue: string = user speech turn; null = terminate signal. */
queue: AsyncQueue<string | null>;
/** Allows aborting the currently-running SDK executeTask call. */
abortController: AbortController;
/** Tenant who owns this voice session. */
tenantId: string;
/** Background run-loop promise (resolved when loop exits). */
runLoop: Promise<void>;
}
@Injectable()
export class VoiceSessionManager {
private readonly logger = new Logger(VoiceSessionManager.name);
/** Map from agentSessionId → handle. */
private readonly sessions = new Map<string, VoiceSessionHandle>();
constructor(
private readonly engineRegistry: EngineRegistry,
private readonly gateway: AgentStreamGateway,
private readonly sessionRepository: SessionRepository,
private readonly taskRepository: TaskRepository,
private readonly contextService: ConversationContextService,
) {}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Start a long-lived voice agent session.
* Safe to call multiple times for the same sessionId (idempotent).
*/
async startSession(sessionId: string, tenantId: string): Promise<void> {
if (this.sessions.has(sessionId)) {
this.logger.warn(`[VoiceSession ${sessionId}] Already active — ignoring duplicate start`);
return;
}
const queue = new AsyncQueue<string | null>();
const abortController = new AbortController();
const handle: VoiceSessionHandle = {
queue,
abortController,
tenantId,
runLoop: this.runLoop(sessionId, tenantId, queue, abortController),
};
this.sessions.set(sessionId, handle);
this.logger.log(`[VoiceSession ${sessionId}] Started for tenant=${tenantId}`);
}
/**
* Inject a new speech-turn message into the running voice session.
* Returns true if injected, false if session not found.
*/
async injectMessage(sessionId: string, message: string): Promise<boolean> {
const handle = this.sessions.get(sessionId);
if (!handle) return false;
handle.queue.enqueue(message);
this.logger.log(`[VoiceSession ${sessionId}] Injected: "${message.slice(0, 80)}"`);
return true;
}
/**
* Terminate the voice session (user hung up or explicitly ended).
* Enqueues the poison-pill, aborts any in-flight SDK call,
* and waits for the run loop to exit.
*/
async terminateSession(sessionId: string): Promise<void> {
const handle = this.sessions.get(sessionId);
if (!handle) {
this.logger.debug(`[VoiceSession ${sessionId}] Already gone — ignoring terminate`);
return;
}
this.logger.log(`[VoiceSession ${sessionId}] Terminating…`);
// Abort any in-flight executeTask call
handle.abortController.abort();
// Poison-pill: unblock the queue.dequeue() if loop is idle between turns
handle.queue.drain(TERMINATE);
handle.queue.enqueue(TERMINATE);
// Wait for loop to exit (max 5 s to avoid hanging the HTTP request)
await Promise.race([
handle.runLoop.catch(() => {}),
new Promise<void>((r) => setTimeout(r, 5_000)),
]);
this.sessions.delete(sessionId);
this.logger.log(`[VoiceSession ${sessionId}] Terminated`);
}
/** Returns true if a run loop is active for this session. */
isActive(sessionId: string): boolean {
return this.sessions.has(sessionId);
}
// ---------------------------------------------------------------------------
// Private: run loop
// ---------------------------------------------------------------------------
/**
* Background run loop one instance per voice session.
* Blocks on queue.dequeue() between speech turns,
* then executes one SDK turn and streams events to the gateway.
*/
private async runLoop(
sessionId: string,
tenantId: string,
queue: AsyncQueue<string | null>,
abortController: AbortController,
): Promise<void> {
// Always get the SDK engine for voice
const engine = this.engineRegistry.switchEngine(AgentEngineType.CLAUDE_AGENT_SDK) as ClaudeAgentSdkEngine;
try {
while (!abortController.signal.aborted) {
// Block until next speech turn (or terminate signal)
const message = await queue.dequeue();
if (message === null) break;
// Execute one turn within the tenant's AsyncLocalStorage context
await TenantContextService.run(
{
tenantId,
tenantName: tenantId,
plan: 'enterprise',
schemaName: `it0_t_${tenantId}`,
maxServers: -1,
maxUsers: -1,
maxStandingOrders: -1,
maxAgentTokensPerMonth: -1,
},
async () => {
await this.executeTurn(engine, sessionId, tenantId, message, abortController);
},
);
}
} catch (err: any) {
if (err?.name !== 'AbortError') {
this.logger.error(`[VoiceSession ${sessionId}] Run loop error: ${err?.message}`);
}
} finally {
this.sessions.delete(sessionId);
this.logger.log(`[VoiceSession ${sessionId}] Run loop exited`);
}
}
/**
* Execute a single speech turn:
* 1. Create a task record
* 2. Save user message to conversation history
* 3. Load context + SDK resume session ID
* 4. Run engine.executeTask() and stream events to gateway
* 5. Save assistant response + capture new SDK session ID for next resume
*/
private async executeTurn(
engine: ClaudeAgentSdkEngine,
sessionId: string,
tenantId: string,
message: string,
abortController: AbortController,
): Promise<void> {
const session = await this.sessionRepository.findById(sessionId);
if (!session) {
this.logger.error(`[VoiceSession ${sessionId}] Session not found in DB — cannot execute turn`);
return;
}
// Create task record for tracking
const task = new AgentTask();
task.id = crypto.randomUUID();
task.tenantId = tenantId;
task.sessionId = sessionId;
task.prompt = message;
task.status = TaskStatus.RUNNING;
task.startedAt = new Date();
task.createdAt = new Date();
await this.taskRepository.save(task);
// Notify WS subscribers about the new task so they can track it
this.gateway.emitStreamEvent(sessionId, { type: 'task_info', taskId: task.id });
// Save user message to conversation history
await this.contextService.saveUserMessage(sessionId, message);
// Load context (max 20 messages) — strip the latest user message since
// the engine adds it as the explicit prompt parameter
const history = await this.contextService.loadContext(sessionId, 20);
const historyForEngine = history.slice(0, -1);
// SDK resume: continue from previous turn if available
const resumeSessionId = (session.metadata as any)?.sdkSessionId as string | undefined;
if (resumeSessionId) {
this.logger.log(`[VoiceSession ${sessionId}] Resuming SDK session: ${resumeSessionId}`);
}
// Voice-mode system prompt: concise oral Chinese, no markdown / tool details
const voiceSystemPrompt =
'你正在通过语音与用户实时对话。请严格遵守以下规则:\n' +
'1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n' +
'2. 用简洁自然的口语中文回答,像面对面对话一样\n' +
'3. 回复要简短精炼适合语音播报通常1-3句话即可\n' +
'4. 不要使用markdown格式、代码块、列表符号等文本格式';
// Events to suppress in voice mode (only text/completed/error reach TTS)
const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']);
const textParts: string[] = [];
let finished = false;
try {
const stream = engine.executeTask({
sessionId,
prompt: message,
systemPrompt: voiceSystemPrompt,
allowedTools: [],
maxTurns: 10,
conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined,
resumeSessionId,
});
for await (const event of stream) {
// Exit early if the voice session was terminated mid-turn
if (abortController.signal.aborted) break;
if (!voiceFilteredTypes.has(event.type)) {
this.gateway.emitStreamEvent(sessionId, event);
}
if (event.type === 'text') {
textParts.push(event.content);
}
if (event.type === 'completed' && !finished) {
finished = true;
task.status = TaskStatus.COMPLETED;
task.result = event.summary;
task.tokensUsed = event.tokensUsed;
task.completedAt = new Date();
await this.taskRepository.save(task);
// Persist assistant response to conversation history
const assistantText = textParts.join('') || event.summary;
if (assistantText) {
await this.contextService.saveAssistantMessage(sessionId, assistantText);
}
// Capture the new SDK session ID so the next turn can resume from here
const sdkSessionId = engine.getSdkSessionId(sessionId);
session.metadata = {
...session.metadata as Record<string, unknown>,
sdkSessionId: sdkSessionId ?? (session.metadata as any)?.sdkSessionId,
};
session.status = 'active';
session.updatedAt = new Date();
await this.sessionRepository.save(session);
}
if (event.type === 'error' && !finished) {
finished = true;
task.status = TaskStatus.FAILED;
task.result = event.message;
task.completedAt = new Date();
await this.taskRepository.save(task);
}
}
} catch (err: any) {
if (!finished && err?.name !== 'AbortError') {
task.status = TaskStatus.FAILED;
task.result = err?.message ?? 'Voice session turn error';
task.completedAt = new Date();
await this.taskRepository.save(task);
this.gateway.emitStreamEvent(sessionId, {
type: 'error',
message: task.result ?? 'Voice session turn error',
code: 'VOICE_TURN_ERROR',
});
}
} finally {
// If aborted mid-turn, save any partial text accumulated before the abort
if (!finished && textParts.length > 0) {
await this.contextService
.saveAssistantMessage(sessionId, textParts.join('') + '\n[中断]')
.catch(() => {});
}
}
}
}

View File

@ -0,0 +1,39 @@
/**
* AsyncQueue simple async message queue with blocking dequeue.
*
* Used by VoiceSessionManager to relay speech-turn messages to the
* long-lived agent run loop. The run loop calls dequeue() which
* suspends until the next message is enqueued (or a sentinel null
* is sent to signal termination).
*/
export class AsyncQueue<T> {
private readonly items: T[] = [];
private readonly waiters: Array<(item: T) => void> = [];
enqueue(item: T): void {
const waiter = this.waiters.shift();
if (waiter) {
waiter(item);
} else {
this.items.push(item);
}
}
dequeue(): Promise<T> {
return new Promise<T>((resolve) => {
if (this.items.length > 0) {
resolve(this.items.shift()!);
} else {
this.waiters.push(resolve);
}
});
}
/** Resolve all pending dequeue waiters with the given sentinel value (e.g. on shutdown). */
drain(sentinel: T): void {
for (const waiter of this.waiters) {
waiter(sentinel);
}
this.waiters.length = 0;
}
}

View File

@ -0,0 +1,148 @@
/**
* VoiceSessionController
*
* HTTP endpoints for the long-lived voice agent session lifecycle:
*
* POST /api/v1/agent/sessions/voice/start called when voice call starts
* POST /api/v1/agent/sessions/:sessionId/voice/inject called per speech turn
* DELETE /api/v1/agent/sessions/:sessionId/voice called when user hangs up
*
* These endpoints replace the per-turn "POST /api/v1/agent/tasks" flow used
* by text chat. The voice-agent calls voice/start once, injects messages as
* the user speaks, and calls voice/terminate on room disconnect.
*/
import {
Controller, Post, Delete, Param, Body,
NotFoundException, BadRequestException, Logger,
} from '@nestjs/common';
import { TenantId } from '@it0/common';
import { VoiceSessionManager } from '../../../domain/services/voice-session-manager.service';
import { SessionRepository } from '../../../infrastructure/repositories/session.repository';
import { AgentEngineType } from '../../../domain/value-objects/agent-engine-type.vo';
import { AgentSession } from '../../../domain/entities/agent-session.entity';
import * as crypto from 'crypto';
@Controller('api/v1/agent/sessions')
export class VoiceSessionController {
private readonly logger = new Logger(VoiceSessionController.name);
constructor(
private readonly voiceSessionManager: VoiceSessionManager,
private readonly sessionRepository: SessionRepository,
) {}
/**
* Start a long-lived voice agent session.
*
* Called ONCE when the user opens a voice call.
* Creates (or reuses) an AgentSession and starts the background run loop.
* Idempotent: safe to call multiple times for the same sessionId.
*/
@Post('voice/start')
async startVoiceSession(
@TenantId() tenantId: string,
@Body() body: { sessionId?: string; systemPrompt?: string },
) {
let session: AgentSession | null = null;
if (body.sessionId) {
session = await this.sessionRepository.findById(body.sessionId);
// Reject cross-tenant access
if (session && session.tenantId !== tenantId) {
session = null;
}
}
if (!session) {
// Create a fresh session pre-marked as voice mode
session = new AgentSession();
session.id = crypto.randomUUID();
session.tenantId = tenantId;
session.engineType = AgentEngineType.CLAUDE_AGENT_SDK;
session.status = 'active';
session.systemPrompt = body.systemPrompt;
session.metadata = { voiceMode: true, title: '', titleSet: true };
session.createdAt = new Date();
session.updatedAt = new Date();
} else {
// Reuse existing session; mark as voice mode
session.metadata = {
...session.metadata as Record<string, unknown>,
voiceMode: true,
};
session.status = 'active';
session.updatedAt = new Date();
}
await this.sessionRepository.save(session);
// Start (or no-op if already running) the long-lived run loop
await this.voiceSessionManager.startSession(session.id, tenantId);
this.logger.log(`Voice session started: ${session.id} tenant=${tenantId}`);
return { sessionId: session.id };
}
/**
* Inject a new speech-turn message into the running voice session.
*
* Called each time STT produces a user utterance.
* The background run loop picks up the message and executes one SDK turn,
* streaming the response back via the WebSocket gateway.
*/
@Post(':sessionId/voice/inject')
async injectVoiceMessage(
@TenantId() tenantId: string,
@Param('sessionId') sessionId: string,
@Body() body: { message: string },
) {
if (!body.message?.trim()) {
throw new BadRequestException('message is required');
}
// Verify session ownership
const session = await this.sessionRepository.findById(sessionId);
if (!session || session.tenantId !== tenantId) {
throw new NotFoundException(`Voice session ${sessionId} not found`);
}
// Auto-start the run loop if the voice-agent reconnected (e.g. container restart)
if (!this.voiceSessionManager.isActive(sessionId)) {
this.logger.warn(`[VoiceSession ${sessionId}] Run loop not active — auto-starting`);
await this.voiceSessionManager.startSession(sessionId, tenantId);
}
const injected = await this.voiceSessionManager.injectMessage(sessionId, body.message);
if (!injected) {
throw new NotFoundException(`Voice session ${sessionId} is not active`);
}
return { sessionId, injected: true };
}
/**
* Terminate the voice session (user hung up).
*
* Sends a poison-pill to the run loop, aborts any in-flight SDK call,
* and marks the AgentSession as completed.
* Idempotent: safe to call even if the session is already gone.
*/
@Delete(':sessionId/voice')
async terminateVoiceSession(
@TenantId() tenantId: string,
@Param('sessionId') sessionId: string,
) {
const session = await this.sessionRepository.findById(sessionId);
if (session && session.tenantId === tenantId) {
await this.voiceSessionManager.terminateSession(sessionId);
session.status = 'completed';
session.updatedAt = new Date();
await this.sessionRepository.save(session);
}
this.logger.log(`Voice session terminated: ${sessionId}`);
return { sessionId, terminated: true };
}
}

View File

@ -200,18 +200,15 @@ async def entrypoint(ctx: JobContext) -> None:
# httpx clients to close when the room disconnects # httpx clients to close when the room disconnects
_http_clients: list = [] _http_clients: list = []
async def _on_room_disconnect() -> None: # _on_room_disconnect is defined AFTER llm is built (it calls terminate_voice_session).
"""Clean up httpx clients when the room disconnects.""" # We register it via a mutable reference cell so the event listener can find it
for client in _http_clients: # even before the function is defined.
try: _cleanup_ref: list = []
await client.aclose()
except Exception:
pass
logger.info("Cleaned up %d httpx client(s) for room %s",
len(_http_clients), ctx.room.name)
# Register cleanup before anything else so it fires even on errors ctx.room.on(
ctx.room.on("disconnected", lambda *_: asyncio.ensure_future(_on_room_disconnect())) "disconnected",
lambda *_: asyncio.ensure_future(_cleanup_ref[0]()) if _cleanup_ref else None,
)
try: try:
# Extract auth header from job metadata # Extract auth header from job metadata
@ -331,6 +328,35 @@ async def entrypoint(ctx: JobContext) -> None:
room_output_options=room_io.RoomOutputOptions(), room_output_options=room_io.RoomOutputOptions(),
) )
# ── Voice session lifecycle ───────────────────────────────────────────
# For Agent SDK engine: start the long-lived voice session in agent-service.
# This spawns a persistent run loop that accepts injected messages for the
# duration of this call, replacing the per-turn POST /tasks approach.
if engine_type == "claude_agent_sdk":
started_session_id = await llm.start_voice_session()
if started_session_id:
logger.info("Long-lived voice session ready: %s", started_session_id)
else:
logger.warning("start_voice_session failed — falling back to per-task mode")
# Register the disconnect handler now that llm is ready.
# This is the ONLY disconnect handler; it terminates the voice session
# (signals the agent to stop) AND closes any httpx clients.
async def _on_room_disconnect() -> None:
logger.info("Room disconnected: %s — terminating voice session", ctx.room.name)
# 1. Terminate the long-lived agent run loop (user hung up)
await llm.terminate_voice_session()
# 2. Close httpx clients
for client in _http_clients:
try:
await client.aclose()
except Exception:
pass
logger.info("Cleaned up %d httpx client(s) for room %s",
len(_http_clients), ctx.room.name)
_cleanup_ref.append(_on_room_disconnect)
# --- Thinking state audio feedback --- # --- Thinking state audio feedback ---
# BackgroundAudioPlayer listens for AgentStateChangedEvent from the # BackgroundAudioPlayer listens for AgentStateChangedEvent from the
# session. When state transitions to "thinking" (STT done, waiting for # session. When state transitions to "thinking" (STT done, waiting for

View File

@ -6,8 +6,22 @@ Instead of calling Claude directly, this plugin:
2. Subscribes to the agent-service WebSocket /ws/agent for streaming text events 2. Subscribes to the agent-service WebSocket /ws/agent for streaming text events
3. Emits ChatChunk objects into the LiveKit pipeline 3. Emits ChatChunk objects into the LiveKit pipeline
In Agent SDK mode, the prompt is wrapped with voice-conversation instructions Voice Session Mode (long-lived agent)
so the agent outputs concise spoken Chinese without tool-call details. --------------------------------------
When engine_type == "claude_agent_sdk", the plugin uses a long-lived voice session:
Voice call starts start_voice_session()
POST /api/v1/agent/sessions/voice/start
agent-service spawns a persistent run loop for this session
User speaks (each turn) AgentServiceLLMStream._run()
POST /api/v1/agent/sessions/{sessionId}/voice/inject
run loop receives the message and executes one SDK turn
streams events back via WebSocket ChatChunks TTS
User hangs up terminate_voice_session()
DELETE /api/v1/agent/sessions/{sessionId}/voice
agent-service kills the run loop and marks session completed
This preserves all agent-service capabilities: Tool Use, conversation history, This preserves all agent-service capabilities: Tool Use, conversation history,
tenant isolation, and session management. tenant isolation, and session management.
@ -48,7 +62,10 @@ class AgentServiceLLM(llm.LLM):
self._auth_header = auth_header self._auth_header = auth_header
self._engine_type = engine_type self._engine_type = engine_type
self._agent_session_id: str | None = None self._agent_session_id: str | None = None
self._injecting = False # guard: don't clear session during inject
# Voice session mode: long-lived agent process tied to the call duration.
# True once start_voice_session() completes successfully.
self._voice_session_started: bool = False
@property @property
def model(self) -> str: def model(self) -> str:
@ -75,79 +92,179 @@ class AgentServiceLLM(llm.LLM):
conn_options=conn_options, conn_options=conn_options,
) )
# -------------------------------------------------------------------------
# Voice session lifecycle
# -------------------------------------------------------------------------
async def start_voice_session(self, session_id: str | None = None) -> str:
"""
Start a long-lived voice agent session in agent-service.
Called ONCE when the LiveKit room is connected. The agent-service
spawns a background run loop that stays alive for the entire call,
accepting injected messages via voice/inject.
Returns the session ID to use for all subsequent inject calls.
"""
headers: dict[str, str] = {"Content-Type": "application/json"}
if self._auth_header:
headers["Authorization"] = self._auth_header
body: dict[str, Any] = {}
if session_id:
body["sessionId"] = session_id
try:
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=15, write=10, pool=10),
) as client:
resp = await client.post(
f"{self._agent_service_url}/api/v1/agent/sessions/voice/start",
json=body,
headers=headers,
)
if resp.status_code not in (200, 201):
logger.error(
"start_voice_session failed: %d %s",
resp.status_code, resp.text[:200],
)
return ""
data = resp.json()
new_session_id: str = data.get("sessionId", "")
self._agent_session_id = new_session_id
self._voice_session_started = True
logger.info("Voice session started: %s", new_session_id)
return new_session_id
except Exception as exc:
logger.error("start_voice_session error: %s: %s", type(exc).__name__, exc)
return ""
async def inject_text_message( async def inject_text_message(
self, self,
*, *,
text: str = "", text: str = "",
attachments: list[dict] | None = None, attachments: list[dict] | None = None,
) -> str: ) -> str:
"""Inject a text message (with optional attachments) into the agent session. """
Inject a text message (sent via data channel, not speech) into the agent.
In voice session mode, uses voice/inject and collects the streamed response.
In text mode, uses POST /tasks directly.
Returns the complete response text for TTS playback via session.say(). Returns the complete response text for TTS playback via session.say().
Uses the same session ID so conversation context is preserved.
""" """
if not text and not attachments: if not text and not attachments:
return "" return ""
self._injecting = True
try: try:
return await self._do_inject(text, attachments) if self._voice_session_started and self._agent_session_id:
return await self._do_inject_voice(text)
else:
return await self._do_inject(text, attachments)
except Exception as exc: except Exception as exc:
logger.error("inject_text_message error: %s: %s", type(exc).__name__, exc) logger.error("inject_text_message error: %s: %s", type(exc).__name__, exc)
return "" return ""
finally:
self._injecting = False async def _do_inject_voice(self, text: str) -> str:
"""Inject via voice/inject endpoint and collect response text from WS."""
import time
agent_url = self._agent_service_url
session_id = self._agent_session_id
headers: dict[str, str] = {"Content-Type": "application/json"}
if self._auth_header:
headers["Authorization"] = self._auth_header
ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
ws_url = f"{ws_url}/ws/agent"
timeout_secs = 120
collected_text = ""
async with websockets.connect(
ws_url,
open_timeout=10,
close_timeout=5,
ping_interval=20,
ping_timeout=10,
) as ws:
await ws.send(json.dumps({
"event": "subscribe_session",
"data": {"sessionId": session_id},
}))
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=15, write=10, pool=10),
) as client:
resp = await client.post(
f"{agent_url}/api/v1/agent/sessions/{session_id}/voice/inject",
json={"message": text},
headers=headers,
)
if resp.status_code not in (200, 201):
logger.error("_do_inject_voice failed: %d", resp.status_code)
return ""
deadline = time.time() + timeout_secs
while time.time() < deadline:
remaining = deadline - time.time()
try:
raw = await asyncio.wait_for(ws.recv(), timeout=min(30.0, remaining))
except asyncio.TimeoutError:
continue
except websockets.exceptions.ConnectionClosed:
break
try:
msg = json.loads(raw)
except (json.JSONDecodeError, TypeError):
continue
if msg.get("event") != "stream_event":
continue
evt_data = msg.get("data", {})
evt_type = evt_data.get("type", "")
if evt_type == "text":
collected_text += evt_data.get("content", "")
elif evt_type in ("completed", "error"):
break
return collected_text
async def _do_inject( async def _do_inject(
self, self,
text: str, text: str,
attachments: list[dict] | None, attachments: list[dict] | None,
) -> str: ) -> str:
"""Execute inject: WS+HTTP stream, collect full response text.""" """Text-mode inject: POST /tasks and collect streamed response text."""
import time import time
agent_url = self._agent_service_url agent_url = self._agent_service_url
auth_header = self._auth_header
headers: dict[str, str] = {"Content-Type": "application/json"} headers: dict[str, str] = {"Content-Type": "application/json"}
if auth_header: if self._auth_header:
headers["Authorization"] = auth_header headers["Authorization"] = self._auth_header
ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://") ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
ws_url = f"{ws_url}/ws/agent" ws_url = f"{ws_url}/ws/agent"
timeout_secs = 120 timeout_secs = 120
engine_type = self._engine_type engine_type = self._engine_type
voice_mode = engine_type == "claude_agent_sdk"
body: dict[str, Any] = { body: dict[str, Any] = {
"prompt": text if text else "(see attachments)", "prompt": text if text else "(see attachments)",
"engineType": engine_type, "engineType": engine_type,
"voiceMode": voice_mode, "voiceMode": False,
} }
if voice_mode:
body["systemPrompt"] = (
"你正在通过语音与用户实时对话。请严格遵守以下规则:\n"
"1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n"
"2. 用简洁自然的口语中文回答,像面对面对话一样\n"
"3. 回复要简短精炼适合语音播报通常1-3句话即可\n"
"4. 不要使用markdown格式、代码块、列表符号等文本格式"
)
if self._agent_session_id: if self._agent_session_id:
body["sessionId"] = self._agent_session_id body["sessionId"] = self._agent_session_id
if attachments: if attachments:
body["attachments"] = attachments body["attachments"] = attachments
logger.info(
"inject POST /tasks engine=%s text=%s attachments=%d",
engine_type,
text[:80] if text else "(empty)",
len(attachments) if attachments else 0,
)
collected_text = "" collected_text = ""
async with websockets.connect( async with websockets.connect(
@ -157,14 +274,12 @@ class AgentServiceLLM(llm.LLM):
ping_interval=20, ping_interval=20,
ping_timeout=10, ping_timeout=10,
) as ws: ) as ws:
# Pre-subscribe
if self._agent_session_id: if self._agent_session_id:
await ws.send(json.dumps({ await ws.send(json.dumps({
"event": "subscribe_session", "event": "subscribe_session",
"data": {"sessionId": self._agent_session_id}, "data": {"sessionId": self._agent_session_id},
})) }))
# Create task
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10), timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
) as client: ) as client:
@ -173,44 +288,28 @@ class AgentServiceLLM(llm.LLM):
json=body, json=body,
headers=headers, headers=headers,
) )
if resp.status_code not in (200, 201):
logger.error("_do_inject task failed: %d", resp.status_code)
return ""
if resp.status_code not in (200, 201): data = resp.json()
logger.error( session_id = data.get("sessionId", "")
"inject task creation failed: %d %s", task_id = data.get("taskId", "")
resp.status_code, resp.text[:200], self._agent_session_id = session_id
)
return ""
data = resp.json()
session_id = data.get("sessionId", "")
task_id = data.get("taskId", "")
self._agent_session_id = session_id
logger.info(
"inject task created: session=%s, task=%s",
session_id, task_id,
)
# Subscribe with actual IDs
await ws.send(json.dumps({ await ws.send(json.dumps({
"event": "subscribe_session", "event": "subscribe_session",
"data": {"sessionId": session_id, "taskId": task_id}, "data": {"sessionId": session_id, "taskId": task_id},
})) }))
# Stream events → collect text
deadline = time.time() + timeout_secs deadline = time.time() + timeout_secs
while time.time() < deadline: while time.time() < deadline:
remaining = deadline - time.time() remaining = deadline - time.time()
try: try:
raw = await asyncio.wait_for( raw = await asyncio.wait_for(ws.recv(), timeout=min(30.0, remaining))
ws.recv(), timeout=min(30.0, remaining)
)
except asyncio.TimeoutError: except asyncio.TimeoutError:
if time.time() >= deadline:
logger.warning("inject stream timeout after %ds", timeout_secs)
continue continue
except websockets.exceptions.ConnectionClosed: except websockets.exceptions.ConnectionClosed:
logger.warning("inject WS connection closed")
break break
try: try:
@ -218,33 +317,54 @@ class AgentServiceLLM(llm.LLM):
except (json.JSONDecodeError, TypeError): except (json.JSONDecodeError, TypeError):
continue continue
event_type = msg.get("event", "") if msg.get("event") != "stream_event":
continue
if event_type == "stream_event": evt_data = msg.get("data", {})
evt_data = msg.get("data", {}) evt_type = evt_data.get("type", "")
evt_type = evt_data.get("type", "")
if evt_type == "text": if evt_type == "text":
content = evt_data.get("content", "") collected_text += evt_data.get("content", "")
if content: elif evt_type == "completed":
collected_text += content return collected_text
elif evt_type == "error":
elif evt_type == "completed": break
logger.info(
"inject stream completed, text length=%d",
len(collected_text),
)
return collected_text
elif evt_type == "error":
err_msg = evt_data.get("message", "Unknown error")
logger.error("inject error: %s", err_msg)
if "aborted" in err_msg.lower() or "exited" in err_msg.lower():
self._agent_session_id = None
return collected_text if collected_text else ""
return collected_text return collected_text
async def terminate_voice_session(self) -> None:
"""
Terminate the long-lived voice agent session.
Called when the user hangs up (LiveKit room disconnects).
Signals agent-service to abort any running SDK task and clean up.
"""
if not self._voice_session_started or not self._agent_session_id:
return
session_id = self._agent_session_id
headers: dict[str, str] = {"Content-Type": "application/json"}
if self._auth_header:
headers["Authorization"] = self._auth_header
try:
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=5, read=10, write=5, pool=5),
) as client:
resp = await client.delete(
f"{self._agent_service_url}/api/v1/agent/sessions/{session_id}/voice",
headers=headers,
)
logger.info(
"Voice session terminated: %s (status=%d)",
session_id, resp.status_code,
)
except Exception as exc:
logger.error("terminate_voice_session error: %s: %s", type(exc).__name__, exc)
finally:
self._voice_session_started = False
self._agent_session_id = None
class AgentServiceLLMStream(llm.LLMStream): class AgentServiceLLMStream(llm.LLMStream):
"""Streams text from agent-service via WebSocket.""" """Streams text from agent-service via WebSocket."""
@ -265,13 +385,12 @@ class AgentServiceLLMStream(llm.LLMStream):
) )
self._llm_instance = llm_instance self._llm_instance = llm_instance
# Retry configuration # Retry configuration (used in fallback / non-voice mode only)
_MAX_RETRIES = 2 _MAX_RETRIES = 2
_RETRY_DELAYS = [1.0, 3.0] # seconds between retries _RETRY_DELAYS = [1.0, 3.0] # seconds between retries
async def _run(self) -> None: async def _run(self) -> None:
# Extract the latest user message from ChatContext # Extract the latest user message from ChatContext
# items can contain ChatMessage and AgentConfigUpdate; filter by type
user_text = "" user_text = ""
for item in reversed(self._chat_ctx.items): for item in reversed(self._chat_ctx.items):
if getattr(item, "type", None) != "message": if getattr(item, "type", None) != "message":
@ -295,6 +414,32 @@ class AgentServiceLLMStream(llm.LLMStream):
return return
request_id = f"agent-{uuid.uuid4().hex[:12]}" request_id = f"agent-{uuid.uuid4().hex[:12]}"
# -----------------------------------------------------------------
# Voice session mode: inject into the long-lived run loop
# -----------------------------------------------------------------
if self._llm_instance._voice_session_started:
try:
await self._do_stream_voice(user_text, request_id)
except Exception as exc:
logger.error(
"Voice inject stream error: %s: %s",
type(exc).__name__, exc, exc_info=True,
)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(
role="assistant",
content="抱歉,语音服务暂时不可用。",
),
)
)
return
# -----------------------------------------------------------------
# Fallback / text mode: per-turn POST /tasks with retry
# -----------------------------------------------------------------
last_error: Exception | None = None last_error: Exception | None = None
for attempt in range(self._MAX_RETRIES + 1): for attempt in range(self._MAX_RETRIES + 1):
@ -308,7 +453,6 @@ class AgentServiceLLMStream(llm.LLMStream):
return # success return # success
except (httpx.ConnectError, httpx.ConnectTimeout, OSError) as exc: except (httpx.ConnectError, httpx.ConnectTimeout, OSError) as exc:
# Network-level errors — retryable
last_error = exc last_error = exc
logger.warning( logger.warning(
"Agent stream attempt %d failed (network): %s: %s", "Agent stream attempt %d failed (network): %s: %s",
@ -321,7 +465,6 @@ class AgentServiceLLMStream(llm.LLMStream):
attempt + 1, getattr(exc, "status_code", "?"), attempt + 1, getattr(exc, "status_code", "?"),
) )
except Exception as exc: except Exception as exc:
# Non-retryable errors — fail immediately
logger.error("Agent stream error: %s: %s", type(exc).__name__, exc) logger.error("Agent stream error: %s: %s", type(exc).__name__, exc)
self._event_ch.send_nowait( self._event_ch.send_nowait(
llm.ChatChunk( llm.ChatChunk(
@ -334,7 +477,6 @@ class AgentServiceLLMStream(llm.LLMStream):
) )
return return
# All retries exhausted
logger.error( logger.error(
"Agent stream failed after %d attempts: %s", "Agent stream failed after %d attempts: %s",
self._MAX_RETRIES + 1, last_error, self._MAX_RETRIES + 1, last_error,
@ -349,8 +491,150 @@ class AgentServiceLLMStream(llm.LLMStream):
) )
) )
# -------------------------------------------------------------------------
# Voice session stream: inject + subscribe
# -------------------------------------------------------------------------
async def _do_stream_voice(self, user_text: str, request_id: str) -> None:
"""
Voice session mode: inject the user utterance into the long-lived
agent run loop via POST voice/inject, then subscribe to the WebSocket
to receive the streaming response as ChatChunks.
Unlike the per-task approach (_do_stream), no new task is created here
the VoiceSessionManager's run loop handles task creation internally.
"""
import time
agent_url = self._llm_instance._agent_service_url
auth_header = self._llm_instance._auth_header
session_id = self._llm_instance._agent_session_id
if not session_id:
logger.error("_do_stream_voice: no session_id — falling back to per-task mode")
await self._do_stream(user_text, request_id)
return
headers: dict[str, str] = {"Content-Type": "application/json"}
if auth_header:
headers["Authorization"] = auth_header
ws_url = agent_url.replace("http://", "ws://").replace("https://", "wss://")
ws_url = f"{ws_url}/ws/agent"
timeout_secs = 120
logger.info(
"Voice inject: session=%s text=%s",
session_id, user_text[:80],
)
async with websockets.connect(
ws_url,
open_timeout=10,
close_timeout=5,
ping_interval=20,
ping_timeout=10,
) as ws:
# 1. Pre-subscribe to receive events as soon as the task starts
await ws.send(json.dumps({
"event": "subscribe_session",
"data": {"sessionId": session_id},
}))
# 2. Inject the message (enqueues to the run loop; async return)
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=15, write=10, pool=10),
) as client:
resp = await client.post(
f"{agent_url}/api/v1/agent/sessions/{session_id}/voice/inject",
json={"message": user_text},
headers=headers,
)
if resp.status_code not in (200, 201):
logger.error(
"Voice inject failed: %d %s",
resp.status_code, resp.text[:200],
)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(
role="assistant",
content="抱歉,语音指令注入失败。",
),
)
)
return
# 3. Emit initial role delta (LiveKit convention)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(role="assistant"),
)
)
# 4. Stream events → ChatChunks until completed/error
deadline = time.time() + timeout_secs
while time.time() < deadline:
remaining = deadline - time.time()
try:
raw = await asyncio.wait_for(
ws.recv(), timeout=min(30.0, remaining)
)
except asyncio.TimeoutError:
if time.time() >= deadline:
logger.warning("Voice inject stream timeout after %ds", timeout_secs)
continue
except websockets.exceptions.ConnectionClosed:
logger.warning("Voice inject WS connection closed")
break
try:
msg = json.loads(raw)
except (json.JSONDecodeError, TypeError):
continue
if msg.get("event") != "stream_event":
continue
evt_data = msg.get("data", {})
evt_type = evt_data.get("type", "")
if evt_type == "text":
content = evt_data.get("content", "")
if content:
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(content=content),
)
)
elif evt_type == "completed":
logger.info("Voice inject stream completed")
return
elif evt_type == "error":
err_msg = evt_data.get("message", "Unknown error")
logger.error("Voice inject agent error: %s", err_msg)
self._event_ch.send_nowait(
llm.ChatChunk(
id=request_id,
delta=llm.ChoiceDelta(content=f"Agent 错误: {err_msg}"),
)
)
return
# -------------------------------------------------------------------------
# Legacy per-task stream (text chat / fallback)
# -------------------------------------------------------------------------
async def _do_stream(self, user_text: str, request_id: str) -> None: async def _do_stream(self, user_text: str, request_id: str) -> None:
"""Execute a single WS+HTTP streaming attempt.""" """Execute a single WS+HTTP streaming attempt (text-chat / non-voice mode)."""
import time import time
agent_url = self._llm_instance._agent_service_url agent_url = self._llm_instance._agent_service_url
@ -382,19 +666,14 @@ class AgentServiceLLMStream(llm.LLMStream):
# 2. Create agent task (with timeout) # 2. Create agent task (with timeout)
engine_type = self._llm_instance._engine_type engine_type = self._llm_instance._engine_type
# Voice mode flag: tell agent-service to filter intermediate events
# (tool_use, tool_result, thinking) — only stream text + completed + error
voice_mode = engine_type == "claude_agent_sdk" voice_mode = engine_type == "claude_agent_sdk"
body: dict[str, Any] = { body: dict[str, Any] = {
"prompt": user_text, # always send clean user text (no wrapping) "prompt": user_text,
"engineType": engine_type, "engineType": engine_type,
"voiceMode": voice_mode, "voiceMode": voice_mode,
} }
# Agent SDK mode: set systemPrompt once (not per-message) so
# conversation history stays clean — identical to text chat pattern
if voice_mode: if voice_mode:
body["systemPrompt"] = ( body["systemPrompt"] = (
"你正在通过语音与用户实时对话。请严格遵守以下规则:\n" "你正在通过语音与用户实时对话。请严格遵守以下规则:\n"
@ -409,9 +688,7 @@ class AgentServiceLLMStream(llm.LLMStream):
logger.info( logger.info(
"POST /tasks engine=%s voiceMode=%s user_text=%s", "POST /tasks engine=%s voiceMode=%s user_text=%s",
engine_type, engine_type, voice_mode, user_text[:80],
voice_mode,
user_text[:80],
) )
async with httpx.AsyncClient( async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10), timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
@ -505,15 +782,9 @@ class AgentServiceLLMStream(llm.LLMStream):
elif evt_type == "error": elif evt_type == "error":
err_msg = evt_data.get("message", "Unknown error") err_msg = evt_data.get("message", "Unknown error")
logger.error("Agent error: %s", err_msg) logger.error("Agent error: %s", err_msg)
# Clear session so next task starts fresh
# (don't try to resume a dead/aborted session)
# But skip if inject is in progress — it owns the session
if "aborted" in err_msg.lower() or "exited" in err_msg.lower(): if "aborted" in err_msg.lower() or "exited" in err_msg.lower():
if not self._llm_instance._injecting: logger.info("Clearing agent session after abort/exit")
logger.info("Clearing agent session after abort/exit") self._llm_instance._agent_session_id = None
self._llm_instance._agent_session_id = None
else:
logger.info("Skipping session clear — inject in progress")
self._event_ch.send_nowait( self._event_ch.send_nowait(
llm.ChatChunk( llm.ChatChunk(
id=request_id, id=request_id,