it0/packages/services/voice-agent/src/agent.py

470 lines
18 KiB
Python

"""
IT0 Voice Agent — LiveKit Agents v1.x entry point.
Uses the official AgentServer + @server.rtc_session() pattern.
Pipeline: VAD → STT → LLM (via agent-service) → TTS.
Voice Session Lifecycle (long-lived agent run loop)
----------------------------------------------------
Each voice call maps to ONE long-lived agent session in agent-service,
instead of spawning a new process for every speech turn.
Call starts → POST /api/v1/agent/sessions/voice/start
agent-service creates an AgentSession, starts a background
run loop, and returns a sessionId.
User speaks → LiveKit STT → AgentServiceLLM._run()
→ POST /:sessionId/voice/inject
agent-service enqueues the utterance; run loop picks it up,
calls Claude Agent SDK, streams events back via WebSocket.
User hangs up → room "disconnected" event → _on_room_disconnect()
→ DELETE /:sessionId/voice
agent-service aborts the run loop and marks session completed.
Interruption (mid-turn abort)
------------------------------
When the user speaks while the agent is still responding:
1. LiveKit framework stops TTS playback immediately (client-side).
2. STT produces the new utterance → voice/inject is called.
3. agent-service detects a turn is already running → aborts it (per-turn
AbortController) → enqueues the new message.
4. The SDK loop breaks silently; no error message is emitted to TTS.
5. The new turn starts, producing the response to the interrupting utterance.
Agent State & Thinking Indicator
---------------------------------
LiveKit AgentSession (v1.4.3+) automatically publishes the participant
attribute ``lk.agent.state`` with these values:
initializing → listening → thinking → speaking → listening → ...
The state transition happens inside the framework:
- RoomIO._on_agent_state_changed() calls
local_participant.set_attributes({"lk.agent.state": state})
On the Flutter side (livekit_client v2.6.4), the app listens for
ParticipantAttributesChanged events and reads the ``lk.agent.state``
attribute to drive UI changes:
- "thinking" → pulsing dots animation + "思考中..." text + orange avatar
- "speaking" → waveform animation driven by audio level
- "listening" → default call UI
BackgroundAudioPlayer is configured below to play a keyboard typing
sound effect during the "thinking" state as auditory feedback.
Usage:
python -m src.agent start
"""
import asyncio
import json
import logging
import ssl
import aiohttp
from livekit.agents import (
Agent,
AgentServer,
AgentSession,
JobContext,
JobProcess,
cli,
room_io,
)
from livekit.agents.voice.background_audio import BackgroundAudioPlayer, BuiltinAudioClip
from livekit.agents.utils import http_context
from livekit.plugins import silero
from .config import settings
# ---------------------------------------------------------------------------
# Monkey-patch: disable SSL verification for aiohttp sessions.
#
# The OpenAI Realtime STT uses aiohttp WebSocket (not httpx), so passing
# verify=False to the httpx/OpenAI client does NOT help. LiveKit's
# http_context._new_session_ctx creates an aiohttp.TCPConnector without
# ssl=False, causing SSL errors when OPENAI_BASE_URL points to a proxy
# with a self-signed certificate.
#
# We replace _new_session_ctx to inject ssl=False into the connector.
# ---------------------------------------------------------------------------
_original_new_session_ctx = http_context._new_session_ctx
_no_verify_ssl = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
_no_verify_ssl.check_hostname = False
_no_verify_ssl.verify_mode = ssl.CERT_NONE
def _patched_new_session_ctx():
"""Same as the original but with ssl verification disabled."""
_g_session = None
def _new_session():
nonlocal _g_session
if _g_session is None or _g_session.closed:
from livekit.agents.job import get_job_context
try:
http_proxy = get_job_context().proc.http_proxy
except RuntimeError:
http_proxy = None
connector = aiohttp.TCPConnector(
limit_per_host=50,
keepalive_timeout=120,
ssl=_no_verify_ssl,
)
_g_session = aiohttp.ClientSession(proxy=http_proxy, connector=connector)
return _g_session
http_context._ContextVar.set(_new_session)
return _new_session
http_context._new_session_ctx = _patched_new_session_ctx
from .plugins.agent_llm import AgentServiceLLM
from .plugins.whisper_stt import LocalWhisperSTT
from .plugins.kokoro_tts import LocalKokoroTTS, patch_misaki_compat
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class IT0VoiceAgent(Agent):
"""Voice agent for IT0 server operations platform."""
def __init__(self):
super().__init__(
instructions=(
"你是 IT0 服务器运维助手。用户通过语音与你对话,"
"你帮助管理和监控服务器集群。回答简洁,适合语音对话场景。"
),
)
async def on_enter(self):
"""No greeting — wait for the user to speak first."""
pass
# ---------------------------------------------------------------------------
# Server setup
# ---------------------------------------------------------------------------
server = AgentServer()
def prewarm(proc: JobProcess) -> None:
"""Pre-load ML models into shared process memory.
Called once per worker process. Models are shared across all sessions
handled by this process, avoiding redundant loading.
"""
logger.info(
"Prewarming models (stt=%s, tts=%s, device=%s)",
settings.stt_provider,
settings.tts_provider,
settings.device,
)
# VAD — always needed
proc.userdata["vad"] = silero.VAD.load()
logger.info("VAD loaded: Silero VAD")
# STT — local faster-whisper
if settings.stt_provider == "local":
from faster_whisper import WhisperModel
compute_type = "float16" if settings.device == "cuda" else "int8"
try:
model = WhisperModel(
settings.whisper_model,
device=settings.device,
compute_type=compute_type,
)
except Exception as e:
logger.warning("Whisper GPU failed, falling back to CPU: %s", e)
model = WhisperModel(
settings.whisper_model, device="cpu", compute_type="int8"
)
proc.userdata["whisper_model"] = model
logger.info("STT loaded: faster-whisper %s", settings.whisper_model)
else:
proc.userdata["whisper_model"] = None
logger.info("STT: using OpenAI %s", settings.openai_stt_model)
# TTS — local Kokoro
if settings.tts_provider == "local":
patch_misaki_compat()
from kokoro import KPipeline
proc.userdata["kokoro_pipeline"] = KPipeline(lang_code="z")
logger.info("TTS loaded: Kokoro-82M voice=%s", settings.kokoro_voice)
else:
proc.userdata["kokoro_pipeline"] = None
logger.info("TTS: using OpenAI %s", settings.openai_tts_model)
logger.info("Prewarm complete.")
server.setup_fnc = prewarm
# ---------------------------------------------------------------------------
# Session entrypoint — called for each voice session (room join)
# ---------------------------------------------------------------------------
@server.rtc_session(agent_name="voice-agent")
async def entrypoint(ctx: JobContext) -> None:
"""Main entrypoint — called for each voice session.
NOTE: session.start() returns immediately while the session continues
running in the background. Resources (httpx clients) must stay alive
for the session's lifetime and are cleaned up via the room disconnect
listener, NOT in a finally block.
"""
logger.info("New voice session: room=%s", ctx.room.name)
# httpx clients to close when the room disconnects
_http_clients: list = []
# _on_room_disconnect is defined AFTER llm is built (it calls terminate_voice_session).
# We register it via a mutable reference cell so the event listener can find it
# even before the function is defined.
_cleanup_ref: list = []
ctx.room.on(
"disconnected",
lambda *_: asyncio.ensure_future(_cleanup_ref[0]()) if _cleanup_ref else None,
)
try:
# Extract auth header from job metadata
# The token endpoint embeds {"auth_header": "Bearer ..."} via RoomAgentDispatch metadata,
# which LiveKit passes through as job.metadata to the agent worker.
auth_header = ""
tts_voice = settings.openai_tts_voice
tts_style = ""
engine_type = "claude_agent_sdk"
meta = {}
try:
meta_str = ctx.job.metadata or "{}"
meta = json.loads(meta_str)
auth_header = meta.get("auth_header", "")
tts_voice = meta.get("tts_voice", settings.openai_tts_voice)
tts_style = meta.get("tts_style", "")
engine_type = meta.get("engine_type", "claude_agent_sdk")
except Exception as e:
logger.warning("Failed to parse job metadata: %s", e)
logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)
# ── Resolve STT provider (metadata > agent-service config > env default) ──
stt_provider = meta.get("stt_provider", "")
if not stt_provider and auth_header:
try:
import httpx as _httpx_cfg
async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client:
_cfg_resp = await _cfg_client.get(
f"{settings.agent_service_url}/api/v1/agent/voice-config",
headers={"Authorization": auth_header},
)
if _cfg_resp.status_code == 200:
_voice_cfg = _cfg_resp.json()
stt_provider = _voice_cfg.get("stt_provider", "")
logger.info("Voice config from agent-service: stt_provider=%s", stt_provider)
except Exception as e:
logger.warning("Failed to fetch voice config from agent-service: %s", e)
if not stt_provider:
stt_provider = settings.stt_provider # env var fallback
# ── Build STT ──
if stt_provider == "openai":
from livekit.plugins import openai as openai_plugin
import httpx as _httpx
import openai as _openai
# OPENAI_BASE_URL may use a self-signed certificate (e.g. proxy)
_http_client = _httpx.AsyncClient(verify=False)
_http_clients.append(_http_client)
_oai_client = _openai.AsyncOpenAI(http_client=_http_client)
stt = openai_plugin.STT(
model=settings.openai_stt_model,
language=settings.whisper_language,
client=_oai_client,
use_realtime=True,
# Increase silence_duration_ms so Chinese speech isn't chopped
# into tiny fragments (default 350ms is too aggressive).
turn_detection={
"type": "server_vad",
"threshold": 0.6,
"prefix_padding_ms": 600,
"silence_duration_ms": 800,
},
)
else:
stt = LocalWhisperSTT(
model=ctx.proc.userdata.get("whisper_model"),
language=settings.whisper_language,
)
logger.info("STT provider selected: %s", stt_provider)
# Build TTS
if settings.tts_provider == "openai":
from livekit.plugins import openai as openai_plugin
import httpx as _httpx
import openai as _openai
_http_client_tts = _httpx.AsyncClient(verify=False)
_http_clients.append(_http_client_tts)
_oai_client_tts = _openai.AsyncOpenAI(http_client=_http_client_tts)
default_instructions = "用自然、友好的中文语气说话,语速稍快,简洁干练,像专业助手一样。"
tts = openai_plugin.TTS(
model=settings.openai_tts_model,
voice=tts_voice,
instructions=tts_style if tts_style else default_instructions,
client=_oai_client_tts,
)
else:
tts = LocalKokoroTTS(
pipeline=ctx.proc.userdata.get("kokoro_pipeline"),
voice=settings.kokoro_voice,
)
# Build custom LLM (proxies to agent-service)
llm = AgentServiceLLM(
agent_service_url=settings.agent_service_url,
auth_header=auth_header,
engine_type=engine_type,
)
# Create and start AgentSession with the full pipeline
session = AgentSession(
vad=ctx.proc.userdata["vad"],
stt=stt,
llm=llm,
tts=tts,
)
await session.start(
agent=IT0VoiceAgent(),
room=ctx.room,
room_input_options=room_io.RoomInputOptions(),
room_output_options=room_io.RoomOutputOptions(),
)
# ── Voice session lifecycle ───────────────────────────────────────────
# For Agent SDK engine: start the long-lived voice session in agent-service.
# This spawns a persistent run loop that accepts injected messages for the
# duration of this call, replacing the per-turn POST /tasks approach.
if engine_type == "claude_agent_sdk":
started_session_id = await llm.start_voice_session()
if started_session_id:
logger.info("Long-lived voice session ready: %s", started_session_id)
else:
logger.warning("start_voice_session failed — falling back to per-task mode")
# Register the disconnect handler now that llm is ready.
# This is the ONLY disconnect handler; it terminates the voice session
# (signals the agent to stop) AND closes any httpx clients.
async def _on_room_disconnect() -> None:
logger.info("Room disconnected: %s — terminating voice session", ctx.room.name)
# 1. Terminate the long-lived agent run loop (user hung up)
await llm.terminate_voice_session()
# 2. Close httpx clients
for client in _http_clients:
try:
await client.aclose()
except Exception:
pass
logger.info("Cleaned up %d httpx client(s) for room %s",
len(_http_clients), ctx.room.name)
_cleanup_ref.append(_on_room_disconnect)
# --- Thinking state audio feedback ---
# BackgroundAudioPlayer listens for AgentStateChangedEvent from the
# session. When state transitions to "thinking" (STT done, waiting for
# LLM response), it plays the built-in keyboard typing sound through
# the LiveKit audio track. The sound stops automatically when the agent
# enters "speaking" state (TTS begins). This gives the user audible
# feedback that the AI is processing their request.
# Available built-in clips: KEYBOARD_TYPING, KEYBOARD_TYPING2,
# OFFICE_AMBIENCE, CITY_AMBIENCE, FOREST_AMBIENCE, CROWDED_ROOM, HOLD_MUSIC
bg_audio = BackgroundAudioPlayer(
thinking_sound=BuiltinAudioClip.KEYBOARD_TYPING,
)
await bg_audio.start(room=ctx.room, agent_session=session)
logger.info("Voice session started for room %s", ctx.room.name)
# ---------------------------------------------------------------------
# Data channel listener: receive text + attachments from Flutter client
# ---------------------------------------------------------------------
async def _on_data_received(data_packet) -> None:
try:
if data_packet.topic != "text_inject":
return
payload = json.loads(data_packet.data.decode("utf-8"))
text = payload.get("text", "")
attachments = payload.get("attachments")
logger.info(
"text_inject received: text=%s attachments=%d",
text[:80] if text else "(empty)",
len(attachments) if attachments else 0,
)
if not text and not attachments:
return
# Call agent-service with the same session (context preserved)
response = await llm.inject_text_message(
text=text,
attachments=attachments,
)
if response:
logger.info("inject response: %s", response[:100])
session.say(response)
# Send response text back to Flutter for display
try:
reply_payload = json.dumps({
"type": "text_reply",
"text": response,
}).encode("utf-8")
await ctx.room.local_participant.publish_data(
reply_payload,
reliable=True,
topic="text_reply",
)
except Exception as pub_err:
logger.warning("Failed to publish text_reply: %s", pub_err)
else:
logger.warning("inject_text_message returned empty response")
except Exception as exc:
logger.error(
"text_inject handler error: %s: %s",
type(exc).__name__, exc, exc_info=True,
)
# Use ensure_future because ctx.room.on() uses a sync event emitter
# (same pattern as the "disconnected" handler above)
ctx.room.on("data_received", lambda dp: asyncio.ensure_future(_on_data_received(dp)))
except Exception as exc:
logger.error(
"Voice session failed for room %s: %s: %s",
ctx.room.name, type(exc).__name__, exc, exc_info=True,
)
if __name__ == "__main__":
cli.run_app(server)