it0/packages/services/voice-agent/src/agent.py

"""
IT0 Voice Agent — LiveKit Agents v1.x entry point.

Uses the official AgentServer + @server.rtc_session() pattern.
Pipeline: VAD → STT → LLM (via agent-service) → TTS.

Usage:
    python -m src.agent start
"""

import asyncio
import json
import logging
import ssl

import aiohttp
from livekit.agents import (
    Agent,
    AgentServer,
    AgentSession,
    JobContext,
    JobProcess,
    cli,
    room_io,
)
from livekit.agents.utils import http_context
from livekit.plugins import silero

from .config import settings

# ---------------------------------------------------------------------------
# Monkey-patch: disable SSL verification for aiohttp sessions.
#
# The OpenAI Realtime STT uses aiohttp WebSocket (not httpx), so passing
# verify=False to the httpx/OpenAI client does NOT help.  LiveKit's
# http_context._new_session_ctx creates an aiohttp.TCPConnector without
# ssl=False, causing SSL errors when OPENAI_BASE_URL points to a proxy
# with a self-signed certificate.
#
# We replace _new_session_ctx to inject ssl=False into the connector.
# ---------------------------------------------------------------------------
_original_new_session_ctx = http_context._new_session_ctx

_no_verify_ssl = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
_no_verify_ssl.check_hostname = False
_no_verify_ssl.verify_mode = ssl.CERT_NONE


def _patched_new_session_ctx():
    """Same as the original but with ssl verification disabled."""
    _g_session = None

    def _new_session():
        nonlocal _g_session
        if _g_session is None or _g_session.closed:
            from livekit.agents.job import get_job_context

            try:
                http_proxy = get_job_context().proc.http_proxy
            except RuntimeError:
                http_proxy = None

            connector = aiohttp.TCPConnector(
                limit_per_host=50,
                keepalive_timeout=120,
                ssl=_no_verify_ssl,
            )
            _g_session = aiohttp.ClientSession(proxy=http_proxy, connector=connector)
        return _g_session

    http_context._ContextVar.set(_new_session)
    return _new_session


http_context._new_session_ctx = _patched_new_session_ctx
from .plugins.agent_llm import AgentServiceLLM
from .plugins.whisper_stt import LocalWhisperSTT
from .plugins.kokoro_tts import LocalKokoroTTS, patch_misaki_compat

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class IT0VoiceAgent(Agent):
    """Voice agent for IT0 server operations platform."""

    def __init__(self):
        super().__init__(
            instructions=(
                "你是 IT0 服务器运维助手。用户通过语音与你对话，"
                "你帮助管理和监控服务器集群。回答简洁，适合语音对话场景。"
            ),
        )

    async def on_enter(self):
        """No greeting — wait for the user to speak first."""
        pass


# ---------------------------------------------------------------------------
# Server setup
# ---------------------------------------------------------------------------

server = AgentServer()


def prewarm(proc: JobProcess) -> None:
    """Pre-load ML models into shared process memory.

    Called once per worker process. Models are shared across all sessions
    handled by this process, avoiding redundant loading.
    """
    logger.info(
        "Prewarming models (stt=%s, tts=%s, device=%s)",
        settings.stt_provider,
        settings.tts_provider,
        settings.device,
    )

    # VAD — always needed
    proc.userdata["vad"] = silero.VAD.load()
    logger.info("VAD loaded: Silero VAD")

    # STT — local faster-whisper
    if settings.stt_provider == "local":
        from faster_whisper import WhisperModel

        compute_type = "float16" if settings.device == "cuda" else "int8"
        try:
            model = WhisperModel(
                settings.whisper_model,
                device=settings.device,
                compute_type=compute_type,
            )
        except Exception as e:
            logger.warning("Whisper GPU failed, falling back to CPU: %s", e)
            model = WhisperModel(
                settings.whisper_model, device="cpu", compute_type="int8"
            )
        proc.userdata["whisper_model"] = model
        logger.info("STT loaded: faster-whisper %s", settings.whisper_model)
    else:
        proc.userdata["whisper_model"] = None
        logger.info("STT: using OpenAI %s", settings.openai_stt_model)

    # TTS — local Kokoro
    if settings.tts_provider == "local":
        patch_misaki_compat()
        from kokoro import KPipeline

        proc.userdata["kokoro_pipeline"] = KPipeline(lang_code="z")
        logger.info("TTS loaded: Kokoro-82M voice=%s", settings.kokoro_voice)
    else:
        proc.userdata["kokoro_pipeline"] = None
        logger.info("TTS: using OpenAI %s", settings.openai_tts_model)

    logger.info("Prewarm complete.")


server.setup_fnc = prewarm


# ---------------------------------------------------------------------------
# Session entrypoint — called for each voice session (room join)
# ---------------------------------------------------------------------------

@server.rtc_session(agent_name="voice-agent")
async def entrypoint(ctx: JobContext) -> None:
    """Main entrypoint — called for each voice session.

    NOTE: session.start() returns immediately while the session continues
    running in the background.  Resources (httpx clients) must stay alive
    for the session's lifetime and are cleaned up via the room disconnect
    listener, NOT in a finally block.
    """
    logger.info("New voice session: room=%s", ctx.room.name)

    # httpx clients to close when the room disconnects
    _http_clients: list = []

    async def _on_room_disconnect() -> None:
        """Clean up httpx clients when the room disconnects."""
        for client in _http_clients:
            try:
                await client.aclose()
            except Exception:
                pass
        logger.info("Cleaned up %d httpx client(s) for room %s",
                    len(_http_clients), ctx.room.name)

    # Register cleanup before anything else so it fires even on errors
    ctx.room.on("disconnected", lambda *_: asyncio.ensure_future(_on_room_disconnect()))

    try:
        # Extract auth header from job metadata
        # The token endpoint embeds {"auth_header": "Bearer ..."} via RoomAgentDispatch metadata,
        # which LiveKit passes through as job.metadata to the agent worker.
        auth_header = ""
        tts_voice = settings.openai_tts_voice
        tts_style = ""
        engine_type = "claude_agent_sdk"
        meta = {}
        try:
            meta_str = ctx.job.metadata or "{}"
            meta = json.loads(meta_str)
            auth_header = meta.get("auth_header", "")
            tts_voice = meta.get("tts_voice", settings.openai_tts_voice)
            tts_style = meta.get("tts_style", "")
            engine_type = meta.get("engine_type", "claude_agent_sdk")
        except Exception as e:
            logger.warning("Failed to parse job metadata: %s", e)

        logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
                    bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)

        # ── Resolve STT provider (metadata > agent-service config > env default) ──
        stt_provider = meta.get("stt_provider", "")
        if not stt_provider and auth_header:
            try:
                import httpx as _httpx_cfg
                async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client:
                    _cfg_resp = await _cfg_client.get(
                        f"{settings.agent_service_url}/api/v1/agent/voice-config",
                        headers={"Authorization": auth_header},
                    )
                    if _cfg_resp.status_code == 200:
                        _voice_cfg = _cfg_resp.json()
                        stt_provider = _voice_cfg.get("stt_provider", "")
                        logger.info("Voice config from agent-service: stt_provider=%s", stt_provider)
            except Exception as e:
                logger.warning("Failed to fetch voice config from agent-service: %s", e)
        if not stt_provider:
            stt_provider = settings.stt_provider  # env var fallback

        # ── Build STT ──
        if stt_provider == "openai":
            from livekit.plugins import openai as openai_plugin
            import httpx as _httpx
            import openai as _openai

            # OPENAI_BASE_URL may use a self-signed certificate (e.g. proxy)
            _http_client = _httpx.AsyncClient(verify=False)
            _http_clients.append(_http_client)
            _oai_client = _openai.AsyncOpenAI(http_client=_http_client)

            stt = openai_plugin.STT(
                model=settings.openai_stt_model,
                language=settings.whisper_language,
                client=_oai_client,
                use_realtime=True,
                # Increase silence_duration_ms so Chinese speech isn't chopped
                # into tiny fragments (default 350ms is too aggressive).
                turn_detection={
                    "type": "server_vad",
                    "threshold": 0.6,
                    "prefix_padding_ms": 600,
                    "silence_duration_ms": 800,
                },
            )
        else:
            stt = LocalWhisperSTT(
                model=ctx.proc.userdata.get("whisper_model"),
                language=settings.whisper_language,
            )
        logger.info("STT provider selected: %s", stt_provider)

        # Build TTS
        if settings.tts_provider == "openai":
            from livekit.plugins import openai as openai_plugin
            import httpx as _httpx
            import openai as _openai

            _http_client_tts = _httpx.AsyncClient(verify=False)
            _http_clients.append(_http_client_tts)
            _oai_client_tts = _openai.AsyncOpenAI(http_client=_http_client_tts)

            default_instructions = "用自然、友好的中文语气说话，语速稍快，简洁干练，像专业助手一样。"
            tts = openai_plugin.TTS(
                model=settings.openai_tts_model,
                voice=tts_voice,
                instructions=tts_style if tts_style else default_instructions,
                client=_oai_client_tts,
            )
        else:
            tts = LocalKokoroTTS(
                pipeline=ctx.proc.userdata.get("kokoro_pipeline"),
                voice=settings.kokoro_voice,
            )

        # Build custom LLM (proxies to agent-service)
        llm = AgentServiceLLM(
            agent_service_url=settings.agent_service_url,
            auth_header=auth_header,
            engine_type=engine_type,
        )

        # Create and start AgentSession with the full pipeline
        session = AgentSession(
            vad=ctx.proc.userdata["vad"],
            stt=stt,
            llm=llm,
            tts=tts,
        )

        await session.start(
            agent=IT0VoiceAgent(),
            room=ctx.room,
            room_input_options=room_io.RoomInputOptions(),
            room_output_options=room_io.RoomOutputOptions(),
        )

        logger.info("Voice session started for room %s", ctx.room.name)

        # ---------------------------------------------------------------------
        # Data channel listener: receive text + attachments from Flutter client
        # ---------------------------------------------------------------------
        async def _on_data_received(data_packet) -> None:
            try:
                if data_packet.topic != "text_inject":
                    return

                payload = json.loads(data_packet.data.decode("utf-8"))
                text = payload.get("text", "")
                attachments = payload.get("attachments")

                logger.info(
                    "text_inject received: text=%s attachments=%d",
                    text[:80] if text else "(empty)",
                    len(attachments) if attachments else 0,
                )

                if not text and not attachments:
                    return

                # Call agent-service with the same session (context preserved)
                response = await llm.inject_text_message(
                    text=text,
                    attachments=attachments,
                )

                if response:
                    logger.info("inject response: %s", response[:100])
                    session.say(response)

                    # Send response text back to Flutter for display
                    try:
                        reply_payload = json.dumps({
                            "type": "text_reply",
                            "text": response,
                        }).encode("utf-8")
                        await ctx.room.local_participant.publish_data(
                            reply_payload,
                            reliable=True,
                            topic="text_reply",
                        )
                    except Exception as pub_err:
                        logger.warning("Failed to publish text_reply: %s", pub_err)
                else:
                    logger.warning("inject_text_message returned empty response")

            except Exception as exc:
                logger.error(
                    "text_inject handler error: %s: %s",
                    type(exc).__name__, exc, exc_info=True,
                )

        # Use ensure_future because ctx.room.on() uses a sync event emitter
        # (same pattern as the "disconnected" handler above)
        ctx.room.on("data_received", lambda dp: asyncio.ensure_future(_on_data_received(dp)))

    except Exception as exc:
        logger.error(
            "Voice session failed for room %s: %s: %s",
            ctx.room.name, type(exc).__name__, exc, exc_info=True,
        )


if __name__ == "__main__":
    cli.run_app(server)