From a06b489a1e13ee901aa8cefcbea31d7325d01bec Mon Sep 17 00:00:00 2001 From: hailin Date: Fri, 20 Feb 2026 00:26:06 -0800 Subject: [PATCH] fix: load voice models in background thread to unblock startup Model downloads (Whisper, Kokoro, Silero VAD) are synchronous blocking calls that prevent uvicorn from completing startup and responding to healthchecks. Move all model loading to a daemon thread so the server starts immediately. Co-Authored-By: Claude Opus 4.6 --- .../services/voice-service/src/api/main.py | 74 +++++++++++++------ 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/packages/services/voice-service/src/api/main.py b/packages/services/voice-service/src/api/main.py index f98749d..02530b8 100644 --- a/packages/services/voice-service/src/api/main.py +++ b/packages/services/voice-service/src/api/main.py @@ -1,3 +1,5 @@ +import threading + from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -23,62 +25,86 @@ app.include_router(session_router, prefix="/api/v1/voice", tags=["sessions"]) app.include_router(twilio_router, prefix="/api/v1/twilio", tags=["twilio"]) -@app.on_event("startup") -async def startup(): - """Load models on startup (graceful — server starts even if models fail).""" +def _load_models_sync(): + """Load ML models in a background thread (all blocking calls).""" from ..config.settings import settings - print("Voice service starting up...") - print(f" Device: {settings.device}") - print(f" Whisper model: {settings.whisper_model}") + def _p(msg): + print(msg, flush=True) - # Initialize STT service + _p(f"[bg] Loading models (device={settings.device}, whisper={settings.whisper_model})...") + + # STT try: from ..stt.whisper_service import WhisperSTTService + from faster_whisper import WhisperModel stt = WhisperSTTService( model=settings.whisper_model, device=settings.device, language=settings.whisper_language, ) - await stt.initialize() + compute_type = "float16" if settings.device == "cuda" else "int8" + try: + stt._model = WhisperModel(stt.model_name, device=stt.device, compute_type=compute_type) + except Exception as e: + _p(f"[bg] Whisper fallback to CPU: {e}") + if stt.device != "cpu": + stt._model = WhisperModel(stt.model_name, device="cpu", compute_type="int8") app.state.stt = stt - print(f"STT model loaded: {settings.whisper_model}") + _p(f"[bg] STT loaded: {settings.whisper_model}") except Exception as e: app.state.stt = None - print(f"WARNING: STT model failed to load: {e}") + _p(f"[bg] WARNING: STT failed: {e}") - # Initialize TTS service + # TTS try: from ..tts.kokoro_service import KokoroTTSService + from kokoro import KPipeline - tts = KokoroTTSService( - model=settings.kokoro_model, - voice=settings.kokoro_voice, - ) - await tts.initialize() + tts = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice) + tts._pipeline = KPipeline(lang_code='z') app.state.tts = tts - print(f"TTS model loaded: {settings.kokoro_model}") + _p(f"[bg] TTS loaded: {settings.kokoro_model}") except Exception as e: app.state.tts = None - print(f"WARNING: TTS model failed to load: {e}") + _p(f"[bg] WARNING: TTS failed: {e}") - # Initialize VAD service + # VAD try: from ..vad.silero_service import SileroVADService + import torch vad = SileroVADService() - await vad.initialize() + vad._model, vad._utils = torch.hub.load( + repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False + ) app.state.vad = vad - print("VAD model loaded: Silero VAD") + _p("[bg] VAD loaded: Silero VAD") except Exception as e: app.state.vad = None - print(f"WARNING: VAD model failed to load: {e}") + _p(f"[bg] WARNING: VAD failed: {e}") - print("Voice service startup complete.") + _p("[bg] Model loading complete.") + + +@app.on_event("startup") +async def startup(): + """Start server immediately, load models in background thread.""" + print("Voice service starting up...", flush=True) + + app.state.stt = None + app.state.tts = None + app.state.vad = None + + # Load models in background thread so server responds to healthchecks immediately + thread = threading.Thread(target=_load_models_sync, daemon=True) + thread.start() + + print("Voice service ready (models loading in background).", flush=True) @app.on_event("shutdown") async def shutdown(): """Cleanup on shutdown.""" - print("Voice service shutting down...") + print("Voice service shutting down...", flush=True)