fix: load voice models in background thread to unblock startup

Model downloads (Whisper, Kokoro, Silero VAD) are synchronous blocking calls that prevent uvicorn from completing startup and responding to healthchecks. Move all model loading to a daemon thread so the server starts immediately. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 00:26:06 -08:00 · 2026-02-20 00:26:06 -08:00 · a06b489a1e
parent 3702fa3f52
commit a06b489a1e
1 changed files with 50 additions and 24 deletions
--- a/packages/services/voice-service/src/api/main.py
+++ b/packages/services/voice-service/src/api/main.py
@ -1,3 +1,5 @@
+import threading
+
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware

@ -23,62 +25,86 @@ app.include_router(session_router, prefix="/api/v1/voice", tags=["sessions"])
 app.include_router(twilio_router, prefix="/api/v1/twilio", tags=["twilio"])


-@app.on_event("startup")
-async def startup():
-    """Load models on startup (graceful — server starts even if models fail)."""
+def _load_models_sync():
+    """Load ML models in a background thread (all blocking calls)."""
    from ..config.settings import settings

-    print("Voice service starting up...")
-    print(f"  Device: {settings.device}")
-    print(f"  Whisper model: {settings.whisper_model}")
+    def _p(msg):
+        print(msg, flush=True)

-    # Initialize STT service
+    _p(f"[bg] Loading models (device={settings.device}, whisper={settings.whisper_model})...")
+
+    # STT
    try:
        from ..stt.whisper_service import WhisperSTTService
+        from faster_whisper import WhisperModel

        stt = WhisperSTTService(
            model=settings.whisper_model,
            device=settings.device,
            language=settings.whisper_language,
        )
-        await stt.initialize()
+        compute_type = "float16" if settings.device == "cuda" else "int8"
+        try:
+            stt._model = WhisperModel(stt.model_name, device=stt.device, compute_type=compute_type)
+        except Exception as e:
+            _p(f"[bg] Whisper fallback to CPU: {e}")
+            if stt.device != "cpu":
+                stt._model = WhisperModel(stt.model_name, device="cpu", compute_type="int8")
        app.state.stt = stt
-        print(f"STT model loaded: {settings.whisper_model}")
+        _p(f"[bg] STT loaded: {settings.whisper_model}")
    except Exception as e:
        app.state.stt = None
-        print(f"WARNING: STT model failed to load: {e}")
+        _p(f"[bg] WARNING: STT failed: {e}")

-    # Initialize TTS service
+    # TTS
    try:
        from ..tts.kokoro_service import KokoroTTSService
+        from kokoro import KPipeline

-        tts = KokoroTTSService(
-            model=settings.kokoro_model,
-            voice=settings.kokoro_voice,
-        )
-        await tts.initialize()
+        tts = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice)
+        tts._pipeline = KPipeline(lang_code='z')
        app.state.tts = tts
-        print(f"TTS model loaded: {settings.kokoro_model}")
+        _p(f"[bg] TTS loaded: {settings.kokoro_model}")
    except Exception as e:
        app.state.tts = None
-        print(f"WARNING: TTS model failed to load: {e}")
+        _p(f"[bg] WARNING: TTS failed: {e}")

-    # Initialize VAD service
+    # VAD
    try:
        from ..vad.silero_service import SileroVADService
+        import torch

        vad = SileroVADService()
-        await vad.initialize()
+        vad._model, vad._utils = torch.hub.load(
+            repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False
+        )
        app.state.vad = vad
-        print("VAD model loaded: Silero VAD")
+        _p("[bg] VAD loaded: Silero VAD")
    except Exception as e:
        app.state.vad = None
-        print(f"WARNING: VAD model failed to load: {e}")
+        _p(f"[bg] WARNING: VAD failed: {e}")

-    print("Voice service startup complete.")
+    _p("[bg] Model loading complete.")
+
+
+@app.on_event("startup")
+async def startup():
+    """Start server immediately, load models in background thread."""
+    print("Voice service starting up...", flush=True)
+
+    app.state.stt = None
+    app.state.tts = None
+    app.state.vad = None
+
+    # Load models in background thread so server responds to healthchecks immediately
+    thread = threading.Thread(target=_load_models_sync, daemon=True)
+    thread.start()
+
+    print("Voice service ready (models loading in background).", flush=True)


@app.on_event("shutdown")
 async def shutdown():
    """Cleanup on shutdown."""
-    print("Voice service shutting down...")
+    print("Voice service shutting down...", flush=True)