fix: load voice models in background thread to unblock startup

Model downloads (Whisper, Kokoro, Silero VAD) are synchronous blocking
calls that prevent uvicorn from completing startup and responding to
healthchecks. Move all model loading to a daemon thread so the server
starts immediately.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-20 00:26:06 -08:00
parent 3702fa3f52
commit a06b489a1e
1 changed files with 50 additions and 24 deletions

View File

@ -1,3 +1,5 @@
import threading
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
@ -23,62 +25,86 @@ app.include_router(session_router, prefix="/api/v1/voice", tags=["sessions"])
app.include_router(twilio_router, prefix="/api/v1/twilio", tags=["twilio"])
@app.on_event("startup")
async def startup():
"""Load models on startup (graceful — server starts even if models fail)."""
def _load_models_sync():
"""Load ML models in a background thread (all blocking calls)."""
from ..config.settings import settings
print("Voice service starting up...")
print(f" Device: {settings.device}")
print(f" Whisper model: {settings.whisper_model}")
def _p(msg):
print(msg, flush=True)
# Initialize STT service
_p(f"[bg] Loading models (device={settings.device}, whisper={settings.whisper_model})...")
# STT
try:
from ..stt.whisper_service import WhisperSTTService
from faster_whisper import WhisperModel
stt = WhisperSTTService(
model=settings.whisper_model,
device=settings.device,
language=settings.whisper_language,
)
await stt.initialize()
compute_type = "float16" if settings.device == "cuda" else "int8"
try:
stt._model = WhisperModel(stt.model_name, device=stt.device, compute_type=compute_type)
except Exception as e:
_p(f"[bg] Whisper fallback to CPU: {e}")
if stt.device != "cpu":
stt._model = WhisperModel(stt.model_name, device="cpu", compute_type="int8")
app.state.stt = stt
print(f"STT model loaded: {settings.whisper_model}")
_p(f"[bg] STT loaded: {settings.whisper_model}")
except Exception as e:
app.state.stt = None
print(f"WARNING: STT model failed to load: {e}")
_p(f"[bg] WARNING: STT failed: {e}")
# Initialize TTS service
# TTS
try:
from ..tts.kokoro_service import KokoroTTSService
from kokoro import KPipeline
tts = KokoroTTSService(
model=settings.kokoro_model,
voice=settings.kokoro_voice,
)
await tts.initialize()
tts = KokoroTTSService(model=settings.kokoro_model, voice=settings.kokoro_voice)
tts._pipeline = KPipeline(lang_code='z')
app.state.tts = tts
print(f"TTS model loaded: {settings.kokoro_model}")
_p(f"[bg] TTS loaded: {settings.kokoro_model}")
except Exception as e:
app.state.tts = None
print(f"WARNING: TTS model failed to load: {e}")
_p(f"[bg] WARNING: TTS failed: {e}")
# Initialize VAD service
# VAD
try:
from ..vad.silero_service import SileroVADService
import torch
vad = SileroVADService()
await vad.initialize()
vad._model, vad._utils = torch.hub.load(
repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False
)
app.state.vad = vad
print("VAD model loaded: Silero VAD")
_p("[bg] VAD loaded: Silero VAD")
except Exception as e:
app.state.vad = None
print(f"WARNING: VAD model failed to load: {e}")
_p(f"[bg] WARNING: VAD failed: {e}")
print("Voice service startup complete.")
_p("[bg] Model loading complete.")
@app.on_event("startup")
async def startup():
"""Start server immediately, load models in background thread."""
print("Voice service starting up...", flush=True)
app.state.stt = None
app.state.tts = None
app.state.vad = None
# Load models in background thread so server responds to healthchecks immediately
thread = threading.Thread(target=_load_models_sync, daemon=True)
thread.start()
print("Voice service ready (models loading in background).", flush=True)
@app.on_event("shutdown")
async def shutdown():
"""Cleanup on shutdown."""
print("Voice service shutting down...")
print("Voice service shutting down...", flush=True)