From ba83e433d379a13990457a9c6acd95a77b1cceb8 Mon Sep 17 00:00:00 2001 From: hailin Date: Sun, 1 Mar 2026 07:49:25 -0800 Subject: [PATCH] feat: enable OpenAI Realtime STT for streaming speech recognition Switch from batch STT (gpt-4o-transcribe via /audio/transcriptions) to streaming Realtime API (WebSocket). This eliminates the ~2s batch upload+process latency per utterance. Also updated nginx proxy on 67.223.119.33 to support WebSocket upgrade for /v1/realtime endpoint. Co-Authored-By: Claude Opus 4.6 --- packages/services/voice-agent/src/agent.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index 3f04de2..e2e9351 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -148,6 +148,7 @@ async def entrypoint(ctx: JobContext) -> None: model=settings.openai_stt_model, language=settings.whisper_language, client=_oai_client, + use_realtime=True, ) else: stt = LocalWhisperSTT(