From 186234bae29ca4cb8a6129f548ff5d7288a4acef Mon Sep 17 00:00:00 2001 From: hailin Date: Sun, 1 Mar 2026 18:37:13 -0800 Subject: [PATCH] fix: increase STT silence_duration_ms to prevent choppy transcription Default silence_duration_ms=350 is too aggressive for Chinese speech, causing sentences to be fragmented into 1-3 character chunks. Increase to 800ms and raise VAD threshold to 0.6 so the STT waits longer before finalizing a turn, producing complete sentences for LLM processing. Co-Authored-By: Claude Opus 4.6 --- packages/services/voice-agent/src/agent.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index b77d0dd..b0b98a1 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -203,6 +203,14 @@ async def entrypoint(ctx: JobContext) -> None: language=settings.whisper_language, client=_oai_client, use_realtime=True, + # Increase silence_duration_ms so Chinese speech isn't chopped + # into tiny fragments (default 350ms is too aggressive). + turn_detection={ + "type": "server_vad", + "threshold": 0.6, + "prefix_padding_ms": 600, + "silence_duration_ms": 800, + }, ) else: stt = LocalWhisperSTT(