From 186234bae29ca4cb8a6129f548ff5d7288a4acef Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Sun, 1 Mar 2026 18:37:13 -0800
Subject: [PATCH] fix: increase STT silence_duration_ms to prevent choppy
 transcription

Default silence_duration_ms=350 is too aggressive for Chinese speech,
causing sentences to be fragmented into 1-3 character chunks. Increase
to 800ms and raise VAD threshold to 0.6 so the STT waits longer before
finalizing a turn, producing complete sentences for LLM processing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/services/voice-agent/src/agent.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py
index b77d0dd..b0b98a1 100644
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@@ -203,6 +203,14 @@ async def entrypoint(ctx: JobContext) -> None:
             language=settings.whisper_language,
             client=_oai_client,
             use_realtime=True,
+            # Increase silence_duration_ms so Chinese speech isn't chopped
+            # into tiny fragments (default 350ms is too aggressive).
+            turn_detection={
+                "type": "server_vad",
+                "threshold": 0.6,
+                "prefix_padding_ms": 600,
+                "silence_duration_ms": 800,
+            },
         )
     else:
         stt = LocalWhisperSTT(