From 3b0119fe0933ced988ae8dbd33442dc8e4e2c5aa Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 3 Mar 2026 03:20:12 -0800 Subject: [PATCH] fix: reduce STT latency, add cooldown dedup, enable diarization - Reduce debounce delay from 700ms to 400ms for faster response - Add 1.5s cooldown after emitting FINAL to prevent duplicate triggers that cause LLM abort/retry cycles - Enable speaker diarization (enable_diarization=True) Co-Authored-By: Claude Opus 4.6 --- .../src/plugins/speechmatics_stt.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py index dd788f8..c50d32b 100644 --- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py +++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py @@ -9,6 +9,7 @@ by the livekit-plugins-speechmatics package. """ import asyncio import logging +import time from livekit.agents import stt, utils from livekit.plugins.speechmatics import STT, TurnDetectionMode @@ -27,11 +28,11 @@ logger = logging.getLogger(__name__) # # Fix: each partial transcript restarts a debounce timer. When partials # stop arriving (user stops speaking), the timer fires and promotes the -# last partial to FINAL_TRANSCRIPT. The 700ms delay balances latency -# vs. avoiding mid-sentence finals. +# last partial to FINAL_TRANSCRIPT. A cooldown prevents duplicate finals. # --------------------------------------------------------------------------- -_FINALIZE_DELAY = 0.7 # seconds after last partial before emitting FINAL +_FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL +_COOLDOWN = 1.5 # seconds after a FINAL before allowing another _original_handle_partial_segment = SpeechStream._handle_partial_segment @@ -41,11 +42,17 @@ async def _auto_finalize(stream: SpeechStream) -> None: try: await asyncio.sleep(_FINALIZE_DELAY) stored = getattr(stream, "_sm_last_partial_segments", []) - if stored: - text = " | ".join(s.get("text", "") for s in stored) - logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120]) - stream._send_frames(stored, is_final=True) - stream._sm_last_partial_segments = [] # type: ignore[attr-defined] + if not stored: + return + # Cooldown: skip if we recently emitted a FINAL + last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0) + if time.monotonic() - last_final_ts < _COOLDOWN: + return + text = " | ".join(s.get("text", "") for s in stored) + logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120]) + stream._send_frames(stored, is_final=True) + stream._sm_last_partial_segments = [] # type: ignore[attr-defined] + stream._sm_last_final_ts = time.monotonic() # type: ignore[attr-defined] except asyncio.CancelledError: pass @@ -86,17 +93,18 @@ def create_speechmatics_stt(language: str = "cmn") -> STT: 'zh' are automatically mapped to Speechmatics equivalents. Returns: - Configured speechmatics.STT instance. + Configured speechmatics.STT instance with speaker diarization enabled. """ sm_lang = _LANG_MAP.get(language, language) stt = STT( language=sm_lang, include_partials=True, turn_detection_mode=TurnDetectionMode.EXTERNAL, + enable_diarization=True, ) # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to # ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal # option after construction so the raw Speechmatics code is sent. stt._stt_options.language = sm_lang # type: ignore[assignment] - logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language) + logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language) return stt