fix: reduce STT latency, add cooldown dedup, enable diarization

- Reduce debounce delay from 700ms to 400ms for faster response - Add 1.5s cooldown after emitting FINAL to prevent duplicate triggers that cause LLM abort/retry cycles - Enable speaker diarization (enable_diarization=True) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 03:20:12 -08:00 · 2026-03-03 03:20:12 -08:00 · 3b0119fe09
parent 8ac1884ab4
commit 3b0119fe09
1 changed files with 18 additions and 10 deletions
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@ -9,6 +9,7 @@ by the livekit-plugins-speechmatics package.
 """
 import asyncio
 import logging
 import time
 from livekit.agents import stt, utils
 from livekit.plugins.speechmatics import STT, TurnDetectionMode
@ -27,11 +28,11 @@ logger = logging.getLogger(__name__)
 #
 # Fix: each partial transcript restarts a debounce timer.  When partials
 # stop arriving (user stops speaking), the timer fires and promotes the
-# last partial to FINAL_TRANSCRIPT.  The 700ms delay balances latency
+# last partial to FINAL_TRANSCRIPT.  A cooldown prevents duplicate finals.
 # vs. avoiding mid-sentence finals.
 # ---------------------------------------------------------------------------
-_FINALIZE_DELAY = 0.7  # seconds after last partial before emitting FINAL
+_FINALIZE_DELAY = 0.4  # seconds after last partial before emitting FINAL
 _COOLDOWN = 1.5  # seconds after a FINAL before allowing another
 _original_handle_partial_segment = SpeechStream._handle_partial_segment
@ -41,11 +42,17 @@ async def _auto_finalize(stream: SpeechStream) -> None:
    try:
        await asyncio.sleep(_FINALIZE_DELAY)
        stored = getattr(stream, "_sm_last_partial_segments", [])
-        if stored:
+        if not stored:
-            text = " | ".join(s.get("text", "") for s in stored)
+            return
-            logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
+        # Cooldown: skip if we recently emitted a FINAL
-            stream._send_frames(stored, is_final=True)
+        last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0)
-            stream._sm_last_partial_segments = []  # type: ignore[attr-defined]
+        if time.monotonic() - last_final_ts < _COOLDOWN:
            return
        text = " | ".join(s.get("text", "") for s in stored)
        logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
        stream._send_frames(stored, is_final=True)
        stream._sm_last_partial_segments = []  # type: ignore[attr-defined]
        stream._sm_last_final_ts = time.monotonic()  # type: ignore[attr-defined]
    except asyncio.CancelledError:
        pass
@ -86,17 +93,18 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
                  'zh' are automatically mapped to Speechmatics equivalents.
    Returns:
-        Configured speechmatics.STT instance.
+        Configured speechmatics.STT instance with speaker diarization enabled.
    """
    sm_lang = _LANG_MAP.get(language, language)
    stt = STT(
        language=sm_lang,
        include_partials=True,
        turn_detection_mode=TurnDetectionMode.EXTERNAL,
        enable_diarization=True,
    )
    # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
    # ISO 639-1 "zh", but Speechmatics expects "cmn".  Override the internal
    # option after construction so the raw Speechmatics code is sent.
    stt._stt_options.language = sm_lang  # type: ignore[assignment]
-    logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language)
+    logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
    return stt