From 3b0119fe0933ced988ae8dbd33442dc8e4e2c5aa Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Tue, 3 Mar 2026 03:20:12 -0800
Subject: [PATCH] fix: reduce STT latency, add cooldown dedup, enable
 diarization

- Reduce debounce delay from 700ms to 400ms for faster response
- Add 1.5s cooldown after emitting FINAL to prevent duplicate triggers
  that cause LLM abort/retry cycles
- Enable speaker diarization (enable_diarization=True)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/plugins/speechmatics_stt.py           | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
index dd788f8..c50d32b 100644
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@@ -9,6 +9,7 @@ by the livekit-plugins-speechmatics package.
 """
 import asyncio
 import logging
+import time
 
 from livekit.agents import stt, utils
 from livekit.plugins.speechmatics import STT, TurnDetectionMode
@@ -27,11 +28,11 @@ logger = logging.getLogger(__name__)
 #
 # Fix: each partial transcript restarts a debounce timer.  When partials
 # stop arriving (user stops speaking), the timer fires and promotes the
-# last partial to FINAL_TRANSCRIPT.  The 700ms delay balances latency
-# vs. avoiding mid-sentence finals.
+# last partial to FINAL_TRANSCRIPT.  A cooldown prevents duplicate finals.
 # ---------------------------------------------------------------------------
 
-_FINALIZE_DELAY = 0.7  # seconds after last partial before emitting FINAL
+_FINALIZE_DELAY = 0.4  # seconds after last partial before emitting FINAL
+_COOLDOWN = 1.5  # seconds after a FINAL before allowing another
 
 _original_handle_partial_segment = SpeechStream._handle_partial_segment
 
@@ -41,11 +42,17 @@ async def _auto_finalize(stream: SpeechStream) -> None:
     try:
         await asyncio.sleep(_FINALIZE_DELAY)
         stored = getattr(stream, "_sm_last_partial_segments", [])
-        if stored:
-            text = " | ".join(s.get("text", "") for s in stored)
-            logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
-            stream._send_frames(stored, is_final=True)
-            stream._sm_last_partial_segments = []  # type: ignore[attr-defined]
+        if not stored:
+            return
+        # Cooldown: skip if we recently emitted a FINAL
+        last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0)
+        if time.monotonic() - last_final_ts < _COOLDOWN:
+            return
+        text = " | ".join(s.get("text", "") for s in stored)
+        logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
+        stream._send_frames(stored, is_final=True)
+        stream._sm_last_partial_segments = []  # type: ignore[attr-defined]
+        stream._sm_last_final_ts = time.monotonic()  # type: ignore[attr-defined]
     except asyncio.CancelledError:
         pass
 
@@ -86,17 +93,18 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
                   'zh' are automatically mapped to Speechmatics equivalents.
 
     Returns:
-        Configured speechmatics.STT instance.
+        Configured speechmatics.STT instance with speaker diarization enabled.
     """
     sm_lang = _LANG_MAP.get(language, language)
     stt = STT(
         language=sm_lang,
         include_partials=True,
         turn_detection_mode=TurnDetectionMode.EXTERNAL,
+        enable_diarization=True,
     )
     # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
     # ISO 639-1 "zh", but Speechmatics expects "cmn".  Override the internal
     # option after construction so the raw Speechmatics code is sent.
     stt._stt_options.language = sm_lang  # type: ignore[assignment]
-    logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language)
+    logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
     return stt