From f30aa414dda001e19b915a12322a1b4327756f45 Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Tue, 3 Mar 2026 04:44:21 -0800
Subject: [PATCH] fix: use SMART_TURN mode per Speechmatics official
 recommendation

Replace EXTERNAL mode + monkey-patch hack with SMART_TURN mode.
SMART_TURN uses Speechmatics server-side turn detection that properly
emits AddSegment (FINAL_TRANSCRIPT) when the user finishes speaking.
No client-side finalize or debounce timer needed.

Ref: https://docs.speechmatics.com/integrations-and-sdks/livekit

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../src/plugins/speechmatics_stt.py           | 72 +------------------
 1 file changed, 2 insertions(+), 70 deletions(-)

diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
index bd5cff3..9804bc6 100644
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@@ -7,80 +7,12 @@ Mandarin recognition with speaker diarization support.
 The SPEECHMATICS_API_KEY environment variable is read automatically
 by the livekit-plugins-speechmatics package.
 """
-import asyncio
 import logging
 
-from livekit.agents import stt, utils
 from livekit.plugins.speechmatics import STT, TurnDetectionMode
-from livekit.plugins.speechmatics.stt import SpeechStream
 
 logger = logging.getLogger(__name__)
 
-# ---------------------------------------------------------------------------
-# Monkey-patch: auto-finalize partial transcripts after silence.
-#
-# In EXTERNAL turn-detection mode, the Speechmatics server never emits
-# AddSegment (FINAL_TRANSCRIPT) on its own.  The LiveKit agents framework
-# does NOT call stream.flush() — it only pushes silence audio and waits
-# for FINAL_TRANSCRIPT events.
-#
-# Fix: each partial transcript restarts a debounce timer.  When partials
-# stop arriving (user stops speaking), the timer fires and promotes the
-# last partial to FINAL_TRANSCRIPT.  Text-based deduplication prevents
-# the same transcript from being finalized multiple times (Speechmatics
-# re-sends identical partials during silence).
-# ---------------------------------------------------------------------------
-
-_FINALIZE_DELAY = 0.4  # seconds after last partial before emitting FINAL
-
-_original_handle_partial_segment = SpeechStream._handle_partial_segment
-
-
-def _segments_text(segments: list) -> str:
-    """Extract combined text from segment dicts."""
-    return " | ".join(s.get("text", "") for s in segments)
-
-
-async def _auto_finalize(stream: SpeechStream) -> None:
-    """Wait, then promote stored partials to FINAL_TRANSCRIPT."""
-    try:
-        await asyncio.sleep(_FINALIZE_DELAY)
-        stored = getattr(stream, "_sm_last_partial_segments", [])
-        if not stored:
-            return
-        # Text dedup: skip if this exact text was already finalized
-        text = _segments_text(stored)
-        last_final_text = getattr(stream, "_sm_last_final_text", "")
-        if text == last_final_text:
-            return
-        logger.info("[SM] auto-finalize → FINAL: %s", text[:120])
-        stream._send_frames(stored, is_final=True)
-        stream._sm_last_partial_segments = []  # type: ignore[attr-defined]
-        stream._sm_last_final_text = text  # type: ignore[attr-defined]
-    except asyncio.CancelledError:
-        pass
-
-
-def _patched_handle_partial_segment(self: SpeechStream, message: dict) -> None:  # type: ignore[override]
-    """Intercept partial segments, stash them, and reset the finalize timer."""
-    segments = message.get("segments", [])
-    if segments:
-        self._sm_last_partial_segments = segments  # type: ignore[attr-defined]
-
-        # Only start a timer if text actually changed from last finalized
-        text = _segments_text(segments)
-        last_final_text = getattr(self, "_sm_last_final_text", "")
-        if text != last_final_text:
-            timer = getattr(self, "_sm_finalize_timer", None)
-            if timer and not timer.done():
-                timer.cancel()
-            self._sm_finalize_timer = asyncio.create_task(_auto_finalize(self))  # type: ignore[attr-defined]
-
-    _original_handle_partial_segment(self, message)
-
-
-SpeechStream._handle_partial_segment = _patched_handle_partial_segment  # type: ignore[assignment]
-
 # Map Whisper language codes to Speechmatics language codes
 _LANG_MAP = {
     "zh": "cmn",
@@ -106,12 +38,12 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
     stt = STT(
         language=sm_lang,
         include_partials=True,
-        turn_detection_mode=TurnDetectionMode.EXTERNAL,
+        turn_detection_mode=TurnDetectionMode.SMART_TURN,
         enable_diarization=True,
     )
     # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
     # ISO 639-1 "zh", but Speechmatics expects "cmn".  Override the internal
     # option after construction so the raw Speechmatics code is sent.
     stt._stt_options.language = sm_lang  # type: ignore[assignment]
-    logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
+    logger.info("Speechmatics STT created: language=%s (input=%s), mode=SMART_TURN, diarization=True", sm_lang, language)
     return stt