From f30aa414dda001e19b915a12322a1b4327756f45 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 3 Mar 2026 04:44:21 -0800 Subject: [PATCH] fix: use SMART_TURN mode per Speechmatics official recommendation Replace EXTERNAL mode + monkey-patch hack with SMART_TURN mode. SMART_TURN uses Speechmatics server-side turn detection that properly emits AddSegment (FINAL_TRANSCRIPT) when the user finishes speaking. No client-side finalize or debounce timer needed. Ref: https://docs.speechmatics.com/integrations-and-sdks/livekit Co-Authored-By: Claude Opus 4.6 --- .../src/plugins/speechmatics_stt.py | 72 +------------------ 1 file changed, 2 insertions(+), 70 deletions(-) diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py index bd5cff3..9804bc6 100644 --- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py +++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py @@ -7,80 +7,12 @@ Mandarin recognition with speaker diarization support. The SPEECHMATICS_API_KEY environment variable is read automatically by the livekit-plugins-speechmatics package. """ -import asyncio import logging -from livekit.agents import stt, utils from livekit.plugins.speechmatics import STT, TurnDetectionMode -from livekit.plugins.speechmatics.stt import SpeechStream logger = logging.getLogger(__name__) -# --------------------------------------------------------------------------- -# Monkey-patch: auto-finalize partial transcripts after silence. -# -# In EXTERNAL turn-detection mode, the Speechmatics server never emits -# AddSegment (FINAL_TRANSCRIPT) on its own. The LiveKit agents framework -# does NOT call stream.flush() — it only pushes silence audio and waits -# for FINAL_TRANSCRIPT events. -# -# Fix: each partial transcript restarts a debounce timer. When partials -# stop arriving (user stops speaking), the timer fires and promotes the -# last partial to FINAL_TRANSCRIPT. Text-based deduplication prevents -# the same transcript from being finalized multiple times (Speechmatics -# re-sends identical partials during silence). -# --------------------------------------------------------------------------- - -_FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL - -_original_handle_partial_segment = SpeechStream._handle_partial_segment - - -def _segments_text(segments: list) -> str: - """Extract combined text from segment dicts.""" - return " | ".join(s.get("text", "") for s in segments) - - -async def _auto_finalize(stream: SpeechStream) -> None: - """Wait, then promote stored partials to FINAL_TRANSCRIPT.""" - try: - await asyncio.sleep(_FINALIZE_DELAY) - stored = getattr(stream, "_sm_last_partial_segments", []) - if not stored: - return - # Text dedup: skip if this exact text was already finalized - text = _segments_text(stored) - last_final_text = getattr(stream, "_sm_last_final_text", "") - if text == last_final_text: - return - logger.info("[SM] auto-finalize → FINAL: %s", text[:120]) - stream._send_frames(stored, is_final=True) - stream._sm_last_partial_segments = [] # type: ignore[attr-defined] - stream._sm_last_final_text = text # type: ignore[attr-defined] - except asyncio.CancelledError: - pass - - -def _patched_handle_partial_segment(self: SpeechStream, message: dict) -> None: # type: ignore[override] - """Intercept partial segments, stash them, and reset the finalize timer.""" - segments = message.get("segments", []) - if segments: - self._sm_last_partial_segments = segments # type: ignore[attr-defined] - - # Only start a timer if text actually changed from last finalized - text = _segments_text(segments) - last_final_text = getattr(self, "_sm_last_final_text", "") - if text != last_final_text: - timer = getattr(self, "_sm_finalize_timer", None) - if timer and not timer.done(): - timer.cancel() - self._sm_finalize_timer = asyncio.create_task(_auto_finalize(self)) # type: ignore[attr-defined] - - _original_handle_partial_segment(self, message) - - -SpeechStream._handle_partial_segment = _patched_handle_partial_segment # type: ignore[assignment] - # Map Whisper language codes to Speechmatics language codes _LANG_MAP = { "zh": "cmn", @@ -106,12 +38,12 @@ def create_speechmatics_stt(language: str = "cmn") -> STT: stt = STT( language=sm_lang, include_partials=True, - turn_detection_mode=TurnDetectionMode.EXTERNAL, + turn_detection_mode=TurnDetectionMode.SMART_TURN, enable_diarization=True, ) # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to # ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal # option after construction so the raw Speechmatics code is sent. stt._stt_options.language = sm_lang # type: ignore[assignment] - logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language) + logger.info("Speechmatics STT created: language=%s (input=%s), mode=SMART_TURN, diarization=True", sm_lang, language) return stt