fix: use SMART_TURN mode per Speechmatics official recommendation
Replace EXTERNAL mode + monkey-patch hack with SMART_TURN mode. SMART_TURN uses Speechmatics server-side turn detection that properly emits AddSegment (FINAL_TRANSCRIPT) when the user finishes speaking. No client-side finalize or debounce timer needed. Ref: https://docs.speechmatics.com/integrations-and-sdks/livekit Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
de99990c4d
commit
f30aa414dd
|
|
@ -7,80 +7,12 @@ Mandarin recognition with speaker diarization support.
|
|||
The SPEECHMATICS_API_KEY environment variable is read automatically
|
||||
by the livekit-plugins-speechmatics package.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from livekit.agents import stt, utils
|
||||
from livekit.plugins.speechmatics import STT, TurnDetectionMode
|
||||
from livekit.plugins.speechmatics.stt import SpeechStream
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Monkey-patch: auto-finalize partial transcripts after silence.
|
||||
#
|
||||
# In EXTERNAL turn-detection mode, the Speechmatics server never emits
|
||||
# AddSegment (FINAL_TRANSCRIPT) on its own. The LiveKit agents framework
|
||||
# does NOT call stream.flush() — it only pushes silence audio and waits
|
||||
# for FINAL_TRANSCRIPT events.
|
||||
#
|
||||
# Fix: each partial transcript restarts a debounce timer. When partials
|
||||
# stop arriving (user stops speaking), the timer fires and promotes the
|
||||
# last partial to FINAL_TRANSCRIPT. Text-based deduplication prevents
|
||||
# the same transcript from being finalized multiple times (Speechmatics
|
||||
# re-sends identical partials during silence).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL
|
||||
|
||||
_original_handle_partial_segment = SpeechStream._handle_partial_segment
|
||||
|
||||
|
||||
def _segments_text(segments: list) -> str:
|
||||
"""Extract combined text from segment dicts."""
|
||||
return " | ".join(s.get("text", "") for s in segments)
|
||||
|
||||
|
||||
async def _auto_finalize(stream: SpeechStream) -> None:
|
||||
"""Wait, then promote stored partials to FINAL_TRANSCRIPT."""
|
||||
try:
|
||||
await asyncio.sleep(_FINALIZE_DELAY)
|
||||
stored = getattr(stream, "_sm_last_partial_segments", [])
|
||||
if not stored:
|
||||
return
|
||||
# Text dedup: skip if this exact text was already finalized
|
||||
text = _segments_text(stored)
|
||||
last_final_text = getattr(stream, "_sm_last_final_text", "")
|
||||
if text == last_final_text:
|
||||
return
|
||||
logger.info("[SM] auto-finalize → FINAL: %s", text[:120])
|
||||
stream._send_frames(stored, is_final=True)
|
||||
stream._sm_last_partial_segments = [] # type: ignore[attr-defined]
|
||||
stream._sm_last_final_text = text # type: ignore[attr-defined]
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
def _patched_handle_partial_segment(self: SpeechStream, message: dict) -> None: # type: ignore[override]
|
||||
"""Intercept partial segments, stash them, and reset the finalize timer."""
|
||||
segments = message.get("segments", [])
|
||||
if segments:
|
||||
self._sm_last_partial_segments = segments # type: ignore[attr-defined]
|
||||
|
||||
# Only start a timer if text actually changed from last finalized
|
||||
text = _segments_text(segments)
|
||||
last_final_text = getattr(self, "_sm_last_final_text", "")
|
||||
if text != last_final_text:
|
||||
timer = getattr(self, "_sm_finalize_timer", None)
|
||||
if timer and not timer.done():
|
||||
timer.cancel()
|
||||
self._sm_finalize_timer = asyncio.create_task(_auto_finalize(self)) # type: ignore[attr-defined]
|
||||
|
||||
_original_handle_partial_segment(self, message)
|
||||
|
||||
|
||||
SpeechStream._handle_partial_segment = _patched_handle_partial_segment # type: ignore[assignment]
|
||||
|
||||
# Map Whisper language codes to Speechmatics language codes
|
||||
_LANG_MAP = {
|
||||
"zh": "cmn",
|
||||
|
|
@ -106,12 +38,12 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
|
|||
stt = STT(
|
||||
language=sm_lang,
|
||||
include_partials=True,
|
||||
turn_detection_mode=TurnDetectionMode.EXTERNAL,
|
||||
turn_detection_mode=TurnDetectionMode.SMART_TURN,
|
||||
enable_diarization=True,
|
||||
)
|
||||
# Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
|
||||
# ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal
|
||||
# option after construction so the raw Speechmatics code is sent.
|
||||
stt._stt_options.language = sm_lang # type: ignore[assignment]
|
||||
logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
|
||||
logger.info("Speechmatics STT created: language=%s (input=%s), mode=SMART_TURN, diarization=True", sm_lang, language)
|
||||
return stt
|
||||
|
|
|
|||
Loading…
Reference in New Issue