fix: reduce STT latency, add cooldown dedup, enable diarization

- Reduce debounce delay from 700ms to 400ms for faster response
- Add 1.5s cooldown after emitting FINAL to prevent duplicate triggers
  that cause LLM abort/retry cycles
- Enable speaker diarization (enable_diarization=True)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-03 03:20:12 -08:00
parent 8ac1884ab4
commit 3b0119fe09
1 changed files with 18 additions and 10 deletions

View File

@ -9,6 +9,7 @@ by the livekit-plugins-speechmatics package.
""" """
import asyncio import asyncio
import logging import logging
import time
from livekit.agents import stt, utils from livekit.agents import stt, utils
from livekit.plugins.speechmatics import STT, TurnDetectionMode from livekit.plugins.speechmatics import STT, TurnDetectionMode
@ -27,11 +28,11 @@ logger = logging.getLogger(__name__)
# #
# Fix: each partial transcript restarts a debounce timer. When partials # Fix: each partial transcript restarts a debounce timer. When partials
# stop arriving (user stops speaking), the timer fires and promotes the # stop arriving (user stops speaking), the timer fires and promotes the
# last partial to FINAL_TRANSCRIPT. The 700ms delay balances latency # last partial to FINAL_TRANSCRIPT. A cooldown prevents duplicate finals.
# vs. avoiding mid-sentence finals.
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
_FINALIZE_DELAY = 0.7 # seconds after last partial before emitting FINAL _FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL
_COOLDOWN = 1.5 # seconds after a FINAL before allowing another
_original_handle_partial_segment = SpeechStream._handle_partial_segment _original_handle_partial_segment = SpeechStream._handle_partial_segment
@ -41,11 +42,17 @@ async def _auto_finalize(stream: SpeechStream) -> None:
try: try:
await asyncio.sleep(_FINALIZE_DELAY) await asyncio.sleep(_FINALIZE_DELAY)
stored = getattr(stream, "_sm_last_partial_segments", []) stored = getattr(stream, "_sm_last_partial_segments", [])
if stored: if not stored:
text = " | ".join(s.get("text", "") for s in stored) return
logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120]) # Cooldown: skip if we recently emitted a FINAL
stream._send_frames(stored, is_final=True) last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0)
stream._sm_last_partial_segments = [] # type: ignore[attr-defined] if time.monotonic() - last_final_ts < _COOLDOWN:
return
text = " | ".join(s.get("text", "") for s in stored)
logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
stream._send_frames(stored, is_final=True)
stream._sm_last_partial_segments = [] # type: ignore[attr-defined]
stream._sm_last_final_ts = time.monotonic() # type: ignore[attr-defined]
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
@ -86,17 +93,18 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
'zh' are automatically mapped to Speechmatics equivalents. 'zh' are automatically mapped to Speechmatics equivalents.
Returns: Returns:
Configured speechmatics.STT instance. Configured speechmatics.STT instance with speaker diarization enabled.
""" """
sm_lang = _LANG_MAP.get(language, language) sm_lang = _LANG_MAP.get(language, language)
stt = STT( stt = STT(
language=sm_lang, language=sm_lang,
include_partials=True, include_partials=True,
turn_detection_mode=TurnDetectionMode.EXTERNAL, turn_detection_mode=TurnDetectionMode.EXTERNAL,
enable_diarization=True,
) )
# Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
# ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal # ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal
# option after construction so the raw Speechmatics code is sent. # option after construction so the raw Speechmatics code is sent.
stt._stt_options.language = sm_lang # type: ignore[assignment] stt._stt_options.language = sm_lang # type: ignore[assignment]
logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language) logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
return stt return stt