fix: reduce STT latency, add cooldown dedup, enable diarization
- Reduce debounce delay from 700ms to 400ms for faster response - Add 1.5s cooldown after emitting FINAL to prevent duplicate triggers that cause LLM abort/retry cycles - Enable speaker diarization (enable_diarization=True) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
8ac1884ab4
commit
3b0119fe09
|
|
@ -9,6 +9,7 @@ by the livekit-plugins-speechmatics package.
|
|||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
|
||||
from livekit.agents import stt, utils
|
||||
from livekit.plugins.speechmatics import STT, TurnDetectionMode
|
||||
|
|
@ -27,11 +28,11 @@ logger = logging.getLogger(__name__)
|
|||
#
|
||||
# Fix: each partial transcript restarts a debounce timer. When partials
|
||||
# stop arriving (user stops speaking), the timer fires and promotes the
|
||||
# last partial to FINAL_TRANSCRIPT. The 700ms delay balances latency
|
||||
# vs. avoiding mid-sentence finals.
|
||||
# last partial to FINAL_TRANSCRIPT. A cooldown prevents duplicate finals.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_FINALIZE_DELAY = 0.7 # seconds after last partial before emitting FINAL
|
||||
_FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL
|
||||
_COOLDOWN = 1.5 # seconds after a FINAL before allowing another
|
||||
|
||||
_original_handle_partial_segment = SpeechStream._handle_partial_segment
|
||||
|
||||
|
|
@ -41,11 +42,17 @@ async def _auto_finalize(stream: SpeechStream) -> None:
|
|||
try:
|
||||
await asyncio.sleep(_FINALIZE_DELAY)
|
||||
stored = getattr(stream, "_sm_last_partial_segments", [])
|
||||
if stored:
|
||||
text = " | ".join(s.get("text", "") for s in stored)
|
||||
logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
|
||||
stream._send_frames(stored, is_final=True)
|
||||
stream._sm_last_partial_segments = [] # type: ignore[attr-defined]
|
||||
if not stored:
|
||||
return
|
||||
# Cooldown: skip if we recently emitted a FINAL
|
||||
last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0)
|
||||
if time.monotonic() - last_final_ts < _COOLDOWN:
|
||||
return
|
||||
text = " | ".join(s.get("text", "") for s in stored)
|
||||
logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
|
||||
stream._send_frames(stored, is_final=True)
|
||||
stream._sm_last_partial_segments = [] # type: ignore[attr-defined]
|
||||
stream._sm_last_final_ts = time.monotonic() # type: ignore[attr-defined]
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
|
@ -86,17 +93,18 @@ def create_speechmatics_stt(language: str = "cmn") -> STT:
|
|||
'zh' are automatically mapped to Speechmatics equivalents.
|
||||
|
||||
Returns:
|
||||
Configured speechmatics.STT instance.
|
||||
Configured speechmatics.STT instance with speaker diarization enabled.
|
||||
"""
|
||||
sm_lang = _LANG_MAP.get(language, language)
|
||||
stt = STT(
|
||||
language=sm_lang,
|
||||
include_partials=True,
|
||||
turn_detection_mode=TurnDetectionMode.EXTERNAL,
|
||||
enable_diarization=True,
|
||||
)
|
||||
# Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
|
||||
# ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal
|
||||
# option after construction so the raw Speechmatics code is sent.
|
||||
stt._stt_options.language = sm_lang # type: ignore[assignment]
|
||||
logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language)
|
||||
logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
|
||||
return stt
|
||||
|
|
|
|||
Loading…
Reference in New Issue