111 lines
4.3 KiB
Python
111 lines
4.3 KiB
Python
"""
|
|
Speechmatics STT factory for voice-agent.
|
|
|
|
Creates a livekit-plugins-speechmatics STT instance configured for
|
|
Mandarin recognition with speaker diarization support.
|
|
|
|
The SPEECHMATICS_API_KEY environment variable is read automatically
|
|
by the livekit-plugins-speechmatics package.
|
|
"""
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
|
|
from livekit.agents import stt, utils
|
|
from livekit.plugins.speechmatics import STT, TurnDetectionMode
|
|
from livekit.plugins.speechmatics.stt import SpeechStream
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Monkey-patch: auto-finalize partial transcripts after silence.
|
|
#
|
|
# In EXTERNAL turn-detection mode, the Speechmatics server never emits
|
|
# AddSegment (FINAL_TRANSCRIPT) on its own. The LiveKit agents framework
|
|
# does NOT call stream.flush() — it only pushes silence audio and waits
|
|
# for FINAL_TRANSCRIPT events. So no FlushSentinel ever reaches the
|
|
# stream's _process_audio loop.
|
|
#
|
|
# Fix: each partial transcript restarts a debounce timer. When partials
|
|
# stop arriving (user stops speaking), the timer fires and promotes the
|
|
# last partial to FINAL_TRANSCRIPT. A cooldown prevents duplicate finals.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_FINALIZE_DELAY = 0.4 # seconds after last partial before emitting FINAL
|
|
_COOLDOWN = 1.5 # seconds after a FINAL before allowing another
|
|
|
|
_original_handle_partial_segment = SpeechStream._handle_partial_segment
|
|
|
|
|
|
async def _auto_finalize(stream: SpeechStream) -> None:
|
|
"""Wait, then promote stored partials to FINAL_TRANSCRIPT."""
|
|
try:
|
|
await asyncio.sleep(_FINALIZE_DELAY)
|
|
stored = getattr(stream, "_sm_last_partial_segments", [])
|
|
if not stored:
|
|
return
|
|
# Cooldown: skip if we recently emitted a FINAL
|
|
last_final_ts = getattr(stream, "_sm_last_final_ts", 0.0)
|
|
if time.monotonic() - last_final_ts < _COOLDOWN:
|
|
return
|
|
text = " | ".join(s.get("text", "") for s in stored)
|
|
logger.info("[SM] auto-finalize: promoting %d segment(s) to FINAL: %s", len(stored), text[:120])
|
|
stream._send_frames(stored, is_final=True)
|
|
stream._sm_last_partial_segments = [] # type: ignore[attr-defined]
|
|
stream._sm_last_final_ts = time.monotonic() # type: ignore[attr-defined]
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
|
|
def _patched_handle_partial_segment(self: SpeechStream, message: dict) -> None: # type: ignore[override]
|
|
"""Intercept partial segments, stash them, and reset the finalize timer."""
|
|
segments = message.get("segments", [])
|
|
if segments:
|
|
self._sm_last_partial_segments = segments # type: ignore[attr-defined]
|
|
|
|
# Cancel previous timer and start a new one
|
|
timer = getattr(self, "_sm_finalize_timer", None)
|
|
if timer and not timer.done():
|
|
timer.cancel()
|
|
self._sm_finalize_timer = asyncio.create_task(_auto_finalize(self)) # type: ignore[attr-defined]
|
|
|
|
_original_handle_partial_segment(self, message)
|
|
|
|
|
|
SpeechStream._handle_partial_segment = _patched_handle_partial_segment # type: ignore[assignment]
|
|
|
|
# Map Whisper language codes to Speechmatics language codes
|
|
_LANG_MAP = {
|
|
"zh": "cmn",
|
|
"en": "en",
|
|
"ja": "ja",
|
|
"ko": "ko",
|
|
"de": "de",
|
|
"fr": "fr",
|
|
}
|
|
|
|
|
|
def create_speechmatics_stt(language: str = "cmn") -> STT:
|
|
"""Create a Speechmatics STT instance for the voice pipeline.
|
|
|
|
Args:
|
|
language: Language code (Whisper or Speechmatics). Whisper codes like
|
|
'zh' are automatically mapped to Speechmatics equivalents.
|
|
|
|
Returns:
|
|
Configured speechmatics.STT instance with speaker diarization enabled.
|
|
"""
|
|
sm_lang = _LANG_MAP.get(language, language)
|
|
stt = STT(
|
|
language=sm_lang,
|
|
include_partials=True,
|
|
turn_detection_mode=TurnDetectionMode.EXTERNAL,
|
|
enable_diarization=True,
|
|
)
|
|
# Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to
|
|
# ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal
|
|
# option after construction so the raw Speechmatics code is sent.
|
|
stt._stt_options.language = sm_lang # type: ignore[assignment]
|
|
logger.info("Speechmatics STT created: language=%s (input=%s), diarization=True", sm_lang, language)
|
|
return stt
|