From 8f951ad31c0ed6f2a274c9e40c77903f360ffe88 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 3 Mar 2026 01:44:10 -0800 Subject: [PATCH] fix: use turn_detection=stt for Speechmatics per official docs Speechmatics handles end-of-utterance natively via its Voice Agent API (ADAPTIVE mode). Use turn_detection="stt" on AgentSession so LiveKit delegates turn boundaries to the STT engine instead of conflicting with its own VAD-based turn detection. Co-Authored-By: Claude Opus 4.6 --- packages/services/voice-agent/src/agent.py | 9 +++++++-- .../services/voice-agent/src/plugins/speechmatics_stt.py | 8 ++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index a637024..d1af8c7 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -297,13 +297,18 @@ async def entrypoint(ctx: JobContext) -> None: engine_type=engine_type, ) - # Create and start AgentSession with the full pipeline - session = AgentSession( + # Create and start AgentSession with the full pipeline. + # Speechmatics handles end-of-utterance natively via its Voice Agent + # API, so we use turn_detection="stt" to let it drive turn boundaries. + session_kwargs = dict( vad=ctx.proc.userdata["vad"], stt=stt, llm=llm, tts=tts, ) + if stt_provider == "speechmatics": + session_kwargs["turn_detection"] = "stt" + session = AgentSession(**session_kwargs) await session.start( agent=IT0VoiceAgent(), diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py index fe500da..82a83fe 100644 --- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py +++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py @@ -9,7 +9,7 @@ by the livekit-plugins-speechmatics package. """ import logging -from livekit.plugins.speechmatics import STT, TurnDetectionMode +from livekit.plugins.speechmatics import STT logger = logging.getLogger(__name__) @@ -39,14 +39,10 @@ def create_speechmatics_stt(language: str = "cmn") -> STT: stt = STT( language=sm_lang, include_partials=True, - # Use EXTERNAL turn detection so LiveKit's own VAD handles turn - # boundaries. ADAPTIVE enables a second client-side Silero VAD inside - # the Speechmatics SDK which conflicts with LiveKit's pipeline. - turn_detection_mode=TurnDetectionMode.EXTERNAL, ) # Workaround: LiveKit's LanguageCode normalizes ISO 639-3 "cmn" back to # ISO 639-1 "zh", but Speechmatics expects "cmn". Override the internal # option after construction so the raw Speechmatics code is sent. stt._stt_options.language = sm_lang # type: ignore[assignment] - logger.info("Speechmatics STT created: language=%s (input=%s), turn_detection=EXTERNAL", sm_lang, language) + logger.info("Speechmatics STT created: language=%s (input=%s)", sm_lang, language) return stt