From 7fb0d1de95fc5b6ec791fe1777452b39488384d9 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 3 Mar 2026 04:58:38 -0800 Subject: [PATCH] refactor: remove Speechmatics STT integration entirely, default to OpenAI - Delete speechmatics_stt.py plugin - Remove speechmatics branch from voice-agent entrypoint - Remove livekit-plugins-speechmatics dependency - Change default stt_provider to 'openai' in entity, controller, and UI - Remove SPEECHMATICS_API_KEY from docker-compose.yml - Remove speechmatics option from web-admin settings dropdown Co-Authored-By: Claude Opus 4.6 --- deploy/docker/docker-compose.yml | 1 - .../src/app/(admin)/settings/page.tsx | 5 +- .../src/i18n/locales/en/settings.json | 1 - .../src/i18n/locales/zh/settings.json | 1 - .../domain/entities/voice-config.entity.ts | 4 +- .../controllers/voice-config.controller.ts | 2 +- .../services/voice-agent/requirements.txt | 1 - packages/services/voice-agent/src/agent.py | 3 - .../src/plugins/speechmatics_stt.py | 101 ------------------ 9 files changed, 5 insertions(+), 114 deletions(-) delete mode 100644 packages/services/voice-agent/src/plugins/speechmatics_stt.py diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index 6462b41..1a69bc7 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -354,7 +354,6 @@ services: - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe} - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts} - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral} - - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-} depends_on: livekit-server: condition: service_healthy diff --git a/it0-web-admin/src/app/(admin)/settings/page.tsx b/it0-web-admin/src/app/(admin)/settings/page.tsx index a8d3cc4..41c9c67 100644 --- a/it0-web-admin/src/app/(admin)/settings/page.tsx +++ b/it0-web-admin/src/app/(admin)/settings/page.tsx @@ -796,7 +796,6 @@ interface VoiceSettings { } const STT_PROVIDERS = [ - { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' }, { value: 'openai', labelKey: 'voice.providers.openai' }, ]; @@ -810,11 +809,11 @@ function VoiceSection() { queryFn: () => apiClient('/api/v1/agent/voice-config'), }); - const [sttProvider, setSttProvider] = useState('speechmatics'); + const [sttProvider, setSttProvider] = useState('openai'); const [initialized, setInitialized] = useState(false); if (data && !initialized) { - setSttProvider(data.stt_provider || 'speechmatics'); + setSttProvider(data.stt_provider || 'openai'); setInitialized(true); } diff --git a/it0-web-admin/src/i18n/locales/en/settings.json b/it0-web-admin/src/i18n/locales/en/settings.json index 5be604f..ce188ce 100644 --- a/it0-web-admin/src/i18n/locales/en/settings.json +++ b/it0-web-admin/src/i18n/locales/en/settings.json @@ -74,7 +74,6 @@ "sttProvider": "Speech-to-Text Provider", "sttProviderHint": "Choose the speech recognition engine for voice sessions.", "providers": { - "speechmatics": "Speechmatics (Default)", "openai": "OpenAI (gpt-4o-transcribe)" }, "saved": "Voice settings saved." diff --git a/it0-web-admin/src/i18n/locales/zh/settings.json b/it0-web-admin/src/i18n/locales/zh/settings.json index cdbed38..15546b2 100644 --- a/it0-web-admin/src/i18n/locales/zh/settings.json +++ b/it0-web-admin/src/i18n/locales/zh/settings.json @@ -74,7 +74,6 @@ "sttProvider": "语音转文字引擎", "sttProviderHint": "选择语音通话时使用的语音识别引擎。", "providers": { - "speechmatics": "Speechmatics(默认)", "openai": "OpenAI (gpt-4o-transcribe)" }, "saved": "语音设置已保存。" diff --git a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts index f970f4b..0d593d8 100644 --- a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts +++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts @@ -1,7 +1,7 @@ /** * Per-tenant voice configuration entity. * - * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai'). + * Stores STT provider preference per tenant (e.g. 'openai'). * Queried by voice-agent at session start to select the appropriate STT engine. */ import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; @@ -14,7 +14,7 @@ export class VoiceConfig { @Column({ type: 'varchar', length: 20, unique: true }) tenantId!: string; - @Column({ type: 'varchar', length: 30, default: 'speechmatics' }) + @Column({ type: 'varchar', length: 30, default: 'openai' }) sttProvider!: string; @CreateDateColumn({ type: 'timestamptz' }) diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts index 1a1e94a..d9f50ad 100644 --- a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts +++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts @@ -10,7 +10,7 @@ import { TenantId } from '@it0/common'; import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service'; const DEFAULT_CONFIG = { - stt_provider: 'speechmatics', + stt_provider: 'openai', }; @Controller('api/v1/agent/voice-config') diff --git a/packages/services/voice-agent/requirements.txt b/packages/services/voice-agent/requirements.txt index 5ddf39d..79deeaf 100644 --- a/packages/services/voice-agent/requirements.txt +++ b/packages/services/voice-agent/requirements.txt @@ -2,7 +2,6 @@ livekit>=1.0.0 livekit-agents>=1.0.0 livekit-plugins-silero>=1.0.0 livekit-plugins-openai>=1.0.0 -livekit-plugins-speechmatics>=1.0.0 faster-whisper==1.2.1 kokoro==0.3.5 misaki[zh]==0.7.17 diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index a637024..e1f26c6 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None: "silence_duration_ms": 800, }, ) - elif stt_provider == "speechmatics": - from .plugins.speechmatics_stt import create_speechmatics_stt - stt = create_speechmatics_stt(language=settings.whisper_language) else: stt = LocalWhisperSTT( model=ctx.proc.userdata.get("whisper_model"), diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py deleted file mode 100644 index e5c2055..0000000 --- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py +++ /dev/null @@ -1,101 +0,0 @@ -""" -Speechmatics STT factory for voice-agent. - -Creates a livekit-plugins-speechmatics STT instance configured for -Mandarin recognition with speaker diarization support. - -The SPEECHMATICS_API_KEY environment variable is read automatically -by the livekit-plugins-speechmatics package. - -=========================================================================== -集成笔记 (2026-03-03) -=========================================================================== - -1. 语言码映射 - - Speechmatics 使用 ISO 639-3 语言码,中文普通话为 "cmn" - - LiveKit 的 LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 的 "zh" - (见 livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh") - - 但 Speechmatics API 不接受 "zh",会报 "lang pack [zh] is not supported" - - 解决:构造 STT 后手动覆盖 stt._stt_options.language = "cmn" - -2. Turn Detection 模式选择(关键!) - 三种模式在 LiveKit 框架下的实际表现: - - - EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT。 - 但 LiveKit agents 框架(v1.4.4)在 VAD 检测到说话结束后并不调用 - stream.flush()(不发 FlushSentinel),而是推送静音帧 + 等待 FINAL 事件。 - 结果:只有 INTERIM_TRANSCRIPT,永远没有 FINAL → 框架 2 秒超时 → 用户无回复。 - - - ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测。 - 但 LiveKit 自己也有 Silero VAD 在运行,两个 VAD 冲突。 - 结果:零转写输出,完全静默。 - - - SMART_TURN: 服务器端智能转弯检测,但过于激进,会把连续语音切成碎片 - (如"你好我是..."被切成"你好。"+"我是..."两个 FINAL),每个碎片触发 LLM 请求 - 导致前一个被 abort,实测不可用。 - - - FIXED(当前使用): 服务器检测固定时长静音后发 EndOfUtterance → finalize() → FINAL。 - 通过 end_of_utterance_silence_trigger 参数控制静音阈值(默认 0.5s,当前设 1.0s)。 - 在 VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()。 - 官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection - -3. Speaker Diarization(说话人识别) - - enable_diarization=True 开启后,每个 segment 带 speaker_id 和 is_active 标记 - - is_active=True 表示主要说话人(用户),is_active=False 表示被动说话人(如 TTS 回声) - - 解决"说话人混淆"问题:Agent 不会把自己 TTS 的回声当成用户输入 - -4. Docker 部署注意 - - SPEECHMATICS_API_KEY 在服务器 .env 中配置,docker-compose.yml 传入容器 - - 每次改动 src/ 下文件后需 docker compose build voice-agent(注意 COPY 层缓存, - 如改动未生效需加 --no-cache) -""" -import logging - -from livekit.plugins.speechmatics import STT, TurnDetectionMode - -logger = logging.getLogger(__name__) - -# Whisper 语言码 → Speechmatics 语言码映射 -# Speechmatics 使用 ISO 639-3(如 "cmn"),而非 ISO 639-1(如 "zh") -_LANG_MAP = { - "zh": "cmn", # 中文普通话 - "en": "en", # 英语 - "ja": "ja", # 日语 - "ko": "ko", # 韩语 - "de": "de", # 德语 - "fr": "fr", # 法语 -} - - -def create_speechmatics_stt(language: str = "cmn") -> STT: - """Create a Speechmatics STT instance for the voice pipeline. - - Args: - language: Language code (Whisper or Speechmatics). Whisper codes like - 'zh' are automatically mapped to Speechmatics equivalents. - - Returns: - Configured speechmatics.STT instance with speaker diarization enabled. - """ - sm_lang = _LANG_MAP.get(language, language) - - stt = STT( - language=sm_lang, - include_partials=True, - # FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT - # SMART_TURN 会把连续语音切成碎片,EXTERNAL 需手动 finalize,ADAPTIVE 与 LiveKit VAD 冲突 - turn_detection_mode=TurnDetectionMode.FIXED, - end_of_utterance_silence_trigger=1.0, - # 说话人识别:区分用户语音与 TTS 回声 - enable_diarization=True, - ) - - # 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化 - # LanguageCode("cmn") 会变成 "zh",但 Speechmatics 只接受 "cmn" - stt._stt_options.language = sm_lang # type: ignore[assignment] - - logger.info( - "Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True", - sm_lang, language, - ) - return stt