From 7fb0d1de95fc5b6ec791fe1777452b39488384d9 Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Tue, 3 Mar 2026 04:58:38 -0800
Subject: [PATCH] refactor: remove Speechmatics STT integration entirely,
 default to OpenAI

- Delete speechmatics_stt.py plugin
- Remove speechmatics branch from voice-agent entrypoint
- Remove livekit-plugins-speechmatics dependency
- Change default stt_provider to 'openai' in entity, controller, and UI
- Remove SPEECHMATICS_API_KEY from docker-compose.yml
- Remove speechmatics option from web-admin settings dropdown

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 deploy/docker/docker-compose.yml              |   1 -
 .../src/app/(admin)/settings/page.tsx         |   5 +-
 .../src/i18n/locales/en/settings.json         |   1 -
 .../src/i18n/locales/zh/settings.json         |   1 -
 .../domain/entities/voice-config.entity.ts    |   4 +-
 .../controllers/voice-config.controller.ts    |   2 +-
 .../services/voice-agent/requirements.txt     |   1 -
 packages/services/voice-agent/src/agent.py    |   3 -
 .../src/plugins/speechmatics_stt.py           | 101 ------------------
 9 files changed, 5 insertions(+), 114 deletions(-)
 delete mode 100644 packages/services/voice-agent/src/plugins/speechmatics_stt.py
diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml
index 6462b41..1a69bc7 100644
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@@ -354,7 +354,6 @@ services:
       - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
       - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
       - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
-      - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
     depends_on:
       livekit-server:
         condition: service_healthy
diff --git a/it0-web-admin/src/app/(admin)/settings/page.tsx b/it0-web-admin/src/app/(admin)/settings/page.tsx
index a8d3cc4..41c9c67 100644
--- a/it0-web-admin/src/app/(admin)/settings/page.tsx
+++ b/it0-web-admin/src/app/(admin)/settings/page.tsx
@@ -796,7 +796,6 @@ interface VoiceSettings {
 }
 
 const STT_PROVIDERS = [
-  { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
   { value: 'openai', labelKey: 'voice.providers.openai' },
 ];
 
@@ -810,11 +809,11 @@ function VoiceSection() {
     queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
   });
 
-  const [sttProvider, setSttProvider] = useState('speechmatics');
+  const [sttProvider, setSttProvider] = useState('openai');
   const [initialized, setInitialized] = useState(false);
 
   if (data && !initialized) {
-    setSttProvider(data.stt_provider || 'speechmatics');
+    setSttProvider(data.stt_provider || 'openai');
     setInitialized(true);
   }
 
diff --git a/it0-web-admin/src/i18n/locales/en/settings.json b/it0-web-admin/src/i18n/locales/en/settings.json
index 5be604f..ce188ce 100644
--- a/it0-web-admin/src/i18n/locales/en/settings.json
+++ b/it0-web-admin/src/i18n/locales/en/settings.json
@@ -74,7 +74,6 @@
     "sttProvider": "Speech-to-Text Provider",
     "sttProviderHint": "Choose the speech recognition engine for voice sessions.",
     "providers": {
-      "speechmatics": "Speechmatics (Default)",
       "openai": "OpenAI (gpt-4o-transcribe)"
     },
     "saved": "Voice settings saved."
diff --git a/it0-web-admin/src/i18n/locales/zh/settings.json b/it0-web-admin/src/i18n/locales/zh/settings.json
index cdbed38..15546b2 100644
--- a/it0-web-admin/src/i18n/locales/zh/settings.json
+++ b/it0-web-admin/src/i18n/locales/zh/settings.json
@@ -74,7 +74,6 @@
     "sttProvider": "语音转文字引擎",
     "sttProviderHint": "选择语音通话时使用的语音识别引擎。",
     "providers": {
-      "speechmatics": "Speechmatics（默认）",
       "openai": "OpenAI (gpt-4o-transcribe)"
     },
     "saved": "语音设置已保存。"
diff --git a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
index f970f4b..0d593d8 100644
--- a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
+++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
@@ -1,7 +1,7 @@
 /**
  * Per-tenant voice configuration entity.
  *
- * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
+ * Stores STT provider preference per tenant (e.g. 'openai').
  * Queried by voice-agent at session start to select the appropriate STT engine.
  */
 import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
@@ -14,7 +14,7 @@ export class VoiceConfig {
   @Column({ type: 'varchar', length: 20, unique: true })
   tenantId!: string;
 
-  @Column({ type: 'varchar', length: 30, default: 'speechmatics' })
+  @Column({ type: 'varchar', length: 30, default: 'openai' })
   sttProvider!: string;
 
   @CreateDateColumn({ type: 'timestamptz' })
diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
index 1a1e94a..d9f50ad 100644
--- a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
@@ -10,7 +10,7 @@ import { TenantId } from '@it0/common';
 import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
 
 const DEFAULT_CONFIG = {
-  stt_provider: 'speechmatics',
+  stt_provider: 'openai',
 };
 
 @Controller('api/v1/agent/voice-config')
diff --git a/packages/services/voice-agent/requirements.txt b/packages/services/voice-agent/requirements.txt
index 5ddf39d..79deeaf 100644
--- a/packages/services/voice-agent/requirements.txt
+++ b/packages/services/voice-agent/requirements.txt
@@ -2,7 +2,6 @@ livekit>=1.0.0
 livekit-agents>=1.0.0
 livekit-plugins-silero>=1.0.0
 livekit-plugins-openai>=1.0.0
-livekit-plugins-speechmatics>=1.0.0
 faster-whisper==1.2.1
 kokoro==0.3.5
 misaki[zh]==0.7.17
diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py
index a637024..e1f26c6 100644
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None:
                     "silence_duration_ms": 800,
                 },
             )
-        elif stt_provider == "speechmatics":
-            from .plugins.speechmatics_stt import create_speechmatics_stt
-            stt = create_speechmatics_stt(language=settings.whisper_language)
         else:
             stt = LocalWhisperSTT(
                 model=ctx.proc.userdata.get("whisper_model"),
diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
deleted file mode 100644
index e5c2055..0000000
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""
-Speechmatics STT factory for voice-agent.
-
-Creates a livekit-plugins-speechmatics STT instance configured for
-Mandarin recognition with speaker diarization support.
-
-The SPEECHMATICS_API_KEY environment variable is read automatically
-by the livekit-plugins-speechmatics package.
-
-===========================================================================
-集成笔记 (2026-03-03)
-===========================================================================
-
-1. 语言码映射
-   - Speechmatics 使用 ISO 639-3 语言码，中文普通话为 "cmn"
-   - LiveKit 的 LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 的 "zh"
-     (见 livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh")
-   - 但 Speechmatics API 不接受 "zh"，会报 "lang pack [zh] is not supported"
-   - 解决：构造 STT 后手动覆盖 stt._stt_options.language = "cmn"
-
-2. Turn Detection 模式选择（关键！）
-   三种模式在 LiveKit 框架下的实际表现：
-
-   - EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT。
-     但 LiveKit agents 框架（v1.4.4）在 VAD 检测到说话结束后并不调用
-     stream.flush()（不发 FlushSentinel），而是推送静音帧 + 等待 FINAL 事件。
-     结果：只有 INTERIM_TRANSCRIPT，永远没有 FINAL → 框架 2 秒超时 → 用户无回复。
-
-   - ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测。
-     但 LiveKit 自己也有 Silero VAD 在运行，两个 VAD 冲突。
-     结果：零转写输出，完全静默。
-
-   - SMART_TURN: 服务器端智能转弯检测，但过于激进，会把连续语音切成碎片
-     （如"你好我是..."被切成"你好。"+"我是..."两个 FINAL），每个碎片触发 LLM 请求
-     导致前一个被 abort，实测不可用。
-
-   - FIXED（当前使用）: 服务器检测固定时长静音后发 EndOfUtterance → finalize() → FINAL。
-     通过 end_of_utterance_silence_trigger 参数控制静音阈值（默认 0.5s，当前设 1.0s）。
-     在 VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()。
-     官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection
-
-3. Speaker Diarization（说话人识别）
-   - enable_diarization=True 开启后，每个 segment 带 speaker_id 和 is_active 标记
-   - is_active=True 表示主要说话人（用户），is_active=False 表示被动说话人（如 TTS 回声）
-   - 解决"说话人混淆"问题：Agent 不会把自己 TTS 的回声当成用户输入
-
-4. Docker 部署注意
-   - SPEECHMATICS_API_KEY 在服务器 .env 中配置，docker-compose.yml 传入容器
-   - 每次改动 src/ 下文件后需 docker compose build voice-agent（注意 COPY 层缓存，
-     如改动未生效需加 --no-cache）
-"""
-import logging
-
-from livekit.plugins.speechmatics import STT, TurnDetectionMode
-
-logger = logging.getLogger(__name__)
-
-# Whisper 语言码 → Speechmatics 语言码映射
-# Speechmatics 使用 ISO 639-3（如 "cmn"），而非 ISO 639-1（如 "zh"）
-_LANG_MAP = {
-    "zh": "cmn",   # 中文普通话
-    "en": "en",    # 英语
-    "ja": "ja",    # 日语
-    "ko": "ko",    # 韩语
-    "de": "de",    # 德语
-    "fr": "fr",    # 法语
-}
-
-
-def create_speechmatics_stt(language: str = "cmn") -> STT:
-    """Create a Speechmatics STT instance for the voice pipeline.
-
-    Args:
-        language: Language code (Whisper or Speechmatics). Whisper codes like
-                  'zh' are automatically mapped to Speechmatics equivalents.
-
-    Returns:
-        Configured speechmatics.STT instance with speaker diarization enabled.
-    """
-    sm_lang = _LANG_MAP.get(language, language)
-
-    stt = STT(
-        language=sm_lang,
-        include_partials=True,
-        # FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT
-        # SMART_TURN 会把连续语音切成碎片，EXTERNAL 需手动 finalize，ADAPTIVE 与 LiveKit VAD 冲突
-        turn_detection_mode=TurnDetectionMode.FIXED,
-        end_of_utterance_silence_trigger=1.0,
-        # 说话人识别：区分用户语音与 TTS 回声
-        enable_diarization=True,
-    )
-
-    # 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化
-    # LanguageCode("cmn") 会变成 "zh"，但 Speechmatics 只接受 "cmn"
-    stt._stt_options.language = sm_lang  # type: ignore[assignment]
-
-    logger.info(
-        "Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True",
-        sm_lang, language,
-    )
-    return stt