refactor: remove Speechmatics STT integration entirely, default to OpenAI

- Delete speechmatics_stt.py plugin - Remove speechmatics branch from voice-agent entrypoint - Remove livekit-plugins-speechmatics dependency - Change default stt_provider to 'openai' in entity, controller, and UI - Remove SPEECHMATICS_API_KEY from docker-compose.yml - Remove speechmatics option from web-admin settings dropdown Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-03 04:58:38 -08:00 · 2026-03-03 04:58:38 -08:00 · 7fb0d1de95
parent 191ce2d6b3
commit 7fb0d1de95
9 changed files with 5 additions and 114 deletions
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@ -354,7 +354,6 @@ services:
      - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
      - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
      - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
      - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
    depends_on:
      livekit-server:
        condition: service_healthy
--- a/it0-web-admin/src/app/(admin)/settings/page.tsx
+++ b/it0-web-admin/src/app/(admin)/settings/page.tsx
@ -796,7 +796,6 @@ interface VoiceSettings {
 }
 const STT_PROVIDERS = [
  { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
  { value: 'openai', labelKey: 'voice.providers.openai' },
 ];
@ -810,11 +809,11 @@ function VoiceSection() {
    queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
  });
-  const [sttProvider, setSttProvider] = useState('speechmatics');
+  const [sttProvider, setSttProvider] = useState('openai');
  const [initialized, setInitialized] = useState(false);
  if (data && !initialized) {
-    setSttProvider(data.stt_provider || 'speechmatics');
+    setSttProvider(data.stt_provider || 'openai');
    setInitialized(true);
  }
--- a/it0-web-admin/src/i18n/locales/en/settings.json
+++ b/it0-web-admin/src/i18n/locales/en/settings.json
@ -74,7 +74,6 @@
    "sttProvider": "Speech-to-Text Provider",
    "sttProviderHint": "Choose the speech recognition engine for voice sessions.",
    "providers": {
      "speechmatics": "Speechmatics (Default)",
      "openai": "OpenAI (gpt-4o-transcribe)"
    },
    "saved": "Voice settings saved."
--- a/it0-web-admin/src/i18n/locales/zh/settings.json
+++ b/it0-web-admin/src/i18n/locales/zh/settings.json
@ -74,7 +74,6 @@
    "sttProvider": "语音转文字引擎",
    "sttProviderHint": "选择语音通话时使用的语音识别引擎。",
    "providers": {
      "speechmatics": "Speechmatics（默认）",
      "openai": "OpenAI (gpt-4o-transcribe)"
    },
    "saved": "语音设置已保存。"
--- a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
+++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
@ -1,7 +1,7 @@
 /**
 * Per-tenant voice configuration entity.
 *
- * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
+ * Stores STT provider preference per tenant (e.g. 'openai').
 * Queried by voice-agent at session start to select the appropriate STT engine.
 */
 import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
@ -14,7 +14,7 @@ export class VoiceConfig {
  @Column({ type: 'varchar', length: 20, unique: true })
  tenantId!: string;
-  @Column({ type: 'varchar', length: 30, default: 'speechmatics' })
+  @Column({ type: 'varchar', length: 30, default: 'openai' })
  sttProvider!: string;
  @CreateDateColumn({ type: 'timestamptz' })
--- a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
@ -10,7 +10,7 @@ import { TenantId } from '@it0/common';
 import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
 const DEFAULT_CONFIG = {
-  stt_provider: 'speechmatics',
+  stt_provider: 'openai',
 };
@Controller('api/v1/agent/voice-config')
--- a/packages/services/voice-agent/requirements.txt
+++ b/packages/services/voice-agent/requirements.txt
@ -2,7 +2,6 @@ livekit>=1.0.0
 livekit-agents>=1.0.0
 livekit-plugins-silero>=1.0.0
 livekit-plugins-openai>=1.0.0
 livekit-plugins-speechmatics>=1.0.0
 faster-whisper==1.2.1
 kokoro==0.3.5
 misaki[zh]==0.7.17
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None:
                    "silence_duration_ms": 800,
                },
            )
        elif stt_provider == "speechmatics":
            from .plugins.speechmatics_stt import create_speechmatics_stt
            stt = create_speechmatics_stt(language=settings.whisper_language)
        else:
            stt = LocalWhisperSTT(
                model=ctx.proc.userdata.get("whisper_model"),
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@ -1,101 +0,0 @@
 """
 Speechmatics STT factory for voice-agent.
 Creates a livekit-plugins-speechmatics STT instance configured for
 Mandarin recognition with speaker diarization support.
 The SPEECHMATICS_API_KEY environment variable is read automatically
 by the livekit-plugins-speechmatics package.
 ===========================================================================
 集成笔记 (2026-03-03)
 ===========================================================================
 1. 语言码映射
   - Speechmatics 使用 ISO 639-3 语言码，中文普通话为 "cmn"
   - LiveKit 的 LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 的 "zh"
     (见 livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh")
   - 但 Speechmatics API 不接受 "zh"，会报 "lang pack [zh] is not supported"
   - 解决：构造 STT 后手动覆盖 stt._stt_options.language = "cmn"
 2. Turn Detection 模式选择（关键！）
   三种模式在 LiveKit 框架下的实际表现：
   - EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT。
     但 LiveKit agents 框架（v1.4.4）在 VAD 检测到说话结束后并不调用
     stream.flush()（不发 FlushSentinel），而是推送静音帧 + 等待 FINAL 事件。
     结果：只有 INTERIM_TRANSCRIPT，永远没有 FINAL → 框架 2 秒超时 → 用户无回复。
   - ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测。
     但 LiveKit 自己也有 Silero VAD 在运行，两个 VAD 冲突。
     结果：零转写输出，完全静默。
   - SMART_TURN: 服务器端智能转弯检测，但过于激进，会把连续语音切成碎片
     （如"你好我是..."被切成"你好。"+"我是..."两个 FINAL），每个碎片触发 LLM 请求
     导致前一个被 abort，实测不可用。
   - FIXED（当前使用）: 服务器检测固定时长静音后发 EndOfUtterance → finalize() → FINAL。
     通过 end_of_utterance_silence_trigger 参数控制静音阈值（默认 0.5s，当前设 1.0s）。
     在 VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()。
     官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection
 3. Speaker Diarization（说话人识别）
   - enable_diarization=True 开启后，每个 segment 带 speaker_id 和 is_active 标记
   - is_active=True 表示主要说话人（用户），is_active=False 表示被动说话人（如 TTS 回声）
   - 解决"说话人混淆"问题：Agent 不会把自己 TTS 的回声当成用户输入
 4. Docker 部署注意
   - SPEECHMATICS_API_KEY 在服务器 .env 中配置，docker-compose.yml 传入容器
   - 每次改动 src/ 下文件后需 docker compose build voice-agent（注意 COPY 层缓存，
     如改动未生效需加 --no-cache）
 """
 import logging
 from livekit.plugins.speechmatics import STT, TurnDetectionMode
 logger = logging.getLogger(__name__)
 # Whisper 语言码 → Speechmatics 语言码映射
 # Speechmatics 使用 ISO 639-3（如 "cmn"），而非 ISO 639-1（如 "zh"）
 _LANG_MAP = {
    "zh": "cmn",   # 中文普通话
    "en": "en",    # 英语
    "ja": "ja",    # 日语
    "ko": "ko",    # 韩语
    "de": "de",    # 德语
    "fr": "fr",    # 法语
 }
 def create_speechmatics_stt(language: str = "cmn") -> STT:
    """Create a Speechmatics STT instance for the voice pipeline.
    Args:
        language: Language code (Whisper or Speechmatics). Whisper codes like
                  'zh' are automatically mapped to Speechmatics equivalents.
    Returns:
        Configured speechmatics.STT instance with speaker diarization enabled.
    """
    sm_lang = _LANG_MAP.get(language, language)
    stt = STT(
        language=sm_lang,
        include_partials=True,
        # FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT
        # SMART_TURN 会把连续语音切成碎片，EXTERNAL 需手动 finalize，ADAPTIVE 与 LiveKit VAD 冲突
        turn_detection_mode=TurnDetectionMode.FIXED,
        end_of_utterance_silence_trigger=1.0,
        # 说话人识别：区分用户语音与 TTS 回声
        enable_diarization=True,
    )
    # 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化
    # LanguageCode("cmn") 会变成 "zh"，但 Speechmatics 只接受 "cmn"
    stt._stt_options.language = sm_lang  # type: ignore[assignment]
    logger.info(
        "Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True",
        sm_lang, language,
    )
    return stt