refactor: remove Speechmatics STT integration entirely, default to OpenAI

- Delete speechmatics_stt.py plugin
- Remove speechmatics branch from voice-agent entrypoint
- Remove livekit-plugins-speechmatics dependency
- Change default stt_provider to 'openai' in entity, controller, and UI
- Remove SPEECHMATICS_API_KEY from docker-compose.yml
- Remove speechmatics option from web-admin settings dropdown

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-03 04:58:38 -08:00
parent 191ce2d6b3
commit 7fb0d1de95
9 changed files with 5 additions and 114 deletions

View File

@ -354,7 +354,6 @@ services:
- OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
- OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
- OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
- SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
depends_on:
livekit-server:
condition: service_healthy

View File

@ -796,7 +796,6 @@ interface VoiceSettings {
}
const STT_PROVIDERS = [
{ value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
{ value: 'openai', labelKey: 'voice.providers.openai' },
];
@ -810,11 +809,11 @@ function VoiceSection() {
queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
});
const [sttProvider, setSttProvider] = useState('speechmatics');
const [sttProvider, setSttProvider] = useState('openai');
const [initialized, setInitialized] = useState(false);
if (data && !initialized) {
setSttProvider(data.stt_provider || 'speechmatics');
setSttProvider(data.stt_provider || 'openai');
setInitialized(true);
}

View File

@ -74,7 +74,6 @@
"sttProvider": "Speech-to-Text Provider",
"sttProviderHint": "Choose the speech recognition engine for voice sessions.",
"providers": {
"speechmatics": "Speechmatics (Default)",
"openai": "OpenAI (gpt-4o-transcribe)"
},
"saved": "Voice settings saved."

View File

@ -74,7 +74,6 @@
"sttProvider": "语音转文字引擎",
"sttProviderHint": "选择语音通话时使用的语音识别引擎。",
"providers": {
"speechmatics": "Speechmatics默认",
"openai": "OpenAI (gpt-4o-transcribe)"
},
"saved": "语音设置已保存。"

View File

@ -1,7 +1,7 @@
/**
* Per-tenant voice configuration entity.
*
* Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
* Stores STT provider preference per tenant (e.g. 'openai').
* Queried by voice-agent at session start to select the appropriate STT engine.
*/
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
@ -14,7 +14,7 @@ export class VoiceConfig {
@Column({ type: 'varchar', length: 20, unique: true })
tenantId!: string;
@Column({ type: 'varchar', length: 30, default: 'speechmatics' })
@Column({ type: 'varchar', length: 30, default: 'openai' })
sttProvider!: string;
@CreateDateColumn({ type: 'timestamptz' })

View File

@ -10,7 +10,7 @@ import { TenantId } from '@it0/common';
import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
const DEFAULT_CONFIG = {
stt_provider: 'speechmatics',
stt_provider: 'openai',
};
@Controller('api/v1/agent/voice-config')

View File

@ -2,7 +2,6 @@ livekit>=1.0.0
livekit-agents>=1.0.0
livekit-plugins-silero>=1.0.0
livekit-plugins-openai>=1.0.0
livekit-plugins-speechmatics>=1.0.0
faster-whisper==1.2.1
kokoro==0.3.5
misaki[zh]==0.7.17

View File

@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None:
"silence_duration_ms": 800,
},
)
elif stt_provider == "speechmatics":
from .plugins.speechmatics_stt import create_speechmatics_stt
stt = create_speechmatics_stt(language=settings.whisper_language)
else:
stt = LocalWhisperSTT(
model=ctx.proc.userdata.get("whisper_model"),

View File

@ -1,101 +0,0 @@
"""
Speechmatics STT factory for voice-agent.
Creates a livekit-plugins-speechmatics STT instance configured for
Mandarin recognition with speaker diarization support.
The SPEECHMATICS_API_KEY environment variable is read automatically
by the livekit-plugins-speechmatics package.
===========================================================================
集成笔记 (2026-03-03)
===========================================================================
1. 语言码映射
- Speechmatics 使用 ISO 639-3 语言码中文普通话为 "cmn"
- LiveKit LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 "zh"
( livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh")
- Speechmatics API 不接受 "zh"会报 "lang pack [zh] is not supported"
- 解决构造 STT 后手动覆盖 stt._stt_options.language = "cmn"
2. Turn Detection 模式选择关键
三种模式在 LiveKit 框架下的实际表现
- EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT
LiveKit agents 框架v1.4.4 VAD 检测到说话结束后并不调用
stream.flush()不发 FlushSentinel而是推送静音帧 + 等待 FINAL 事件
结果只有 INTERIM_TRANSCRIPT永远没有 FINAL 框架 2 秒超时 用户无回复
- ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测
LiveKit 自己也有 Silero VAD 在运行两个 VAD 冲突
结果零转写输出完全静默
- SMART_TURN: 服务器端智能转弯检测但过于激进会把连续语音切成碎片
"你好我是..."被切成"你好。"+"我是..."两个 FINAL每个碎片触发 LLM 请求
导致前一个被 abort实测不可用
- FIXED当前使用: 服务器检测固定时长静音后发 EndOfUtterance finalize() FINAL
通过 end_of_utterance_silence_trigger 参数控制静音阈值默认 0.5s当前设 1.0s
VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()
官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection
3. Speaker Diarization说话人识别
- enable_diarization=True 开启后每个 segment speaker_id is_active 标记
- is_active=True 表示主要说话人用户is_active=False 表示被动说话人 TTS 回声
- 解决"说话人混淆"问题Agent 不会把自己 TTS 的回声当成用户输入
4. Docker 部署注意
- SPEECHMATICS_API_KEY 在服务器 .env 中配置docker-compose.yml 传入容器
- 每次改动 src/ 下文件后需 docker compose build voice-agent注意 COPY 层缓存
如改动未生效需加 --no-cache
"""
import logging
from livekit.plugins.speechmatics import STT, TurnDetectionMode
logger = logging.getLogger(__name__)
# Whisper 语言码 → Speechmatics 语言码映射
# Speechmatics 使用 ISO 639-3如 "cmn"),而非 ISO 639-1如 "zh"
_LANG_MAP = {
"zh": "cmn", # 中文普通话
"en": "en", # 英语
"ja": "ja", # 日语
"ko": "ko", # 韩语
"de": "de", # 德语
"fr": "fr", # 法语
}
def create_speechmatics_stt(language: str = "cmn") -> STT:
"""Create a Speechmatics STT instance for the voice pipeline.
Args:
language: Language code (Whisper or Speechmatics). Whisper codes like
'zh' are automatically mapped to Speechmatics equivalents.
Returns:
Configured speechmatics.STT instance with speaker diarization enabled.
"""
sm_lang = _LANG_MAP.get(language, language)
stt = STT(
language=sm_lang,
include_partials=True,
# FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT
# SMART_TURN 会把连续语音切成碎片EXTERNAL 需手动 finalizeADAPTIVE 与 LiveKit VAD 冲突
turn_detection_mode=TurnDetectionMode.FIXED,
end_of_utterance_silence_trigger=1.0,
# 说话人识别:区分用户语音与 TTS 回声
enable_diarization=True,
)
# 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化
# LanguageCode("cmn") 会变成 "zh",但 Speechmatics 只接受 "cmn"
stt._stt_options.language = sm_lang # type: ignore[assignment]
logger.info(
"Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True",
sm_lang, language,
)
return stt