refactor: remove Speechmatics STT integration entirely, default to OpenAI
- Delete speechmatics_stt.py plugin - Remove speechmatics branch from voice-agent entrypoint - Remove livekit-plugins-speechmatics dependency - Change default stt_provider to 'openai' in entity, controller, and UI - Remove SPEECHMATICS_API_KEY from docker-compose.yml - Remove speechmatics option from web-admin settings dropdown Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
191ce2d6b3
commit
7fb0d1de95
|
|
@ -354,7 +354,6 @@ services:
|
|||
- OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
|
||||
- OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
|
||||
- OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
|
||||
- SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
|
||||
depends_on:
|
||||
livekit-server:
|
||||
condition: service_healthy
|
||||
|
|
|
|||
|
|
@ -796,7 +796,6 @@ interface VoiceSettings {
|
|||
}
|
||||
|
||||
const STT_PROVIDERS = [
|
||||
{ value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
|
||||
{ value: 'openai', labelKey: 'voice.providers.openai' },
|
||||
];
|
||||
|
||||
|
|
@ -810,11 +809,11 @@ function VoiceSection() {
|
|||
queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
|
||||
});
|
||||
|
||||
const [sttProvider, setSttProvider] = useState('speechmatics');
|
||||
const [sttProvider, setSttProvider] = useState('openai');
|
||||
const [initialized, setInitialized] = useState(false);
|
||||
|
||||
if (data && !initialized) {
|
||||
setSttProvider(data.stt_provider || 'speechmatics');
|
||||
setSttProvider(data.stt_provider || 'openai');
|
||||
setInitialized(true);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@
|
|||
"sttProvider": "Speech-to-Text Provider",
|
||||
"sttProviderHint": "Choose the speech recognition engine for voice sessions.",
|
||||
"providers": {
|
||||
"speechmatics": "Speechmatics (Default)",
|
||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||
},
|
||||
"saved": "Voice settings saved."
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@
|
|||
"sttProvider": "语音转文字引擎",
|
||||
"sttProviderHint": "选择语音通话时使用的语音识别引擎。",
|
||||
"providers": {
|
||||
"speechmatics": "Speechmatics(默认)",
|
||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||
},
|
||||
"saved": "语音设置已保存。"
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
/**
|
||||
* Per-tenant voice configuration entity.
|
||||
*
|
||||
* Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
|
||||
* Stores STT provider preference per tenant (e.g. 'openai').
|
||||
* Queried by voice-agent at session start to select the appropriate STT engine.
|
||||
*/
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
||||
|
|
@ -14,7 +14,7 @@ export class VoiceConfig {
|
|||
@Column({ type: 'varchar', length: 20, unique: true })
|
||||
tenantId!: string;
|
||||
|
||||
@Column({ type: 'varchar', length: 30, default: 'speechmatics' })
|
||||
@Column({ type: 'varchar', length: 30, default: 'openai' })
|
||||
sttProvider!: string;
|
||||
|
||||
@CreateDateColumn({ type: 'timestamptz' })
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import { TenantId } from '@it0/common';
|
|||
import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
|
||||
|
||||
const DEFAULT_CONFIG = {
|
||||
stt_provider: 'speechmatics',
|
||||
stt_provider: 'openai',
|
||||
};
|
||||
|
||||
@Controller('api/v1/agent/voice-config')
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ livekit>=1.0.0
|
|||
livekit-agents>=1.0.0
|
||||
livekit-plugins-silero>=1.0.0
|
||||
livekit-plugins-openai>=1.0.0
|
||||
livekit-plugins-speechmatics>=1.0.0
|
||||
faster-whisper==1.2.1
|
||||
kokoro==0.3.5
|
||||
misaki[zh]==0.7.17
|
||||
|
|
|
|||
|
|
@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None:
|
|||
"silence_duration_ms": 800,
|
||||
},
|
||||
)
|
||||
elif stt_provider == "speechmatics":
|
||||
from .plugins.speechmatics_stt import create_speechmatics_stt
|
||||
stt = create_speechmatics_stt(language=settings.whisper_language)
|
||||
else:
|
||||
stt = LocalWhisperSTT(
|
||||
model=ctx.proc.userdata.get("whisper_model"),
|
||||
|
|
|
|||
|
|
@ -1,101 +0,0 @@
|
|||
"""
|
||||
Speechmatics STT factory for voice-agent.
|
||||
|
||||
Creates a livekit-plugins-speechmatics STT instance configured for
|
||||
Mandarin recognition with speaker diarization support.
|
||||
|
||||
The SPEECHMATICS_API_KEY environment variable is read automatically
|
||||
by the livekit-plugins-speechmatics package.
|
||||
|
||||
===========================================================================
|
||||
集成笔记 (2026-03-03)
|
||||
===========================================================================
|
||||
|
||||
1. 语言码映射
|
||||
- Speechmatics 使用 ISO 639-3 语言码,中文普通话为 "cmn"
|
||||
- LiveKit 的 LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 的 "zh"
|
||||
(见 livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh")
|
||||
- 但 Speechmatics API 不接受 "zh",会报 "lang pack [zh] is not supported"
|
||||
- 解决:构造 STT 后手动覆盖 stt._stt_options.language = "cmn"
|
||||
|
||||
2. Turn Detection 模式选择(关键!)
|
||||
三种模式在 LiveKit 框架下的实际表现:
|
||||
|
||||
- EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT。
|
||||
但 LiveKit agents 框架(v1.4.4)在 VAD 检测到说话结束后并不调用
|
||||
stream.flush()(不发 FlushSentinel),而是推送静音帧 + 等待 FINAL 事件。
|
||||
结果:只有 INTERIM_TRANSCRIPT,永远没有 FINAL → 框架 2 秒超时 → 用户无回复。
|
||||
|
||||
- ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测。
|
||||
但 LiveKit 自己也有 Silero VAD 在运行,两个 VAD 冲突。
|
||||
结果:零转写输出,完全静默。
|
||||
|
||||
- SMART_TURN: 服务器端智能转弯检测,但过于激进,会把连续语音切成碎片
|
||||
(如"你好我是..."被切成"你好。"+"我是..."两个 FINAL),每个碎片触发 LLM 请求
|
||||
导致前一个被 abort,实测不可用。
|
||||
|
||||
- FIXED(当前使用): 服务器检测固定时长静音后发 EndOfUtterance → finalize() → FINAL。
|
||||
通过 end_of_utterance_silence_trigger 参数控制静音阈值(默认 0.5s,当前设 1.0s)。
|
||||
在 VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()。
|
||||
官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection
|
||||
|
||||
3. Speaker Diarization(说话人识别)
|
||||
- enable_diarization=True 开启后,每个 segment 带 speaker_id 和 is_active 标记
|
||||
- is_active=True 表示主要说话人(用户),is_active=False 表示被动说话人(如 TTS 回声)
|
||||
- 解决"说话人混淆"问题:Agent 不会把自己 TTS 的回声当成用户输入
|
||||
|
||||
4. Docker 部署注意
|
||||
- SPEECHMATICS_API_KEY 在服务器 .env 中配置,docker-compose.yml 传入容器
|
||||
- 每次改动 src/ 下文件后需 docker compose build voice-agent(注意 COPY 层缓存,
|
||||
如改动未生效需加 --no-cache)
|
||||
"""
|
||||
import logging
|
||||
|
||||
from livekit.plugins.speechmatics import STT, TurnDetectionMode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Whisper 语言码 → Speechmatics 语言码映射
|
||||
# Speechmatics 使用 ISO 639-3(如 "cmn"),而非 ISO 639-1(如 "zh")
|
||||
_LANG_MAP = {
|
||||
"zh": "cmn", # 中文普通话
|
||||
"en": "en", # 英语
|
||||
"ja": "ja", # 日语
|
||||
"ko": "ko", # 韩语
|
||||
"de": "de", # 德语
|
||||
"fr": "fr", # 法语
|
||||
}
|
||||
|
||||
|
||||
def create_speechmatics_stt(language: str = "cmn") -> STT:
|
||||
"""Create a Speechmatics STT instance for the voice pipeline.
|
||||
|
||||
Args:
|
||||
language: Language code (Whisper or Speechmatics). Whisper codes like
|
||||
'zh' are automatically mapped to Speechmatics equivalents.
|
||||
|
||||
Returns:
|
||||
Configured speechmatics.STT instance with speaker diarization enabled.
|
||||
"""
|
||||
sm_lang = _LANG_MAP.get(language, language)
|
||||
|
||||
stt = STT(
|
||||
language=sm_lang,
|
||||
include_partials=True,
|
||||
# FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT
|
||||
# SMART_TURN 会把连续语音切成碎片,EXTERNAL 需手动 finalize,ADAPTIVE 与 LiveKit VAD 冲突
|
||||
turn_detection_mode=TurnDetectionMode.FIXED,
|
||||
end_of_utterance_silence_trigger=1.0,
|
||||
# 说话人识别:区分用户语音与 TTS 回声
|
||||
enable_diarization=True,
|
||||
)
|
||||
|
||||
# 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化
|
||||
# LanguageCode("cmn") 会变成 "zh",但 Speechmatics 只接受 "cmn"
|
||||
stt._stt_options.language = sm_lang # type: ignore[assignment]
|
||||
|
||||
logger.info(
|
||||
"Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True",
|
||||
sm_lang, language,
|
||||
)
|
||||
return stt
|
||||
Loading…
Reference in New Issue