refactor: remove Speechmatics STT integration entirely, default to OpenAI
- Delete speechmatics_stt.py plugin - Remove speechmatics branch from voice-agent entrypoint - Remove livekit-plugins-speechmatics dependency - Change default stt_provider to 'openai' in entity, controller, and UI - Remove SPEECHMATICS_API_KEY from docker-compose.yml - Remove speechmatics option from web-admin settings dropdown Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
191ce2d6b3
commit
7fb0d1de95
|
|
@ -354,7 +354,6 @@ services:
|
||||||
- OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
|
- OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
|
||||||
- OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
|
- OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
|
||||||
- OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
|
- OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
|
||||||
- SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
|
|
||||||
depends_on:
|
depends_on:
|
||||||
livekit-server:
|
livekit-server:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
|
||||||
|
|
@ -796,7 +796,6 @@ interface VoiceSettings {
|
||||||
}
|
}
|
||||||
|
|
||||||
const STT_PROVIDERS = [
|
const STT_PROVIDERS = [
|
||||||
{ value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
|
|
||||||
{ value: 'openai', labelKey: 'voice.providers.openai' },
|
{ value: 'openai', labelKey: 'voice.providers.openai' },
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|
@ -810,11 +809,11 @@ function VoiceSection() {
|
||||||
queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
|
queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
|
||||||
});
|
});
|
||||||
|
|
||||||
const [sttProvider, setSttProvider] = useState('speechmatics');
|
const [sttProvider, setSttProvider] = useState('openai');
|
||||||
const [initialized, setInitialized] = useState(false);
|
const [initialized, setInitialized] = useState(false);
|
||||||
|
|
||||||
if (data && !initialized) {
|
if (data && !initialized) {
|
||||||
setSttProvider(data.stt_provider || 'speechmatics');
|
setSttProvider(data.stt_provider || 'openai');
|
||||||
setInitialized(true);
|
setInitialized(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,6 @@
|
||||||
"sttProvider": "Speech-to-Text Provider",
|
"sttProvider": "Speech-to-Text Provider",
|
||||||
"sttProviderHint": "Choose the speech recognition engine for voice sessions.",
|
"sttProviderHint": "Choose the speech recognition engine for voice sessions.",
|
||||||
"providers": {
|
"providers": {
|
||||||
"speechmatics": "Speechmatics (Default)",
|
|
||||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||||
},
|
},
|
||||||
"saved": "Voice settings saved."
|
"saved": "Voice settings saved."
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,6 @@
|
||||||
"sttProvider": "语音转文字引擎",
|
"sttProvider": "语音转文字引擎",
|
||||||
"sttProviderHint": "选择语音通话时使用的语音识别引擎。",
|
"sttProviderHint": "选择语音通话时使用的语音识别引擎。",
|
||||||
"providers": {
|
"providers": {
|
||||||
"speechmatics": "Speechmatics(默认)",
|
|
||||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||||
},
|
},
|
||||||
"saved": "语音设置已保存。"
|
"saved": "语音设置已保存。"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
/**
|
/**
|
||||||
* Per-tenant voice configuration entity.
|
* Per-tenant voice configuration entity.
|
||||||
*
|
*
|
||||||
* Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
|
* Stores STT provider preference per tenant (e.g. 'openai').
|
||||||
* Queried by voice-agent at session start to select the appropriate STT engine.
|
* Queried by voice-agent at session start to select the appropriate STT engine.
|
||||||
*/
|
*/
|
||||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
||||||
|
|
@ -14,7 +14,7 @@ export class VoiceConfig {
|
||||||
@Column({ type: 'varchar', length: 20, unique: true })
|
@Column({ type: 'varchar', length: 20, unique: true })
|
||||||
tenantId!: string;
|
tenantId!: string;
|
||||||
|
|
||||||
@Column({ type: 'varchar', length: 30, default: 'speechmatics' })
|
@Column({ type: 'varchar', length: 30, default: 'openai' })
|
||||||
sttProvider!: string;
|
sttProvider!: string;
|
||||||
|
|
||||||
@CreateDateColumn({ type: 'timestamptz' })
|
@CreateDateColumn({ type: 'timestamptz' })
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import { TenantId } from '@it0/common';
|
||||||
import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
|
import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
|
||||||
|
|
||||||
const DEFAULT_CONFIG = {
|
const DEFAULT_CONFIG = {
|
||||||
stt_provider: 'speechmatics',
|
stt_provider: 'openai',
|
||||||
};
|
};
|
||||||
|
|
||||||
@Controller('api/v1/agent/voice-config')
|
@Controller('api/v1/agent/voice-config')
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,6 @@ livekit>=1.0.0
|
||||||
livekit-agents>=1.0.0
|
livekit-agents>=1.0.0
|
||||||
livekit-plugins-silero>=1.0.0
|
livekit-plugins-silero>=1.0.0
|
||||||
livekit-plugins-openai>=1.0.0
|
livekit-plugins-openai>=1.0.0
|
||||||
livekit-plugins-speechmatics>=1.0.0
|
|
||||||
faster-whisper==1.2.1
|
faster-whisper==1.2.1
|
||||||
kokoro==0.3.5
|
kokoro==0.3.5
|
||||||
misaki[zh]==0.7.17
|
misaki[zh]==0.7.17
|
||||||
|
|
|
||||||
|
|
@ -257,9 +257,6 @@ async def entrypoint(ctx: JobContext) -> None:
|
||||||
"silence_duration_ms": 800,
|
"silence_duration_ms": 800,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
elif stt_provider == "speechmatics":
|
|
||||||
from .plugins.speechmatics_stt import create_speechmatics_stt
|
|
||||||
stt = create_speechmatics_stt(language=settings.whisper_language)
|
|
||||||
else:
|
else:
|
||||||
stt = LocalWhisperSTT(
|
stt = LocalWhisperSTT(
|
||||||
model=ctx.proc.userdata.get("whisper_model"),
|
model=ctx.proc.userdata.get("whisper_model"),
|
||||||
|
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
||||||
"""
|
|
||||||
Speechmatics STT factory for voice-agent.
|
|
||||||
|
|
||||||
Creates a livekit-plugins-speechmatics STT instance configured for
|
|
||||||
Mandarin recognition with speaker diarization support.
|
|
||||||
|
|
||||||
The SPEECHMATICS_API_KEY environment variable is read automatically
|
|
||||||
by the livekit-plugins-speechmatics package.
|
|
||||||
|
|
||||||
===========================================================================
|
|
||||||
集成笔记 (2026-03-03)
|
|
||||||
===========================================================================
|
|
||||||
|
|
||||||
1. 语言码映射
|
|
||||||
- Speechmatics 使用 ISO 639-3 语言码,中文普通话为 "cmn"
|
|
||||||
- LiveKit 的 LanguageCode 类会自动将 "cmn" 归一化为 ISO 639-1 的 "zh"
|
|
||||||
(见 livekit/agents/_language_data.py: ISO_639_3_TO_1["cmn"] = "zh")
|
|
||||||
- 但 Speechmatics API 不接受 "zh",会报 "lang pack [zh] is not supported"
|
|
||||||
- 解决:构造 STT 后手动覆盖 stt._stt_options.language = "cmn"
|
|
||||||
|
|
||||||
2. Turn Detection 模式选择(关键!)
|
|
||||||
三种模式在 LiveKit 框架下的实际表现:
|
|
||||||
|
|
||||||
- EXTERNAL: 需要客户端手动调用 client.finalize() 才会产生 FINAL_TRANSCRIPT。
|
|
||||||
但 LiveKit agents 框架(v1.4.4)在 VAD 检测到说话结束后并不调用
|
|
||||||
stream.flush()(不发 FlushSentinel),而是推送静音帧 + 等待 FINAL 事件。
|
|
||||||
结果:只有 INTERIM_TRANSCRIPT,永远没有 FINAL → 框架 2 秒超时 → 用户无回复。
|
|
||||||
|
|
||||||
- ADAPTIVE: 使用 Speechmatics SDK 内置的 Silero VAD 做客户端转弯检测。
|
|
||||||
但 LiveKit 自己也有 Silero VAD 在运行,两个 VAD 冲突。
|
|
||||||
结果:零转写输出,完全静默。
|
|
||||||
|
|
||||||
- SMART_TURN: 服务器端智能转弯检测,但过于激进,会把连续语音切成碎片
|
|
||||||
(如"你好我是..."被切成"你好。"+"我是..."两个 FINAL),每个碎片触发 LLM 请求
|
|
||||||
导致前一个被 abort,实测不可用。
|
|
||||||
|
|
||||||
- FIXED(当前使用): 服务器检测固定时长静音后发 EndOfUtterance → finalize() → FINAL。
|
|
||||||
通过 end_of_utterance_silence_trigger 参数控制静音阈值(默认 0.5s,当前设 1.0s)。
|
|
||||||
在 VoiceAgentClient 中有内置的 END_OF_UTTERANCE handler 自动调用 finalize()。
|
|
||||||
官方文档: https://docs.speechmatics.com/speech-to-text/realtime/turn-detection
|
|
||||||
|
|
||||||
3. Speaker Diarization(说话人识别)
|
|
||||||
- enable_diarization=True 开启后,每个 segment 带 speaker_id 和 is_active 标记
|
|
||||||
- is_active=True 表示主要说话人(用户),is_active=False 表示被动说话人(如 TTS 回声)
|
|
||||||
- 解决"说话人混淆"问题:Agent 不会把自己 TTS 的回声当成用户输入
|
|
||||||
|
|
||||||
4. Docker 部署注意
|
|
||||||
- SPEECHMATICS_API_KEY 在服务器 .env 中配置,docker-compose.yml 传入容器
|
|
||||||
- 每次改动 src/ 下文件后需 docker compose build voice-agent(注意 COPY 层缓存,
|
|
||||||
如改动未生效需加 --no-cache)
|
|
||||||
"""
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from livekit.plugins.speechmatics import STT, TurnDetectionMode
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Whisper 语言码 → Speechmatics 语言码映射
|
|
||||||
# Speechmatics 使用 ISO 639-3(如 "cmn"),而非 ISO 639-1(如 "zh")
|
|
||||||
_LANG_MAP = {
|
|
||||||
"zh": "cmn", # 中文普通话
|
|
||||||
"en": "en", # 英语
|
|
||||||
"ja": "ja", # 日语
|
|
||||||
"ko": "ko", # 韩语
|
|
||||||
"de": "de", # 德语
|
|
||||||
"fr": "fr", # 法语
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def create_speechmatics_stt(language: str = "cmn") -> STT:
|
|
||||||
"""Create a Speechmatics STT instance for the voice pipeline.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
language: Language code (Whisper or Speechmatics). Whisper codes like
|
|
||||||
'zh' are automatically mapped to Speechmatics equivalents.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Configured speechmatics.STT instance with speaker diarization enabled.
|
|
||||||
"""
|
|
||||||
sm_lang = _LANG_MAP.get(language, language)
|
|
||||||
|
|
||||||
stt = STT(
|
|
||||||
language=sm_lang,
|
|
||||||
include_partials=True,
|
|
||||||
# FIXED: 服务器检测到 1 秒静音后发 FINAL_TRANSCRIPT
|
|
||||||
# SMART_TURN 会把连续语音切成碎片,EXTERNAL 需手动 finalize,ADAPTIVE 与 LiveKit VAD 冲突
|
|
||||||
turn_detection_mode=TurnDetectionMode.FIXED,
|
|
||||||
end_of_utterance_silence_trigger=1.0,
|
|
||||||
# 说话人识别:区分用户语音与 TTS 回声
|
|
||||||
enable_diarization=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 绕过 LiveKit LanguageCode 的 ISO 639-3 → 639-1 自动归一化
|
|
||||||
# LanguageCode("cmn") 会变成 "zh",但 Speechmatics 只接受 "cmn"
|
|
||||||
stt._stt_options.language = sm_lang # type: ignore[assignment]
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Speechmatics STT created: language=%s (input=%s), mode=FIXED(1.0s), diarization=True",
|
|
||||||
sm_lang, language,
|
|
||||||
)
|
|
||||||
return stt
|
|
||||||
Loading…
Reference in New Issue