feat: add STT provider switching (OpenAI ↔ Speechmatics) in settings
- Add VoiceConfig entity/repo/service/controller in agent-service for per-tenant STT provider persistence (default: speechmatics) - Add Speechmatics STT plugin in voice-agent with livekit-plugins-speechmatics - Modify voice-agent entrypoint for 3-way STT selection: metadata > agent-service config > env var fallback - Add "Voice" section in web-admin settings page with STT provider dropdown - Add i18n translations (en/zh) for voice settings - Add SPEECHMATICS_API_KEY env var in docker-compose Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
7cb185e0cd
commit
f9c47de04b
|
|
@ -354,6 +354,7 @@ services:
|
|||
- OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
|
||||
- OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
|
||||
- OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
|
||||
- SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
|
||||
depends_on:
|
||||
livekit-server:
|
||||
condition: service_healthy
|
||||
|
|
|
|||
|
|
@ -48,9 +48,9 @@ interface AccountInfo {
|
|||
/* Constants */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account';
|
||||
type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account' | 'voice';
|
||||
|
||||
const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account'];
|
||||
const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account', 'voice'];
|
||||
|
||||
const TIMEZONES = [
|
||||
'UTC', 'America/New_York', 'America/Chicago', 'America/Denver',
|
||||
|
|
@ -116,6 +116,7 @@ export default function SettingsPage() {
|
|||
{activeSection === 'apikeys' && <ApiKeysSection />}
|
||||
{activeSection === 'theme' && <ThemeSection />}
|
||||
{activeSection === 'account' && <AccountSection />}
|
||||
{activeSection === 'voice' && <VoiceSection />}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -785,3 +786,88 @@ function AccountSection() {
|
|||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Voice Section */
|
||||
/* ------------------------------------------------------------------ */
|
||||
|
||||
interface VoiceSettings {
|
||||
stt_provider: string;
|
||||
}
|
||||
|
||||
const STT_PROVIDERS = [
|
||||
{ value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
|
||||
{ value: 'openai', labelKey: 'voice.providers.openai' },
|
||||
];
|
||||
|
||||
function VoiceSection() {
|
||||
const { t } = useTranslation('settings');
|
||||
const { t: tc } = useTranslation('common');
|
||||
const queryClient = useQueryClient();
|
||||
|
||||
const { data, isLoading } = useQuery<VoiceSettings>({
|
||||
queryKey: queryKeys.settings.voice(),
|
||||
queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
|
||||
});
|
||||
|
||||
const [sttProvider, setSttProvider] = useState('speechmatics');
|
||||
const [initialized, setInitialized] = useState(false);
|
||||
|
||||
if (data && !initialized) {
|
||||
setSttProvider(data.stt_provider || 'speechmatics');
|
||||
setInitialized(true);
|
||||
}
|
||||
|
||||
const mutation = useMutation({
|
||||
mutationFn: (body: VoiceSettings) =>
|
||||
apiClient('/api/v1/agent/voice-config', { method: 'PUT', body }),
|
||||
onSuccess: () => queryClient.invalidateQueries({ queryKey: queryKeys.settings.all }),
|
||||
});
|
||||
|
||||
return (
|
||||
<div className="bg-card border rounded-lg p-6">
|
||||
<h2 className="text-lg font-semibold mb-4">{t('voice.title')}</h2>
|
||||
|
||||
{isLoading ? (
|
||||
<p className="text-muted-foreground text-sm">{tc('loading')}</p>
|
||||
) : (
|
||||
<div className="space-y-4 max-w-lg">
|
||||
<div>
|
||||
<label className="block text-sm font-medium mb-1">
|
||||
{t('voice.sttProvider')}
|
||||
</label>
|
||||
<select
|
||||
className="w-full border rounded-md px-3 py-2 bg-background text-sm"
|
||||
value={sttProvider}
|
||||
onChange={(e) => setSttProvider(e.target.value)}
|
||||
>
|
||||
{STT_PROVIDERS.map((p) => (
|
||||
<option key={p.value} value={p.value}>
|
||||
{t(p.labelKey)}
|
||||
</option>
|
||||
))}
|
||||
</select>
|
||||
<p className="text-xs text-muted-foreground mt-1">
|
||||
{t('voice.sttProviderHint')}
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<button
|
||||
onClick={() => mutation.mutate({ stt_provider: sttProvider })}
|
||||
disabled={mutation.isPending}
|
||||
className="px-4 py-2 bg-primary text-primary-foreground rounded-md text-sm font-medium hover:opacity-90 disabled:opacity-50"
|
||||
>
|
||||
{mutation.isPending ? tc('saving') : tc('save')}
|
||||
</button>
|
||||
|
||||
{mutation.isError && (
|
||||
<p className="text-sm text-red-500">{(mutation.error as Error).message}</p>
|
||||
)}
|
||||
{mutation.isSuccess && (
|
||||
<p className="text-sm text-green-600">{t('voice.saved')}</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@
|
|||
"notifications": "Notifications",
|
||||
"apikeys": "API Keys",
|
||||
"theme": "Theme",
|
||||
"account": "Account"
|
||||
"account": "Account",
|
||||
"voice": "Voice"
|
||||
},
|
||||
"general": {
|
||||
"title": "General Settings",
|
||||
|
|
@ -68,6 +69,16 @@
|
|||
"passwordChanged": "Password changed successfully.",
|
||||
"changing": "Changing..."
|
||||
},
|
||||
"voice": {
|
||||
"title": "Voice Settings",
|
||||
"sttProvider": "Speech-to-Text Provider",
|
||||
"sttProviderHint": "Choose the speech recognition engine for voice sessions.",
|
||||
"providers": {
|
||||
"speechmatics": "Speechmatics (Default)",
|
||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||
},
|
||||
"saved": "Voice settings saved."
|
||||
},
|
||||
"languages": {
|
||||
"en": "English",
|
||||
"zh": "中文",
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@
|
|||
"notifications": "通知",
|
||||
"apikeys": "API 密钥",
|
||||
"theme": "主题",
|
||||
"account": "账户"
|
||||
"account": "账户",
|
||||
"voice": "语音"
|
||||
},
|
||||
"general": {
|
||||
"title": "通用设置",
|
||||
|
|
@ -68,6 +69,16 @@
|
|||
"passwordChanged": "密码修改成功。",
|
||||
"changing": "修改中..."
|
||||
},
|
||||
"voice": {
|
||||
"title": "语音设置",
|
||||
"sttProvider": "语音转文字引擎",
|
||||
"sttProviderHint": "选择语音通话时使用的语音识别引擎。",
|
||||
"providers": {
|
||||
"speechmatics": "Speechmatics(默认)",
|
||||
"openai": "OpenAI (gpt-4o-transcribe)"
|
||||
},
|
||||
"saved": "语音设置已保存。"
|
||||
},
|
||||
"languages": {
|
||||
"en": "English",
|
||||
"zh": "中文",
|
||||
|
|
|
|||
|
|
@ -130,5 +130,6 @@ export const queryKeys = {
|
|||
apiKeys: () => [...queryKeys.settings.all, 'api-keys'] as const,
|
||||
theme: () => [...queryKeys.settings.all, 'theme'] as const,
|
||||
account: () => [...queryKeys.settings.all, 'account'] as const,
|
||||
voice: () => [...queryKeys.settings.all, 'voice'] as const,
|
||||
},
|
||||
};
|
||||
|
|
|
|||
|
|
@ -35,8 +35,12 @@ import { StandingOrderRef } from './domain/entities/standing-order.entity';
|
|||
import { TenantAgentConfig } from './domain/entities/tenant-agent-config.entity';
|
||||
import { AgentConfig } from './domain/entities/agent-config.entity';
|
||||
import { HookScript } from './domain/entities/hook-script.entity';
|
||||
import { VoiceConfig } from './domain/entities/voice-config.entity';
|
||||
import { ConversationMessage } from './domain/entities/conversation-message.entity';
|
||||
import { MessageRepository } from './infrastructure/repositories/message.repository';
|
||||
import { VoiceConfigRepository } from './infrastructure/repositories/voice-config.repository';
|
||||
import { VoiceConfigService } from './infrastructure/services/voice-config.service';
|
||||
import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller';
|
||||
import { ConversationContextService } from './domain/services/conversation-context.service';
|
||||
|
||||
@Module({
|
||||
|
|
@ -45,13 +49,13 @@ import { ConversationContextService } from './domain/services/conversation-conte
|
|||
DatabaseModule.forRoot(),
|
||||
TypeOrmModule.forFeature([
|
||||
AgentSession, AgentTask, CommandRecord, StandingOrderRef,
|
||||
TenantAgentConfig, AgentConfig, HookScript,
|
||||
TenantAgentConfig, AgentConfig, HookScript, VoiceConfig,
|
||||
ConversationMessage,
|
||||
]),
|
||||
],
|
||||
controllers: [
|
||||
AgentController, SessionController, RiskRulesController,
|
||||
TenantAgentConfigController, AgentConfigController, SkillsController, HooksController,
|
||||
TenantAgentConfigController, AgentConfigController, VoiceConfigController, SkillsController, HooksController,
|
||||
],
|
||||
providers: [
|
||||
AgentStreamGateway,
|
||||
|
|
@ -70,9 +74,11 @@ import { ConversationContextService } from './domain/services/conversation-conte
|
|||
MessageRepository,
|
||||
TenantAgentConfigRepository,
|
||||
AgentConfigRepository,
|
||||
VoiceConfigRepository,
|
||||
HookScriptRepository,
|
||||
TenantAgentConfigService,
|
||||
AgentConfigService,
|
||||
VoiceConfigService,
|
||||
AgentSkillService,
|
||||
HookScriptService,
|
||||
],
|
||||
|
|
|
|||
|
|
@ -0,0 +1,25 @@
|
|||
/**
|
||||
* Per-tenant voice configuration entity.
|
||||
*
|
||||
* Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
|
||||
* Queried by voice-agent at session start to select the appropriate STT engine.
|
||||
*/
|
||||
import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
|
||||
|
||||
@Entity('voice_configs')
|
||||
export class VoiceConfig {
|
||||
@PrimaryGeneratedColumn('uuid')
|
||||
id!: string;
|
||||
|
||||
@Column({ type: 'varchar', length: 20, unique: true })
|
||||
tenantId!: string;
|
||||
|
||||
@Column({ type: 'varchar', length: 30, default: 'speechmatics' })
|
||||
sttProvider!: string;
|
||||
|
||||
@CreateDateColumn({ type: 'timestamptz' })
|
||||
createdAt!: Date;
|
||||
|
||||
@UpdateDateColumn({ type: 'timestamptz' })
|
||||
updatedAt!: Date;
|
||||
}
|
||||
|
|
@ -0,0 +1,24 @@
|
|||
/**
|
||||
* Repository for VoiceConfig.
|
||||
* Uses standard TypeORM repository (no schema-per-tenant — uses tenantId column filter).
|
||||
*/
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { InjectRepository } from '@nestjs/typeorm';
|
||||
import { Repository } from 'typeorm';
|
||||
import { VoiceConfig } from '../../domain/entities/voice-config.entity';
|
||||
|
||||
@Injectable()
|
||||
export class VoiceConfigRepository {
|
||||
constructor(
|
||||
@InjectRepository(VoiceConfig)
|
||||
private readonly repo: Repository<VoiceConfig>,
|
||||
) {}
|
||||
|
||||
async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
|
||||
return this.repo.findOneBy({ tenantId });
|
||||
}
|
||||
|
||||
async save(entity: VoiceConfig): Promise<VoiceConfig> {
|
||||
return this.repo.save(entity);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
/**
|
||||
* Service for managing per-tenant voice configuration (STT provider selection).
|
||||
*/
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { VoiceConfigRepository } from '../repositories/voice-config.repository';
|
||||
import { VoiceConfig } from '../../domain/entities/voice-config.entity';
|
||||
|
||||
export interface UpdateVoiceConfigDto {
|
||||
stt_provider?: string;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class VoiceConfigService {
|
||||
constructor(private readonly repo: VoiceConfigRepository) {}
|
||||
|
||||
async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
|
||||
return this.repo.findByTenantId(tenantId);
|
||||
}
|
||||
|
||||
async upsert(tenantId: string, dto: UpdateVoiceConfigDto): Promise<VoiceConfig> {
|
||||
let config = await this.repo.findByTenantId(tenantId);
|
||||
if (!config) {
|
||||
config = new VoiceConfig();
|
||||
config.tenantId = tenantId;
|
||||
}
|
||||
if (dto.stt_provider !== undefined) {
|
||||
config.sttProvider = dto.stt_provider;
|
||||
}
|
||||
return this.repo.save(config);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* REST controller for per-tenant voice configuration (STT provider selection).
|
||||
*
|
||||
* Endpoints (JWT validated by Kong gateway):
|
||||
* GET /api/v1/agent/voice-config → Get current tenant's voice config
|
||||
* PUT /api/v1/agent/voice-config → Upsert voice config
|
||||
*/
|
||||
import { Controller, Get, Put, Body, Headers } from '@nestjs/common';
|
||||
import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
|
||||
|
||||
const DEFAULT_CONFIG = {
|
||||
stt_provider: 'speechmatics',
|
||||
};
|
||||
|
||||
@Controller('api/v1/agent/voice-config')
|
||||
export class VoiceConfigController {
|
||||
constructor(private readonly voiceConfigService: VoiceConfigService) {}
|
||||
|
||||
@Get()
|
||||
async getConfig(@Headers('x-tenant-id') tenantId: string) {
|
||||
if (!tenantId) return DEFAULT_CONFIG;
|
||||
const config = await this.voiceConfigService.findByTenantId(tenantId);
|
||||
if (!config) return { ...DEFAULT_CONFIG, tenantId };
|
||||
return {
|
||||
tenantId: config.tenantId,
|
||||
stt_provider: config.sttProvider,
|
||||
};
|
||||
}
|
||||
|
||||
@Put()
|
||||
async upsertConfig(
|
||||
@Headers('x-tenant-id') tenantId: string,
|
||||
@Body() dto: UpdateVoiceConfigDto,
|
||||
) {
|
||||
const config = await this.voiceConfigService.upsert(tenantId || 'default', dto);
|
||||
return {
|
||||
tenantId: config.tenantId,
|
||||
stt_provider: config.sttProvider,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@ livekit>=1.0.0
|
|||
livekit-agents>=1.0.0
|
||||
livekit-plugins-silero>=1.0.0
|
||||
livekit-plugins-openai>=1.0.0
|
||||
livekit-plugins-speechmatics>=1.0.0
|
||||
faster-whisper==1.2.1
|
||||
kokoro==0.3.5
|
||||
misaki[zh]==0.7.17
|
||||
|
|
|
|||
|
|
@ -199,6 +199,7 @@ async def entrypoint(ctx: JobContext) -> None:
|
|||
tts_voice = settings.openai_tts_voice
|
||||
tts_style = ""
|
||||
engine_type = "claude_agent_sdk"
|
||||
meta = {}
|
||||
try:
|
||||
meta_str = ctx.job.metadata or "{}"
|
||||
meta = json.loads(meta_str)
|
||||
|
|
@ -212,8 +213,27 @@ async def entrypoint(ctx: JobContext) -> None:
|
|||
logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
|
||||
bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)
|
||||
|
||||
# Build STT
|
||||
if settings.stt_provider == "openai":
|
||||
# ── Resolve STT provider (metadata > agent-service config > env default) ──
|
||||
stt_provider = meta.get("stt_provider", "")
|
||||
if not stt_provider and auth_header:
|
||||
try:
|
||||
import httpx as _httpx_cfg
|
||||
async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client:
|
||||
_cfg_resp = await _cfg_client.get(
|
||||
f"{settings.agent_service_url}/api/v1/agent/voice-config",
|
||||
headers={"Authorization": auth_header},
|
||||
)
|
||||
if _cfg_resp.status_code == 200:
|
||||
_voice_cfg = _cfg_resp.json()
|
||||
stt_provider = _voice_cfg.get("stt_provider", "")
|
||||
logger.info("Voice config from agent-service: stt_provider=%s", stt_provider)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to fetch voice config from agent-service: %s", e)
|
||||
if not stt_provider:
|
||||
stt_provider = settings.stt_provider # env var fallback
|
||||
|
||||
# ── Build STT ──
|
||||
if stt_provider == "openai":
|
||||
from livekit.plugins import openai as openai_plugin
|
||||
import httpx as _httpx
|
||||
import openai as _openai
|
||||
|
|
@ -237,11 +257,15 @@ async def entrypoint(ctx: JobContext) -> None:
|
|||
"silence_duration_ms": 800,
|
||||
},
|
||||
)
|
||||
elif stt_provider == "speechmatics":
|
||||
from .plugins.speechmatics_stt import create_speechmatics_stt
|
||||
stt = create_speechmatics_stt(language=settings.whisper_language)
|
||||
else:
|
||||
stt = LocalWhisperSTT(
|
||||
model=ctx.proc.userdata.get("whisper_model"),
|
||||
language=settings.whisper_language,
|
||||
)
|
||||
logger.info("STT provider selected: %s", stt_provider)
|
||||
|
||||
# Build TTS
|
||||
if settings.tts_provider == "openai":
|
||||
|
|
|
|||
|
|
@ -0,0 +1,32 @@
|
|||
"""
|
||||
Speechmatics STT factory for voice-agent.
|
||||
|
||||
Creates a livekit-plugins-speechmatics STT instance configured for
|
||||
Mandarin-English bilingual recognition with speaker diarization support.
|
||||
|
||||
The SPEECHMATICS_API_KEY environment variable is read automatically
|
||||
by the livekit-plugins-speechmatics package.
|
||||
"""
|
||||
import logging
|
||||
|
||||
from livekit.plugins import speechmatics
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def create_speechmatics_stt(language: str = "cmn") -> speechmatics.STT:
|
||||
"""Create a Speechmatics STT instance for the voice pipeline.
|
||||
|
||||
Args:
|
||||
language: Speechmatics language code. Default 'cmn' for Mandarin Chinese.
|
||||
Use 'cmn_en' for Mandarin-English bilingual, 'en' for English.
|
||||
|
||||
Returns:
|
||||
Configured speechmatics.STT instance.
|
||||
"""
|
||||
stt = speechmatics.STT(
|
||||
language=language,
|
||||
enable_partials=True,
|
||||
)
|
||||
logger.info("Speechmatics STT created: language=%s", language)
|
||||
return stt
|
||||
Loading…
Reference in New Issue