From f9c47de04b0d91d352043fa13eb317c27a4ae15f Mon Sep 17 00:00:00 2001 From: hailin Date: Mon, 2 Mar 2026 22:13:18 -0800 Subject: [PATCH] =?UTF-8?q?feat:=20add=20STT=20provider=20switching=20(Ope?= =?UTF-8?q?nAI=20=E2=86=94=20Speechmatics)=20in=20settings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add VoiceConfig entity/repo/service/controller in agent-service for per-tenant STT provider persistence (default: speechmatics) - Add Speechmatics STT plugin in voice-agent with livekit-plugins-speechmatics - Modify voice-agent entrypoint for 3-way STT selection: metadata > agent-service config > env var fallback - Add "Voice" section in web-admin settings page with STT provider dropdown - Add i18n translations (en/zh) for voice settings - Add SPEECHMATICS_API_KEY env var in docker-compose Co-Authored-By: Claude Opus 4.6 --- deploy/docker/docker-compose.yml | 1 + .../src/app/(admin)/settings/page.tsx | 90 ++++++++++++++++++- .../src/i18n/locales/en/settings.json | 13 ++- .../src/i18n/locales/zh/settings.json | 13 ++- .../src/infrastructure/api/query-keys.ts | 1 + .../agent-service/src/agent.module.ts | 10 ++- .../domain/entities/voice-config.entity.ts | 25 ++++++ .../repositories/voice-config.repository.ts | 24 +++++ .../services/voice-config.service.ts | 31 +++++++ .../controllers/voice-config.controller.ts | 41 +++++++++ .../services/voice-agent/requirements.txt | 1 + packages/services/voice-agent/src/agent.py | 28 +++++- .../src/plugins/speechmatics_stt.py | 32 +++++++ 13 files changed, 302 insertions(+), 8 deletions(-) create mode 100644 packages/services/agent-service/src/domain/entities/voice-config.entity.ts create mode 100644 packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts create mode 100644 packages/services/agent-service/src/infrastructure/services/voice-config.service.ts create mode 100644 packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts create mode 100644 packages/services/voice-agent/src/plugins/speechmatics_stt.py diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml index 1a69bc7..6462b41 100644 --- a/deploy/docker/docker-compose.yml +++ b/deploy/docker/docker-compose.yml @@ -354,6 +354,7 @@ services: - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe} - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts} - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral} + - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-} depends_on: livekit-server: condition: service_healthy diff --git a/it0-web-admin/src/app/(admin)/settings/page.tsx b/it0-web-admin/src/app/(admin)/settings/page.tsx index 25a01a6..a8d3cc4 100644 --- a/it0-web-admin/src/app/(admin)/settings/page.tsx +++ b/it0-web-admin/src/app/(admin)/settings/page.tsx @@ -48,9 +48,9 @@ interface AccountInfo { /* Constants */ /* ------------------------------------------------------------------ */ -type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account'; +type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account' | 'voice'; -const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account']; +const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account', 'voice']; const TIMEZONES = [ 'UTC', 'America/New_York', 'America/Chicago', 'America/Denver', @@ -116,6 +116,7 @@ export default function SettingsPage() { {activeSection === 'apikeys' && } {activeSection === 'theme' && } {activeSection === 'account' && } + {activeSection === 'voice' && } @@ -785,3 +786,88 @@ function AccountSection() { ); } + +/* ------------------------------------------------------------------ */ +/* Voice Section */ +/* ------------------------------------------------------------------ */ + +interface VoiceSettings { + stt_provider: string; +} + +const STT_PROVIDERS = [ + { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' }, + { value: 'openai', labelKey: 'voice.providers.openai' }, +]; + +function VoiceSection() { + const { t } = useTranslation('settings'); + const { t: tc } = useTranslation('common'); + const queryClient = useQueryClient(); + + const { data, isLoading } = useQuery({ + queryKey: queryKeys.settings.voice(), + queryFn: () => apiClient('/api/v1/agent/voice-config'), + }); + + const [sttProvider, setSttProvider] = useState('speechmatics'); + const [initialized, setInitialized] = useState(false); + + if (data && !initialized) { + setSttProvider(data.stt_provider || 'speechmatics'); + setInitialized(true); + } + + const mutation = useMutation({ + mutationFn: (body: VoiceSettings) => + apiClient('/api/v1/agent/voice-config', { method: 'PUT', body }), + onSuccess: () => queryClient.invalidateQueries({ queryKey: queryKeys.settings.all }), + }); + + return ( +
+

{t('voice.title')}

+ + {isLoading ? ( +

{tc('loading')}

+ ) : ( +
+
+ + +

+ {t('voice.sttProviderHint')} +

+
+ + + + {mutation.isError && ( +

{(mutation.error as Error).message}

+ )} + {mutation.isSuccess && ( +

{t('voice.saved')}

+ )} +
+ )} +
+ ); +} diff --git a/it0-web-admin/src/i18n/locales/en/settings.json b/it0-web-admin/src/i18n/locales/en/settings.json index 8dc9234..5be604f 100644 --- a/it0-web-admin/src/i18n/locales/en/settings.json +++ b/it0-web-admin/src/i18n/locales/en/settings.json @@ -6,7 +6,8 @@ "notifications": "Notifications", "apikeys": "API Keys", "theme": "Theme", - "account": "Account" + "account": "Account", + "voice": "Voice" }, "general": { "title": "General Settings", @@ -68,6 +69,16 @@ "passwordChanged": "Password changed successfully.", "changing": "Changing..." }, + "voice": { + "title": "Voice Settings", + "sttProvider": "Speech-to-Text Provider", + "sttProviderHint": "Choose the speech recognition engine for voice sessions.", + "providers": { + "speechmatics": "Speechmatics (Default)", + "openai": "OpenAI (gpt-4o-transcribe)" + }, + "saved": "Voice settings saved." + }, "languages": { "en": "English", "zh": "中文", diff --git a/it0-web-admin/src/i18n/locales/zh/settings.json b/it0-web-admin/src/i18n/locales/zh/settings.json index c88800c..cdbed38 100644 --- a/it0-web-admin/src/i18n/locales/zh/settings.json +++ b/it0-web-admin/src/i18n/locales/zh/settings.json @@ -6,7 +6,8 @@ "notifications": "通知", "apikeys": "API 密钥", "theme": "主题", - "account": "账户" + "account": "账户", + "voice": "语音" }, "general": { "title": "通用设置", @@ -68,6 +69,16 @@ "passwordChanged": "密码修改成功。", "changing": "修改中..." }, + "voice": { + "title": "语音设置", + "sttProvider": "语音转文字引擎", + "sttProviderHint": "选择语音通话时使用的语音识别引擎。", + "providers": { + "speechmatics": "Speechmatics(默认)", + "openai": "OpenAI (gpt-4o-transcribe)" + }, + "saved": "语音设置已保存。" + }, "languages": { "en": "English", "zh": "中文", diff --git a/it0-web-admin/src/infrastructure/api/query-keys.ts b/it0-web-admin/src/infrastructure/api/query-keys.ts index 6f0b729..4287cde 100644 --- a/it0-web-admin/src/infrastructure/api/query-keys.ts +++ b/it0-web-admin/src/infrastructure/api/query-keys.ts @@ -130,5 +130,6 @@ export const queryKeys = { apiKeys: () => [...queryKeys.settings.all, 'api-keys'] as const, theme: () => [...queryKeys.settings.all, 'theme'] as const, account: () => [...queryKeys.settings.all, 'account'] as const, + voice: () => [...queryKeys.settings.all, 'voice'] as const, }, }; diff --git a/packages/services/agent-service/src/agent.module.ts b/packages/services/agent-service/src/agent.module.ts index b80d016..b32f678 100644 --- a/packages/services/agent-service/src/agent.module.ts +++ b/packages/services/agent-service/src/agent.module.ts @@ -35,8 +35,12 @@ import { StandingOrderRef } from './domain/entities/standing-order.entity'; import { TenantAgentConfig } from './domain/entities/tenant-agent-config.entity'; import { AgentConfig } from './domain/entities/agent-config.entity'; import { HookScript } from './domain/entities/hook-script.entity'; +import { VoiceConfig } from './domain/entities/voice-config.entity'; import { ConversationMessage } from './domain/entities/conversation-message.entity'; import { MessageRepository } from './infrastructure/repositories/message.repository'; +import { VoiceConfigRepository } from './infrastructure/repositories/voice-config.repository'; +import { VoiceConfigService } from './infrastructure/services/voice-config.service'; +import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller'; import { ConversationContextService } from './domain/services/conversation-context.service'; @Module({ @@ -45,13 +49,13 @@ import { ConversationContextService } from './domain/services/conversation-conte DatabaseModule.forRoot(), TypeOrmModule.forFeature([ AgentSession, AgentTask, CommandRecord, StandingOrderRef, - TenantAgentConfig, AgentConfig, HookScript, + TenantAgentConfig, AgentConfig, HookScript, VoiceConfig, ConversationMessage, ]), ], controllers: [ AgentController, SessionController, RiskRulesController, - TenantAgentConfigController, AgentConfigController, SkillsController, HooksController, + TenantAgentConfigController, AgentConfigController, VoiceConfigController, SkillsController, HooksController, ], providers: [ AgentStreamGateway, @@ -70,9 +74,11 @@ import { ConversationContextService } from './domain/services/conversation-conte MessageRepository, TenantAgentConfigRepository, AgentConfigRepository, + VoiceConfigRepository, HookScriptRepository, TenantAgentConfigService, AgentConfigService, + VoiceConfigService, AgentSkillService, HookScriptService, ], diff --git a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts new file mode 100644 index 0000000..f970f4b --- /dev/null +++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts @@ -0,0 +1,25 @@ +/** + * Per-tenant voice configuration entity. + * + * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai'). + * Queried by voice-agent at session start to select the appropriate STT engine. + */ +import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm'; + +@Entity('voice_configs') +export class VoiceConfig { + @PrimaryGeneratedColumn('uuid') + id!: string; + + @Column({ type: 'varchar', length: 20, unique: true }) + tenantId!: string; + + @Column({ type: 'varchar', length: 30, default: 'speechmatics' }) + sttProvider!: string; + + @CreateDateColumn({ type: 'timestamptz' }) + createdAt!: Date; + + @UpdateDateColumn({ type: 'timestamptz' }) + updatedAt!: Date; +} diff --git a/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts b/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts new file mode 100644 index 0000000..70e27cf --- /dev/null +++ b/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts @@ -0,0 +1,24 @@ +/** + * Repository for VoiceConfig. + * Uses standard TypeORM repository (no schema-per-tenant — uses tenantId column filter). + */ +import { Injectable } from '@nestjs/common'; +import { InjectRepository } from '@nestjs/typeorm'; +import { Repository } from 'typeorm'; +import { VoiceConfig } from '../../domain/entities/voice-config.entity'; + +@Injectable() +export class VoiceConfigRepository { + constructor( + @InjectRepository(VoiceConfig) + private readonly repo: Repository, + ) {} + + async findByTenantId(tenantId: string): Promise { + return this.repo.findOneBy({ tenantId }); + } + + async save(entity: VoiceConfig): Promise { + return this.repo.save(entity); + } +} diff --git a/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts b/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts new file mode 100644 index 0000000..3f0b061 --- /dev/null +++ b/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts @@ -0,0 +1,31 @@ +/** + * Service for managing per-tenant voice configuration (STT provider selection). + */ +import { Injectable } from '@nestjs/common'; +import { VoiceConfigRepository } from '../repositories/voice-config.repository'; +import { VoiceConfig } from '../../domain/entities/voice-config.entity'; + +export interface UpdateVoiceConfigDto { + stt_provider?: string; +} + +@Injectable() +export class VoiceConfigService { + constructor(private readonly repo: VoiceConfigRepository) {} + + async findByTenantId(tenantId: string): Promise { + return this.repo.findByTenantId(tenantId); + } + + async upsert(tenantId: string, dto: UpdateVoiceConfigDto): Promise { + let config = await this.repo.findByTenantId(tenantId); + if (!config) { + config = new VoiceConfig(); + config.tenantId = tenantId; + } + if (dto.stt_provider !== undefined) { + config.sttProvider = dto.stt_provider; + } + return this.repo.save(config); + } +} diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts new file mode 100644 index 0000000..8d65c97 --- /dev/null +++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts @@ -0,0 +1,41 @@ +/** + * REST controller for per-tenant voice configuration (STT provider selection). + * + * Endpoints (JWT validated by Kong gateway): + * GET /api/v1/agent/voice-config → Get current tenant's voice config + * PUT /api/v1/agent/voice-config → Upsert voice config + */ +import { Controller, Get, Put, Body, Headers } from '@nestjs/common'; +import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service'; + +const DEFAULT_CONFIG = { + stt_provider: 'speechmatics', +}; + +@Controller('api/v1/agent/voice-config') +export class VoiceConfigController { + constructor(private readonly voiceConfigService: VoiceConfigService) {} + + @Get() + async getConfig(@Headers('x-tenant-id') tenantId: string) { + if (!tenantId) return DEFAULT_CONFIG; + const config = await this.voiceConfigService.findByTenantId(tenantId); + if (!config) return { ...DEFAULT_CONFIG, tenantId }; + return { + tenantId: config.tenantId, + stt_provider: config.sttProvider, + }; + } + + @Put() + async upsertConfig( + @Headers('x-tenant-id') tenantId: string, + @Body() dto: UpdateVoiceConfigDto, + ) { + const config = await this.voiceConfigService.upsert(tenantId || 'default', dto); + return { + tenantId: config.tenantId, + stt_provider: config.sttProvider, + }; + } +} diff --git a/packages/services/voice-agent/requirements.txt b/packages/services/voice-agent/requirements.txt index 79deeaf..5ddf39d 100644 --- a/packages/services/voice-agent/requirements.txt +++ b/packages/services/voice-agent/requirements.txt @@ -2,6 +2,7 @@ livekit>=1.0.0 livekit-agents>=1.0.0 livekit-plugins-silero>=1.0.0 livekit-plugins-openai>=1.0.0 +livekit-plugins-speechmatics>=1.0.0 faster-whisper==1.2.1 kokoro==0.3.5 misaki[zh]==0.7.17 diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index 2536c39..a637024 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -199,6 +199,7 @@ async def entrypoint(ctx: JobContext) -> None: tts_voice = settings.openai_tts_voice tts_style = "" engine_type = "claude_agent_sdk" + meta = {} try: meta_str = ctx.job.metadata or "{}" meta = json.loads(meta_str) @@ -212,8 +213,27 @@ async def entrypoint(ctx: JobContext) -> None: logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s", bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type) - # Build STT - if settings.stt_provider == "openai": + # ── Resolve STT provider (metadata > agent-service config > env default) ── + stt_provider = meta.get("stt_provider", "") + if not stt_provider and auth_header: + try: + import httpx as _httpx_cfg + async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client: + _cfg_resp = await _cfg_client.get( + f"{settings.agent_service_url}/api/v1/agent/voice-config", + headers={"Authorization": auth_header}, + ) + if _cfg_resp.status_code == 200: + _voice_cfg = _cfg_resp.json() + stt_provider = _voice_cfg.get("stt_provider", "") + logger.info("Voice config from agent-service: stt_provider=%s", stt_provider) + except Exception as e: + logger.warning("Failed to fetch voice config from agent-service: %s", e) + if not stt_provider: + stt_provider = settings.stt_provider # env var fallback + + # ── Build STT ── + if stt_provider == "openai": from livekit.plugins import openai as openai_plugin import httpx as _httpx import openai as _openai @@ -237,11 +257,15 @@ async def entrypoint(ctx: JobContext) -> None: "silence_duration_ms": 800, }, ) + elif stt_provider == "speechmatics": + from .plugins.speechmatics_stt import create_speechmatics_stt + stt = create_speechmatics_stt(language=settings.whisper_language) else: stt = LocalWhisperSTT( model=ctx.proc.userdata.get("whisper_model"), language=settings.whisper_language, ) + logger.info("STT provider selected: %s", stt_provider) # Build TTS if settings.tts_provider == "openai": diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py new file mode 100644 index 0000000..b81b886 --- /dev/null +++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py @@ -0,0 +1,32 @@ +""" +Speechmatics STT factory for voice-agent. + +Creates a livekit-plugins-speechmatics STT instance configured for +Mandarin-English bilingual recognition with speaker diarization support. + +The SPEECHMATICS_API_KEY environment variable is read automatically +by the livekit-plugins-speechmatics package. +""" +import logging + +from livekit.plugins import speechmatics + +logger = logging.getLogger(__name__) + + +def create_speechmatics_stt(language: str = "cmn") -> speechmatics.STT: + """Create a Speechmatics STT instance for the voice pipeline. + + Args: + language: Speechmatics language code. Default 'cmn' for Mandarin Chinese. + Use 'cmn_en' for Mandarin-English bilingual, 'en' for English. + + Returns: + Configured speechmatics.STT instance. + """ + stt = speechmatics.STT( + language=language, + enable_partials=True, + ) + logger.info("Speechmatics STT created: language=%s", language) + return stt