feat: add STT provider switching (OpenAI ↔ Speechmatics) in settings

- Add VoiceConfig entity/repo/service/controller in agent-service for per-tenant STT provider persistence (default: speechmatics) - Add Speechmatics STT plugin in voice-agent with livekit-plugins-speechmatics - Modify voice-agent entrypoint for 3-way STT selection: metadata > agent-service config > env var fallback - Add "Voice" section in web-admin settings page with STT provider dropdown - Add i18n translations (en/zh) for voice settings - Add SPEECHMATICS_API_KEY env var in docker-compose Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 22:13:18 -08:00 · 2026-03-02 22:13:18 -08:00 · f9c47de04b
parent 7cb185e0cd
commit f9c47de04b
13 changed files with 302 additions and 8 deletions
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@ -354,6 +354,7 @@ services:
      - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
      - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
      - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
+      - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
    depends_on:
      livekit-server:
        condition: service_healthy
--- a/it0-web-admin/src/app/(admin)/settings/page.tsx
+++ b/it0-web-admin/src/app/(admin)/settings/page.tsx
@ -48,9 +48,9 @@ interface AccountInfo {
 /*  Constants                                                          */
 /* ------------------------------------------------------------------ */

-type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account';
+type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account' | 'voice';

-const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account'];
+const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account', 'voice'];

 const TIMEZONES = [
  'UTC', 'America/New_York', 'America/Chicago', 'America/Denver',
@ -116,6 +116,7 @@ export default function SettingsPage() {
          {activeSection === 'apikeys' && <ApiKeysSection />}
          {activeSection === 'theme' && <ThemeSection />}
          {activeSection === 'account' && <AccountSection />}
+          {activeSection === 'voice' && <VoiceSection />}
        </div>
      </div>
    </div>
@ -785,3 +786,88 @@ function AccountSection() {
    </div>
  );
 }
+
+/* ------------------------------------------------------------------ */
+/*  Voice Section                                                      */
+/* ------------------------------------------------------------------ */
+
+interface VoiceSettings {
+  stt_provider: string;
+}
+
+const STT_PROVIDERS = [
+  { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
+  { value: 'openai', labelKey: 'voice.providers.openai' },
+];
+
+function VoiceSection() {
+  const { t } = useTranslation('settings');
+  const { t: tc } = useTranslation('common');
+  const queryClient = useQueryClient();
+
+  const { data, isLoading } = useQuery<VoiceSettings>({
+    queryKey: queryKeys.settings.voice(),
+    queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
+  });
+
+  const [sttProvider, setSttProvider] = useState('speechmatics');
+  const [initialized, setInitialized] = useState(false);
+
+  if (data && !initialized) {
+    setSttProvider(data.stt_provider || 'speechmatics');
+    setInitialized(true);
+  }
+
+  const mutation = useMutation({
+    mutationFn: (body: VoiceSettings) =>
+      apiClient('/api/v1/agent/voice-config', { method: 'PUT', body }),
+    onSuccess: () => queryClient.invalidateQueries({ queryKey: queryKeys.settings.all }),
+  });
+
+  return (
+    <div className="bg-card border rounded-lg p-6">
+      <h2 className="text-lg font-semibold mb-4">{t('voice.title')}</h2>
+
+      {isLoading ? (
+        <p className="text-muted-foreground text-sm">{tc('loading')}</p>
+      ) : (
+        <div className="space-y-4 max-w-lg">
+          <div>
+            <label className="block text-sm font-medium mb-1">
+              {t('voice.sttProvider')}
+            </label>
+            <select
+              className="w-full border rounded-md px-3 py-2 bg-background text-sm"
+              value={sttProvider}
+              onChange={(e) => setSttProvider(e.target.value)}
+            >
+              {STT_PROVIDERS.map((p) => (
+                <option key={p.value} value={p.value}>
+                  {t(p.labelKey)}
+                </option>
+              ))}
+            </select>
+            <p className="text-xs text-muted-foreground mt-1">
+              {t('voice.sttProviderHint')}
+            </p>
+          </div>
+
+          <button
+            onClick={() => mutation.mutate({ stt_provider: sttProvider })}
+            disabled={mutation.isPending}
+            className="px-4 py-2 bg-primary text-primary-foreground rounded-md text-sm font-medium hover:opacity-90 disabled:opacity-50"
+          >
+            {mutation.isPending ? tc('saving') : tc('save')}
+          </button>
+
+          {mutation.isError && (
+            <p className="text-sm text-red-500">{(mutation.error as Error).message}</p>
+          )}
+          {mutation.isSuccess && (
+            <p className="text-sm text-green-600">{t('voice.saved')}</p>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
--- a/it0-web-admin/src/i18n/locales/en/settings.json
+++ b/it0-web-admin/src/i18n/locales/en/settings.json
@ -6,7 +6,8 @@
    "notifications": "Notifications",
    "apikeys": "API Keys",
    "theme": "Theme",
-    "account": "Account"
+    "account": "Account",
+    "voice": "Voice"
  },
  "general": {
    "title": "General Settings",
@ -68,6 +69,16 @@
    "passwordChanged": "Password changed successfully.",
    "changing": "Changing..."
  },
+  "voice": {
+    "title": "Voice Settings",
+    "sttProvider": "Speech-to-Text Provider",
+    "sttProviderHint": "Choose the speech recognition engine for voice sessions.",
+    "providers": {
+      "speechmatics": "Speechmatics (Default)",
+      "openai": "OpenAI (gpt-4o-transcribe)"
+    },
+    "saved": "Voice settings saved."
+  },
  "languages": {
    "en": "English",
    "zh": "中文",
--- a/it0-web-admin/src/i18n/locales/zh/settings.json
+++ b/it0-web-admin/src/i18n/locales/zh/settings.json
@ -6,7 +6,8 @@
    "notifications": "通知",
    "apikeys": "API 密钥",
    "theme": "主题",
-    "account": "账户"
+    "account": "账户",
+    "voice": "语音"
  },
  "general": {
    "title": "通用设置",
@ -68,6 +69,16 @@
    "passwordChanged": "密码修改成功。",
    "changing": "修改中..."
  },
+  "voice": {
+    "title": "语音设置",
+    "sttProvider": "语音转文字引擎",
+    "sttProviderHint": "选择语音通话时使用的语音识别引擎。",
+    "providers": {
+      "speechmatics": "Speechmatics（默认）",
+      "openai": "OpenAI (gpt-4o-transcribe)"
+    },
+    "saved": "语音设置已保存。"
+  },
  "languages": {
    "en": "English",
    "zh": "中文",
--- a/it0-web-admin/src/infrastructure/api/query-keys.ts
+++ b/it0-web-admin/src/infrastructure/api/query-keys.ts
@ -130,5 +130,6 @@ export const queryKeys = {
    apiKeys: () => [...queryKeys.settings.all, 'api-keys'] as const,
    theme: () => [...queryKeys.settings.all, 'theme'] as const,
    account: () => [...queryKeys.settings.all, 'account'] as const,
+    voice: () => [...queryKeys.settings.all, 'voice'] as const,
  },
 };
--- a/packages/services/agent-service/src/agent.module.ts
+++ b/packages/services/agent-service/src/agent.module.ts
@ -35,8 +35,12 @@ import { StandingOrderRef } from './domain/entities/standing-order.entity';
 import { TenantAgentConfig } from './domain/entities/tenant-agent-config.entity';
 import { AgentConfig } from './domain/entities/agent-config.entity';
 import { HookScript } from './domain/entities/hook-script.entity';
+import { VoiceConfig } from './domain/entities/voice-config.entity';
 import { ConversationMessage } from './domain/entities/conversation-message.entity';
 import { MessageRepository } from './infrastructure/repositories/message.repository';
+import { VoiceConfigRepository } from './infrastructure/repositories/voice-config.repository';
+import { VoiceConfigService } from './infrastructure/services/voice-config.service';
+import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller';
 import { ConversationContextService } from './domain/services/conversation-context.service';

@Module({
@ -45,13 +49,13 @@ import { ConversationContextService } from './domain/services/conversation-conte
    DatabaseModule.forRoot(),
    TypeOrmModule.forFeature([
      AgentSession, AgentTask, CommandRecord, StandingOrderRef,
-      TenantAgentConfig, AgentConfig, HookScript,
+      TenantAgentConfig, AgentConfig, HookScript, VoiceConfig,
      ConversationMessage,
    ]),
  ],
  controllers: [
    AgentController, SessionController, RiskRulesController,
-    TenantAgentConfigController, AgentConfigController, SkillsController, HooksController,
+    TenantAgentConfigController, AgentConfigController, VoiceConfigController, SkillsController, HooksController,
  ],
  providers: [
    AgentStreamGateway,
@ -70,9 +74,11 @@ import { ConversationContextService } from './domain/services/conversation-conte
    MessageRepository,
    TenantAgentConfigRepository,
    AgentConfigRepository,
+    VoiceConfigRepository,
    HookScriptRepository,
    TenantAgentConfigService,
    AgentConfigService,
+    VoiceConfigService,
    AgentSkillService,
    HookScriptService,
  ],
--- a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
+++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
@ -0,0 +1,25 @@
+/**
+ * Per-tenant voice configuration entity.
+ *
+ * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
+ * Queried by voice-agent at session start to select the appropriate STT engine.
+ */
+import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
+
+@Entity('voice_configs')
+export class VoiceConfig {
+  @PrimaryGeneratedColumn('uuid')
+  id!: string;
+
+  @Column({ type: 'varchar', length: 20, unique: true })
+  tenantId!: string;
+
+  @Column({ type: 'varchar', length: 30, default: 'speechmatics' })
+  sttProvider!: string;
+
+  @CreateDateColumn({ type: 'timestamptz' })
+  createdAt!: Date;
+
+  @UpdateDateColumn({ type: 'timestamptz' })
+  updatedAt!: Date;
+}
--- a/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts
+++ b/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts
@ -0,0 +1,24 @@
+/**
+ * Repository for VoiceConfig.
+ * Uses standard TypeORM repository (no schema-per-tenant — uses tenantId column filter).
+ */
+import { Injectable } from '@nestjs/common';
+import { InjectRepository } from '@nestjs/typeorm';
+import { Repository } from 'typeorm';
+import { VoiceConfig } from '../../domain/entities/voice-config.entity';
+
+@Injectable()
+export class VoiceConfigRepository {
+  constructor(
+    @InjectRepository(VoiceConfig)
+    private readonly repo: Repository<VoiceConfig>,
+  ) {}
+
+  async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
+    return this.repo.findOneBy({ tenantId });
+  }
+
+  async save(entity: VoiceConfig): Promise<VoiceConfig> {
+    return this.repo.save(entity);
+  }
+}
--- a/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts
+++ b/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts
@ -0,0 +1,31 @@
+/**
+ * Service for managing per-tenant voice configuration (STT provider selection).
+ */
+import { Injectable } from '@nestjs/common';
+import { VoiceConfigRepository } from '../repositories/voice-config.repository';
+import { VoiceConfig } from '../../domain/entities/voice-config.entity';
+
+export interface UpdateVoiceConfigDto {
+  stt_provider?: string;
+}
+
+@Injectable()
+export class VoiceConfigService {
+  constructor(private readonly repo: VoiceConfigRepository) {}
+
+  async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
+    return this.repo.findByTenantId(tenantId);
+  }
+
+  async upsert(tenantId: string, dto: UpdateVoiceConfigDto): Promise<VoiceConfig> {
+    let config = await this.repo.findByTenantId(tenantId);
+    if (!config) {
+      config = new VoiceConfig();
+      config.tenantId = tenantId;
+    }
+    if (dto.stt_provider !== undefined) {
+      config.sttProvider = dto.stt_provider;
+    }
+    return this.repo.save(config);
+  }
+}
--- a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
@ -0,0 +1,41 @@
+/**
+ * REST controller for per-tenant voice configuration (STT provider selection).
+ *
+ * Endpoints (JWT validated by Kong gateway):
+ *   GET  /api/v1/agent/voice-config  → Get current tenant's voice config
+ *   PUT  /api/v1/agent/voice-config  → Upsert voice config
+ */
+import { Controller, Get, Put, Body, Headers } from '@nestjs/common';
+import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
+
+const DEFAULT_CONFIG = {
+  stt_provider: 'speechmatics',
+};
+
+@Controller('api/v1/agent/voice-config')
+export class VoiceConfigController {
+  constructor(private readonly voiceConfigService: VoiceConfigService) {}
+
+  @Get()
+  async getConfig(@Headers('x-tenant-id') tenantId: string) {
+    if (!tenantId) return DEFAULT_CONFIG;
+    const config = await this.voiceConfigService.findByTenantId(tenantId);
+    if (!config) return { ...DEFAULT_CONFIG, tenantId };
+    return {
+      tenantId: config.tenantId,
+      stt_provider: config.sttProvider,
+    };
+  }
+
+  @Put()
+  async upsertConfig(
+    @Headers('x-tenant-id') tenantId: string,
+    @Body() dto: UpdateVoiceConfigDto,
+  ) {
+    const config = await this.voiceConfigService.upsert(tenantId || 'default', dto);
+    return {
+      tenantId: config.tenantId,
+      stt_provider: config.sttProvider,
+    };
+  }
+}
--- a/packages/services/voice-agent/requirements.txt
+++ b/packages/services/voice-agent/requirements.txt
@ -2,6 +2,7 @@ livekit>=1.0.0
 livekit-agents>=1.0.0
 livekit-plugins-silero>=1.0.0
 livekit-plugins-openai>=1.0.0
+livekit-plugins-speechmatics>=1.0.0
 faster-whisper==1.2.1
 kokoro==0.3.5
 misaki[zh]==0.7.17
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@ -199,6 +199,7 @@ async def entrypoint(ctx: JobContext) -> None:
        tts_voice = settings.openai_tts_voice
        tts_style = ""
        engine_type = "claude_agent_sdk"
+        meta = {}
        try:
            meta_str = ctx.job.metadata or "{}"
            meta = json.loads(meta_str)
@ -212,8 +213,27 @@ async def entrypoint(ctx: JobContext) -> None:
        logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
                    bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)

-        # Build STT
-        if settings.stt_provider == "openai":
+        # ── Resolve STT provider (metadata > agent-service config > env default) ──
+        stt_provider = meta.get("stt_provider", "")
+        if not stt_provider and auth_header:
+            try:
+                import httpx as _httpx_cfg
+                async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client:
+                    _cfg_resp = await _cfg_client.get(
+                        f"{settings.agent_service_url}/api/v1/agent/voice-config",
+                        headers={"Authorization": auth_header},
+                    )
+                    if _cfg_resp.status_code == 200:
+                        _voice_cfg = _cfg_resp.json()
+                        stt_provider = _voice_cfg.get("stt_provider", "")
+                        logger.info("Voice config from agent-service: stt_provider=%s", stt_provider)
+            except Exception as e:
+                logger.warning("Failed to fetch voice config from agent-service: %s", e)
+        if not stt_provider:
+            stt_provider = settings.stt_provider  # env var fallback
+
+        # ── Build STT ──
+        if stt_provider == "openai":
            from livekit.plugins import openai as openai_plugin
            import httpx as _httpx
            import openai as _openai
@ -237,11 +257,15 @@ async def entrypoint(ctx: JobContext) -> None:
                    "silence_duration_ms": 800,
                },
            )
+        elif stt_provider == "speechmatics":
+            from .plugins.speechmatics_stt import create_speechmatics_stt
+            stt = create_speechmatics_stt(language=settings.whisper_language)
        else:
            stt = LocalWhisperSTT(
                model=ctx.proc.userdata.get("whisper_model"),
                language=settings.whisper_language,
            )
+        logger.info("STT provider selected: %s", stt_provider)

        # Build TTS
        if settings.tts_provider == "openai":
--- a/packages/services/voice-agent/src/plugins/speechmatics_stt.py
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@ -0,0 +1,32 @@
+"""
+Speechmatics STT factory for voice-agent.
+
+Creates a livekit-plugins-speechmatics STT instance configured for
+Mandarin-English bilingual recognition with speaker diarization support.
+
+The SPEECHMATICS_API_KEY environment variable is read automatically
+by the livekit-plugins-speechmatics package.
+"""
+import logging
+
+from livekit.plugins import speechmatics
+
+logger = logging.getLogger(__name__)
+
+
+def create_speechmatics_stt(language: str = "cmn") -> speechmatics.STT:
+    """Create a Speechmatics STT instance for the voice pipeline.
+
+    Args:
+        language: Speechmatics language code. Default 'cmn' for Mandarin Chinese.
+                  Use 'cmn_en' for Mandarin-English bilingual, 'en' for English.
+
+    Returns:
+        Configured speechmatics.STT instance.
+    """
+    stt = speechmatics.STT(
+        language=language,
+        enable_partials=True,
+    )
+    logger.info("Speechmatics STT created: language=%s", language)
+    return stt