From f9c47de04b0d91d352043fa13eb317c27a4ae15f Mon Sep 17 00:00:00 2001
From: hailin <hailin.zhao@gdzx.xyz>
Date: Mon, 2 Mar 2026 22:13:18 -0800
Subject: [PATCH] =?UTF-8?q?feat:=20add=20STT=20provider=20switching=20(Ope?=
 =?UTF-8?q?nAI=20=E2=86=94=20Speechmatics)=20in=20settings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add VoiceConfig entity/repo/service/controller in agent-service
  for per-tenant STT provider persistence (default: speechmatics)
- Add Speechmatics STT plugin in voice-agent with livekit-plugins-speechmatics
- Modify voice-agent entrypoint for 3-way STT selection:
  metadata > agent-service config > env var fallback
- Add "Voice" section in web-admin settings page with STT provider dropdown
- Add i18n translations (en/zh) for voice settings
- Add SPEECHMATICS_API_KEY env var in docker-compose

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 deploy/docker/docker-compose.yml              |  1 +
 .../src/app/(admin)/settings/page.tsx         | 90 ++++++++++++++++++-
 .../src/i18n/locales/en/settings.json         | 13 ++-
 .../src/i18n/locales/zh/settings.json         | 13 ++-
 .../src/infrastructure/api/query-keys.ts      |  1 +
 .../agent-service/src/agent.module.ts         | 10 ++-
 .../domain/entities/voice-config.entity.ts    | 25 ++++++
 .../repositories/voice-config.repository.ts   | 24 +++++
 .../services/voice-config.service.ts          | 31 +++++++
 .../controllers/voice-config.controller.ts    | 41 +++++++++
 .../services/voice-agent/requirements.txt     |  1 +
 packages/services/voice-agent/src/agent.py    | 28 +++++-
 .../src/plugins/speechmatics_stt.py           | 32 +++++++
 13 files changed, 302 insertions(+), 8 deletions(-)
 create mode 100644 packages/services/agent-service/src/domain/entities/voice-config.entity.ts
 create mode 100644 packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts
 create mode 100644 packages/services/agent-service/src/infrastructure/services/voice-config.service.ts
 create mode 100644 packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
 create mode 100644 packages/services/voice-agent/src/plugins/speechmatics_stt.py
diff --git a/deploy/docker/docker-compose.yml b/deploy/docker/docker-compose.yml
index 1a69bc7..6462b41 100644
--- a/deploy/docker/docker-compose.yml
+++ b/deploy/docker/docker-compose.yml
@@ -354,6 +354,7 @@ services:
       - OPENAI_STT_MODEL=${OPENAI_STT_MODEL:-gpt-4o-transcribe}
       - OPENAI_TTS_MODEL=${OPENAI_TTS_MODEL:-gpt-4o-mini-tts}
       - OPENAI_TTS_VOICE=${OPENAI_TTS_VOICE:-coral}
+      - SPEECHMATICS_API_KEY=${SPEECHMATICS_API_KEY:-}
     depends_on:
       livekit-server:
         condition: service_healthy
diff --git a/it0-web-admin/src/app/(admin)/settings/page.tsx b/it0-web-admin/src/app/(admin)/settings/page.tsx
index 25a01a6..a8d3cc4 100644
--- a/it0-web-admin/src/app/(admin)/settings/page.tsx
+++ b/it0-web-admin/src/app/(admin)/settings/page.tsx
@@ -48,9 +48,9 @@ interface AccountInfo {
 /*  Constants                                                          */
 /* ------------------------------------------------------------------ */
 
-type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account';
+type SectionId = 'general' | 'notifications' | 'apikeys' | 'theme' | 'account' | 'voice';
 
-const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account'];
+const SECTION_IDS: SectionId[] = ['general', 'notifications', 'apikeys', 'theme', 'account', 'voice'];
 
 const TIMEZONES = [
   'UTC', 'America/New_York', 'America/Chicago', 'America/Denver',
@@ -116,6 +116,7 @@ export default function SettingsPage() {
           {activeSection === 'apikeys' && <ApiKeysSection />}
           {activeSection === 'theme' && <ThemeSection />}
           {activeSection === 'account' && <AccountSection />}
+          {activeSection === 'voice' && <VoiceSection />}
         </div>
       </div>
     </div>
@@ -785,3 +786,88 @@ function AccountSection() {
     </div>
   );
 }
+
+/* ------------------------------------------------------------------ */
+/*  Voice Section                                                      */
+/* ------------------------------------------------------------------ */
+
+interface VoiceSettings {
+  stt_provider: string;
+}
+
+const STT_PROVIDERS = [
+  { value: 'speechmatics', labelKey: 'voice.providers.speechmatics' },
+  { value: 'openai', labelKey: 'voice.providers.openai' },
+];
+
+function VoiceSection() {
+  const { t } = useTranslation('settings');
+  const { t: tc } = useTranslation('common');
+  const queryClient = useQueryClient();
+
+  const { data, isLoading } = useQuery<VoiceSettings>({
+    queryKey: queryKeys.settings.voice(),
+    queryFn: () => apiClient<VoiceSettings>('/api/v1/agent/voice-config'),
+  });
+
+  const [sttProvider, setSttProvider] = useState('speechmatics');
+  const [initialized, setInitialized] = useState(false);
+
+  if (data && !initialized) {
+    setSttProvider(data.stt_provider || 'speechmatics');
+    setInitialized(true);
+  }
+
+  const mutation = useMutation({
+    mutationFn: (body: VoiceSettings) =>
+      apiClient('/api/v1/agent/voice-config', { method: 'PUT', body }),
+    onSuccess: () => queryClient.invalidateQueries({ queryKey: queryKeys.settings.all }),
+  });
+
+  return (
+    <div className="bg-card border rounded-lg p-6">
+      <h2 className="text-lg font-semibold mb-4">{t('voice.title')}</h2>
+
+      {isLoading ? (
+        <p className="text-muted-foreground text-sm">{tc('loading')}</p>
+      ) : (
+        <div className="space-y-4 max-w-lg">
+          <div>
+            <label className="block text-sm font-medium mb-1">
+              {t('voice.sttProvider')}
+            </label>
+            <select
+              className="w-full border rounded-md px-3 py-2 bg-background text-sm"
+              value={sttProvider}
+              onChange={(e) => setSttProvider(e.target.value)}
+            >
+              {STT_PROVIDERS.map((p) => (
+                <option key={p.value} value={p.value}>
+                  {t(p.labelKey)}
+                </option>
+              ))}
+            </select>
+            <p className="text-xs text-muted-foreground mt-1">
+              {t('voice.sttProviderHint')}
+            </p>
+          </div>
+
+          <button
+            onClick={() => mutation.mutate({ stt_provider: sttProvider })}
+            disabled={mutation.isPending}
+            className="px-4 py-2 bg-primary text-primary-foreground rounded-md text-sm font-medium hover:opacity-90 disabled:opacity-50"
+          >
+            {mutation.isPending ? tc('saving') : tc('save')}
+          </button>
+
+          {mutation.isError && (
+            <p className="text-sm text-red-500">{(mutation.error as Error).message}</p>
+          )}
+          {mutation.isSuccess && (
+            <p className="text-sm text-green-600">{t('voice.saved')}</p>
+          )}
+        </div>
+      )}
+    </div>
+  );
+}
diff --git a/it0-web-admin/src/i18n/locales/en/settings.json b/it0-web-admin/src/i18n/locales/en/settings.json
index 8dc9234..5be604f 100644
--- a/it0-web-admin/src/i18n/locales/en/settings.json
+++ b/it0-web-admin/src/i18n/locales/en/settings.json
@@ -6,7 +6,8 @@
     "notifications": "Notifications",
     "apikeys": "API Keys",
     "theme": "Theme",
-    "account": "Account"
+    "account": "Account",
+    "voice": "Voice"
   },
   "general": {
     "title": "General Settings",
@@ -68,6 +69,16 @@
     "passwordChanged": "Password changed successfully.",
     "changing": "Changing..."
   },
+  "voice": {
+    "title": "Voice Settings",
+    "sttProvider": "Speech-to-Text Provider",
+    "sttProviderHint": "Choose the speech recognition engine for voice sessions.",
+    "providers": {
+      "speechmatics": "Speechmatics (Default)",
+      "openai": "OpenAI (gpt-4o-transcribe)"
+    },
+    "saved": "Voice settings saved."
+  },
   "languages": {
     "en": "English",
     "zh": "中文",
diff --git a/it0-web-admin/src/i18n/locales/zh/settings.json b/it0-web-admin/src/i18n/locales/zh/settings.json
index c88800c..cdbed38 100644
--- a/it0-web-admin/src/i18n/locales/zh/settings.json
+++ b/it0-web-admin/src/i18n/locales/zh/settings.json
@@ -6,7 +6,8 @@
     "notifications": "通知",
     "apikeys": "API 密钥",
     "theme": "主题",
-    "account": "账户"
+    "account": "账户",
+    "voice": "语音"
   },
   "general": {
     "title": "通用设置",
@@ -68,6 +69,16 @@
     "passwordChanged": "密码修改成功。",
     "changing": "修改中..."
   },
+  "voice": {
+    "title": "语音设置",
+    "sttProvider": "语音转文字引擎",
+    "sttProviderHint": "选择语音通话时使用的语音识别引擎。",
+    "providers": {
+      "speechmatics": "Speechmatics（默认）",
+      "openai": "OpenAI (gpt-4o-transcribe)"
+    },
+    "saved": "语音设置已保存。"
+  },
   "languages": {
     "en": "English",
     "zh": "中文",
diff --git a/it0-web-admin/src/infrastructure/api/query-keys.ts b/it0-web-admin/src/infrastructure/api/query-keys.ts
index 6f0b729..4287cde 100644
--- a/it0-web-admin/src/infrastructure/api/query-keys.ts
+++ b/it0-web-admin/src/infrastructure/api/query-keys.ts
@@ -130,5 +130,6 @@ export const queryKeys = {
     apiKeys: () => [...queryKeys.settings.all, 'api-keys'] as const,
     theme: () => [...queryKeys.settings.all, 'theme'] as const,
     account: () => [...queryKeys.settings.all, 'account'] as const,
+    voice: () => [...queryKeys.settings.all, 'voice'] as const,
   },
 };
diff --git a/packages/services/agent-service/src/agent.module.ts b/packages/services/agent-service/src/agent.module.ts
index b80d016..b32f678 100644
--- a/packages/services/agent-service/src/agent.module.ts
+++ b/packages/services/agent-service/src/agent.module.ts
@@ -35,8 +35,12 @@ import { StandingOrderRef } from './domain/entities/standing-order.entity';
 import { TenantAgentConfig } from './domain/entities/tenant-agent-config.entity';
 import { AgentConfig } from './domain/entities/agent-config.entity';
 import { HookScript } from './domain/entities/hook-script.entity';
+import { VoiceConfig } from './domain/entities/voice-config.entity';
 import { ConversationMessage } from './domain/entities/conversation-message.entity';
 import { MessageRepository } from './infrastructure/repositories/message.repository';
+import { VoiceConfigRepository } from './infrastructure/repositories/voice-config.repository';
+import { VoiceConfigService } from './infrastructure/services/voice-config.service';
+import { VoiceConfigController } from './interfaces/rest/controllers/voice-config.controller';
 import { ConversationContextService } from './domain/services/conversation-context.service';
 
 @Module({
@@ -45,13 +49,13 @@ import { ConversationContextService } from './domain/services/conversation-conte
     DatabaseModule.forRoot(),
     TypeOrmModule.forFeature([
       AgentSession, AgentTask, CommandRecord, StandingOrderRef,
-      TenantAgentConfig, AgentConfig, HookScript,
+      TenantAgentConfig, AgentConfig, HookScript, VoiceConfig,
       ConversationMessage,
     ]),
   ],
   controllers: [
     AgentController, SessionController, RiskRulesController,
-    TenantAgentConfigController, AgentConfigController, SkillsController, HooksController,
+    TenantAgentConfigController, AgentConfigController, VoiceConfigController, SkillsController, HooksController,
   ],
   providers: [
     AgentStreamGateway,
@@ -70,9 +74,11 @@ import { ConversationContextService } from './domain/services/conversation-conte
     MessageRepository,
     TenantAgentConfigRepository,
     AgentConfigRepository,
+    VoiceConfigRepository,
     HookScriptRepository,
     TenantAgentConfigService,
     AgentConfigService,
+    VoiceConfigService,
     AgentSkillService,
     HookScriptService,
   ],
diff --git a/packages/services/agent-service/src/domain/entities/voice-config.entity.ts b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
new file mode 100644
index 0000000..f970f4b
--- /dev/null
+++ b/packages/services/agent-service/src/domain/entities/voice-config.entity.ts
@@ -0,0 +1,25 @@
+/**
+ * Per-tenant voice configuration entity.
+ *
+ * Stores STT provider preference per tenant (e.g. 'speechmatics' or 'openai').
+ * Queried by voice-agent at session start to select the appropriate STT engine.
+ */
+import { Entity, PrimaryGeneratedColumn, Column, CreateDateColumn, UpdateDateColumn } from 'typeorm';
+
+@Entity('voice_configs')
+export class VoiceConfig {
+  @PrimaryGeneratedColumn('uuid')
+  id!: string;
+
+  @Column({ type: 'varchar', length: 20, unique: true })
+  tenantId!: string;
+
+  @Column({ type: 'varchar', length: 30, default: 'speechmatics' })
+  sttProvider!: string;
+
+  @CreateDateColumn({ type: 'timestamptz' })
+  createdAt!: Date;
+
+  @UpdateDateColumn({ type: 'timestamptz' })
+  updatedAt!: Date;
+}
diff --git a/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts b/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts
new file mode 100644
index 0000000..70e27cf
--- /dev/null
+++ b/packages/services/agent-service/src/infrastructure/repositories/voice-config.repository.ts
@@ -0,0 +1,24 @@
+/**
+ * Repository for VoiceConfig.
+ * Uses standard TypeORM repository (no schema-per-tenant — uses tenantId column filter).
+ */
+import { Injectable } from '@nestjs/common';
+import { InjectRepository } from '@nestjs/typeorm';
+import { Repository } from 'typeorm';
+import { VoiceConfig } from '../../domain/entities/voice-config.entity';
+
+@Injectable()
+export class VoiceConfigRepository {
+  constructor(
+    @InjectRepository(VoiceConfig)
+    private readonly repo: Repository<VoiceConfig>,
+  ) {}
+
+  async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
+    return this.repo.findOneBy({ tenantId });
+  }
+
+  async save(entity: VoiceConfig): Promise<VoiceConfig> {
+    return this.repo.save(entity);
+  }
+}
diff --git a/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts b/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts
new file mode 100644
index 0000000..3f0b061
--- /dev/null
+++ b/packages/services/agent-service/src/infrastructure/services/voice-config.service.ts
@@ -0,0 +1,31 @@
+/**
+ * Service for managing per-tenant voice configuration (STT provider selection).
+ */
+import { Injectable } from '@nestjs/common';
+import { VoiceConfigRepository } from '../repositories/voice-config.repository';
+import { VoiceConfig } from '../../domain/entities/voice-config.entity';
+
+export interface UpdateVoiceConfigDto {
+  stt_provider?: string;
+}
+
+@Injectable()
+export class VoiceConfigService {
+  constructor(private readonly repo: VoiceConfigRepository) {}
+
+  async findByTenantId(tenantId: string): Promise<VoiceConfig | null> {
+    return this.repo.findByTenantId(tenantId);
+  }
+
+  async upsert(tenantId: string, dto: UpdateVoiceConfigDto): Promise<VoiceConfig> {
+    let config = await this.repo.findByTenantId(tenantId);
+    if (!config) {
+      config = new VoiceConfig();
+      config.tenantId = tenantId;
+    }
+    if (dto.stt_provider !== undefined) {
+      config.sttProvider = dto.stt_provider;
+    }
+    return this.repo.save(config);
+  }
+}
diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
new file mode 100644
index 0000000..8d65c97
--- /dev/null
+++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-config.controller.ts
@@ -0,0 +1,41 @@
+/**
+ * REST controller for per-tenant voice configuration (STT provider selection).
+ *
+ * Endpoints (JWT validated by Kong gateway):
+ *   GET  /api/v1/agent/voice-config  → Get current tenant's voice config
+ *   PUT  /api/v1/agent/voice-config  → Upsert voice config
+ */
+import { Controller, Get, Put, Body, Headers } from '@nestjs/common';
+import { VoiceConfigService, UpdateVoiceConfigDto } from '../../../infrastructure/services/voice-config.service';
+
+const DEFAULT_CONFIG = {
+  stt_provider: 'speechmatics',
+};
+
+@Controller('api/v1/agent/voice-config')
+export class VoiceConfigController {
+  constructor(private readonly voiceConfigService: VoiceConfigService) {}
+
+  @Get()
+  async getConfig(@Headers('x-tenant-id') tenantId: string) {
+    if (!tenantId) return DEFAULT_CONFIG;
+    const config = await this.voiceConfigService.findByTenantId(tenantId);
+    if (!config) return { ...DEFAULT_CONFIG, tenantId };
+    return {
+      tenantId: config.tenantId,
+      stt_provider: config.sttProvider,
+    };
+  }
+
+  @Put()
+  async upsertConfig(
+    @Headers('x-tenant-id') tenantId: string,
+    @Body() dto: UpdateVoiceConfigDto,
+  ) {
+    const config = await this.voiceConfigService.upsert(tenantId || 'default', dto);
+    return {
+      tenantId: config.tenantId,
+      stt_provider: config.sttProvider,
+    };
+  }
+}
diff --git a/packages/services/voice-agent/requirements.txt b/packages/services/voice-agent/requirements.txt
index 79deeaf..5ddf39d 100644
--- a/packages/services/voice-agent/requirements.txt
+++ b/packages/services/voice-agent/requirements.txt
@@ -2,6 +2,7 @@ livekit>=1.0.0
 livekit-agents>=1.0.0
 livekit-plugins-silero>=1.0.0
 livekit-plugins-openai>=1.0.0
+livekit-plugins-speechmatics>=1.0.0
 faster-whisper==1.2.1
 kokoro==0.3.5
 misaki[zh]==0.7.17
diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py
index 2536c39..a637024 100644
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@@ -199,6 +199,7 @@ async def entrypoint(ctx: JobContext) -> None:
         tts_voice = settings.openai_tts_voice
         tts_style = ""
         engine_type = "claude_agent_sdk"
+        meta = {}
         try:
             meta_str = ctx.job.metadata or "{}"
             meta = json.loads(meta_str)
@@ -212,8 +213,27 @@ async def entrypoint(ctx: JobContext) -> None:
         logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
                     bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)
 
-        # Build STT
-        if settings.stt_provider == "openai":
+        # ── Resolve STT provider (metadata > agent-service config > env default) ──
+        stt_provider = meta.get("stt_provider", "")
+        if not stt_provider and auth_header:
+            try:
+                import httpx as _httpx_cfg
+                async with _httpx_cfg.AsyncClient(timeout=_httpx_cfg.Timeout(5)) as _cfg_client:
+                    _cfg_resp = await _cfg_client.get(
+                        f"{settings.agent_service_url}/api/v1/agent/voice-config",
+                        headers={"Authorization": auth_header},
+                    )
+                    if _cfg_resp.status_code == 200:
+                        _voice_cfg = _cfg_resp.json()
+                        stt_provider = _voice_cfg.get("stt_provider", "")
+                        logger.info("Voice config from agent-service: stt_provider=%s", stt_provider)
+            except Exception as e:
+                logger.warning("Failed to fetch voice config from agent-service: %s", e)
+        if not stt_provider:
+            stt_provider = settings.stt_provider  # env var fallback
+
+        # ── Build STT ──
+        if stt_provider == "openai":
             from livekit.plugins import openai as openai_plugin
             import httpx as _httpx
             import openai as _openai
@@ -237,11 +257,15 @@ async def entrypoint(ctx: JobContext) -> None:
                     "silence_duration_ms": 800,
                 },
             )
+        elif stt_provider == "speechmatics":
+            from .plugins.speechmatics_stt import create_speechmatics_stt
+            stt = create_speechmatics_stt(language=settings.whisper_language)
         else:
             stt = LocalWhisperSTT(
                 model=ctx.proc.userdata.get("whisper_model"),
                 language=settings.whisper_language,
             )
+        logger.info("STT provider selected: %s", stt_provider)
 
         # Build TTS
         if settings.tts_provider == "openai":
diff --git a/packages/services/voice-agent/src/plugins/speechmatics_stt.py b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
new file mode 100644
index 0000000..b81b886
--- /dev/null
+++ b/packages/services/voice-agent/src/plugins/speechmatics_stt.py
@@ -0,0 +1,32 @@
+"""
+Speechmatics STT factory for voice-agent.
+
+Creates a livekit-plugins-speechmatics STT instance configured for
+Mandarin-English bilingual recognition with speaker diarization support.
+
+The SPEECHMATICS_API_KEY environment variable is read automatically
+by the livekit-plugins-speechmatics package.
+"""
+import logging
+
+from livekit.plugins import speechmatics
+
+logger = logging.getLogger(__name__)
+
+
+def create_speechmatics_stt(language: str = "cmn") -> speechmatics.STT:
+    """Create a Speechmatics STT instance for the voice pipeline.
+
+    Args:
+        language: Speechmatics language code. Default 'cmn' for Mandarin Chinese.
+                  Use 'cmn_en' for Mandarin-English bilingual, 'en' for English.
+
+    Returns:
+        Configured speechmatics.STT instance.
+    """
+    stt = speechmatics.STT(
+        language=language,
+        enable_partials=True,
+    )
+    logger.info("Speechmatics STT created: language=%s", language)
+    return stt