feat: add engine type selection (Agent SDK / Claude API) for voice calls

Full-stack implementation allowing users to choose between Claude Agent SDK (default, with tool approval, skill injection, session resume) and Claude API (direct, lower latency) in Flutter settings. Agent SDK mode wraps prompts with voice-conversation instructions for concise spoken Chinese output. Data flow: Flutter Settings → SharedPreferences → POST /livekit/token → RoomAgentDispatch metadata → voice-agent → AgentServiceLLM(engine_type) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 02:11:51 -08:00 · 2026-03-02 02:11:51 -08:00 · 59a3e60b82
parent c9e196639a
commit 59a3e60b82
8 changed files with 110 additions and 6 deletions
--- a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
+++ b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart
@ -80,6 +80,7 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
        data: {
          if (voiceSettings.ttsVoice.isNotEmpty) 'tts_voice': voiceSettings.ttsVoice,
          if (voiceSettings.ttsStyle.isNotEmpty) 'tts_style': voiceSettings.ttsStyle,
+          'engine_type': voiceSettings.engineType,
        },
      );
      final data = response.data as Map<String, dynamic>;
--- a/it0_app/lib/features/settings/data/datasources/settings_datasource.dart
+++ b/it0_app/lib/features/settings/data/datasources/settings_datasource.dart
@ -16,6 +16,7 @@ class SettingsDatasource {
  static const String _keyBiometric = 'settings_biometric';
  static const String _keyTtsVoice = 'settings_tts_voice';
  static const String _keyTtsStyle = 'settings_tts_style';
+  static const String _keyEngineType = 'settings_engine_type';

  SettingsDatasource(this._prefs);

@ -35,6 +36,7 @@ class SettingsDatasource {
      biometricEnabled: _prefs.getBool(_keyBiometric) ?? false,
      ttsVoice: _prefs.getString(_keyTtsVoice) ?? 'coral',
      ttsStyle: _prefs.getString(_keyTtsStyle) ?? '',
+      engineType: _prefs.getString(_keyEngineType) ?? 'claude_agent_sdk',
    );
  }

@ -61,6 +63,7 @@ class SettingsDatasource {
    await _prefs.setBool(_keyBiometric, settings.biometricEnabled);
    await _prefs.setString(_keyTtsVoice, settings.ttsVoice);
    await _prefs.setString(_keyTtsStyle, settings.ttsStyle);
+    await _prefs.setString(_keyEngineType, settings.engineType);
  }

  /// Removes all settings keys from SharedPreferences.
@ -75,5 +78,6 @@ class SettingsDatasource {
    await _prefs.remove(_keyBiometric);
    await _prefs.remove(_keyTtsVoice);
    await _prefs.remove(_keyTtsStyle);
+    await _prefs.remove(_keyEngineType);
  }
 }
--- a/it0_app/lib/features/settings/domain/entities/app_settings.dart
+++ b/it0_app/lib/features/settings/domain/entities/app_settings.dart
@ -12,6 +12,7 @@ class AppSettings {
  final bool biometricEnabled;
  final String ttsVoice;
  final String ttsStyle;
+  final String engineType;

  const AppSettings({
    this.themeMode = ThemeMode.dark,
@ -24,6 +25,7 @@ class AppSettings {
    this.biometricEnabled = false,
    this.ttsVoice = 'coral',
    this.ttsStyle = '',
+    this.engineType = 'claude_agent_sdk',
  });

  AppSettings copyWith({
@ -37,6 +39,7 @@ class AppSettings {
    bool? biometricEnabled,
    String? ttsVoice,
    String? ttsStyle,
+    String? engineType,
  }) {
    return AppSettings(
      themeMode: themeMode ?? this.themeMode,
@ -49,6 +52,7 @@ class AppSettings {
      biometricEnabled: biometricEnabled ?? this.biometricEnabled,
      ttsVoice: ttsVoice ?? this.ttsVoice,
      ttsStyle: ttsStyle ?? this.ttsStyle,
+      engineType: engineType ?? this.engineType,
    );
  }
 }
--- a/it0_app/lib/features/settings/presentation/pages/settings_page.dart
+++ b/it0_app/lib/features/settings/presentation/pages/settings_page.dart
@ -114,6 +114,16 @@ class _SettingsPageState extends ConsumerState<SettingsPage> {
          _SettingsGroup(
            cardColor: cardColor,
            children: [
+              _SettingsRow(
+                icon: Icons.psychology,
+                iconBg: const Color(0xFF7C3AED),
+                title: '对话引擎',
+                trailing: Text(
+                  settings.engineType == 'claude_agent_sdk' ? 'Agent SDK' : 'Claude API',
+                  style: TextStyle(color: subtitleColor, fontSize: 14),
+                ),
+                onTap: () => _showEngineTypePicker(settings.engineType),
+              ),
              _SettingsRow(
                icon: Icons.record_voice_over,
                iconBg: const Color(0xFF0EA5E9),
@ -376,6 +386,60 @@ class _SettingsPageState extends ConsumerState<SettingsPage> {
    ('fable', 'Fable', '中性 · 叙事'),
  ];

+  void _showEngineTypePicker(String current) {
+    final engines = [
+      ('claude_agent_sdk', 'Agent SDK', '支持工具审批、技能注入、会话恢复'),
+      ('claude_api', 'Claude API', '直连 API，响应更快'),
+    ];
+    showModalBottomSheet(
+      context: context,
+      shape: const RoundedRectangleBorder(
+        borderRadius: BorderRadius.vertical(top: Radius.circular(20)),
+      ),
+      builder: (ctx) => Padding(
+        padding: const EdgeInsets.symmetric(vertical: 16),
+        child: Column(
+          mainAxisSize: MainAxisSize.min,
+          children: [
+            Container(
+              width: 36,
+              height: 4,
+              decoration: BoxDecoration(
+                color: Colors.grey[400],
+                borderRadius: BorderRadius.circular(2),
+              ),
+            ),
+            const SizedBox(height: 16),
+            Text('选择对话引擎',
+                style: Theme.of(ctx).textTheme.titleMedium?.copyWith(
+                      fontWeight: FontWeight.w600,
+                    )),
+            const SizedBox(height: 12),
+            ...engines.map((e) => ListTile(
+                  leading: Icon(
+                    e.$1 == 'claude_agent_sdk' ? Icons.psychology : Icons.api,
+                    color: e.$1 == current ? AppColors.primary : null,
+                  ),
+                  title: Text(e.$2,
+                      style: TextStyle(
+                        fontWeight: e.$1 == current ? FontWeight.bold : FontWeight.normal,
+                        color: e.$1 == current ? AppColors.primary : null,
+                      )),
+                  subtitle: Text(e.$3, style: const TextStyle(fontSize: 12)),
+                  trailing: e.$1 == current
+                      ? const Icon(Icons.check_circle, color: AppColors.primary)
+                      : null,
+                  onTap: () {
+                    ref.read(settingsProvider.notifier).setEngineType(e.$1);
+                    Navigator.pop(ctx);
+                  },
+                )),
+          ],
+        ),
+      ),
+    );
+  }
+
  void _showVoicePicker(String current) {
    showModalBottomSheet(
      context: context,
--- a/it0_app/lib/features/settings/presentation/providers/settings_providers.dart
+++ b/it0_app/lib/features/settings/presentation/providers/settings_providers.dart
@ -134,6 +134,11 @@ class SettingsNotifier extends StateNotifier<AppSettings> {
    await _repository?.saveSettings(state);
  }

+  Future<void> setEngineType(String type) async {
+    state = state.copyWith(engineType: type);
+    await _repository?.saveSettings(state);
+  }
+
  Future<void> resetToDefaults() async {
    await _repository?.resetSettings();
    state = const AppSettings();
--- a/packages/services/voice-agent/src/agent.py
+++ b/packages/services/voice-agent/src/agent.py
@ -200,17 +200,19 @@ async def entrypoint(ctx: JobContext) -> None:
        auth_header = ""
        tts_voice = settings.openai_tts_voice
        tts_style = ""
+        engine_type = "claude_agent_sdk"
        try:
            meta_str = ctx.job.metadata or "{}"
            meta = json.loads(meta_str)
            auth_header = meta.get("auth_header", "")
            tts_voice = meta.get("tts_voice", settings.openai_tts_voice)
            tts_style = meta.get("tts_style", "")
+            engine_type = meta.get("engine_type", "claude_agent_sdk")
        except Exception as e:
            logger.warning("Failed to parse job metadata: %s", e)

-        logger.info("Auth header present: %s, TTS: voice=%s, style=%s",
-                    bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)")
+        logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
+                    bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)

        # Build STT
        if settings.stt_provider == "openai":
@ -270,6 +272,7 @@ async def entrypoint(ctx: JobContext) -> None:
        llm = AgentServiceLLM(
            agent_service_url=settings.agent_service_url,
            auth_header=auth_header,
+            engine_type=engine_type,
        )

        # Create and start AgentSession with the full pipeline
--- a/packages/services/voice-agent/src/plugins/agent_llm.py
+++ b/packages/services/voice-agent/src/plugins/agent_llm.py
@ -2,10 +2,13 @@
 Custom LLM plugin that proxies to IT0 agent-service.

 Instead of calling Claude directly, this plugin:
-1. POSTs to agent-service /api/v1/agent/tasks (creates a task with engineType=claude_api)
+1. POSTs to agent-service /api/v1/agent/tasks (engineType configurable: claude_agent_sdk or claude_api)
 2. Subscribes to the agent-service WebSocket /ws/agent for streaming text events
 3. Emits ChatChunk objects into the LiveKit pipeline

+In Agent SDK mode, the prompt is wrapped with voice-conversation instructions
+so the agent outputs concise spoken Chinese without tool-call details.
+
 This preserves all agent-service capabilities: Tool Use, conversation history,
 tenant isolation, and session management.
 """
@ -38,10 +41,12 @@ class AgentServiceLLM(llm.LLM):
        *,
        agent_service_url: str = "http://agent-service:3002",
        auth_header: str = "",
+        engine_type: str = "claude_agent_sdk",
    ):
        super().__init__()
        self._agent_service_url = agent_service_url
        self._auth_header = auth_header
+        self._engine_type = engine_type
        self._agent_session_id: str | None = None

    @property
@ -205,14 +210,29 @@ class AgentServiceLLMStream(llm.LLMStream):
                }))

            # 2. Create agent task (with timeout)
+            engine_type = self._llm_instance._engine_type
+            prompt = user_text
+
+            # Agent SDK mode: instruct the agent to output concise spoken Chinese
+            # (skip tool-call details and intermediate steps)
+            if engine_type == "claude_agent_sdk":
+                prompt = (
+                    "【语音对话模式】你正在通过语音与用户实时对话。请严格遵守以下规则：\n"
+                    "1. 只输出用户关注的最终答案，不要输出工具调用过程、中间步骤或技术细节\n"
+                    "2. 用简洁自然的口语中文回答，像面对面对话一样\n"
+                    "3. 回复要简短精炼，适合语音播报，通常1-3句话即可\n"
+                    "4. 不要使用markdown格式、代码块、列表符号等文本格式\n"
+                    f"\n用户说：{user_text}"
+                )
+
            body: dict[str, Any] = {
-                "prompt": user_text,
-                "engineType": "claude_api",
+                "prompt": prompt,
+                "engineType": engine_type,
            }
            if self._llm_instance._agent_session_id:
                body["sessionId"] = self._llm_instance._agent_session_id

-            logger.info("POST /tasks prompt=%s", user_text[:80])
+            logger.info("POST /tasks engine=%s prompt=%s", engine_type, user_text[:80])
            async with httpx.AsyncClient(
                timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
            ) as client:
--- a/packages/services/voice-service/src/api/livekit_token.py
+++ b/packages/services/voice-service/src/api/livekit_token.py
@ -24,6 +24,7 @@ router = APIRouter()
 class TokenRequest(BaseModel):
    tts_voice: Optional[str] = None
    tts_style: Optional[str] = None
+    engine_type: Optional[str] = None


@router.post("/livekit/token")
@ -44,6 +45,8 @@ async def create_livekit_token(request: Request, body: TokenRequest = TokenReque
        metadata["tts_voice"] = body.tts_voice
    if body.tts_style:
        metadata["tts_style"] = body.tts_style
+    if body.engine_type:
+        metadata["engine_type"] = body.engine_type

    token = (
        livekit_api.AccessToken(settings.livekit_api_key, settings.livekit_api_secret)