From 59a3e60b82bb692f6a202f2db1d80dd5614fddd1 Mon Sep 17 00:00:00 2001 From: hailin Date: Mon, 2 Mar 2026 02:11:51 -0800 Subject: [PATCH] feat: add engine type selection (Agent SDK / Claude API) for voice calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full-stack implementation allowing users to choose between Claude Agent SDK (default, with tool approval, skill injection, session resume) and Claude API (direct, lower latency) in Flutter settings. Agent SDK mode wraps prompts with voice-conversation instructions for concise spoken Chinese output. Data flow: Flutter Settings → SharedPreferences → POST /livekit/token → RoomAgentDispatch metadata → voice-agent → AgentServiceLLM(engine_type) Co-Authored-By: Claude Opus 4.6 --- .../presentation/pages/agent_call_page.dart | 1 + .../data/datasources/settings_datasource.dart | 4 ++ .../domain/entities/app_settings.dart | 4 ++ .../presentation/pages/settings_page.dart | 64 +++++++++++++++++++ .../providers/settings_providers.dart | 5 ++ packages/services/voice-agent/src/agent.py | 7 +- .../voice-agent/src/plugins/agent_llm.py | 28 ++++++-- .../voice-service/src/api/livekit_token.py | 3 + 8 files changed, 110 insertions(+), 6 deletions(-) diff --git a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart index 94d16cd..12a4b0d 100644 --- a/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart +++ b/it0_app/lib/features/agent_call/presentation/pages/agent_call_page.dart @@ -80,6 +80,7 @@ class _AgentCallPageState extends ConsumerState data: { if (voiceSettings.ttsVoice.isNotEmpty) 'tts_voice': voiceSettings.ttsVoice, if (voiceSettings.ttsStyle.isNotEmpty) 'tts_style': voiceSettings.ttsStyle, + 'engine_type': voiceSettings.engineType, }, ); final data = response.data as Map; diff --git a/it0_app/lib/features/settings/data/datasources/settings_datasource.dart b/it0_app/lib/features/settings/data/datasources/settings_datasource.dart index 67fb35c..07715a1 100644 --- a/it0_app/lib/features/settings/data/datasources/settings_datasource.dart +++ b/it0_app/lib/features/settings/data/datasources/settings_datasource.dart @@ -16,6 +16,7 @@ class SettingsDatasource { static const String _keyBiometric = 'settings_biometric'; static const String _keyTtsVoice = 'settings_tts_voice'; static const String _keyTtsStyle = 'settings_tts_style'; + static const String _keyEngineType = 'settings_engine_type'; SettingsDatasource(this._prefs); @@ -35,6 +36,7 @@ class SettingsDatasource { biometricEnabled: _prefs.getBool(_keyBiometric) ?? false, ttsVoice: _prefs.getString(_keyTtsVoice) ?? 'coral', ttsStyle: _prefs.getString(_keyTtsStyle) ?? '', + engineType: _prefs.getString(_keyEngineType) ?? 'claude_agent_sdk', ); } @@ -61,6 +63,7 @@ class SettingsDatasource { await _prefs.setBool(_keyBiometric, settings.biometricEnabled); await _prefs.setString(_keyTtsVoice, settings.ttsVoice); await _prefs.setString(_keyTtsStyle, settings.ttsStyle); + await _prefs.setString(_keyEngineType, settings.engineType); } /// Removes all settings keys from SharedPreferences. @@ -75,5 +78,6 @@ class SettingsDatasource { await _prefs.remove(_keyBiometric); await _prefs.remove(_keyTtsVoice); await _prefs.remove(_keyTtsStyle); + await _prefs.remove(_keyEngineType); } } diff --git a/it0_app/lib/features/settings/domain/entities/app_settings.dart b/it0_app/lib/features/settings/domain/entities/app_settings.dart index d39a2b5..373d5ce 100644 --- a/it0_app/lib/features/settings/domain/entities/app_settings.dart +++ b/it0_app/lib/features/settings/domain/entities/app_settings.dart @@ -12,6 +12,7 @@ class AppSettings { final bool biometricEnabled; final String ttsVoice; final String ttsStyle; + final String engineType; const AppSettings({ this.themeMode = ThemeMode.dark, @@ -24,6 +25,7 @@ class AppSettings { this.biometricEnabled = false, this.ttsVoice = 'coral', this.ttsStyle = '', + this.engineType = 'claude_agent_sdk', }); AppSettings copyWith({ @@ -37,6 +39,7 @@ class AppSettings { bool? biometricEnabled, String? ttsVoice, String? ttsStyle, + String? engineType, }) { return AppSettings( themeMode: themeMode ?? this.themeMode, @@ -49,6 +52,7 @@ class AppSettings { biometricEnabled: biometricEnabled ?? this.biometricEnabled, ttsVoice: ttsVoice ?? this.ttsVoice, ttsStyle: ttsStyle ?? this.ttsStyle, + engineType: engineType ?? this.engineType, ); } } diff --git a/it0_app/lib/features/settings/presentation/pages/settings_page.dart b/it0_app/lib/features/settings/presentation/pages/settings_page.dart index a9c7051..b8f55ee 100644 --- a/it0_app/lib/features/settings/presentation/pages/settings_page.dart +++ b/it0_app/lib/features/settings/presentation/pages/settings_page.dart @@ -114,6 +114,16 @@ class _SettingsPageState extends ConsumerState { _SettingsGroup( cardColor: cardColor, children: [ + _SettingsRow( + icon: Icons.psychology, + iconBg: const Color(0xFF7C3AED), + title: '对话引擎', + trailing: Text( + settings.engineType == 'claude_agent_sdk' ? 'Agent SDK' : 'Claude API', + style: TextStyle(color: subtitleColor, fontSize: 14), + ), + onTap: () => _showEngineTypePicker(settings.engineType), + ), _SettingsRow( icon: Icons.record_voice_over, iconBg: const Color(0xFF0EA5E9), @@ -376,6 +386,60 @@ class _SettingsPageState extends ConsumerState { ('fable', 'Fable', '中性 · 叙事'), ]; + void _showEngineTypePicker(String current) { + final engines = [ + ('claude_agent_sdk', 'Agent SDK', '支持工具审批、技能注入、会话恢复'), + ('claude_api', 'Claude API', '直连 API,响应更快'), + ]; + showModalBottomSheet( + context: context, + shape: const RoundedRectangleBorder( + borderRadius: BorderRadius.vertical(top: Radius.circular(20)), + ), + builder: (ctx) => Padding( + padding: const EdgeInsets.symmetric(vertical: 16), + child: Column( + mainAxisSize: MainAxisSize.min, + children: [ + Container( + width: 36, + height: 4, + decoration: BoxDecoration( + color: Colors.grey[400], + borderRadius: BorderRadius.circular(2), + ), + ), + const SizedBox(height: 16), + Text('选择对话引擎', + style: Theme.of(ctx).textTheme.titleMedium?.copyWith( + fontWeight: FontWeight.w600, + )), + const SizedBox(height: 12), + ...engines.map((e) => ListTile( + leading: Icon( + e.$1 == 'claude_agent_sdk' ? Icons.psychology : Icons.api, + color: e.$1 == current ? AppColors.primary : null, + ), + title: Text(e.$2, + style: TextStyle( + fontWeight: e.$1 == current ? FontWeight.bold : FontWeight.normal, + color: e.$1 == current ? AppColors.primary : null, + )), + subtitle: Text(e.$3, style: const TextStyle(fontSize: 12)), + trailing: e.$1 == current + ? const Icon(Icons.check_circle, color: AppColors.primary) + : null, + onTap: () { + ref.read(settingsProvider.notifier).setEngineType(e.$1); + Navigator.pop(ctx); + }, + )), + ], + ), + ), + ); + } + void _showVoicePicker(String current) { showModalBottomSheet( context: context, diff --git a/it0_app/lib/features/settings/presentation/providers/settings_providers.dart b/it0_app/lib/features/settings/presentation/providers/settings_providers.dart index c738f17..24658b0 100644 --- a/it0_app/lib/features/settings/presentation/providers/settings_providers.dart +++ b/it0_app/lib/features/settings/presentation/providers/settings_providers.dart @@ -134,6 +134,11 @@ class SettingsNotifier extends StateNotifier { await _repository?.saveSettings(state); } + Future setEngineType(String type) async { + state = state.copyWith(engineType: type); + await _repository?.saveSettings(state); + } + Future resetToDefaults() async { await _repository?.resetSettings(); state = const AppSettings(); diff --git a/packages/services/voice-agent/src/agent.py b/packages/services/voice-agent/src/agent.py index b671aa7..07892d3 100644 --- a/packages/services/voice-agent/src/agent.py +++ b/packages/services/voice-agent/src/agent.py @@ -200,17 +200,19 @@ async def entrypoint(ctx: JobContext) -> None: auth_header = "" tts_voice = settings.openai_tts_voice tts_style = "" + engine_type = "claude_agent_sdk" try: meta_str = ctx.job.metadata or "{}" meta = json.loads(meta_str) auth_header = meta.get("auth_header", "") tts_voice = meta.get("tts_voice", settings.openai_tts_voice) tts_style = meta.get("tts_style", "") + engine_type = meta.get("engine_type", "claude_agent_sdk") except Exception as e: logger.warning("Failed to parse job metadata: %s", e) - logger.info("Auth header present: %s, TTS: voice=%s, style=%s", - bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)") + logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s", + bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type) # Build STT if settings.stt_provider == "openai": @@ -270,6 +272,7 @@ async def entrypoint(ctx: JobContext) -> None: llm = AgentServiceLLM( agent_service_url=settings.agent_service_url, auth_header=auth_header, + engine_type=engine_type, ) # Create and start AgentSession with the full pipeline diff --git a/packages/services/voice-agent/src/plugins/agent_llm.py b/packages/services/voice-agent/src/plugins/agent_llm.py index 90c66af..82eab8f 100644 --- a/packages/services/voice-agent/src/plugins/agent_llm.py +++ b/packages/services/voice-agent/src/plugins/agent_llm.py @@ -2,10 +2,13 @@ Custom LLM plugin that proxies to IT0 agent-service. Instead of calling Claude directly, this plugin: -1. POSTs to agent-service /api/v1/agent/tasks (creates a task with engineType=claude_api) +1. POSTs to agent-service /api/v1/agent/tasks (engineType configurable: claude_agent_sdk or claude_api) 2. Subscribes to the agent-service WebSocket /ws/agent for streaming text events 3. Emits ChatChunk objects into the LiveKit pipeline +In Agent SDK mode, the prompt is wrapped with voice-conversation instructions +so the agent outputs concise spoken Chinese without tool-call details. + This preserves all agent-service capabilities: Tool Use, conversation history, tenant isolation, and session management. """ @@ -38,10 +41,12 @@ class AgentServiceLLM(llm.LLM): *, agent_service_url: str = "http://agent-service:3002", auth_header: str = "", + engine_type: str = "claude_agent_sdk", ): super().__init__() self._agent_service_url = agent_service_url self._auth_header = auth_header + self._engine_type = engine_type self._agent_session_id: str | None = None @property @@ -205,14 +210,29 @@ class AgentServiceLLMStream(llm.LLMStream): })) # 2. Create agent task (with timeout) + engine_type = self._llm_instance._engine_type + prompt = user_text + + # Agent SDK mode: instruct the agent to output concise spoken Chinese + # (skip tool-call details and intermediate steps) + if engine_type == "claude_agent_sdk": + prompt = ( + "【语音对话模式】你正在通过语音与用户实时对话。请严格遵守以下规则:\n" + "1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n" + "2. 用简洁自然的口语中文回答,像面对面对话一样\n" + "3. 回复要简短精炼,适合语音播报,通常1-3句话即可\n" + "4. 不要使用markdown格式、代码块、列表符号等文本格式\n" + f"\n用户说:{user_text}" + ) + body: dict[str, Any] = { - "prompt": user_text, - "engineType": "claude_api", + "prompt": prompt, + "engineType": engine_type, } if self._llm_instance._agent_session_id: body["sessionId"] = self._llm_instance._agent_session_id - logger.info("POST /tasks prompt=%s", user_text[:80]) + logger.info("POST /tasks engine=%s prompt=%s", engine_type, user_text[:80]) async with httpx.AsyncClient( timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10), ) as client: diff --git a/packages/services/voice-service/src/api/livekit_token.py b/packages/services/voice-service/src/api/livekit_token.py index a1be55e..39bfd56 100644 --- a/packages/services/voice-service/src/api/livekit_token.py +++ b/packages/services/voice-service/src/api/livekit_token.py @@ -24,6 +24,7 @@ router = APIRouter() class TokenRequest(BaseModel): tts_voice: Optional[str] = None tts_style: Optional[str] = None + engine_type: Optional[str] = None @router.post("/livekit/token") @@ -44,6 +45,8 @@ async def create_livekit_token(request: Request, body: TokenRequest = TokenReque metadata["tts_voice"] = body.tts_voice if body.tts_style: metadata["tts_style"] = body.tts_style + if body.engine_type: + metadata["engine_type"] = body.engine_type token = ( livekit_api.AccessToken(settings.livekit_api_key, settings.livekit_api_secret)