feat: add engine type selection (Agent SDK / Claude API) for voice calls

Full-stack implementation allowing users to choose between Claude Agent SDK
(default, with tool approval, skill injection, session resume) and Claude API
(direct, lower latency) in Flutter settings. Agent SDK mode wraps prompts with
voice-conversation instructions for concise spoken Chinese output.

Data flow: Flutter Settings → SharedPreferences → POST /livekit/token →
RoomAgentDispatch metadata → voice-agent → AgentServiceLLM(engine_type)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-02 02:11:51 -08:00
parent c9e196639a
commit 59a3e60b82
8 changed files with 110 additions and 6 deletions

View File

@ -80,6 +80,7 @@ class _AgentCallPageState extends ConsumerState<AgentCallPage>
data: {
if (voiceSettings.ttsVoice.isNotEmpty) 'tts_voice': voiceSettings.ttsVoice,
if (voiceSettings.ttsStyle.isNotEmpty) 'tts_style': voiceSettings.ttsStyle,
'engine_type': voiceSettings.engineType,
},
);
final data = response.data as Map<String, dynamic>;

View File

@ -16,6 +16,7 @@ class SettingsDatasource {
static const String _keyBiometric = 'settings_biometric';
static const String _keyTtsVoice = 'settings_tts_voice';
static const String _keyTtsStyle = 'settings_tts_style';
static const String _keyEngineType = 'settings_engine_type';
SettingsDatasource(this._prefs);
@ -35,6 +36,7 @@ class SettingsDatasource {
biometricEnabled: _prefs.getBool(_keyBiometric) ?? false,
ttsVoice: _prefs.getString(_keyTtsVoice) ?? 'coral',
ttsStyle: _prefs.getString(_keyTtsStyle) ?? '',
engineType: _prefs.getString(_keyEngineType) ?? 'claude_agent_sdk',
);
}
@ -61,6 +63,7 @@ class SettingsDatasource {
await _prefs.setBool(_keyBiometric, settings.biometricEnabled);
await _prefs.setString(_keyTtsVoice, settings.ttsVoice);
await _prefs.setString(_keyTtsStyle, settings.ttsStyle);
await _prefs.setString(_keyEngineType, settings.engineType);
}
/// Removes all settings keys from SharedPreferences.
@ -75,5 +78,6 @@ class SettingsDatasource {
await _prefs.remove(_keyBiometric);
await _prefs.remove(_keyTtsVoice);
await _prefs.remove(_keyTtsStyle);
await _prefs.remove(_keyEngineType);
}
}

View File

@ -12,6 +12,7 @@ class AppSettings {
final bool biometricEnabled;
final String ttsVoice;
final String ttsStyle;
final String engineType;
const AppSettings({
this.themeMode = ThemeMode.dark,
@ -24,6 +25,7 @@ class AppSettings {
this.biometricEnabled = false,
this.ttsVoice = 'coral',
this.ttsStyle = '',
this.engineType = 'claude_agent_sdk',
});
AppSettings copyWith({
@ -37,6 +39,7 @@ class AppSettings {
bool? biometricEnabled,
String? ttsVoice,
String? ttsStyle,
String? engineType,
}) {
return AppSettings(
themeMode: themeMode ?? this.themeMode,
@ -49,6 +52,7 @@ class AppSettings {
biometricEnabled: biometricEnabled ?? this.biometricEnabled,
ttsVoice: ttsVoice ?? this.ttsVoice,
ttsStyle: ttsStyle ?? this.ttsStyle,
engineType: engineType ?? this.engineType,
);
}
}

View File

@ -114,6 +114,16 @@ class _SettingsPageState extends ConsumerState<SettingsPage> {
_SettingsGroup(
cardColor: cardColor,
children: [
_SettingsRow(
icon: Icons.psychology,
iconBg: const Color(0xFF7C3AED),
title: '对话引擎',
trailing: Text(
settings.engineType == 'claude_agent_sdk' ? 'Agent SDK' : 'Claude API',
style: TextStyle(color: subtitleColor, fontSize: 14),
),
onTap: () => _showEngineTypePicker(settings.engineType),
),
_SettingsRow(
icon: Icons.record_voice_over,
iconBg: const Color(0xFF0EA5E9),
@ -376,6 +386,60 @@ class _SettingsPageState extends ConsumerState<SettingsPage> {
('fable', 'Fable', '中性 · 叙事'),
];
void _showEngineTypePicker(String current) {
final engines = [
('claude_agent_sdk', 'Agent SDK', '支持工具审批、技能注入、会话恢复'),
('claude_api', 'Claude API', '直连 API响应更快'),
];
showModalBottomSheet(
context: context,
shape: const RoundedRectangleBorder(
borderRadius: BorderRadius.vertical(top: Radius.circular(20)),
),
builder: (ctx) => Padding(
padding: const EdgeInsets.symmetric(vertical: 16),
child: Column(
mainAxisSize: MainAxisSize.min,
children: [
Container(
width: 36,
height: 4,
decoration: BoxDecoration(
color: Colors.grey[400],
borderRadius: BorderRadius.circular(2),
),
),
const SizedBox(height: 16),
Text('选择对话引擎',
style: Theme.of(ctx).textTheme.titleMedium?.copyWith(
fontWeight: FontWeight.w600,
)),
const SizedBox(height: 12),
...engines.map((e) => ListTile(
leading: Icon(
e.$1 == 'claude_agent_sdk' ? Icons.psychology : Icons.api,
color: e.$1 == current ? AppColors.primary : null,
),
title: Text(e.$2,
style: TextStyle(
fontWeight: e.$1 == current ? FontWeight.bold : FontWeight.normal,
color: e.$1 == current ? AppColors.primary : null,
)),
subtitle: Text(e.$3, style: const TextStyle(fontSize: 12)),
trailing: e.$1 == current
? const Icon(Icons.check_circle, color: AppColors.primary)
: null,
onTap: () {
ref.read(settingsProvider.notifier).setEngineType(e.$1);
Navigator.pop(ctx);
},
)),
],
),
),
);
}
void _showVoicePicker(String current) {
showModalBottomSheet(
context: context,

View File

@ -134,6 +134,11 @@ class SettingsNotifier extends StateNotifier<AppSettings> {
await _repository?.saveSettings(state);
}
Future<void> setEngineType(String type) async {
state = state.copyWith(engineType: type);
await _repository?.saveSettings(state);
}
Future<void> resetToDefaults() async {
await _repository?.resetSettings();
state = const AppSettings();

View File

@ -200,17 +200,19 @@ async def entrypoint(ctx: JobContext) -> None:
auth_header = ""
tts_voice = settings.openai_tts_voice
tts_style = ""
engine_type = "claude_agent_sdk"
try:
meta_str = ctx.job.metadata or "{}"
meta = json.loads(meta_str)
auth_header = meta.get("auth_header", "")
tts_voice = meta.get("tts_voice", settings.openai_tts_voice)
tts_style = meta.get("tts_style", "")
engine_type = meta.get("engine_type", "claude_agent_sdk")
except Exception as e:
logger.warning("Failed to parse job metadata: %s", e)
logger.info("Auth header present: %s, TTS: voice=%s, style=%s",
bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)")
logger.info("Auth header present: %s, TTS: voice=%s, style=%s, engine=%s",
bool(auth_header), tts_voice, tts_style[:50] if tts_style else "(default)", engine_type)
# Build STT
if settings.stt_provider == "openai":
@ -270,6 +272,7 @@ async def entrypoint(ctx: JobContext) -> None:
llm = AgentServiceLLM(
agent_service_url=settings.agent_service_url,
auth_header=auth_header,
engine_type=engine_type,
)
# Create and start AgentSession with the full pipeline

View File

@ -2,10 +2,13 @@
Custom LLM plugin that proxies to IT0 agent-service.
Instead of calling Claude directly, this plugin:
1. POSTs to agent-service /api/v1/agent/tasks (creates a task with engineType=claude_api)
1. POSTs to agent-service /api/v1/agent/tasks (engineType configurable: claude_agent_sdk or claude_api)
2. Subscribes to the agent-service WebSocket /ws/agent for streaming text events
3. Emits ChatChunk objects into the LiveKit pipeline
In Agent SDK mode, the prompt is wrapped with voice-conversation instructions
so the agent outputs concise spoken Chinese without tool-call details.
This preserves all agent-service capabilities: Tool Use, conversation history,
tenant isolation, and session management.
"""
@ -38,10 +41,12 @@ class AgentServiceLLM(llm.LLM):
*,
agent_service_url: str = "http://agent-service:3002",
auth_header: str = "",
engine_type: str = "claude_agent_sdk",
):
super().__init__()
self._agent_service_url = agent_service_url
self._auth_header = auth_header
self._engine_type = engine_type
self._agent_session_id: str | None = None
@property
@ -205,14 +210,29 @@ class AgentServiceLLMStream(llm.LLMStream):
}))
# 2. Create agent task (with timeout)
engine_type = self._llm_instance._engine_type
prompt = user_text
# Agent SDK mode: instruct the agent to output concise spoken Chinese
# (skip tool-call details and intermediate steps)
if engine_type == "claude_agent_sdk":
prompt = (
"【语音对话模式】你正在通过语音与用户实时对话。请严格遵守以下规则:\n"
"1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n"
"2. 用简洁自然的口语中文回答,像面对面对话一样\n"
"3. 回复要简短精炼适合语音播报通常1-3句话即可\n"
"4. 不要使用markdown格式、代码块、列表符号等文本格式\n"
f"\n用户说:{user_text}"
)
body: dict[str, Any] = {
"prompt": user_text,
"engineType": "claude_api",
"prompt": prompt,
"engineType": engine_type,
}
if self._llm_instance._agent_session_id:
body["sessionId"] = self._llm_instance._agent_session_id
logger.info("POST /tasks prompt=%s", user_text[:80])
logger.info("POST /tasks engine=%s prompt=%s", engine_type, user_text[:80])
async with httpx.AsyncClient(
timeout=httpx.Timeout(connect=10, read=30, write=10, pool=10),
) as client:

View File

@ -24,6 +24,7 @@ router = APIRouter()
class TokenRequest(BaseModel):
tts_voice: Optional[str] = None
tts_style: Optional[str] = None
engine_type: Optional[str] = None
@router.post("/livekit/token")
@ -44,6 +45,8 @@ async def create_livekit_token(request: Request, body: TokenRequest = TokenReque
metadata["tts_voice"] = body.tts_voice
if body.tts_style:
metadata["tts_style"] = body.tts_style
if body.engine_type:
metadata["engine_type"] = body.engine_type
token = (
livekit_api.AccessToken(settings.livekit_api_key, settings.livekit_api_secret)