From 2d0bdbd27fd7f3671017fba99f5dd80526bb3111 Mon Sep 17 00:00:00 2001 From: hailin Date: Sun, 8 Mar 2026 08:49:13 -0700 Subject: [PATCH] feat(agent): voice-triggered DingTalk binding + GET instances by user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add GET /api/v1/agent/instances/user/:userId endpoint so Claude can look up the caller's agent instances without knowing the ID upfront - Update SystemPromptBuilder DingTalk section with centralized binding flow (one-time code via iAgent DingTalk bot, no per-instance creds) - VoiceSessionController.startVoiceSession now extracts userId from JWT and builds a full iAgent system prompt (userId + DingTalk instructions) so Claude knows who is speaking and how to call the binding API - VoiceSessionManager.executeTurn now uses the session's stored system prompt (base context + voice rules) and allows the Bash tool so Claude can call internal APIs via wget during voice conversations User flow: speak "帮我绑定钉钉" → Claude lists instances → generates code via POST /api/v1/agent/channels/dingtalk/bind/:id → speaks code letter-by-letter → user sends code in DingTalk → binding completes. Co-Authored-By: Claude Sonnet 4.6 --- .../services/voice-session-manager.service.ts | 19 +++++++++---- .../claude-code-cli/system-prompt-builder.ts | 28 +++++++++++++------ .../controllers/agent-instance.controller.ts | 6 ++++ .../controllers/voice-session.controller.ts | 21 ++++++++++++-- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/packages/services/agent-service/src/domain/services/voice-session-manager.service.ts b/packages/services/agent-service/src/domain/services/voice-session-manager.service.ts index 31bea9d..bf1bda2 100644 --- a/packages/services/agent-service/src/domain/services/voice-session-manager.service.ts +++ b/packages/services/agent-service/src/domain/services/voice-session-manager.service.ts @@ -305,13 +305,21 @@ export class VoiceSessionManager { this.logger.log(`[VoiceSession ${sessionId}] Resuming SDK session: ${resumeSessionId}`); } - // Voice-mode system prompt: concise oral Chinese, no markdown / tool details - const voiceSystemPrompt = - '你正在通过语音与用户实时对话。请严格遵守以下规则:\n' + + // Voice-mode rules: always appended to whatever base system prompt the session has. + // These keep responses concise and audio-friendly. + const voiceRules = + '\n\n## 语音对话规则(必须严格遵守)\n' + '1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n' + '2. 用简洁自然的口语中文回答,像面对面对话一样\n' + '3. 回复要简短精炼,适合语音播报,通常1-3句话即可\n' + - '4. 不要使用markdown格式、代码块、列表符号等文本格式'; + '4. 不要使用markdown格式、代码块、列表符号等文本格式\n' + + '5. 朗读验证码时,每个字符之间要有明显停顿,例如 "A - 3 - F - 9 - C - 2"'; + + // Combine the session's full iAgent system prompt (with userId, DingTalk instructions, + // etc.) with the voice-specific rules. Fall back to voice rules only if no stored prompt. + const voiceSystemPrompt = session.systemPrompt + ? `${session.systemPrompt}${voiceRules}` + : `你是iAgent服务器运维助手。${voiceRules}`; // Events to suppress in voice mode (only text/completed/error reach TTS) const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']); @@ -324,7 +332,8 @@ export class VoiceSessionManager { sessionId, prompt: message, systemPrompt: voiceSystemPrompt, - allowedTools: [], + // Allow Bash so Claude can call internal APIs (e.g., DingTalk binding via wget) + allowedTools: ['Bash'], maxTurns: 10, conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined, resumeSessionId, diff --git a/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/system-prompt-builder.ts b/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/system-prompt-builder.ts index 669b254..df0ab8a 100644 --- a/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/system-prompt-builder.ts +++ b/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/system-prompt-builder.ts @@ -45,15 +45,25 @@ export class SystemPromptBuilder { '1. Ask for a name if not given\n' + '2. Use the Current User ID from this prompt as userId\n' + '3. Call the create API with Bash and report the result (id, status, containerName)\n\n' + - '## DingTalk Channel Binding (钉钉接入)\n' + - 'To bind a DingTalk bot to an OpenClaw instance, the user needs a DingTalk app with Stream mode enabled:\n' + - ' Step 1: Go to https://open.dingtalk.com/developer → "创建应用" → Enterprise Internal App\n' + - ' Step 2: Enable "机器人" → set message receive mode to "Stream模式"\n' + - ' Step 3: Copy the AppKey (ClientId) and AppSecret (ClientSecret)\n' + - ' Step 4: Create/re-deploy the OpenClaw instance with DingTalk creds:\n' + - ' wget -q -O- --post-data=\'{"name":"","userId":"","usePool":true,"dingTalkClientId":"","dingTalkClientSecret":""}\' \\\n' + - ' --header="Content-Type: application/json" http://localhost:3002/api/v1/agent/instances\n' + - ' Step 5: Tell the user to add the bot to a DingTalk group or send it a direct message — it will respond via OpenClaw AI.', + '## DingTalk Channel Binding (钉钉接入 — 语音/对话绑定)\n' + + 'iAgent has a centralized DingTalk bot. Users bind their OpenClaw instance to it using a one-time code.\n' + + 'When a user asks to bind DingTalk (e.g. "帮我绑定钉钉", "bind DingTalk", "连接钉钉"), follow these steps:\n\n' + + ' Step 1 — Find the user\'s active instance:\n' + + ' wget -q -O- http://localhost:3002/api/v1/agent/instances/user/\n' + + ' Pick the first instance with status "running". If none running, pick the first one.\n' + + ' If the user has no instances, tell them to create one first.\n\n' + + ' Step 2 — Generate a binding code for the instance:\n' + + ' wget -q -O- --post-data="" http://localhost:3002/api/v1/agent/channels/dingtalk/bind/\n' + + ' This returns JSON: { "code": "A3F9C2", "expiresAt": "..." }\n\n' + + ' Step 3 — Tell the user clearly (voice-friendly):\n' + + ' Speak the code letter-by-letter with pauses: "验证码是 A-3-F-9-C-2"\n' + + ' Instructions: "请在钉钉中找到 iAgent 机器人,向它发送这6位验证码。发送后绑定会自动完成,有效期15分钟。"\n' + + ' If speaking via voice: spell each character slowly for the user to type easily.\n\n' + + ' Check binding status (optional, if user asks):\n' + + ' wget -q -O- http://localhost:3002/api/v1/agent/channels/dingtalk/status/\n' + + ' Returns { "bound": true/false }\n\n' + + ' Unbind DingTalk:\n' + + ' wget -q -O- --post-data="" http://localhost:3002/api/v1/agent/channels/dingtalk/unbind/', ); // Tenant + user context diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/agent-instance.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/agent-instance.controller.ts index 7ca6a94..e185cfe 100644 --- a/packages/services/agent-service/src/interfaces/rest/controllers/agent-instance.controller.ts +++ b/packages/services/agent-service/src/interfaces/rest/controllers/agent-instance.controller.ts @@ -32,6 +32,12 @@ export class AgentInstanceController { return this.instanceRepo.findAll(); } + @Get('user/:userId') + async listByUser(@Param('userId') userId: string) { + const instances = await this.instanceRepo.findByUserId(userId); + return instances.map((inst) => this.sanitize(inst)); + } + @Get(':id') async getOne(@Param('id') id: string) { const inst = await this.instanceRepo.findById(id); diff --git a/packages/services/agent-service/src/interfaces/rest/controllers/voice-session.controller.ts b/packages/services/agent-service/src/interfaces/rest/controllers/voice-session.controller.ts index dd8a295..a50148b 100644 --- a/packages/services/agent-service/src/interfaces/rest/controllers/voice-session.controller.ts +++ b/packages/services/agent-service/src/interfaces/rest/controllers/voice-session.controller.ts @@ -12,12 +12,14 @@ * the user speaks, and calls voice/terminate on room disconnect. */ import { - Controller, Post, Delete, Param, Body, + Controller, Post, Delete, Param, Body, Req, NotFoundException, BadRequestException, Logger, } from '@nestjs/common'; +import { Request } from 'express'; import { TenantId } from '@it0/common'; import { VoiceSessionManager } from '../../../domain/services/voice-session-manager.service'; import { SessionRepository } from '../../../infrastructure/repositories/session.repository'; +import { SystemPromptBuilder } from '../../../infrastructure/engines/claude-code-cli/system-prompt-builder'; import { AgentEngineType } from '../../../domain/value-objects/agent-engine-type.vo'; import { AgentSession } from '../../../domain/entities/agent-session.entity'; import * as crypto from 'crypto'; @@ -29,6 +31,7 @@ export class VoiceSessionController { constructor( private readonly voiceSessionManager: VoiceSessionManager, private readonly sessionRepository: SessionRepository, + private readonly systemPromptBuilder: SystemPromptBuilder, ) {} /** @@ -41,6 +44,7 @@ export class VoiceSessionController { @Post('voice/start') async startVoiceSession( @TenantId() tenantId: string, + @Req() req: Request, @Body() body: { sessionId?: string; systemPrompt?: string }, ) { let session: AgentSession | null = null; @@ -53,6 +57,19 @@ export class VoiceSessionController { } } + // Extract user identity from JWT for context-aware system prompt + const jwtUser = (req as any).user; + const userId: string | undefined = jwtUser?.sub ?? jwtUser?.userId; + const userEmail: string | undefined = jwtUser?.email; + + // Build the full iAgent system prompt (includes DingTalk binding instructions, userId, etc.) + // If the caller explicitly overrides with their own systemPrompt, use that instead. + const builtSystemPrompt = body.systemPrompt || this.systemPromptBuilder.build({ + tenantId, + userId, + userEmail, + }); + if (!session) { // Create a fresh session pre-marked as voice mode session = new AgentSession(); @@ -60,7 +77,7 @@ export class VoiceSessionController { session.tenantId = tenantId; session.engineType = AgentEngineType.CLAUDE_AGENT_SDK; session.status = 'active'; - session.systemPrompt = body.systemPrompt; + session.systemPrompt = builtSystemPrompt; session.metadata = { voiceMode: true, title: '', titleSet: true }; session.createdAt = new Date(); session.updatedAt = new Date();