feat(agent): voice-triggered DingTalk binding + GET instances by user

- Add GET /api/v1/agent/instances/user/:userId endpoint so Claude can
  look up the caller's agent instances without knowing the ID upfront
- Update SystemPromptBuilder DingTalk section with centralized binding
  flow (one-time code via iAgent DingTalk bot, no per-instance creds)
- VoiceSessionController.startVoiceSession now extracts userId from JWT
  and builds a full iAgent system prompt (userId + DingTalk instructions)
  so Claude knows who is speaking and how to call the binding API
- VoiceSessionManager.executeTurn now uses the session's stored system
  prompt (base context + voice rules) and allows the Bash tool so Claude
  can call internal APIs via wget during voice conversations

User flow: speak "帮我绑定钉钉" → Claude lists instances → generates
code via POST /api/v1/agent/channels/dingtalk/bind/:id → speaks code
letter-by-letter → user sends code in DingTalk → binding completes.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-03-08 08:49:13 -07:00
parent db0e1f1439
commit 2d0bdbd27f
4 changed files with 58 additions and 16 deletions

View File

@ -305,13 +305,21 @@ export class VoiceSessionManager {
this.logger.log(`[VoiceSession ${sessionId}] Resuming SDK session: ${resumeSessionId}`);
}
// Voice-mode system prompt: concise oral Chinese, no markdown / tool details
const voiceSystemPrompt =
'你正在通过语音与用户实时对话。请严格遵守以下规则:\n' +
// Voice-mode rules: always appended to whatever base system prompt the session has.
// These keep responses concise and audio-friendly.
const voiceRules =
'\n\n## 语音对话规则(必须严格遵守)\n' +
'1. 只输出用户关注的最终答案,不要输出工具调用过程、中间步骤或技术细节\n' +
'2. 用简洁自然的口语中文回答,像面对面对话一样\n' +
'3. 回复要简短精炼适合语音播报通常1-3句话即可\n' +
'4. 不要使用markdown格式、代码块、列表符号等文本格式';
'4. 不要使用markdown格式、代码块、列表符号等文本格式\n' +
'5. 朗读验证码时,每个字符之间要有明显停顿,例如 "A - 3 - F - 9 - C - 2"';
// Combine the session's full iAgent system prompt (with userId, DingTalk instructions,
// etc.) with the voice-specific rules. Fall back to voice rules only if no stored prompt.
const voiceSystemPrompt = session.systemPrompt
? `${session.systemPrompt}${voiceRules}`
: `你是iAgent服务器运维助手。${voiceRules}`;
// Events to suppress in voice mode (only text/completed/error reach TTS)
const voiceFilteredTypes = new Set(['thinking', 'tool_use', 'tool_result']);
@ -324,7 +332,8 @@ export class VoiceSessionManager {
sessionId,
prompt: message,
systemPrompt: voiceSystemPrompt,
allowedTools: [],
// Allow Bash so Claude can call internal APIs (e.g., DingTalk binding via wget)
allowedTools: ['Bash'],
maxTurns: 10,
conversationHistory: historyForEngine.length > 0 ? historyForEngine : undefined,
resumeSessionId,

View File

@ -45,15 +45,25 @@ export class SystemPromptBuilder {
'1. Ask for a name if not given\n' +
'2. Use the Current User ID from this prompt as userId\n' +
'3. Call the create API with Bash and report the result (id, status, containerName)\n\n' +
'## DingTalk Channel Binding (钉钉接入)\n' +
'To bind a DingTalk bot to an OpenClaw instance, the user needs a DingTalk app with Stream mode enabled:\n' +
' Step 1: Go to https://open.dingtalk.com/developer → "创建应用" → Enterprise Internal App\n' +
' Step 2: Enable "机器人" → set message receive mode to "Stream模式"\n' +
' Step 3: Copy the AppKey (ClientId) and AppSecret (ClientSecret)\n' +
' Step 4: Create/re-deploy the OpenClaw instance with DingTalk creds:\n' +
' wget -q -O- --post-data=\'{"name":"<name>","userId":"<userId>","usePool":true,"dingTalkClientId":"<appKey>","dingTalkClientSecret":"<appSecret>"}\' \\\n' +
' --header="Content-Type: application/json" http://localhost:3002/api/v1/agent/instances\n' +
' Step 5: Tell the user to add the bot to a DingTalk group or send it a direct message — it will respond via OpenClaw AI.',
'## DingTalk Channel Binding (钉钉接入 — 语音/对话绑定)\n' +
'iAgent has a centralized DingTalk bot. Users bind their OpenClaw instance to it using a one-time code.\n' +
'When a user asks to bind DingTalk (e.g. "帮我绑定钉钉", "bind DingTalk", "连接钉钉"), follow these steps:\n\n' +
' Step 1 — Find the user\'s active instance:\n' +
' wget -q -O- http://localhost:3002/api/v1/agent/instances/user/<userId>\n' +
' Pick the first instance with status "running". If none running, pick the first one.\n' +
' If the user has no instances, tell them to create one first.\n\n' +
' Step 2 — Generate a binding code for the instance:\n' +
' wget -q -O- --post-data="" http://localhost:3002/api/v1/agent/channels/dingtalk/bind/<instanceId>\n' +
' This returns JSON: { "code": "A3F9C2", "expiresAt": "..." }\n\n' +
' Step 3 — Tell the user clearly (voice-friendly):\n' +
' Speak the code letter-by-letter with pauses: "验证码是 A-3-F-9-C-2"\n' +
' Instructions: "请在钉钉中找到 iAgent 机器人向它发送这6位验证码。发送后绑定会自动完成有效期15分钟。"\n' +
' If speaking via voice: spell each character slowly for the user to type easily.\n\n' +
' Check binding status (optional, if user asks):\n' +
' wget -q -O- http://localhost:3002/api/v1/agent/channels/dingtalk/status/<instanceId>\n' +
' Returns { "bound": true/false }\n\n' +
' Unbind DingTalk:\n' +
' wget -q -O- --post-data="" http://localhost:3002/api/v1/agent/channels/dingtalk/unbind/<instanceId>',
);
// Tenant + user context

View File

@ -32,6 +32,12 @@ export class AgentInstanceController {
return this.instanceRepo.findAll();
}
@Get('user/:userId')
async listByUser(@Param('userId') userId: string) {
const instances = await this.instanceRepo.findByUserId(userId);
return instances.map((inst) => this.sanitize(inst));
}
@Get(':id')
async getOne(@Param('id') id: string) {
const inst = await this.instanceRepo.findById(id);

View File

@ -12,12 +12,14 @@
* the user speaks, and calls voice/terminate on room disconnect.
*/
import {
Controller, Post, Delete, Param, Body,
Controller, Post, Delete, Param, Body, Req,
NotFoundException, BadRequestException, Logger,
} from '@nestjs/common';
import { Request } from 'express';
import { TenantId } from '@it0/common';
import { VoiceSessionManager } from '../../../domain/services/voice-session-manager.service';
import { SessionRepository } from '../../../infrastructure/repositories/session.repository';
import { SystemPromptBuilder } from '../../../infrastructure/engines/claude-code-cli/system-prompt-builder';
import { AgentEngineType } from '../../../domain/value-objects/agent-engine-type.vo';
import { AgentSession } from '../../../domain/entities/agent-session.entity';
import * as crypto from 'crypto';
@ -29,6 +31,7 @@ export class VoiceSessionController {
constructor(
private readonly voiceSessionManager: VoiceSessionManager,
private readonly sessionRepository: SessionRepository,
private readonly systemPromptBuilder: SystemPromptBuilder,
) {}
/**
@ -41,6 +44,7 @@ export class VoiceSessionController {
@Post('voice/start')
async startVoiceSession(
@TenantId() tenantId: string,
@Req() req: Request,
@Body() body: { sessionId?: string; systemPrompt?: string },
) {
let session: AgentSession | null = null;
@ -53,6 +57,19 @@ export class VoiceSessionController {
}
}
// Extract user identity from JWT for context-aware system prompt
const jwtUser = (req as any).user;
const userId: string | undefined = jwtUser?.sub ?? jwtUser?.userId;
const userEmail: string | undefined = jwtUser?.email;
// Build the full iAgent system prompt (includes DingTalk binding instructions, userId, etc.)
// If the caller explicitly overrides with their own systemPrompt, use that instead.
const builtSystemPrompt = body.systemPrompt || this.systemPromptBuilder.build({
tenantId,
userId,
userEmail,
});
if (!session) {
// Create a fresh session pre-marked as voice mode
session = new AgentSession();
@ -60,7 +77,7 @@ export class VoiceSessionController {
session.tenantId = tenantId;
session.engineType = AgentEngineType.CLAUDE_AGENT_SDK;
session.status = 'active';
session.systemPrompt = body.systemPrompt;
session.systemPrompt = builtSystemPrompt;
session.metadata = { voiceMode: true, title: '', titleSet: true };
session.createdAt = new Date();
session.updatedAt = new Date();