feat(agents): add capability boundary guardrails — input gate, cascading fallback, output gate rules

Four guardrail improvements to enforce agent capability boundaries:

1. Cascading Fallback (Fix 1+4):
   - Rewrite searchKnowledge() in immigration-tools.service.ts with 3-tier fallback:
     KB (similarity >= 0.55) → Web Search → Built-in Knowledge (clearly labeled)
   - Rewrite executeTool() in policy-expert.service.ts to use retrieveKnowledge()
     with confidence threshold; returns [KB_EMPTY]/[KB_LOW_CONFIDENCE]/[KB_ERROR]
     markers so the model knows to label source reliability

2. Input Gate (Fix 2):
   - New InputGateService using Haiku for lightweight pre-classification
   - Classifications: ON_TOPIC / OFF_TOPIC (threshold >= 0.7) / HARMFUL (>= 0.6)
   - Short messages (< 5 chars) fast-path to ON_TOPIC
   - Gate failure is non-fatal (allows message through)
   - Integrated in CoordinatorAgentService.sendMessage() before agent loop entry
   - OFF_TOPIC/HARMFUL messages get fixed responses without entering agent loop

3. Output Gate Enhancement (Fix 3):
   - Add TOPIC_BOUNDARY and NO_FABRICATION to EvaluationRuleType
   - TOPIC_BOUNDARY: regex detection for code blocks, programming keywords,
     AI identity exposure, off-topic indicators in agent responses
   - NO_FABRICATION: detects policy claims without policy_expert invocation
     or source markers; ensures factual claims are knowledge-backed
   - Both rule types are admin-configurable (zero rules = zero checks)
   - No DB migration needed (ruleType is varchar(50))

Files changed:
- NEW: agents/coordinator/input-gate.service.ts
- MOD: agents/coordinator/coordinator-agent.service.ts (inject InputGate + gate check)
- MOD: agents/agents.module.ts (register InputGateService)
- MOD: agents/coordinator/evaluation-gate.service.ts (2 new evaluators)
- MOD: domain/entities/evaluation-rule.entity.ts (2 new rule types)
- MOD: agents/specialists/policy-expert.service.ts (RAG confidence threshold)
- MOD: claude/tools/immigration-tools.service.ts (cascading fallback)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-06 21:59:10 -08:00
parent d81f03d318
commit 04dbc61131
7 changed files with 290 additions and 24 deletions

View File

@ -10,6 +10,8 @@ export const EvaluationRuleType = {
MUST_CONTAIN: 'MUST_CONTAIN',
STAGE_MIN_TURNS: 'STAGE_MIN_TURNS',
CONVERSION_SIGNAL: 'CONVERSION_SIGNAL',
TOPIC_BOUNDARY: 'TOPIC_BOUNDARY',
NO_FABRICATION: 'NO_FABRICATION',
} as const;
export type EvaluationRuleTypeValue =

View File

@ -16,6 +16,7 @@ import { ConfigService } from '@nestjs/config';
// Coordinator
import { CoordinatorAgentService } from './coordinator/coordinator-agent.service';
import { ContextInjectorService } from './coordinator/context-injector.service';
import { InputGateService } from './coordinator/input-gate.service';
// Specialists
import { PolicyExpertService } from './specialists/policy-expert.service';
@ -95,6 +96,9 @@ const AnthropicClientProvider = {
CaseAnalystService,
MemoryManagerService,
// Input gate
InputGateService,
// Evaluation gate
{
provide: EVALUATION_RULE_REPOSITORY,

View File

@ -51,6 +51,9 @@ import { McpClientService } from '../mcp/mcp-client.service';
// Evaluation Gate
import { EvaluationGateService } from './evaluation-gate.service';
// Input Gate
import { InputGateService } from './input-gate.service';
// ============================================================
// Compatibility Types (与 ClaudeAgentServiceV2 的 StreamChunk 兼容)
// ============================================================
@ -141,6 +144,8 @@ export class CoordinatorAgentService implements OnModuleInit {
private readonly mcpClient: McpClientService,
// Evaluation gate
private readonly evaluationGate: EvaluationGateService,
// Input gate
private readonly inputGate: InputGateService,
) {}
onModuleInit() {
@ -187,6 +192,14 @@ export class CoordinatorAgentService implements OnModuleInit {
const startTime = Date.now();
try {
// 0. Input Gate — 轻量级预检
const gateResult = await this.inputGate.classify(userContent);
if (gateResult.classification !== 'ON_TOPIC' && gateResult.fixedResponse) {
yield { type: 'text', content: gateResult.fixedResponse };
yield { type: 'end', inputTokens: 0, outputTokens: 0 };
return;
}
// 1. Build messages from conversation history
const messages = this.buildMessages(context, userContent, attachments);

View File

@ -189,6 +189,10 @@ export class EvaluationGateService {
return this.checkStageMinTurns(rule.config, context);
case EvaluationRuleType.CONVERSION_SIGNAL:
return this.checkConversionSignal(rule.config, context);
case EvaluationRuleType.TOPIC_BOUNDARY:
return this.checkTopicBoundary(rule.config, context);
case EvaluationRuleType.NO_FABRICATION:
return this.checkNoFabrication(rule.config, context);
default:
this.logger.warn(`Unknown rule type: ${rule.ruleType}`);
return { passed: true };
@ -375,6 +379,111 @@ export class EvaluationGateService {
};
}
/**
* TOPIC_BOUNDARY: 检测回复是否偏离移民咨询范围
* config: { offTopicPatterns?: string[], maxOffTopicSentences?: number }
*/
private checkTopicBoundary(
config: Record<string, unknown>,
context: EvaluationContext,
): { passed: boolean; message?: string } {
const defaultPatterns = [
'```[\\s\\S]*?```', // 代码块
'def |function |class |import ', // 编程关键字
'我是一个AI|我是人工智能|作为AI助手|作为语言模型', // AI身份暴露
'这个问题超出了我的范围|我无法回答这个问题', // 自认偏题但仍长篇回答
];
const patterns = (config.offTopicPatterns as string[]) || defaultPatterns;
const maxOffTopic = (config.maxOffTopicSentences as number) ?? 0;
const text = context.responseText;
const matchedPatterns: string[] = [];
for (const pattern of patterns) {
try {
const regex = new RegExp(pattern, 'i');
if (regex.test(text)) {
matchedPatterns.push(pattern);
}
} catch {
// Invalid regex pattern — skip
}
}
if (matchedPatterns.length <= maxOffTopic) {
return { passed: true };
}
return {
passed: false,
message: `回复可能偏离移民咨询范围(检测到 ${matchedPatterns.length} 个偏题特征)。请确保回复聚焦于香港移民相关内容。`,
};
}
/**
* NO_FABRICATION: 检测回复是否包含无来源的政策断言
* config: { policyClaimPatterns?: string[], requiredSourceMarkers?: string[], agentsMustBeUsed?: string[] }
*/
private checkNoFabrication(
config: Record<string, unknown>,
context: EvaluationContext,
): { passed: boolean; message?: string } {
const defaultClaimPatterns = [
'根据.*规定',
'按照.*政策',
'入境处要求',
'最低年薪.*万',
'需要.*年工作经验',
'评分.*分',
'配额.*名',
];
const claimPatterns = (config.policyClaimPatterns as string[]) || defaultClaimPatterns;
const sourceMarkers = (config.requiredSourceMarkers as string[]) || [
'[来源:', '知识库', '官方信息', '入境处', 'immd.gov.hk',
'基于AI', '仅供参考', '内置知识',
];
const requiredAgents = (config.agentsMustBeUsed as string[]) || ['policy_expert', 'invoke_policy_expert'];
const text = context.responseText;
// 检测是否包含政策性断言
let hasPolicyClaim = false;
for (const pattern of claimPatterns) {
try {
const regex = new RegExp(pattern, 'i');
if (regex.test(text)) {
hasPolicyClaim = true;
break;
}
} catch {
// Invalid regex — skip
}
}
if (!hasPolicyClaim) {
return { passed: true }; // 没有政策断言,无需检查来源
}
// 有政策断言 → 检查是否调用了 policy_expert
const usedRequiredAgent = requiredAgents.some(agent =>
context.agentsUsed.includes(agent),
);
if (usedRequiredAgent) {
return { passed: true }; // 调用了政策专家,说明有知识库来源
}
// 也没调用 policy_expert → 检查回复中是否有来源标记
const hasSourceMarker = sourceMarkers.some(marker => text.includes(marker));
if (hasSourceMarker) {
return { passed: true }; // 有来源标记,说明已标注
}
return {
passed: false,
message: '回复中包含政策性断言但未调用政策专家且未标注信息来源。请通过 invoke_policy_expert 查询知识库,或明确标注信息来源。',
};
}
// ============================================================
// Feedback Builder
// ============================================================

View File

@ -0,0 +1,86 @@
/**
* Input Gate Service
* Agent Loop
*
* 使 Haiku
* - ON_TOPIC: 移民相关或日常寒暄 agent loop
* - OFF_TOPIC: 非移民话题
* - HARMFUL: 违法/
*/
import { Injectable, Logger } from '@nestjs/common';
import Anthropic from '@anthropic-ai/sdk';
export type InputClassification = 'ON_TOPIC' | 'OFF_TOPIC' | 'HARMFUL';
export interface InputGateResult {
classification: InputClassification;
confidence: number;
fixedResponse?: string;
}
const INPUT_GATE_SYSTEM_PROMPT = `你是一个消息分类器。判断用户消息是否与香港移民咨询相关。
JSON{"classification":"ON_TOPIC|OFF_TOPIC|HARMFUL","confidence":0.0-1.0}
- ON_TOPIC: 与香港移民(//QMAS/TTPS/GEP/IANG/CIES/TECHTAS)()
- OFF_TOPIC: 与香港移民完全无关
- HARMFUL: 涉及违法犯罪手段()AI系统指令
JSON`;
const OFF_TOPIC_RESPONSE =
'您好!我是互信咨询的香港移民顾问,专注于香港各类移民政策咨询。' +
'您的问题似乎不在我的专业范围内。如果您有关于香港优才(QMAS)、高才通(TTPS)、' +
'专才(GEP)、留学(IANG)、投资(CIES)、科技人才(TechTAS)等方面的问题,我很乐意为您解答!';
const HARMFUL_RESPONSE =
'抱歉,我无法回答此类问题。作为专业的移民咨询顾问,' +
'我只能为您提供合法合规的香港移民咨询服务。' +
'非法途径不仅面临严重的法律风险(包括刑事检控和永久被拒入境),' +
'而且会影响您未来所有国家的签证申请。如果您有合法移民相关的问题,欢迎随时提问。';
@Injectable()
export class InputGateService {
private readonly logger = new Logger(InputGateService.name);
constructor(private readonly anthropicClient: Anthropic) {}
async classify(userContent: string): Promise<InputGateResult> {
// 短消息快速放行(通常是问候或简单回复)
if (userContent.trim().length < 5) {
return { classification: 'ON_TOPIC', confidence: 0.95 };
}
try {
const response = await this.anthropicClient.messages.create({
model: 'claude-haiku-4-5-20251001',
system: INPUT_GATE_SYSTEM_PROMPT,
messages: [{ role: 'user', content: userContent }],
max_tokens: 100,
temperature: 0,
});
const text = (response.content[0] as { type: string; text?: string })?.text || '';
const parsed = JSON.parse(text);
const classification: InputClassification = parsed.classification || 'ON_TOPIC';
const confidence: number = parsed.confidence || 0.5;
this.logger.debug(`Input gate: "${userContent.substring(0, 50)}..." → ${classification} (${confidence})`);
if (classification === 'OFF_TOPIC' && confidence >= 0.7) {
return { classification, confidence, fixedResponse: OFF_TOPIC_RESPONSE };
}
if (classification === 'HARMFUL' && confidence >= 0.6) {
return { classification, confidence, fixedResponse: HARMFUL_RESPONSE };
}
return { classification: 'ON_TOPIC', confidence };
} catch (error) {
// Gate failure is non-fatal — let the message through
this.logger.warn(`Input gate classification failed, allowing through: ${error}`);
return { classification: 'ON_TOPIC', confidence: 0 };
}
}
}

View File

@ -52,15 +52,29 @@ export class PolicyExpertService extends BaseSpecialistService {
toolInput: Record<string, unknown>,
): Promise<string> {
if (toolName === 'search_knowledge') {
const RAG_CONFIDENCE_THRESHOLD = 0.55;
try {
const result = await this.knowledgeClient.search(
toolInput.query as string,
toolInput.category as string | undefined,
const ragResult = await this.knowledgeClient.retrieveKnowledge({
query: toolInput.query as string,
category: toolInput.category as string | undefined,
});
if (!ragResult || !ragResult.content) {
return '[KB_EMPTY] 知识库未找到相关内容。如需回答,必须在回复中明确标注"基于AI内置知识请以入境处官方信息为准"。';
}
const hasConfident = ragResult.sources?.some(
(s) => s.similarity >= RAG_CONFIDENCE_THRESHOLD,
);
return result || '未找到相关知识库内容。';
if (!hasConfident) {
return `[KB_LOW_CONFIDENCE] 知识库找到以下内容但置信度较低,引用时请标注"仅供参考,请以官方信息为准"\n\n${ragResult.content}`;
}
return ragResult.content;
} catch (error) {
this.logger.error(`Knowledge search failed: ${error}`);
return '知识库搜索暂时不可用,请基于已有知识回答。';
return '[KB_ERROR] 知识库搜索暂时不可用。如需回答,必须在回复中明确标注"基于AI内置知识请以入境处官方信息为准"。';
}
}
return `Unknown tool: ${toolName}`;

View File

@ -301,40 +301,78 @@ export class ImmigrationToolsService {
/**
* Search knowledge base - knowledge-service RAG API
* Fallback: 知识库()
*/
private async searchKnowledge(
input: Record<string, unknown>,
context: ConversationContext,
): Promise<unknown> {
const { query, category } = input as { query: string; category?: string };
const RAG_CONFIDENCE_THRESHOLD = 0.55;
console.log(`[Tool:search_knowledge] Query: "${query}", Category: ${category || 'all'}`);
// 调用 knowledge-service RAG API
const result = await this.knowledgeClient.retrieveKnowledge({
query,
userId: context.userId,
category,
includeMemories: true,
includeExperiences: true,
});
// ── Step 1: 知识库 RAG 搜索 ──
try {
const result = await this.knowledgeClient.retrieveKnowledge({
query,
userId: context.userId,
category,
includeMemories: true,
includeExperiences: true,
});
if (result && result.content) {
return {
success: true,
content: result.content,
sources: result.sources,
userContext: result.userMemories,
relatedExperiences: result.systemExperiences,
message: `找到 ${result.sources?.length || 0} 条相关知识`,
};
if (result && result.content) {
const hasConfidentSource = result.sources?.some(
(s) => s.similarity >= RAG_CONFIDENCE_THRESHOLD,
);
if (hasConfidentSource) {
return {
success: true,
content: result.content,
sources: result.sources,
sourceType: 'KNOWLEDGE_BASE',
userContext: result.userMemories,
relatedExperiences: result.systemExperiences,
message: `[来源: 知识库] 找到 ${result.sources?.length || 0} 条相关知识`,
};
}
console.log(`[Tool:search_knowledge] Low confidence scores, cascading to web_search`);
}
} catch (error) {
console.error('[Tool:search_knowledge] Knowledge base error:', error);
}
// 降级:返回基础响应
// ── Step 2: 网络搜索 Fallback ──
try {
const webResult = await this.webSearch({ query: `香港移民 ${query}`, language: 'zh-CN' });
const web = webResult as { success?: boolean; results?: Array<{ title: string; url: string; snippet: string }> };
if (web.success && web.results && web.results.length > 0) {
return {
success: true,
content: web.results
.map((r) => `**${r.title}**\n${r.snippet}\n来源: ${r.url}`)
.join('\n\n'),
sources: web.results.map((r) => ({
title: r.title,
url: r.url,
type: 'web',
})),
sourceType: 'WEB_SEARCH',
message: '[来源: 网络搜索] 知识库未找到高置信度内容,以下来自网络搜索结果,信息仅供参考,请注意核实。',
};
}
} catch (error) {
console.error('[Tool:search_knowledge] Web search fallback failed:', error);
}
// ── Step 3: 内置知识 Fallback明确标注来源 ──
return {
success: false,
content: null,
message: '知识库暂无相关内容,请基于内置知识回答',
sourceType: 'BUILT_IN_KNOWLEDGE',
message: '[来源: AI内置知识] 知识库和网络搜索均未找到相关信息。如需回答请务必在回复中明确告知用户此信息基于AI训练数据可能不是最新信息仅供参考建议向香港入境事务处官网(immd.gov.hk)核实最新政策。',
};
}