feat: add multimodal image support to Claude Agent SDK engine
- SDK engine now constructs AsyncIterable<SDKUserMessage> with image content blocks when attachments are present in conversationHistory, using the SDK's native multimodal prompt format - CLI engine logs a warning when images are detected, since the `-p` flag only accepts text (upstream Claude CLI limitation) - Both SDK and API engines now fully support multimodal image input Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e4c2505048
commit
b9c3bfdf91
|
|
@ -170,8 +170,11 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
|
|||
this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`);
|
||||
}
|
||||
|
||||
// Build prompt: use multimodal SDKUserMessage when attachments are present
|
||||
const sdkPrompt = this.buildSdkPrompt(params);
|
||||
|
||||
const sdkQuery = query({
|
||||
prompt: params.prompt,
|
||||
prompt: sdkPrompt,
|
||||
options: sdkOptions,
|
||||
});
|
||||
|
||||
|
|
@ -455,6 +458,50 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
|
|||
this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the SDK prompt: plain string for text-only, or AsyncIterable<SDKUserMessage>
|
||||
* for multimodal (images). Checks if the last history message contains image blocks.
|
||||
*/
|
||||
private buildSdkPrompt(
|
||||
params: EngineTaskParams,
|
||||
): string | AsyncIterable<any> {
|
||||
const history = params.conversationHistory;
|
||||
if (!history || history.length === 0) {
|
||||
return params.prompt;
|
||||
}
|
||||
|
||||
// Check if the last message in history is a user message with image content blocks
|
||||
const lastMsg = history[history.length - 1];
|
||||
const hasImageBlocks =
|
||||
lastMsg.role === 'user' &&
|
||||
Array.isArray(lastMsg.content) &&
|
||||
lastMsg.content.some((block: any) => block.type === 'image');
|
||||
|
||||
if (!hasImageBlocks) {
|
||||
return params.prompt;
|
||||
}
|
||||
|
||||
// Build a single SDKUserMessage with multimodal content
|
||||
this.logger.log(`Building multimodal SDK prompt with image content blocks`);
|
||||
|
||||
const userMessage = {
|
||||
type: 'user' as const,
|
||||
message: {
|
||||
role: 'user' as const,
|
||||
content: lastMsg.content,
|
||||
},
|
||||
parent_tool_use_id: null,
|
||||
session_id: '',
|
||||
};
|
||||
|
||||
// Return an async iterable that yields a single SDKUserMessage
|
||||
async function* singleMessage() {
|
||||
yield userMessage;
|
||||
}
|
||||
|
||||
return singleMessage();
|
||||
}
|
||||
|
||||
private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel {
|
||||
// Only classify Bash commands for risk; other tools are auto-allowed
|
||||
if (toolName === 'Bash' && typeof toolInput?.command === 'string') {
|
||||
|
|
|
|||
|
|
@ -15,6 +15,17 @@ export class ClaudeCodeCliEngine implements AgentEnginePort {
|
|||
constructor(private readonly configService: ConfigService) {}
|
||||
|
||||
async *executeTask(params: EngineTaskParams): AsyncGenerator<EngineStreamEvent> {
|
||||
// CLI engine does not support multimodal — `-p` only accepts text.
|
||||
// If attachments are present, log a warning. Use SDK or API engine for images.
|
||||
if (params.conversationHistory?.some(
|
||||
(m) => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'image'),
|
||||
)) {
|
||||
this.logger.warn(
|
||||
`[Session ${params.sessionId}] CLI engine does not support multimodal (images). ` +
|
||||
`Image content blocks will be ignored. Use claude_agent_sdk or claude_api engine instead.`,
|
||||
);
|
||||
}
|
||||
|
||||
const args = [
|
||||
'-p', params.prompt,
|
||||
'--output-format', 'stream-json',
|
||||
|
|
|
|||
Loading…
Reference in New Issue