feat: add multimodal image support to Claude Agent SDK engine

- SDK engine now constructs AsyncIterable<SDKUserMessage> with image
  content blocks when attachments are present in conversationHistory,
  using the SDK's native multimodal prompt format
- CLI engine logs a warning when images are detected, since the `-p`
  flag only accepts text (upstream Claude CLI limitation)
- Both SDK and API engines now fully support multimodal image input

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-28 03:38:59 -08:00
parent e4c2505048
commit b9c3bfdf91
2 changed files with 59 additions and 1 deletions

View File

@ -170,8 +170,11 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`);
}
// Build prompt: use multimodal SDKUserMessage when attachments are present
const sdkPrompt = this.buildSdkPrompt(params);
const sdkQuery = query({
prompt: params.prompt,
prompt: sdkPrompt,
options: sdkOptions,
});
@ -455,6 +458,50 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`);
}
/**
* Build the SDK prompt: plain string for text-only, or AsyncIterable<SDKUserMessage>
* for multimodal (images). Checks if the last history message contains image blocks.
*/
private buildSdkPrompt(
params: EngineTaskParams,
): string | AsyncIterable<any> {
const history = params.conversationHistory;
if (!history || history.length === 0) {
return params.prompt;
}
// Check if the last message in history is a user message with image content blocks
const lastMsg = history[history.length - 1];
const hasImageBlocks =
lastMsg.role === 'user' &&
Array.isArray(lastMsg.content) &&
lastMsg.content.some((block: any) => block.type === 'image');
if (!hasImageBlocks) {
return params.prompt;
}
// Build a single SDKUserMessage with multimodal content
this.logger.log(`Building multimodal SDK prompt with image content blocks`);
const userMessage = {
type: 'user' as const,
message: {
role: 'user' as const,
content: lastMsg.content,
},
parent_tool_use_id: null,
session_id: '',
};
// Return an async iterable that yields a single SDKUserMessage
async function* singleMessage() {
yield userMessage;
}
return singleMessage();
}
private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel {
// Only classify Bash commands for risk; other tools are auto-allowed
if (toolName === 'Bash' && typeof toolInput?.command === 'string') {

View File

@ -15,6 +15,17 @@ export class ClaudeCodeCliEngine implements AgentEnginePort {
constructor(private readonly configService: ConfigService) {}
async *executeTask(params: EngineTaskParams): AsyncGenerator<EngineStreamEvent> {
// CLI engine does not support multimodal — `-p` only accepts text.
// If attachments are present, log a warning. Use SDK or API engine for images.
if (params.conversationHistory?.some(
(m) => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'image'),
)) {
this.logger.warn(
`[Session ${params.sessionId}] CLI engine does not support multimodal (images). ` +
`Image content blocks will be ignored. Use claude_agent_sdk or claude_api engine instead.`,
);
}
const args = [
'-p', params.prompt,
'--output-format', 'stream-json',