feat: add multimodal image support to Claude Agent SDK engine
- SDK engine now constructs AsyncIterable<SDKUserMessage> with image content blocks when attachments are present in conversationHistory, using the SDK's native multimodal prompt format - CLI engine logs a warning when images are detected, since the `-p` flag only accepts text (upstream Claude CLI limitation) - Both SDK and API engines now fully support multimodal image input Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e4c2505048
commit
b9c3bfdf91
|
|
@ -170,8 +170,11 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
|
||||||
this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`);
|
this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build prompt: use multimodal SDKUserMessage when attachments are present
|
||||||
|
const sdkPrompt = this.buildSdkPrompt(params);
|
||||||
|
|
||||||
const sdkQuery = query({
|
const sdkQuery = query({
|
||||||
prompt: params.prompt,
|
prompt: sdkPrompt,
|
||||||
options: sdkOptions,
|
options: sdkOptions,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
@ -455,6 +458,50 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
|
||||||
this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`);
|
this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build the SDK prompt: plain string for text-only, or AsyncIterable<SDKUserMessage>
|
||||||
|
* for multimodal (images). Checks if the last history message contains image blocks.
|
||||||
|
*/
|
||||||
|
private buildSdkPrompt(
|
||||||
|
params: EngineTaskParams,
|
||||||
|
): string | AsyncIterable<any> {
|
||||||
|
const history = params.conversationHistory;
|
||||||
|
if (!history || history.length === 0) {
|
||||||
|
return params.prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the last message in history is a user message with image content blocks
|
||||||
|
const lastMsg = history[history.length - 1];
|
||||||
|
const hasImageBlocks =
|
||||||
|
lastMsg.role === 'user' &&
|
||||||
|
Array.isArray(lastMsg.content) &&
|
||||||
|
lastMsg.content.some((block: any) => block.type === 'image');
|
||||||
|
|
||||||
|
if (!hasImageBlocks) {
|
||||||
|
return params.prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build a single SDKUserMessage with multimodal content
|
||||||
|
this.logger.log(`Building multimodal SDK prompt with image content blocks`);
|
||||||
|
|
||||||
|
const userMessage = {
|
||||||
|
type: 'user' as const,
|
||||||
|
message: {
|
||||||
|
role: 'user' as const,
|
||||||
|
content: lastMsg.content,
|
||||||
|
},
|
||||||
|
parent_tool_use_id: null,
|
||||||
|
session_id: '',
|
||||||
|
};
|
||||||
|
|
||||||
|
// Return an async iterable that yields a single SDKUserMessage
|
||||||
|
async function* singleMessage() {
|
||||||
|
yield userMessage;
|
||||||
|
}
|
||||||
|
|
||||||
|
return singleMessage();
|
||||||
|
}
|
||||||
|
|
||||||
private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel {
|
private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel {
|
||||||
// Only classify Bash commands for risk; other tools are auto-allowed
|
// Only classify Bash commands for risk; other tools are auto-allowed
|
||||||
if (toolName === 'Bash' && typeof toolInput?.command === 'string') {
|
if (toolName === 'Bash' && typeof toolInput?.command === 'string') {
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,17 @@ export class ClaudeCodeCliEngine implements AgentEnginePort {
|
||||||
constructor(private readonly configService: ConfigService) {}
|
constructor(private readonly configService: ConfigService) {}
|
||||||
|
|
||||||
async *executeTask(params: EngineTaskParams): AsyncGenerator<EngineStreamEvent> {
|
async *executeTask(params: EngineTaskParams): AsyncGenerator<EngineStreamEvent> {
|
||||||
|
// CLI engine does not support multimodal — `-p` only accepts text.
|
||||||
|
// If attachments are present, log a warning. Use SDK or API engine for images.
|
||||||
|
if (params.conversationHistory?.some(
|
||||||
|
(m) => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'image'),
|
||||||
|
)) {
|
||||||
|
this.logger.warn(
|
||||||
|
`[Session ${params.sessionId}] CLI engine does not support multimodal (images). ` +
|
||||||
|
`Image content blocks will be ignored. Use claude_agent_sdk or claude_api engine instead.`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const args = [
|
const args = [
|
||||||
'-p', params.prompt,
|
'-p', params.prompt,
|
||||||
'--output-format', 'stream-json',
|
'--output-format', 'stream-json',
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue