feat: add multimodal image support to Claude Agent SDK engine

- SDK engine now constructs AsyncIterable<SDKUserMessage> with image content blocks when attachments are present in conversationHistory, using the SDK's native multimodal prompt format - CLI engine logs a warning when images are detected, since the `-p` flag only accepts text (upstream Claude CLI limitation) - Both SDK and API engines now fully support multimodal image input Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-28 03:38:59 -08:00 · 2026-02-28 03:38:59 -08:00 · b9c3bfdf91
parent e4c2505048
commit b9c3bfdf91
2 changed files with 59 additions and 1 deletions
--- a/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts
+++ b/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts
@ -170,8 +170,11 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
        this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`);
      }

+      // Build prompt: use multimodal SDKUserMessage when attachments are present
+      const sdkPrompt = this.buildSdkPrompt(params);
+
      const sdkQuery = query({
-        prompt: params.prompt,
+        prompt: sdkPrompt,
        options: sdkOptions,
      });

@ -455,6 +458,50 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort {
    this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`);
  }

+  /**
+   * Build the SDK prompt: plain string for text-only, or AsyncIterable<SDKUserMessage>
+   * for multimodal (images). Checks if the last history message contains image blocks.
+   */
+  private buildSdkPrompt(
+    params: EngineTaskParams,
+  ): string | AsyncIterable<any> {
+    const history = params.conversationHistory;
+    if (!history || history.length === 0) {
+      return params.prompt;
+    }
+
+    // Check if the last message in history is a user message with image content blocks
+    const lastMsg = history[history.length - 1];
+    const hasImageBlocks =
+      lastMsg.role === 'user' &&
+      Array.isArray(lastMsg.content) &&
+      lastMsg.content.some((block: any) => block.type === 'image');
+
+    if (!hasImageBlocks) {
+      return params.prompt;
+    }
+
+    // Build a single SDKUserMessage with multimodal content
+    this.logger.log(`Building multimodal SDK prompt with image content blocks`);
+
+    const userMessage = {
+      type: 'user' as const,
+      message: {
+        role: 'user' as const,
+        content: lastMsg.content,
+      },
+      parent_tool_use_id: null,
+      session_id: '',
+    };
+
+    // Return an async iterable that yields a single SDKUserMessage
+    async function* singleMessage() {
+      yield userMessage;
+    }
+
+    return singleMessage();
+  }
+
  private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel {
    // Only classify Bash commands for risk; other tools are auto-allowed
    if (toolName === 'Bash' && typeof toolInput?.command === 'string') {
--- a/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts
+++ b/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts
@ -15,6 +15,17 @@ export class ClaudeCodeCliEngine implements AgentEnginePort {
  constructor(private readonly configService: ConfigService) {}

  async *executeTask(params: EngineTaskParams): AsyncGenerator<EngineStreamEvent> {
+    // CLI engine does not support multimodal — `-p` only accepts text.
+    // If attachments are present, log a warning. Use SDK or API engine for images.
+    if (params.conversationHistory?.some(
+      (m) => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'image'),
+    )) {
+      this.logger.warn(
+        `[Session ${params.sessionId}] CLI engine does not support multimodal (images). ` +
+        `Image content blocks will be ignored. Use claude_agent_sdk or claude_api engine instead.`,
+      );
+    }
+
    const args = [
      '-p', params.prompt,
      '--output-format', 'stream-json',