diff --git a/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts b/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts index 65e9314..0d4b8c7 100644 --- a/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts +++ b/packages/services/agent-service/src/infrastructure/engines/claude-agent-sdk/claude-agent-sdk-engine.ts @@ -170,8 +170,11 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort { this.logger.log(`Resuming SDK session: ${params.resumeSessionId} for session ${params.sessionId}`); } + // Build prompt: use multimodal SDKUserMessage when attachments are present + const sdkPrompt = this.buildSdkPrompt(params); + const sdkQuery = query({ - prompt: params.prompt, + prompt: sdkPrompt, options: sdkOptions, }); @@ -455,6 +458,50 @@ export class ClaudeAgentSdkEngine implements AgentEnginePort { this.logger.debug(`Set HOME=${tenantHome} for tenant ${tenantId}`); } + /** + * Build the SDK prompt: plain string for text-only, or AsyncIterable + * for multimodal (images). Checks if the last history message contains image blocks. + */ + private buildSdkPrompt( + params: EngineTaskParams, + ): string | AsyncIterable { + const history = params.conversationHistory; + if (!history || history.length === 0) { + return params.prompt; + } + + // Check if the last message in history is a user message with image content blocks + const lastMsg = history[history.length - 1]; + const hasImageBlocks = + lastMsg.role === 'user' && + Array.isArray(lastMsg.content) && + lastMsg.content.some((block: any) => block.type === 'image'); + + if (!hasImageBlocks) { + return params.prompt; + } + + // Build a single SDKUserMessage with multimodal content + this.logger.log(`Building multimodal SDK prompt with image content blocks`); + + const userMessage = { + type: 'user' as const, + message: { + role: 'user' as const, + content: lastMsg.content, + }, + parent_tool_use_id: null, + session_id: '', + }; + + // Return an async iterable that yields a single SDKUserMessage + async function* singleMessage() { + yield userMessage; + } + + return singleMessage(); + } + private classifyToolRisk(toolName: string, toolInput: any): CommandRiskLevel { // Only classify Bash commands for risk; other tools are auto-allowed if (toolName === 'Bash' && typeof toolInput?.command === 'string') { diff --git a/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts b/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts index 410df86..efb3f10 100644 --- a/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts +++ b/packages/services/agent-service/src/infrastructure/engines/claude-code-cli/claude-code-engine.ts @@ -15,6 +15,17 @@ export class ClaudeCodeCliEngine implements AgentEnginePort { constructor(private readonly configService: ConfigService) {} async *executeTask(params: EngineTaskParams): AsyncGenerator { + // CLI engine does not support multimodal — `-p` only accepts text. + // If attachments are present, log a warning. Use SDK or API engine for images. + if (params.conversationHistory?.some( + (m) => Array.isArray(m.content) && m.content.some((b: any) => b.type === 'image'), + )) { + this.logger.warn( + `[Session ${params.sessionId}] CLI engine does not support multimodal (images). ` + + `Image content blocks will be ignored. Use claude_agent_sdk or claude_api engine instead.`, + ); + } + const args = [ '-p', params.prompt, '--output-format', 'stream-json',