fix(agents): preserve image content blocks in context injection — fixes 209K token overflow

injectIntoMessages() was JSON.stringify-ing array content (with image blocks), turning base64 data into text tokens (~170K) instead of image tokens (~1,600). Fix: append context as a new text block in the array, preserving image block format. Also fixes token estimation to count images at ~1,600 tokens instead of base64 char length, and adds debug logging for API call token composition. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-07 08:51:24 -08:00 · 2026-02-07 08:51:24 -08:00 · fa835e4f56
parent 7dc364d9b3
commit fa835e4f56
3 changed files with 83 additions and 13 deletions
--- a/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts
+++ b/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts
@ -144,9 +144,32 @@ export async function* agentLoop(
  }

  // ---- Call Claude API (Streaming, with retry for rate limits) ----
-  logger.debug(
-    `[Turn ${currentTurn + 1}/${maxTurns}] Calling Claude API with ${messages.length} messages`,
-  );
+  // Debug: log message composition to diagnose token issues
+  {
+    let textChars = 0;
+    let imageCount = 0;
+    let docCount = 0;
+    for (const msg of messages) {
+      if (typeof msg.content === 'string') {
+        textChars += msg.content.length;
+      } else if (Array.isArray(msg.content)) {
+        for (const block of msg.content as any[]) {
+          if (block.type === 'text') textChars += (block.text || '').length;
+          else if (block.type === 'image') imageCount++;
+          else if (block.type === 'document') docCount++;
+          else if (block.type === 'tool_result') textChars += String(block.content || '').length;
+        }
+      }
+    }
+    const systemChars = Array.isArray(systemPrompt)
+      ? systemPrompt.reduce((sum: number, b: any) => sum + (b.text?.length || 0), 0)
+      : 0;
+    logger.log(
+      `[Turn ${currentTurn + 1}/${maxTurns}] API call: ${messages.length} msgs, ` +
+      `~${Math.round(textChars / 3)} text tokens, ${imageCount} images (~${imageCount * 1600} tokens), ` +
+      `${docCount} docs, system prompt ~${Math.round(systemChars / 3)} tokens`,
+    );
+  }

  let stream!: ReturnType<typeof anthropicClient.messages.stream>;
  const MAX_RETRIES = 2;
--- a/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts
+++ b/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts
@ -428,14 +428,26 @@ export class ContextInjectorService {

    if (lastUserIndex >= 0) {
      const lastUser = result[lastUserIndex];
-      const originalContent = typeof lastUser.content === 'string'
-        ? lastUser.content
-        : JSON.stringify(lastUser.content);
+      const contextSuffix = `\n\n<system-context>\n${injectionText}\n</system-context>`;

-      result[lastUserIndex] = {
-        ...lastUser,
-        content: `${originalContent}\n\n<system-context>\n${injectionText}\n</system-context>`,
-      };
+      if (typeof lastUser.content === 'string') {
+        // Simple string content — append as text
+        result[lastUserIndex] = {
+          ...lastUser,
+          content: lastUser.content + contextSuffix,
+        };
+      } else if (Array.isArray(lastUser.content)) {
+        // Array content (has image/document blocks) — append as additional text block
+        // IMPORTANT: Do NOT JSON.stringify array content, as this would turn
+        // image base64 data into text tokens (~170K) instead of image tokens (~1,600)
+        result[lastUserIndex] = {
+          ...lastUser,
+          content: [
+            ...lastUser.content,
+            { type: 'text' as const, text: contextSuffix },
+          ],
+        };
+      }
    }

    return result;
@ -457,8 +469,9 @@ export class ContextInjectorService {
      return { messages, wasCompacted: false };
    }

-    // Rough token estimation: ~3 chars per token for Chinese
-    const estimatedTokens = JSON.stringify(messages).length / 3;
+    // Token estimation: strip base64 image/document data before measuring
+    // (images are counted by pixels, not by base64 text size)
+    const estimatedTokens = this.estimateMessageTokens(messages);

    if (estimatedTokens < this.config.compactionThreshold) {
      return { messages, wasCompacted: false };
@ -518,6 +531,39 @@ export class ContextInjectorService {
    }
  }

+  // ============================================================
+  // Token Estimation
+  // ============================================================
+
+  /**
+   * Estimate token count for messages, excluding base64 binary data.
+   * Images/documents are counted as ~1,600 tokens each (by pixels, not base64 size).
+   * Text content is estimated at ~3 chars/token.
+   */
+  private estimateMessageTokens(messages: ClaudeMessage[]): number {
+    let tokens = 0;
+    for (const msg of messages) {
+      if (typeof msg.content === 'string') {
+        tokens += Math.ceil(msg.content.length / 3);
+      } else if (Array.isArray(msg.content)) {
+        for (const block of msg.content) {
+          if (block.type === 'text' && 'text' in block) {
+            tokens += Math.ceil(block.text.length / 3);
+          } else if (block.type === 'image') {
+            tokens += 1600; // ~1,600 tokens per image (Claude API counts by pixels)
+          } else if (block.type === 'document') {
+            tokens += 3000; // Approximate for PDF documents
+          } else if (block.type === 'tool_result' && 'content' in block) {
+            tokens += Math.ceil(String(block.content).length / 3);
+          } else if (block.type === 'tool_use') {
+            tokens += Math.ceil(JSON.stringify(block.input || {}).length / 3) + 50;
+          }
+        }
+      }
+    }
+    return tokens;
+  }
+
  // ============================================================
  // Cache Helpers
  // ============================================================
--- a/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts
+++ b/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts
@ -302,7 +302,8 @@ export type ContentBlock =
  | { type: 'text'; text: string }
  | { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> }
  | { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean }
-  | { type: 'image'; source: { type: 'base64'; media_type: string; data: string } };
+  | { type: 'image'; source: { type: 'base64'; media_type: string; data: string } }
+  | { type: 'document'; source: { type: 'base64'; media_type: string; data: string }; title?: string };

 /** Tool Definition for Claude API */
 export interface ToolDefinition {