fix(agents): preserve image content blocks in context injection — fixes 209K token overflow

injectIntoMessages() was JSON.stringify-ing array content (with image blocks),
turning base64 data into text tokens (~170K) instead of image tokens (~1,600).
Fix: append context as a new text block in the array, preserving image block format.

Also fixes token estimation to count images at ~1,600 tokens instead of base64 char length,
and adds debug logging for API call token composition.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
hailin 2026-02-07 08:51:24 -08:00
parent 7dc364d9b3
commit fa835e4f56
3 changed files with 83 additions and 13 deletions

View File

@ -144,9 +144,32 @@ export async function* agentLoop(
}
// ---- Call Claude API (Streaming, with retry for rate limits) ----
logger.debug(
`[Turn ${currentTurn + 1}/${maxTurns}] Calling Claude API with ${messages.length} messages`,
);
// Debug: log message composition to diagnose token issues
{
let textChars = 0;
let imageCount = 0;
let docCount = 0;
for (const msg of messages) {
if (typeof msg.content === 'string') {
textChars += msg.content.length;
} else if (Array.isArray(msg.content)) {
for (const block of msg.content as any[]) {
if (block.type === 'text') textChars += (block.text || '').length;
else if (block.type === 'image') imageCount++;
else if (block.type === 'document') docCount++;
else if (block.type === 'tool_result') textChars += String(block.content || '').length;
}
}
}
const systemChars = Array.isArray(systemPrompt)
? systemPrompt.reduce((sum: number, b: any) => sum + (b.text?.length || 0), 0)
: 0;
logger.log(
`[Turn ${currentTurn + 1}/${maxTurns}] API call: ${messages.length} msgs, ` +
`~${Math.round(textChars / 3)} text tokens, ${imageCount} images (~${imageCount * 1600} tokens), ` +
`${docCount} docs, system prompt ~${Math.round(systemChars / 3)} tokens`,
);
}
let stream!: ReturnType<typeof anthropicClient.messages.stream>;
const MAX_RETRIES = 2;

View File

@ -428,14 +428,26 @@ export class ContextInjectorService {
if (lastUserIndex >= 0) {
const lastUser = result[lastUserIndex];
const originalContent = typeof lastUser.content === 'string'
? lastUser.content
: JSON.stringify(lastUser.content);
const contextSuffix = `\n\n<system-context>\n${injectionText}\n</system-context>`;
result[lastUserIndex] = {
...lastUser,
content: `${originalContent}\n\n<system-context>\n${injectionText}\n</system-context>`,
};
if (typeof lastUser.content === 'string') {
// Simple string content — append as text
result[lastUserIndex] = {
...lastUser,
content: lastUser.content + contextSuffix,
};
} else if (Array.isArray(lastUser.content)) {
// Array content (has image/document blocks) — append as additional text block
// IMPORTANT: Do NOT JSON.stringify array content, as this would turn
// image base64 data into text tokens (~170K) instead of image tokens (~1,600)
result[lastUserIndex] = {
...lastUser,
content: [
...lastUser.content,
{ type: 'text' as const, text: contextSuffix },
],
};
}
}
return result;
@ -457,8 +469,9 @@ export class ContextInjectorService {
return { messages, wasCompacted: false };
}
// Rough token estimation: ~3 chars per token for Chinese
const estimatedTokens = JSON.stringify(messages).length / 3;
// Token estimation: strip base64 image/document data before measuring
// (images are counted by pixels, not by base64 text size)
const estimatedTokens = this.estimateMessageTokens(messages);
if (estimatedTokens < this.config.compactionThreshold) {
return { messages, wasCompacted: false };
@ -518,6 +531,39 @@ export class ContextInjectorService {
}
}
// ============================================================
// Token Estimation
// ============================================================
/**
* Estimate token count for messages, excluding base64 binary data.
* Images/documents are counted as ~1,600 tokens each (by pixels, not base64 size).
* Text content is estimated at ~3 chars/token.
*/
private estimateMessageTokens(messages: ClaudeMessage[]): number {
let tokens = 0;
for (const msg of messages) {
if (typeof msg.content === 'string') {
tokens += Math.ceil(msg.content.length / 3);
} else if (Array.isArray(msg.content)) {
for (const block of msg.content) {
if (block.type === 'text' && 'text' in block) {
tokens += Math.ceil(block.text.length / 3);
} else if (block.type === 'image') {
tokens += 1600; // ~1,600 tokens per image (Claude API counts by pixels)
} else if (block.type === 'document') {
tokens += 3000; // Approximate for PDF documents
} else if (block.type === 'tool_result' && 'content' in block) {
tokens += Math.ceil(String(block.content).length / 3);
} else if (block.type === 'tool_use') {
tokens += Math.ceil(JSON.stringify(block.input || {}).length / 3) + 50;
}
}
}
}
return tokens;
}
// ============================================================
// Cache Helpers
// ============================================================

View File

@ -302,7 +302,8 @@ export type ContentBlock =
| { type: 'text'; text: string }
| { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> }
| { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean }
| { type: 'image'; source: { type: 'base64'; media_type: string; data: string } };
| { type: 'image'; source: { type: 'base64'; media_type: string; data: string } }
| { type: 'document'; source: { type: 'base64'; media_type: string; data: string }; title?: string };
/** Tool Definition for Claude API */
export interface ToolDefinition {