fix(agents): preserve image content blocks in context injection — fixes 209K token overflow
injectIntoMessages() was JSON.stringify-ing array content (with image blocks), turning base64 data into text tokens (~170K) instead of image tokens (~1,600). Fix: append context as a new text block in the array, preserving image block format. Also fixes token estimation to count images at ~1,600 tokens instead of base64 char length, and adds debug logging for API call token composition. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
7dc364d9b3
commit
fa835e4f56
|
|
@ -144,9 +144,32 @@ export async function* agentLoop(
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- Call Claude API (Streaming, with retry for rate limits) ----
|
// ---- Call Claude API (Streaming, with retry for rate limits) ----
|
||||||
logger.debug(
|
// Debug: log message composition to diagnose token issues
|
||||||
`[Turn ${currentTurn + 1}/${maxTurns}] Calling Claude API with ${messages.length} messages`,
|
{
|
||||||
);
|
let textChars = 0;
|
||||||
|
let imageCount = 0;
|
||||||
|
let docCount = 0;
|
||||||
|
for (const msg of messages) {
|
||||||
|
if (typeof msg.content === 'string') {
|
||||||
|
textChars += msg.content.length;
|
||||||
|
} else if (Array.isArray(msg.content)) {
|
||||||
|
for (const block of msg.content as any[]) {
|
||||||
|
if (block.type === 'text') textChars += (block.text || '').length;
|
||||||
|
else if (block.type === 'image') imageCount++;
|
||||||
|
else if (block.type === 'document') docCount++;
|
||||||
|
else if (block.type === 'tool_result') textChars += String(block.content || '').length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const systemChars = Array.isArray(systemPrompt)
|
||||||
|
? systemPrompt.reduce((sum: number, b: any) => sum + (b.text?.length || 0), 0)
|
||||||
|
: 0;
|
||||||
|
logger.log(
|
||||||
|
`[Turn ${currentTurn + 1}/${maxTurns}] API call: ${messages.length} msgs, ` +
|
||||||
|
`~${Math.round(textChars / 3)} text tokens, ${imageCount} images (~${imageCount * 1600} tokens), ` +
|
||||||
|
`${docCount} docs, system prompt ~${Math.round(systemChars / 3)} tokens`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let stream!: ReturnType<typeof anthropicClient.messages.stream>;
|
let stream!: ReturnType<typeof anthropicClient.messages.stream>;
|
||||||
const MAX_RETRIES = 2;
|
const MAX_RETRIES = 2;
|
||||||
|
|
|
||||||
|
|
@ -428,14 +428,26 @@ export class ContextInjectorService {
|
||||||
|
|
||||||
if (lastUserIndex >= 0) {
|
if (lastUserIndex >= 0) {
|
||||||
const lastUser = result[lastUserIndex];
|
const lastUser = result[lastUserIndex];
|
||||||
const originalContent = typeof lastUser.content === 'string'
|
const contextSuffix = `\n\n<system-context>\n${injectionText}\n</system-context>`;
|
||||||
? lastUser.content
|
|
||||||
: JSON.stringify(lastUser.content);
|
|
||||||
|
|
||||||
result[lastUserIndex] = {
|
if (typeof lastUser.content === 'string') {
|
||||||
...lastUser,
|
// Simple string content — append as text
|
||||||
content: `${originalContent}\n\n<system-context>\n${injectionText}\n</system-context>`,
|
result[lastUserIndex] = {
|
||||||
};
|
...lastUser,
|
||||||
|
content: lastUser.content + contextSuffix,
|
||||||
|
};
|
||||||
|
} else if (Array.isArray(lastUser.content)) {
|
||||||
|
// Array content (has image/document blocks) — append as additional text block
|
||||||
|
// IMPORTANT: Do NOT JSON.stringify array content, as this would turn
|
||||||
|
// image base64 data into text tokens (~170K) instead of image tokens (~1,600)
|
||||||
|
result[lastUserIndex] = {
|
||||||
|
...lastUser,
|
||||||
|
content: [
|
||||||
|
...lastUser.content,
|
||||||
|
{ type: 'text' as const, text: contextSuffix },
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
|
@ -457,8 +469,9 @@ export class ContextInjectorService {
|
||||||
return { messages, wasCompacted: false };
|
return { messages, wasCompacted: false };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rough token estimation: ~3 chars per token for Chinese
|
// Token estimation: strip base64 image/document data before measuring
|
||||||
const estimatedTokens = JSON.stringify(messages).length / 3;
|
// (images are counted by pixels, not by base64 text size)
|
||||||
|
const estimatedTokens = this.estimateMessageTokens(messages);
|
||||||
|
|
||||||
if (estimatedTokens < this.config.compactionThreshold) {
|
if (estimatedTokens < this.config.compactionThreshold) {
|
||||||
return { messages, wasCompacted: false };
|
return { messages, wasCompacted: false };
|
||||||
|
|
@ -518,6 +531,39 @@ export class ContextInjectorService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// Token Estimation
|
||||||
|
// ============================================================
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Estimate token count for messages, excluding base64 binary data.
|
||||||
|
* Images/documents are counted as ~1,600 tokens each (by pixels, not base64 size).
|
||||||
|
* Text content is estimated at ~3 chars/token.
|
||||||
|
*/
|
||||||
|
private estimateMessageTokens(messages: ClaudeMessage[]): number {
|
||||||
|
let tokens = 0;
|
||||||
|
for (const msg of messages) {
|
||||||
|
if (typeof msg.content === 'string') {
|
||||||
|
tokens += Math.ceil(msg.content.length / 3);
|
||||||
|
} else if (Array.isArray(msg.content)) {
|
||||||
|
for (const block of msg.content) {
|
||||||
|
if (block.type === 'text' && 'text' in block) {
|
||||||
|
tokens += Math.ceil(block.text.length / 3);
|
||||||
|
} else if (block.type === 'image') {
|
||||||
|
tokens += 1600; // ~1,600 tokens per image (Claude API counts by pixels)
|
||||||
|
} else if (block.type === 'document') {
|
||||||
|
tokens += 3000; // Approximate for PDF documents
|
||||||
|
} else if (block.type === 'tool_result' && 'content' in block) {
|
||||||
|
tokens += Math.ceil(String(block.content).length / 3);
|
||||||
|
} else if (block.type === 'tool_use') {
|
||||||
|
tokens += Math.ceil(JSON.stringify(block.input || {}).length / 3) + 50;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================
|
// ============================================================
|
||||||
// Cache Helpers
|
// Cache Helpers
|
||||||
// ============================================================
|
// ============================================================
|
||||||
|
|
|
||||||
|
|
@ -302,7 +302,8 @@ export type ContentBlock =
|
||||||
| { type: 'text'; text: string }
|
| { type: 'text'; text: string }
|
||||||
| { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> }
|
| { type: 'tool_use'; id: string; name: string; input: Record<string, unknown> }
|
||||||
| { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean }
|
| { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean }
|
||||||
| { type: 'image'; source: { type: 'base64'; media_type: string; data: string } };
|
| { type: 'image'; source: { type: 'base64'; media_type: string; data: string } }
|
||||||
|
| { type: 'document'; source: { type: 'base64'; media_type: string; data: string }; title?: string };
|
||||||
|
|
||||||
/** Tool Definition for Claude API */
|
/** Tool Definition for Claude API */
|
||||||
export interface ToolDefinition {
|
export interface ToolDefinition {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue