diff --git a/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts b/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts index 7add432..e91091d 100644 --- a/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts +++ b/packages/services/conversation-service/src/infrastructure/agents/coordinator/agent-loop.ts @@ -144,9 +144,32 @@ export async function* agentLoop( } // ---- Call Claude API (Streaming, with retry for rate limits) ---- - logger.debug( - `[Turn ${currentTurn + 1}/${maxTurns}] Calling Claude API with ${messages.length} messages`, - ); + // Debug: log message composition to diagnose token issues + { + let textChars = 0; + let imageCount = 0; + let docCount = 0; + for (const msg of messages) { + if (typeof msg.content === 'string') { + textChars += msg.content.length; + } else if (Array.isArray(msg.content)) { + for (const block of msg.content as any[]) { + if (block.type === 'text') textChars += (block.text || '').length; + else if (block.type === 'image') imageCount++; + else if (block.type === 'document') docCount++; + else if (block.type === 'tool_result') textChars += String(block.content || '').length; + } + } + } + const systemChars = Array.isArray(systemPrompt) + ? systemPrompt.reduce((sum: number, b: any) => sum + (b.text?.length || 0), 0) + : 0; + logger.log( + `[Turn ${currentTurn + 1}/${maxTurns}] API call: ${messages.length} msgs, ` + + `~${Math.round(textChars / 3)} text tokens, ${imageCount} images (~${imageCount * 1600} tokens), ` + + `${docCount} docs, system prompt ~${Math.round(systemChars / 3)} tokens`, + ); + } let stream!: ReturnType; const MAX_RETRIES = 2; diff --git a/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts b/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts index 35935fc..c1cc057 100644 --- a/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts +++ b/packages/services/conversation-service/src/infrastructure/agents/coordinator/context-injector.service.ts @@ -428,14 +428,26 @@ export class ContextInjectorService { if (lastUserIndex >= 0) { const lastUser = result[lastUserIndex]; - const originalContent = typeof lastUser.content === 'string' - ? lastUser.content - : JSON.stringify(lastUser.content); + const contextSuffix = `\n\n\n${injectionText}\n`; - result[lastUserIndex] = { - ...lastUser, - content: `${originalContent}\n\n\n${injectionText}\n`, - }; + if (typeof lastUser.content === 'string') { + // Simple string content — append as text + result[lastUserIndex] = { + ...lastUser, + content: lastUser.content + contextSuffix, + }; + } else if (Array.isArray(lastUser.content)) { + // Array content (has image/document blocks) — append as additional text block + // IMPORTANT: Do NOT JSON.stringify array content, as this would turn + // image base64 data into text tokens (~170K) instead of image tokens (~1,600) + result[lastUserIndex] = { + ...lastUser, + content: [ + ...lastUser.content, + { type: 'text' as const, text: contextSuffix }, + ], + }; + } } return result; @@ -457,8 +469,9 @@ export class ContextInjectorService { return { messages, wasCompacted: false }; } - // Rough token estimation: ~3 chars per token for Chinese - const estimatedTokens = JSON.stringify(messages).length / 3; + // Token estimation: strip base64 image/document data before measuring + // (images are counted by pixels, not by base64 text size) + const estimatedTokens = this.estimateMessageTokens(messages); if (estimatedTokens < this.config.compactionThreshold) { return { messages, wasCompacted: false }; @@ -518,6 +531,39 @@ export class ContextInjectorService { } } + // ============================================================ + // Token Estimation + // ============================================================ + + /** + * Estimate token count for messages, excluding base64 binary data. + * Images/documents are counted as ~1,600 tokens each (by pixels, not base64 size). + * Text content is estimated at ~3 chars/token. + */ + private estimateMessageTokens(messages: ClaudeMessage[]): number { + let tokens = 0; + for (const msg of messages) { + if (typeof msg.content === 'string') { + tokens += Math.ceil(msg.content.length / 3); + } else if (Array.isArray(msg.content)) { + for (const block of msg.content) { + if (block.type === 'text' && 'text' in block) { + tokens += Math.ceil(block.text.length / 3); + } else if (block.type === 'image') { + tokens += 1600; // ~1,600 tokens per image (Claude API counts by pixels) + } else if (block.type === 'document') { + tokens += 3000; // Approximate for PDF documents + } else if (block.type === 'tool_result' && 'content' in block) { + tokens += Math.ceil(String(block.content).length / 3); + } else if (block.type === 'tool_use') { + tokens += Math.ceil(JSON.stringify(block.input || {}).length / 3) + 50; + } + } + } + } + return tokens; + } + // ============================================================ // Cache Helpers // ============================================================ diff --git a/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts b/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts index 602e8fc..f50c919 100644 --- a/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts +++ b/packages/services/conversation-service/src/infrastructure/agents/types/agent.types.ts @@ -302,7 +302,8 @@ export type ContentBlock = | { type: 'text'; text: string } | { type: 'tool_use'; id: string; name: string; input: Record } | { type: 'tool_result'; tool_use_id: string; content: string; is_error?: boolean } - | { type: 'image'; source: { type: 'base64'; media_type: string; data: string } }; + | { type: 'image'; source: { type: 'base64'; media_type: string; data: string } } + | { type: 'document'; source: { type: 'base64'; media_type: string; data: string }; title?: string }; /** Tool Definition for Claude API */ export interface ToolDefinition {