Claude Code internals: AutoDream, Buddy companion, prompt cache economics, microcompact

12 min read Original article ↗
import { feature } from 'bun:bundle' import type { ToolResultBlockParam } from '@anthropic-ai/sdk/resources/index.mjs' import type { QuerySource } from '../../constants/querySource.js' import type { ToolUseContext } from '../../Tool.js' import { FILE_EDIT_TOOL_NAME } from '../../tools/FileEditTool/constants.js' import { FILE_READ_TOOL_NAME } from '../../tools/FileReadTool/prompt.js' import { FILE_WRITE_TOOL_NAME } from '../../tools/FileWriteTool/prompt.js' import { GLOB_TOOL_NAME } from '../../tools/GlobTool/prompt.js' import { GREP_TOOL_NAME } from '../../tools/GrepTool/prompt.js' import { WEB_FETCH_TOOL_NAME } from '../../tools/WebFetchTool/prompt.js' import { WEB_SEARCH_TOOL_NAME } from '../../tools/WebSearchTool/prompt.js' import type { Message } from '../../types/message.js' import { logForDebugging } from '../../utils/debug.js' import { getMainLoopModel } from '../../utils/model/model.js' import { SHELL_TOOL_NAMES } from '../../utils/shell/shellToolUtils.js' import { jsonStringify } from '../../utils/slowOperations.js' import { type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, logEvent, } from '../analytics/index.js' import { notifyCacheDeletion } from '../api/promptCacheBreakDetection.js' import { roughTokenCountEstimation } from '../tokenEstimation.js' import { clearCompactWarningSuppression, suppressCompactWarning, } from './compactWarningState.js' import { getTimeBasedMCConfig, type TimeBasedMCConfig, } from './timeBasedMCConfig.js' // Inline from utils/toolResultStorage.ts — importing that file pulls in // sessionStorage → utils/messages → services/api/errors, completing a // circular-deps loop back through this file via promptCacheBreakDetection. // Drift is caught by a test asserting equality with the source-of-truth. export const TIME_BASED_MC_CLEARED_MESSAGE = '[Old tool result content cleared]' const IMAGE_MAX_TOKEN_SIZE = 2000 // Only compact these tools const COMPACTABLE_TOOLS = new Set<string>([ FILE_READ_TOOL_NAME, ...SHELL_TOOL_NAMES, GREP_TOOL_NAME, GLOB_TOOL_NAME, WEB_SEARCH_TOOL_NAME, WEB_FETCH_TOOL_NAME, FILE_EDIT_TOOL_NAME, FILE_WRITE_TOOL_NAME, ]) // --- Cached microcompact state (ant-only, gated by feature('CACHED_MICROCOMPACT')) --- // Lazy-initialized cached MC module and state to avoid importing in external builds. // The imports and state live inside feature() checks for dead code elimination. let cachedMCModule: typeof import('./cachedMicrocompact.js') | null = null let cachedMCState: import('./cachedMicrocompact.js').CachedMCState | null = null let pendingCacheEdits: | import('./cachedMicrocompact.js').CacheEditsBlock | null = null async function getCachedMCModule(): Promise< typeof import('./cachedMicrocompact.js') > { if (!cachedMCModule) { cachedMCModule = await import('./cachedMicrocompact.js') } return cachedMCModule } function ensureCachedMCState(): import('./cachedMicrocompact.js').CachedMCState { if (!cachedMCState && cachedMCModule) { cachedMCState = cachedMCModule.createCachedMCState() } if (!cachedMCState) { throw new Error( 'cachedMCState not initialized — getCachedMCModule() must be called first', ) } return cachedMCState } /** * Get new pending cache edits to be included in the next API request. * Returns null if there are no new pending edits. * Clears the pending state (caller must pin them after insertion). */ export function consumePendingCacheEdits(): | import('./cachedMicrocompact.js').CacheEditsBlock | null { const edits = pendingCacheEdits pendingCacheEdits = null return edits } /** * Get all previously-pinned cache edits that must be re-sent at their * original positions for cache hits. */ export function getPinnedCacheEdits(): import('./cachedMicrocompact.js').PinnedCacheEdits[] { if (!cachedMCState) { return [] } return cachedMCState.pinnedEdits } /** * Pin a new cache_edits block to a specific user message position. * Called after inserting new edits so they are re-sent in subsequent calls. */ export function pinCacheEdits( userMessageIndex: number, block: import('./cachedMicrocompact.js').CacheEditsBlock, ): void { if (cachedMCState) { cachedMCState.pinnedEdits.push({ userMessageIndex, block }) } } /** * Marks all registered tools as sent to the API. * Called after a successful API response. */ export function markToolsSentToAPIState(): void { if (cachedMCState && cachedMCModule) { cachedMCModule.markToolsSentToAPI(cachedMCState) } } export function resetMicrocompactState(): void { if (cachedMCState && cachedMCModule) { cachedMCModule.resetCachedMCState(cachedMCState) } pendingCacheEdits = null } // Helper to calculate tool result tokens function calculateToolResultTokens(block: ToolResultBlockParam): number { if (!block.content) { return 0 } if (typeof block.content === 'string') { return roughTokenCountEstimation(block.content) } // Array of TextBlockParam | ImageBlockParam | DocumentBlockParam return block.content.reduce((sum, item) => { if (item.type === 'text') { return sum + roughTokenCountEstimation(item.text) } else if (item.type === 'image' || item.type === 'document') { // Images/documents are approximately 2000 tokens regardless of format return sum + IMAGE_MAX_TOKEN_SIZE } return sum }, 0) } /** * Estimate token count for messages by extracting text content * Used for rough token estimation when we don't have accurate API counts * Pads estimate by 4/3 to be conservative since we're approximating */ export function estimateMessageTokens(messages: Message[]): number { let totalTokens = 0 for (const message of messages) { if (message.type !== 'user' && message.type !== 'assistant') { continue } if (!Array.isArray(message.message.content)) { continue } for (const block of message.message.content) { if (block.type === 'text') { totalTokens += roughTokenCountEstimation(block.text) } else if (block.type === 'tool_result') { totalTokens += calculateToolResultTokens(block) } else if (block.type === 'image' || block.type === 'document') { totalTokens += IMAGE_MAX_TOKEN_SIZE } else if (block.type === 'thinking') { // Match roughTokenCountEstimationForBlock: count only the thinking // text, not the JSON wrapper or signature (signature is metadata, // not model-tokenized content). totalTokens += roughTokenCountEstimation(block.thinking) } else if (block.type === 'redacted_thinking') { totalTokens += roughTokenCountEstimation(block.data) } else if (block.type === 'tool_use') { // Match roughTokenCountEstimationForBlock: count name + input, // not the JSON wrapper or id field. totalTokens += roughTokenCountEstimation( block.name + jsonStringify(block.input ?? {}), ) } else { // server_tool_use, web_search_tool_result, etc. totalTokens += roughTokenCountEstimation(jsonStringify(block)) } } } // Pad estimate by 4/3 to be conservative since we're approximating return Math.ceil(totalTokens * (4 / 3)) } export type PendingCacheEdits = { trigger: 'auto' deletedToolIds: string[] // Baseline cumulative cache_deleted_input_tokens from the previous API response, // used to compute the per-operation delta (the API value is sticky/cumulative) baselineCacheDeletedTokens: number } export type MicrocompactResult = { messages: Message[] compactionInfo?: { pendingCacheEdits?: PendingCacheEdits } } /** * Walk messages and collect tool_use IDs whose tool name is in * COMPACTABLE_TOOLS, in encounter order. Shared by both microcompact paths. */ function collectCompactableToolIds(messages: Message[]): string[] { const ids: string[] = [] for (const message of messages) { if ( message.type === 'assistant' && Array.isArray(message.message.content) ) { for (const block of message.message.content) { if (block.type === 'tool_use' && COMPACTABLE_TOOLS.has(block.name)) { ids.push(block.id) } } } } return ids } // Prefix-match because promptCategory.ts sets the querySource to // 'repl_main_thread:outputStyle:<style>' when a non-default output style // is active. The bare 'repl_main_thread' is only used for the default style. // query.ts:350/1451 use the same startsWith pattern; the pre-existing // cached-MC `=== 'repl_main_thread'` check was a latent bug — users with a // non-default output style were silently excluded from cached MC. function isMainThreadSource(querySource: QuerySource | undefined): boolean { return !querySource || querySource.startsWith('repl_main_thread') } export async function microcompactMessages( messages: Message[], toolUseContext?: ToolUseContext, querySource?: QuerySource, ): Promise<MicrocompactResult> { // Clear suppression flag at start of new microcompact attempt clearCompactWarningSuppression() // Time-based trigger runs first and short-circuits. If the gap since the // last assistant message exceeds the threshold, the server cache has expired // and the full prefix will be rewritten regardless — so content-clear old // tool results now, before the request, to shrink what gets rewritten. // Cached MC (cache-editing) is skipped when this fires: editing assumes a // warm cache, and we just established it's cold. const timeBasedResult = maybeTimeBasedMicrocompact(messages, querySource) if (timeBasedResult) { return timeBasedResult } // Only run cached MC for the main thread to prevent forked agents // (session_memory, prompt_suggestion, etc.) from registering their // tool_results in the global cachedMCState, which would cause the main // thread to try deleting tools that don't exist in its own conversation. if (feature('CACHED_MICROCOMPACT')) { const mod = await getCachedMCModule() const model = toolUseContext?.options.mainLoopModel ?? getMainLoopModel() if ( mod.isCachedMicrocompactEnabled() && mod.isModelSupportedForCacheEditing(model) && isMainThreadSource(querySource) ) { return await cachedMicrocompactPath(messages, querySource) } } // Legacy microcompact path removed — tengu_cache_plum_violet is always true. // For contexts where cached microcompact is not available (external builds, // non-ant users, unsupported models, sub-agents), no compaction happens here; // autocompact handles context pressure instead. return { messages } } /** * Cached microcompact path - uses cache editing API to remove tool results * without invalidating the cached prefix. * * Key differences from regular microcompact: * - Does NOT modify local message content (cache_reference and cache_edits are added at API layer) * - Uses count-based trigger/keep thresholds from GrowthBook config * - Takes precedence over regular microcompact (no disk persistence) * - Tracks tool results and queues cache edits for the API layer */ async function cachedMicrocompactPath( messages: Message[], querySource: QuerySource | undefined, ): Promise<MicrocompactResult> { const mod = await getCachedMCModule() const state = ensureCachedMCState() const config = mod.getCachedMCConfig() const compactableToolIds = new Set(collectCompactableToolIds(messages)) // Second pass: register tool results grouped by user message for (const message of messages) { if (message.type === 'user' && Array.isArray(message.message.content)) { const groupIds: string[] = [] for (const block of message.message.content) { if ( block.type === 'tool_result' && compactableToolIds.has(block.tool_use_id) && !state.registeredTools.has(block.tool_use_id) ) { mod.registerToolResult(state, block.tool_use_id) groupIds.push(block.tool_use_id) } } mod.registerToolMessage(state, groupIds) } } const toolsToDelete = mod.getToolResultsToDelete(state) if (toolsToDelete.length > 0) { // Create and queue the cache_edits block for the API layer const cacheEdits = mod.createCacheEditsBlock(state, toolsToDelete) if (cacheEdits) { pendingCacheEdits = cacheEdits } logForDebugging( `Cached MC deleting ${toolsToDelete.length} tool(s): ${toolsToDelete.join(', ')}`, ) // Log the event logEvent('tengu_cached_microcompact', { toolsDeleted: toolsToDelete.length, deletedToolIds: toolsToDelete.join( ',', ) as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, activeToolCount: state.toolOrder.length - state.deletedRefs.size, triggerType: 'auto' as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS, threshold: config.triggerThreshold, keepRecent: config.keepRecent, }) // Suppress warning after successful compaction suppressCompactWarning() // Notify cache break detection that cache reads will legitimately drop if (feature('PROMPT_CACHE_BREAK_DETECTION')) { // Pass the actual querySource — isMainThreadSource now prefix-matches // so output-style variants enter here, and getTrackingKey keys on the // full source string, not the 'repl_main_thread' prefix. notifyCacheDeletion(querySource ?? 'repl_main_thread') } // Return messages unchanged - cache_reference and cache_edits are added at API layer // Boundary message is deferred until after API response so we can use // actual cache_deleted_input_tokens from the API instead of client-side estimates // Capture the baseline cumulative cache_deleted_input_tokens from the last // assistant message so we can compute a per-operation delta after the API call const lastAsst = messages.findLast(m => m.type === 'assistant') const baseline = lastAsst?.type === 'assistant' ? (( lastAsst.message.usage as unknown as Record< string, number | undefined > )?.cache_deleted_input_tokens ?? 0) : 0 return { messages, compactionInfo: { pendingCacheEdits: { trigger: 'auto', deletedToolIds: toolsToDelete, baselineCacheDeletedTokens: baseline, }, }, } } // No compaction needed, return messages unchanged return { messages } } /** * Time-based microcompact: when the gap since the last main-loop assistant * message exceeds the configured threshold, content-clear all but the most * recent N compactable tool results. * * Returns null when the trigger doesn't fire (disabled, wrong source, gap * under threshold, nothing to clear) — caller falls through to other paths. * * Unlike cached MC, this mutates message content directly. The cache is cold, * so there's no cached prefix to preserve via cache_edits. */ /** * Check whether the time-based trigger should fire for this request. * * Returns the measured gap (minutes since last assistant message) when the * trigger fires, or null when it doesn't (disabled, wrong source, under * threshold, no prior assistant, unparseable timestamp). * * Extracted so other pre-request paths (e.g. snip force-apply) can consult * the same predicate without coupling to the tool-result clearing action. */ export function evaluateTimeBasedTrigger( messages: Message[], querySource: QuerySource | undefined, ): { gapMinutes: number; config: TimeBasedMCConfig } | null { const config = getTimeBasedMCConfig() // Require an explicit main-thread querySource. isMainThreadSource treats // undefined as main-thread (for cached-MC backward-compat), but several // callers (/context, /compact, analyzeContext) invoke microcompactMessages // without a source for analysis-only purposes — they should not trigger. if (!config.enabled || !querySource || !isMainThreadSource(querySource)) { return null } const lastAssistant = messages.findLast(m => m.type === 'assistant') if (!lastAssistant) { return null } const gapMinutes = (Date.now() - new Date(lastAssistant.timestamp).getTime()) / 60_000 if (!Number.isFinite(gapMinutes) || gapMinutes < config.gapThresholdMinutes) { return null } return { gapMinutes, config } } function maybeTimeBasedMicrocompact( messages: Message[], querySource: QuerySource | undefined, ): MicrocompactResult | null { const trigger = evaluateTimeBasedTrigger(messages, querySource) if (!trigger) { return null } const { gapMinutes, config } = trigger const compactableIds = collectCompactableToolIds(messages) // Floor at 1: slice(-0) returns the full array (paradoxically keeps // everything), and clearing ALL results leaves the model with zero working // context. Neither degenerate is sensible — always keep at least the last. const keepRecent = Math.max(1, config.keepRecent) const keepSet = new Set(compactableIds.slice(-keepRecent)) const clearSet = new Set(compactableIds.filter(id => !keepSet.has(id))) if (clearSet.size === 0) { return null } let tokensSaved = 0 const result: Message[] = messages.map(message => { if (message.type !== 'user' || !Array.isArray(message.message.content)) { return message } let touched = false const newContent = message.message.content.map(block => { if ( block.type === 'tool_result' && clearSet.has(block.tool_use_id) && block.content !== TIME_BASED_MC_CLEARED_MESSAGE ) { tokensSaved += calculateToolResultTokens(block) touched = true return { ...block, content: TIME_BASED_MC_CLEARED_MESSAGE } } return block }) if (!touched) return message return { ...message, message: { ...message.message, content: newContent }, } }) if (tokensSaved === 0) { return null } logEvent('tengu_time_based_microcompact', { gapMinutes: Math.round(gapMinutes), gapThresholdMinutes: config.gapThresholdMinutes, toolsCleared: clearSet.size, toolsKept: keepSet.size, keepRecent: config.keepRecent, tokensSaved, }) logForDebugging( `[TIME-BASED MC] gap ${Math.round(gapMinutes)}min > ${config.gapThresholdMinutes}min, cleared ${clearSet.size} tool results (~${tokensSaved} tokens), kept last ${keepSet.size}`, ) suppressCompactWarning() // Cached-MC state (module-level) holds tool IDs registered on prior turns. // We just content-cleared some of those tools AND invalidated the server // cache by changing prompt content. If cached-MC runs next turn with the // stale state, it would try to cache_edit tools whose server-side entries // no longer exist. Reset it. resetMicrocompactState() // We just changed the prompt content — the next response's cache read will // be low, but that's us, not a break. Tell the detector to expect a drop. // notifyCacheDeletion (not notifyCompaction) because it's already imported // here and achieves the same false-positive suppression — adding the second // symbol to the import was flagged by the circular-deps check. // Pass the actual querySource: getTrackingKey returns the full source string // (e.g. 'repl_main_thread:outputStyle:custom'), not just the prefix. if (feature('PROMPT_CACHE_BREAK_DETECTION') && querySource) { notifyCacheDeletion(querySource) } return { messages: result } }