UNPKG

shipdeck

Version:

Ship MVPs in 48 hours. Fix bugs in 30 seconds. The command deck for developers who ship.

422 lines (355 loc) 13.3 kB
/** * Token Manager for Anthropic API * Handles token counting, context window management, and optimization */ // Token estimation based on Claude's tokenizer patterns const TOKEN_ESTIMATION = { // Average characters per token for different content types code: 3.5, prose: 4.0, structured: 3.0, // Special tokens systemTokens: 10, // Overhead for system messages messageOverhead: 5, // Overhead per message // Safety margins contextSafetyMargin: 0.1, // Reserve 10% of context responseSafetyMargin: 0.2 // Reserve 20% for response }; // Model configurations const MODEL_LIMITS = { 'claude-3-5-sonnet-20241022': { contextWindow: 200000, maxOutput: 8192 }, 'claude-3-5-haiku-20241022': { contextWindow: 200000, maxOutput: 4096 }, 'claude-3-opus-20240229': { contextWindow: 200000, maxOutput: 4096 }, 'claude-opus-4-1-20250805': { contextWindow: 200000, maxOutput: 8192 } // Claude 4.1 Opus }; class TokenManager { constructor(model = 'claude-opus-4-1-20250805') { this.model = model; this.modelLimits = MODEL_LIMITS[model]; if (!this.modelLimits) { throw new Error(`Unsupported model: ${model}. Supported models: ${Object.keys(MODEL_LIMITS).join(', ')}`); } // Cache for token estimates this.tokenCache = new Map(); } /** * Estimate tokens for text content */ estimateTokens(text, contentType = 'prose') { if (typeof text !== 'string') { text = JSON.stringify(text); } // Check cache const cacheKey = `${contentType}:${text.length}:${text.substring(0, 100)}`; if (this.tokenCache.has(cacheKey)) { return this.tokenCache.get(cacheKey); } let estimate = 0; const charsPerToken = TOKEN_ESTIMATION[contentType] || TOKEN_ESTIMATION.prose; // Basic estimation estimate = Math.ceil(text.length / charsPerToken); // Adjustments for different content patterns if (contentType === 'code') { // Code has more special characters and keywords const specialChars = (text.match(/[{}();,.\[\]]/g) || []).length; const keywords = (text.match(/\b(function|class|interface|import|export|const|let|var|if|else|for|while|return)\b/g) || []).length; estimate += specialChars * 0.1 + keywords * 0.2; } else if (contentType === 'structured') { // JSON/YAML structures are more compact const brackets = (text.match(/[{}[\]]/g) || []).length; estimate += brackets * 0.1; } // Round up and cache estimate = Math.ceil(estimate); this.tokenCache.set(cacheKey, estimate); // Limit cache size if (this.tokenCache.size > 1000) { const firstKey = this.tokenCache.keys().next().value; this.tokenCache.delete(firstKey); } return estimate; } /** * Estimate tokens for a message */ estimateMessageTokens(message) { let tokens = TOKEN_ESTIMATION.messageOverhead; if (message.role === 'system') { tokens += TOKEN_ESTIMATION.systemTokens; } if (typeof message.content === 'string') { tokens += this.estimateTokens(message.content); } else if (Array.isArray(message.content)) { // Multi-modal content for (const content of message.content) { if (content.type === 'text') { tokens += this.estimateTokens(content.text); } else if (content.type === 'image') { // Image tokens - rough estimate tokens += 1000; // Base cost for image processing } } } else { tokens += this.estimateTokens(JSON.stringify(message.content), 'structured'); } return tokens; } /** * Estimate tokens for a conversation */ estimateConversationTokens(messages) { return messages.reduce((total, message) => { return total + this.estimateMessageTokens(message); }, 0); } /** * Check if conversation fits within context window */ checkContextWindow(messages, maxOutputTokens = 4096) { const inputTokens = this.estimateConversationTokens(messages); const safeContextLimit = Math.floor(this.modelLimits.contextWindow * (1 - TOKEN_ESTIMATION.contextSafetyMargin)); const availableForOutput = safeContextLimit - inputTokens; return { inputTokens, maxOutputTokens: Math.min(maxOutputTokens, this.modelLimits.maxOutput), availableForOutput, fitsInContext: inputTokens + maxOutputTokens <= safeContextLimit, utilizationPercent: (inputTokens / this.modelLimits.contextWindow) * 100, recommendedMaxOutput: Math.min( Math.floor(availableForOutput * (1 - TOKEN_ESTIMATION.responseSafetyMargin)), this.modelLimits.maxOutput ) }; } /** * Optimize conversation for context window */ optimizeConversation(messages, maxOutputTokens = 4096, preserveSystemMessages = true) { const analysis = this.checkContextWindow(messages, maxOutputTokens); if (analysis.fitsInContext) { return { optimized: false, messages, analysis }; } let optimizedMessages = [...messages]; // Step 1: Preserve system and recent messages const systemMessages = optimizedMessages.filter(msg => msg.role === 'system'); const recentMessages = optimizedMessages.slice(-3); // Keep last 3 messages const middleMessages = optimizedMessages.slice( preserveSystemMessages ? systemMessages.length : 0, optimizedMessages.length - 3 ); // Step 2: Calculate target tokens const safeContextLimit = Math.floor(this.modelLimits.contextWindow * 0.8); const targetInputTokens = safeContextLimit - maxOutputTokens; const currentTokens = this.estimateConversationTokens(optimizedMessages); const reductionNeeded = currentTokens - targetInputTokens; // Step 3: Compress or summarize middle messages let finalMiddleMessages; if (middleMessages.length > 10) { // For many messages, create a summary const summary = this._createMessageSummary(middleMessages); finalMiddleMessages = [{ role: 'assistant', content: `[Previous conversation summary: ${summary}]` }]; } else { // For fewer messages, try compression finalMiddleMessages = this._compressMessages(middleMessages, reductionNeeded); } // Step 4: Reconstruct conversation const finalMessages = [ ...(preserveSystemMessages ? systemMessages : []), ...finalMiddleMessages, ...recentMessages ]; return { optimized: true, messages: finalMessages, analysis: this.checkContextWindow(finalMessages, maxOutputTokens), originalTokens: analysis.inputTokens, optimizedTokens: this.estimateConversationTokens(finalMessages), compressionRatio: this.estimateConversationTokens(finalMessages) / analysis.inputTokens }; } /** * Compress messages to fit within token budget */ _compressMessages(messages, targetTokenReduction) { if (messages.length === 0 || targetTokenReduction <= 0) { return messages; } const compressed = []; let tokensSaved = 0; for (const message of messages) { const messageTokens = this.estimateMessageTokens(message); if (tokensSaved >= targetTokenReduction) { // We've saved enough tokens, keep remaining messages compressed.push(message); } else if (messageTokens > 1000) { // Compress long messages const compressionRatio = Math.max(0.3, 1 - (targetTokenReduction - tokensSaved) / messageTokens); const compressedContent = this._compressContent(message.content, compressionRatio); compressed.push({ ...message, content: compressedContent }); tokensSaved += messageTokens - this.estimateTokens(compressedContent); } else { // Skip short messages to save tokens tokensSaved += messageTokens; } } // If we haven't saved enough, add a summary if (tokensSaved < targetTokenReduction && messages.length > 0) { const summary = this._createMessageSummary(messages); return [{ role: 'assistant', content: `[Conversation summary: ${summary}]` }]; } return compressed; } /** * Compress content while preserving key information */ _compressContent(content, ratio) { if (typeof content !== 'string') { content = JSON.stringify(content); } const targetLength = Math.floor(content.length * ratio); // Simple compression: keep beginning and end, summarize middle if (content.length <= targetLength) { return content; } const beginningLength = Math.floor(targetLength * 0.3); const endLength = Math.floor(targetLength * 0.3); const summaryLength = targetLength - beginningLength - endLength - 20; // Buffer for summary text const beginning = content.substring(0, beginningLength); const end = content.substring(content.length - endLength); if (summaryLength > 20) { const middleStart = beginningLength; const middleEnd = content.length - endLength; const middle = content.substring(middleStart, middleEnd); const summary = middle.length > summaryLength ? `...[${Math.floor(middle.length/1000)}k chars omitted]...` : middle.substring(0, summaryLength) + '...'; return `${beginning}${summary}${end}`; } return `${beginning}...${end}`; } /** * Create a summary of messages */ _createMessageSummary(messages) { const keyPoints = []; for (const message of messages) { if (message.role === 'user') { const content = typeof message.content === 'string' ? message.content : JSON.stringify(message.content); if (content.length > 50) { keyPoints.push(`User asked: ${content.substring(0, 100)}...`); } } else if (message.role === 'assistant') { const content = typeof message.content === 'string' ? message.content : JSON.stringify(message.content); if (content.includes('```')) { keyPoints.push('Assistant provided code solution'); } else if (content.length > 50) { keyPoints.push(`Assistant: ${content.substring(0, 100)}...`); } } } return keyPoints.slice(0, 5).join('; '); // Keep top 5 points } /** * Get optimal chunk size for long content */ getOptimalChunkSize(totalTokens, overlapTokens = 200) { const maxChunkTokens = Math.floor(this.modelLimits.contextWindow * 0.7); // 70% of context window if (totalTokens <= maxChunkTokens) { return { chunks: 1, chunkSize: totalTokens, overlap: 0, processingStrategy: 'single' }; } const effectiveChunkSize = maxChunkTokens - overlapTokens; const numChunks = Math.ceil(totalTokens / effectiveChunkSize); return { chunks: numChunks, chunkSize: effectiveChunkSize, overlap: overlapTokens, processingStrategy: 'chunked', estimatedTime: numChunks * 30 // Rough estimate in seconds }; } /** * Split content into optimal chunks */ chunkContent(content, chunkInfo) { if (chunkInfo.chunks === 1) { return [content]; } const chunks = []; const contentLength = content.length; const charsPerToken = TOKEN_ESTIMATION.prose; const chunkChars = chunkInfo.chunkSize * charsPerToken; const overlapChars = chunkInfo.overlap * charsPerToken; let start = 0; let chunkIndex = 0; while (start < contentLength) { const end = Math.min(start + chunkChars, contentLength); let chunkContent = content.substring(start, end); // Try to break at natural boundaries (sentences, paragraphs, code blocks) if (end < contentLength) { const lastNewline = chunkContent.lastIndexOf('\n\n'); const lastSentence = chunkContent.lastIndexOf('. '); const lastCodeBlock = chunkContent.lastIndexOf('```'); const breakPoint = Math.max(lastNewline, lastSentence); if (breakPoint > chunkChars * 0.8) { // Don't break too early chunkContent = content.substring(start, start + breakPoint + 1); } } chunks.push({ index: chunkIndex, content: chunkContent, tokens: this.estimateTokens(chunkContent), start: start, end: start + chunkContent.length }); // Move start position with overlap start = start + chunkContent.length - overlapChars; chunkIndex++; if (start >= contentLength) break; } return chunks; } /** * Get model information */ getModelInfo() { return { model: this.model, limits: this.modelLimits, estimation: TOKEN_ESTIMATION }; } /** * Clear token cache */ clearCache() { this.tokenCache.clear(); } /** * Get cache statistics */ getCacheStats() { return { size: this.tokenCache.size, hitRate: this.cacheHits / (this.cacheHits + this.cacheMisses) || 0 }; } } module.exports = { TokenManager, MODEL_LIMITS, TOKEN_ESTIMATION };