UNPKG

@afterxleep/doc-bot

Version:

Generic MCP server for intelligent documentation access in any project

134 lines (118 loc) 4.5 kB
/** * TokenEstimator - Realistic token counting for LLM content * * Implements proper tokenization estimation based on modern LLM patterns. * Much more accurate than naive character-count approaches. */ export class TokenEstimator { /** * Estimate token count using realistic tokenization patterns * Based on GPT-style tokenization rules and observed patterns * * @param {string} text - Text to analyze * @returns {number} Estimated token count */ static estimateTokens(text) { if (!text) return 0; let tokens = 0; // Split by whitespace and punctuation patterns const words = text.split(/(\s+|[^\w\s])/); for (const word of words) { if (!word) continue; // Whitespace is often merged with adjacent tokens if (/^\s+$/.test(word)) { continue; // Don't count pure whitespace as tokens } // Single punctuation marks are usually 1 token if (/^[^\w\s]$/.test(word)) { tokens += 1; continue; } // Handle different word types if (/^\w+$/.test(word)) { // Regular words: estimate based on length and common patterns if (word.length <= 3) { tokens += 1; // Short words: 1 token } else if (word.length <= 6) { tokens += 1; // Medium words: usually 1 token } else if (word.length <= 10) { tokens += Math.ceil(word.length / 5); // Longer words: ~5 chars per token } else { // Very long words (often technical terms): ~4 chars per token tokens += Math.ceil(word.length / 4); } } else { // Mixed content (URLs, emails, code, etc.) // These are often tokenized more aggressively if (word.includes('://') || word.includes('@')) { // URLs and emails: roughly 3-4 chars per token tokens += Math.ceil(word.length / 3.5); } else if (/[A-Z]{2,}/.test(word) || /\d+/.test(word)) { // Acronyms and numbers: often 2-3 chars per token tokens += Math.ceil(word.length / 2.5); } else { // Other mixed content: 4 chars per token tokens += Math.ceil(word.length / 4); } } } // Account for special sequences that are tokenized differently // Code blocks, markdown, JSON, etc. tend to have more tokens const specialPatterns = [ /```[\s\S]*?```/g, // Code blocks /`[^`]+`/g, // Inline code /\[[^\]]*\]\([^)]*\)/g, // Markdown links /\*\*[^*]+\*\*/g, // Bold text /\*[^*]+\*/g, // Italic text /{[^}]*}/g, // JSON-like structures /\([^)]*\)/g, // Parenthetical content ]; let specialTokens = 0; for (const pattern of specialPatterns) { const matches = text.match(pattern); if (matches) { for (const match of matches) { // Special content has higher token density specialTokens += Math.ceil(match.length / 3); } } } // Use the higher of the two estimates (word-based vs special-pattern-based) // This accounts for content that's heavily formatted vs plain text const wordBasedEstimate = tokens; const specialContentRatio = specialTokens / Math.max(1, text.length); if (specialContentRatio > 0.1) { // High special content - use pattern-based estimate with adjustment tokens = Math.max(wordBasedEstimate, Math.ceil(text.length / 3.2)); } else { // Regular content - use word-based estimate tokens = wordBasedEstimate; } // Add buffer for control tokens, formatting, etc. (5-10% overhead) tokens = Math.ceil(tokens * 1.08); return tokens; } /** * Get the average characters per token for specific text * Useful for chunking operations */ static getAvgCharsPerToken(text) { if (!text) return 4; // Fallback const tokens = this.estimateTokens(text); return tokens > 0 ? text.length / tokens : 4; } /** * Check if text exceeds a token limit */ static exceedsLimit(text, maxTokens) { return this.estimateTokens(text) > maxTokens; } /** * Estimate how many characters would fit within a token budget * for a given text style */ static estimateCharsForTokens(sampleText, targetTokens) { const avgCharsPerToken = this.getAvgCharsPerToken(sampleText); return Math.floor(targetTokens * avgCharsPerToken); } }