UNPKG

ultratoken

Version:

UltraToken Utility - CLI tool for token cost analysis

github.com/TrintechResearch/UltraToken

TrintechResearch/UltraToken

404 lines (330 loc) • 11.3 kB

JavaScript

const fs = require('fs').promises; const path = require('path'); const readline = require('readline'); // ANSI color codes for terminal output const colors = { reset: '\x1b[0m', red: '\x1b[31m', green: '\x1b[32m', yellow: '\x1b[33m', blue: '\x1b[34m', cyan: '\x1b[36m', white: '\x1b[37m', bold: '\x1b[1m', gray: '\x1b[90m' }; const colorize = (color, text) => `${colors[color]}${text}${colors.reset}`; // ============================================================================ // Tiktoken Core Implementation (based on js-tiktoken) // ============================================================================ function base64ToBytes(base64) { const binaryString = Buffer.from(base64, 'base64').toString('binary'); const bytes = new Uint8Array(binaryString.length); for (let i = 0; i < binaryString.length; i++) { bytes[i] = binaryString.charCodeAt(i); } return bytes; } function bytePairMerge(piece, ranks) { let parts = Array.from({ length: piece.length }, (_, i) => ({ start: i, end: i + 1 })); while (parts.length > 1) { let minRank = null; let minIndex = -1; let minRankValue = Infinity; for (let i = 0; i < parts.length - 1; i++) { const slice = piece.slice(parts[i].start, parts[i + 1].end); const key = Array.from(slice).join(','); const rank = ranks.get(key); if (rank != null && rank < minRankValue) { minRankValue = rank; minIndex = i; } } if (minIndex !== -1) { parts[minIndex] = { start: parts[minIndex].start, end: parts[minIndex + 1].end }; parts.splice(minIndex + 1, 1); } else { break; } } return parts; } function bytePairEncode(piece, ranks) { if (piece.length === 1) { const key = Array.from(piece).join(','); return [ranks.get(key)]; } const parts = bytePairMerge(piece, ranks); const tokens = []; for (const part of parts) { const slice = piece.slice(part.start, part.end); const key = Array.from(slice).join(','); const rank = ranks.get(key); if (rank != null) { tokens.push(rank); } } return tokens; } function escapeRegex(str) { return str.replace(/[\\^$*+?.()|[\]{}]/g, '\\$&'); } class Tiktoken { constructor(ranks) { this.patStr = ranks.pat_str; this.specialTokens = ranks.special_tokens || {}; this.textEncoder = new TextEncoder(); this.textDecoder = new TextDecoder('utf-8'); this.rankMap = new Map(); this.textMap = new Map(); // Parse the BPE ranks if (ranks.bpe_ranks) { const lines = ranks.bpe_ranks.split('\n').filter(Boolean); for (const line of lines) { const parts = line.split(' '); const offsetStr = parts[1]; const offset = parseInt(offsetStr, 10); for (let i = 2; i < parts.length; i++) { const token = parts[i]; const rank = offset + (i - 2); const bytes = base64ToBytes(token); const key = Array.from(bytes).join(','); this.rankMap.set(key, rank); this.textMap.set(rank, bytes); } } } // Create inverse special tokens map this.inverseSpecialTokens = {}; for (const [text, rank] of Object.entries(this.specialTokens)) { this.inverseSpecialTokens[rank] = this.textEncoder.encode(text); } } static specialTokenRegex(tokens) { return new RegExp(tokens.map(t => escapeRegex(t)).join('|'), 'g'); } encode(text, allowedSpecial = [], disallowedSpecial = 'all') { const regex = new RegExp(this.patStr, 'ug'); const specialRegex = Tiktoken.specialTokenRegex(Object.keys(this.specialTokens)); const ret = []; const allowedSpecialSet = new Set( allowedSpecial === 'all' ? Object.keys(this.specialTokens) : allowedSpecial ); const disallowedSpecialSet = new Set( disallowedSpecial === 'all' ? Object.keys(this.specialTokens).filter(x => !allowedSpecialSet.has(x)) : disallowedSpecial ); // Check for disallowed special tokens if (disallowedSpecialSet.size > 0) { const disallowedRegex = Tiktoken.specialTokenRegex([...disallowedSpecialSet]); const specialMatch = text.match(disallowedRegex); if (specialMatch != null) { throw new Error(`Text contains disallowed special token: ${specialMatch[0]}`); } } let start = 0; while (true) { let nextSpecial = null; let startFind = start; // Find next allowed special token while (true) { specialRegex.lastIndex = startFind; nextSpecial = specialRegex.exec(text); if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0])) break; startFind = nextSpecial.index + 1; } const end = nextSpecial?.index ?? text.length; // Process regular text before special token const substring = text.substring(start, end); const matches = substring.matchAll(regex); for (const match of matches) { const piece = this.textEncoder.encode(match[0]); const key = Array.from(piece).join(','); const token = this.rankMap.get(key); if (token != null) { ret.push(token); } else { // Apply BPE const encoded = bytePairEncode(piece, this.rankMap); ret.push(...encoded); } } // Handle special token if (nextSpecial == null) break; const token = this.specialTokens[nextSpecial[0]]; ret.push(token); start = nextSpecial.index + nextSpecial[0].length; } return ret; } decode(tokens) { const chunks = []; let totalLength = 0; for (const token of tokens) { const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token]; if (bytes != null) { chunks.push(bytes); totalLength += bytes.length; } } // Merge all chunks const result = new Uint8Array(totalLength); let offset = 0; for (const chunk of chunks) { result.set(chunk, offset); offset += chunk.length; } return this.textDecoder.decode(result); } countTokens(text) { return this.encode(text).length; } } // ============================================================================ // UltraToken Implementation // ============================================================================ let tokenizer = null; // Initialize tokenizer function initializeTokenizer() { try { const vocabPath = path.join(__dirname, '..', 'data', 'o200k_base.js'); if (!require('fs').existsSync(vocabPath)) { throw new Error('o200k_base.js vocabulary not found in data folder'); } const ranks = require(vocabPath); tokenizer = new Tiktoken(ranks); console.log(colorize('green', `✓ Loaded ${ranks.modelName} tokenizer (${ranks.vocabSize} tokens)`)); } catch (error) { console.error(colorize('red', 'Failed to load tokenizer:'), error.message); process.exit(1); } } // Initialize on module load initializeTokenizer(); /** * Get token count for text */ function getTokenCount(text) { if (!text || typeof text !== 'string') { throw new Error('Invalid input: text must be a non-empty string'); } try { return tokenizer.countTokens(text); } catch (error) { throw new Error(`Tokenization failed: ${error.message}`); } } /** * Process a file and append token counts */ async function processWordFile(filename) { try { await fs.access(filename, fs.constants.R_OK); const content = await fs.readFile(filename, 'utf-8'); const lines = content.split('\n'); const processedLines = []; console.log(colorize('blue', `📝 Processing ${lines.length} lines...`)); for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (line === '') { processedLines.push(''); continue; } try { const tokenCount = getTokenCount(line); processedLines.push(`${line} ${tokenCount}`); if (lines.length > 100 && (i + 1) % 100 === 0) { console.log(colorize('gray', `Progress: ${i + 1}/${lines.length} lines processed`)); } } catch (error) { console.warn(colorize('yellow', `Warning: Could not process line ${i + 1}: "${line}"`)); processedLines.push(line); } } const outputContent = processedLines.join('\n'); await fs.writeFile(filename, outputContent, 'utf-8'); console.log(colorize('green', `✅ Processed ${processedLines.filter(line => line.trim() !== '').length} words`)); } catch (error) { if (error.code === 'ENOENT') { throw new Error(`File not found: ${filename}`); } else if (error.code === 'EACCES') { throw new Error(`Permission denied: ${filename}`); } else { throw new Error(`File processing error: ${error.message}`); } } } /** * Start interactive mode */ function startInteractiveMode() { console.log(colorize('cyan', colorize('bold', '🚀 UltraToken TikToken Utility'))); console.log(colorize('gray', 'Interactive Mode - Type words to get token counts')); console.log(colorize('gray', 'Commands: "jump" to exit, "help" for help\n')); const rl = readline.createInterface({ input: process.stdin, output: process.stdout, prompt: colorize('blue', 'ultratoken> ') }); rl.prompt(); rl.on('line', (input) => { const trimmedInput = input.trim(); if (trimmedInput === '') { rl.prompt(); return; } if (trimmedInput === 'jump') { console.log(colorize('yellow', '👋 UltraToken terminated. Goodbye!')); rl.close(); return; } if (trimmedInput === 'help') { showHelp(); rl.prompt(); return; } try { const tokenCount = getTokenCount(trimmedInput); console.log(colorize('green', `"${colorize('bold', trimmedInput)}" = ${colorize('bold', tokenCount.toString())} tokens`)); } catch (error) { console.log(colorize('red', `Error: ${error.message}`)); } rl.prompt(); }); rl.on('close', () => { console.log(colorize('yellow', '\n👋 UltraToken terminated. Goodbye!')); process.exit(0); }); rl.on('SIGINT', () => { console.log(colorize('yellow', '\n👋 UltraToken terminated. Goodbye!')); process.exit(0); }); } /** * Show help information */ function showHelp() { console.log(colorize('cyan', '\n📖 UltraToken Help:')); console.log(colorize('white', ' • Type any word or text to get its token count')); console.log(colorize('white', ' • "jump" - Exit the program')); console.log(colorize('white', ' • "help" - Show this help message')); console.log(colorize('gray', '\n💡 CLI Usage:')); console.log(colorize('gray', ' ultratoken <word> - Get token count for a word')); console.log(colorize('gray', ' ultratoken economy <file> - Process word file')); console.log(colorize('gray', ' ultratoken jump - Exit program\n')); } module.exports = { getTokenCount, processWordFile, startInteractiveMode, Tiktoken };