ultratoken
Version:
UltraToken Utility - CLI tool for token cost analysis
404 lines (330 loc) ⢠11.3 kB
JavaScript
const fs = require('fs').promises;
const path = require('path');
const readline = require('readline');
// ANSI color codes for terminal output
const colors = {
reset: '\x1b[0m',
red: '\x1b[31m',
green: '\x1b[32m',
yellow: '\x1b[33m',
blue: '\x1b[34m',
cyan: '\x1b[36m',
white: '\x1b[37m',
bold: '\x1b[1m',
gray: '\x1b[90m'
};
const colorize = (color, text) => `${colors[color]}${text}${colors.reset}`;
// ============================================================================
// Tiktoken Core Implementation (based on js-tiktoken)
// ============================================================================
function base64ToBytes(base64) {
const binaryString = Buffer.from(base64, 'base64').toString('binary');
const bytes = new Uint8Array(binaryString.length);
for (let i = 0; i < binaryString.length; i++) {
bytes[i] = binaryString.charCodeAt(i);
}
return bytes;
}
function bytePairMerge(piece, ranks) {
let parts = Array.from({ length: piece.length }, (_, i) => ({
start: i,
end: i + 1
}));
while (parts.length > 1) {
let minRank = null;
let minIndex = -1;
let minRankValue = Infinity;
for (let i = 0; i < parts.length - 1; i++) {
const slice = piece.slice(parts[i].start, parts[i + 1].end);
const key = Array.from(slice).join(',');
const rank = ranks.get(key);
if (rank != null && rank < minRankValue) {
minRankValue = rank;
minIndex = i;
}
}
if (minIndex !== -1) {
parts[minIndex] = {
start: parts[minIndex].start,
end: parts[minIndex + 1].end
};
parts.splice(minIndex + 1, 1);
} else {
break;
}
}
return parts;
}
function bytePairEncode(piece, ranks) {
if (piece.length === 1) {
const key = Array.from(piece).join(',');
return [ranks.get(key)];
}
const parts = bytePairMerge(piece, ranks);
const tokens = [];
for (const part of parts) {
const slice = piece.slice(part.start, part.end);
const key = Array.from(slice).join(',');
const rank = ranks.get(key);
if (rank != null) {
tokens.push(rank);
}
}
return tokens;
}
function escapeRegex(str) {
return str.replace(/[\\^$*+?.()|[\]{}]/g, '\\$&');
}
class Tiktoken {
constructor(ranks) {
this.patStr = ranks.pat_str;
this.specialTokens = ranks.special_tokens || {};
this.textEncoder = new TextEncoder();
this.textDecoder = new TextDecoder('utf-8');
this.rankMap = new Map();
this.textMap = new Map();
// Parse the BPE ranks
if (ranks.bpe_ranks) {
const lines = ranks.bpe_ranks.split('\n').filter(Boolean);
for (const line of lines) {
const parts = line.split(' ');
const offsetStr = parts[1];
const offset = parseInt(offsetStr, 10);
for (let i = 2; i < parts.length; i++) {
const token = parts[i];
const rank = offset + (i - 2);
const bytes = base64ToBytes(token);
const key = Array.from(bytes).join(',');
this.rankMap.set(key, rank);
this.textMap.set(rank, bytes);
}
}
}
// Create inverse special tokens map
this.inverseSpecialTokens = {};
for (const [text, rank] of Object.entries(this.specialTokens)) {
this.inverseSpecialTokens[rank] = this.textEncoder.encode(text);
}
}
static specialTokenRegex(tokens) {
return new RegExp(tokens.map(t => escapeRegex(t)).join('|'), 'g');
}
encode(text, allowedSpecial = [], disallowedSpecial = 'all') {
const regex = new RegExp(this.patStr, 'ug');
const specialRegex = Tiktoken.specialTokenRegex(Object.keys(this.specialTokens));
const ret = [];
const allowedSpecialSet = new Set(
allowedSpecial === 'all'
? Object.keys(this.specialTokens)
: allowedSpecial
);
const disallowedSpecialSet = new Set(
disallowedSpecial === 'all'
? Object.keys(this.specialTokens).filter(x => !allowedSpecialSet.has(x))
: disallowedSpecial
);
// Check for disallowed special tokens
if (disallowedSpecialSet.size > 0) {
const disallowedRegex = Tiktoken.specialTokenRegex([...disallowedSpecialSet]);
const specialMatch = text.match(disallowedRegex);
if (specialMatch != null) {
throw new Error(`Text contains disallowed special token: ${specialMatch[0]}`);
}
}
let start = 0;
while (true) {
let nextSpecial = null;
let startFind = start;
// Find next allowed special token
while (true) {
specialRegex.lastIndex = startFind;
nextSpecial = specialRegex.exec(text);
if (nextSpecial == null || allowedSpecialSet.has(nextSpecial[0])) break;
startFind = nextSpecial.index + 1;
}
const end = nextSpecial?.index ?? text.length;
// Process regular text before special token
const substring = text.substring(start, end);
const matches = substring.matchAll(regex);
for (const match of matches) {
const piece = this.textEncoder.encode(match[0]);
const key = Array.from(piece).join(',');
const token = this.rankMap.get(key);
if (token != null) {
ret.push(token);
} else {
// Apply BPE
const encoded = bytePairEncode(piece, this.rankMap);
ret.push(...encoded);
}
}
// Handle special token
if (nextSpecial == null) break;
const token = this.specialTokens[nextSpecial[0]];
ret.push(token);
start = nextSpecial.index + nextSpecial[0].length;
}
return ret;
}
decode(tokens) {
const chunks = [];
let totalLength = 0;
for (const token of tokens) {
const bytes = this.textMap.get(token) ?? this.inverseSpecialTokens[token];
if (bytes != null) {
chunks.push(bytes);
totalLength += bytes.length;
}
}
// Merge all chunks
const result = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of chunks) {
result.set(chunk, offset);
offset += chunk.length;
}
return this.textDecoder.decode(result);
}
countTokens(text) {
return this.encode(text).length;
}
}
// ============================================================================
// UltraToken Implementation
// ============================================================================
let tokenizer = null;
// Initialize tokenizer
function initializeTokenizer() {
try {
const vocabPath = path.join(__dirname, '..', 'data', 'o200k_base.js');
if (!require('fs').existsSync(vocabPath)) {
throw new Error('o200k_base.js vocabulary not found in data folder');
}
const ranks = require(vocabPath);
tokenizer = new Tiktoken(ranks);
console.log(colorize('green', `ā Loaded ${ranks.modelName} tokenizer (${ranks.vocabSize} tokens)`));
} catch (error) {
console.error(colorize('red', 'Failed to load tokenizer:'), error.message);
process.exit(1);
}
}
// Initialize on module load
initializeTokenizer();
/**
* Get token count for text
*/
function getTokenCount(text) {
if (!text || typeof text !== 'string') {
throw new Error('Invalid input: text must be a non-empty string');
}
try {
return tokenizer.countTokens(text);
} catch (error) {
throw new Error(`Tokenization failed: ${error.message}`);
}
}
/**
* Process a file and append token counts
*/
async function processWordFile(filename) {
try {
await fs.access(filename, fs.constants.R_OK);
const content = await fs.readFile(filename, 'utf-8');
const lines = content.split('\n');
const processedLines = [];
console.log(colorize('blue', `š Processing ${lines.length} lines...`));
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (line === '') {
processedLines.push('');
continue;
}
try {
const tokenCount = getTokenCount(line);
processedLines.push(`${line} ${tokenCount}`);
if (lines.length > 100 && (i + 1) % 100 === 0) {
console.log(colorize('gray', `Progress: ${i + 1}/${lines.length} lines processed`));
}
} catch (error) {
console.warn(colorize('yellow', `Warning: Could not process line ${i + 1}: "${line}"`));
processedLines.push(line);
}
}
const outputContent = processedLines.join('\n');
await fs.writeFile(filename, outputContent, 'utf-8');
console.log(colorize('green', `ā
Processed ${processedLines.filter(line => line.trim() !== '').length} words`));
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(`File not found: ${filename}`);
} else if (error.code === 'EACCES') {
throw new Error(`Permission denied: ${filename}`);
} else {
throw new Error(`File processing error: ${error.message}`);
}
}
}
/**
* Start interactive mode
*/
function startInteractiveMode() {
console.log(colorize('cyan', colorize('bold', 'š UltraToken TikToken Utility')));
console.log(colorize('gray', 'Interactive Mode - Type words to get token counts'));
console.log(colorize('gray', 'Commands: "jump" to exit, "help" for help\n'));
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
prompt: colorize('blue', 'ultratoken> ')
});
rl.prompt();
rl.on('line', (input) => {
const trimmedInput = input.trim();
if (trimmedInput === '') {
rl.prompt();
return;
}
if (trimmedInput === 'jump') {
console.log(colorize('yellow', 'š UltraToken terminated. Goodbye!'));
rl.close();
return;
}
if (trimmedInput === 'help') {
showHelp();
rl.prompt();
return;
}
try {
const tokenCount = getTokenCount(trimmedInput);
console.log(colorize('green', `"${colorize('bold', trimmedInput)}" = ${colorize('bold', tokenCount.toString())} tokens`));
} catch (error) {
console.log(colorize('red', `Error: ${error.message}`));
}
rl.prompt();
});
rl.on('close', () => {
console.log(colorize('yellow', '\nš UltraToken terminated. Goodbye!'));
process.exit(0);
});
rl.on('SIGINT', () => {
console.log(colorize('yellow', '\nš UltraToken terminated. Goodbye!'));
process.exit(0);
});
}
/**
* Show help information
*/
function showHelp() {
console.log(colorize('cyan', '\nš UltraToken Help:'));
console.log(colorize('white', ' ⢠Type any word or text to get its token count'));
console.log(colorize('white', ' ⢠"jump" - Exit the program'));
console.log(colorize('white', ' ⢠"help" - Show this help message'));
console.log(colorize('gray', '\nš” CLI Usage:'));
console.log(colorize('gray', ' ultratoken <word> - Get token count for a word'));
console.log(colorize('gray', ' ultratoken economy <file> - Process word file'));
console.log(colorize('gray', ' ultratoken jump - Exit program\n'));
}
module.exports = {
getTokenCount,
processWordFile,
startInteractiveMode,
Tiktoken
};