UNPKG

tokenx

Version:

Fast and lightweight token estimation for any LLM without requiring a full tokenizer

64 lines (63 loc) 2.56 kB
//#region src/index.ts const PATTERNS = { whitespace: /^\s+$/, cjk: /[\u4E00-\u9FFF\u3400-\u4DBF\u3000-\u303F\uFF00-\uFFEF\u30A0-\u30FF\u2E80-\u2EFF\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uD7B0-\uD7FF]/, numeric: /^\d+(?:[.,]\d+)*$/, punctuation: /[.,!?;(){}[\]<>:/\\|@#$%^&*+=`~-]/, alphanumeric: /^[a-zA-Z0-9\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]+$/ }; const TOKEN_SPLIT_PATTERN = new RegExp(`(\\s+|${PATTERNS.punctuation.source}+)`); const DEFAULT_CHARS_PER_TOKEN = 6; const SHORT_TOKEN_THRESHOLD = 3; const DEFAULT_LANGUAGE_CONFIGS = [ { pattern: /[äöüßẞ]/i, averageCharsPerToken: 3 }, { pattern: /[éèêëàâîïôûùüÿçœæáíóúñ]/i, averageCharsPerToken: 3 }, { pattern: /[ąćęłńóśźżěščřžýůúďťň]/i, averageCharsPerToken: 3.5 } ]; /** * Checks if a text string is within a specified token limit */ function isWithinTokenLimit(text, tokenLimit, options) { return estimateTokenCount(text, options) <= tokenLimit; } const approximateTokenSize = estimateTokenCount; /** * Estimates the number of tokens in a text string using heuristic rules. */ function estimateTokenCount(text, options = {}) { if (!text) return 0; const { defaultCharsPerToken = DEFAULT_CHARS_PER_TOKEN, languageConfigs = DEFAULT_LANGUAGE_CONFIGS } = options; const segments = text.split(TOKEN_SPLIT_PATTERN).filter(Boolean); let tokenCount = 0; for (const segment of segments) tokenCount += estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken); return tokenCount; } function estimateSegmentTokens(segment, languageConfigs, defaultCharsPerToken) { if (PATTERNS.whitespace.test(segment)) return 0; if (PATTERNS.cjk.test(segment)) return getCharacterCount(segment); if (PATTERNS.numeric.test(segment)) return 1; if (segment.length <= SHORT_TOKEN_THRESHOLD) return 1; if (PATTERNS.punctuation.test(segment)) return segment.length > 1 ? Math.ceil(segment.length / 2) : 1; if (PATTERNS.alphanumeric.test(segment)) { const charsPerToken = getLanguageSpecificCharsPerToken(segment, languageConfigs) ?? defaultCharsPerToken; return Math.ceil(segment.length / charsPerToken); } return getCharacterCount(segment); } function getLanguageSpecificCharsPerToken(segment, languageConfigs) { for (const config of languageConfigs) if (config.pattern.test(segment)) return config.averageCharsPerToken; } function getCharacterCount(text) { return Array.from(text).length; } //#endregion export { approximateTokenSize, estimateTokenCount, isWithinTokenLimit };