gemini-token-estimator
Version:
Estimate the number of tokens for Gemini models
67 lines (66 loc) • 3.17 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.truncateTextToMaxTokens = exports.getTokenCount = exports.tokenize = void 0;
const lowercaseWord = /[ ]?(?:[bcdfghjklmnpqrstvwxzßçñ]{0,3}[aeiouyàáâäèéêëìíîïòóôöùúûüýÿæœ]{1,3}[bcdfghjklmnpqrstvwxzßçñ]{0,3}){1,3}/;
const uppercaseWord = /[ ]?(?:[BCDFGHJKLMNPQRSTVWXZÇÑ]{0,3}[AEIOUYÀÁÂÄÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜÝŸÆŒ]{1,3}[BCDFGHJKLMNPQRSTVWXZÇÑ]{0,3}){1,3}/;
const titlecaseWord = /[ ]?[A-ZÀÁÂÄÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜÝŸÆŒÇÑ][a-zàáâäèéêëìíîïòóôöùúûüýÿæœßçñ]{1,8}/;
// Sequences of consonants that are common and should be treated as a single token
const commonAbbreviations = /pdf|png|http(?:s)?|rfp|www|PDF|PNG|HTTP|HTTP(?:S)?|RFP|WWW/;
// Match anything that's not in common Latin character ranges and not in private use area
const nonLatinWord = /[ ]?[^\u0000-\u007F\u00A0-\u00FF\u0100-\u017F\ue000-\uf8ff]{1,5}/;
const regexPatterns = [
/\d/, // Single digit
/\n+/, // One or more newlines
/\r+/, // One or more carriage returns
/\t+/, // One or more tabs
/\v+/, // One or more vertical tabs
/\f+/, // One or more form feeds
/[\ue000-\uf8ff]/, // Private use area
commonAbbreviations, // Common abbreviations
lowercaseWord, // Lowercase word
titlecaseWord, // Titlecase word
uppercaseWord, // Uppercase word
/[bcdfghjklmnpqrstvwxzßçñ]{1,2}/, // One or two consonants
/[BCDFGHJKLMNPQRSTVWXZÇÑ]{1,2}/, // One or two uppercase consonants
nonLatinWord,
/\(\)/, // Parentheses
/\[\]/, // Brackets
/\{\}/, // Braces
/([.=#_-])\1{1,15}/, // Handle repeated special characters with lengths of up to 16
/[ ]?[!@#$%^&*()_+\-=\[\]{}\\|;:'",.<>/?`~]{1,3}/, // One to three special characters
/[ ]+/, // One or more spaces
/./, // Any other character
];
// Combine all patterns with the 'g' flag for global matching
const combinedPattern = new RegExp(regexPatterns.map((pattern) => pattern.source).join('|'), 'g');
/**
* Tokenize a string similar to Gemma's tokenizer
* @param input The text to tokenize
* @returns The tokens
* @example
* const tokens = tokenize('Hello, world!')
* console.log(tokens)
* Output: ['Hello', ',', ' world', '!']
*/
const tokenize = (input) => input.match(combinedPattern) || [];
exports.tokenize = tokenize;
/**
* Get the count of tokens in a text
* @param text The text to tokenize
* @returns The count of tokens in the text
*/
const getTokenCount = (text) => (0, exports.tokenize)(text).length;
exports.getTokenCount = getTokenCount;
/**
* Truncate text to a maximum number of tokens
* @param text The text to truncate
* @param maxTokens The maximum number of tokens to truncate to
* @returns The truncated text
*/
const truncateTextToMaxTokens = (text, maxTokens) => {
const tokens = (0, exports.tokenize)(text);
const truncatedTokens = tokens.slice(0, maxTokens);
const truncatedText = truncatedTokens.join('');
return { truncatedText, truncatedTokenCount: truncatedTokens.length };
};
exports.truncateTextToMaxTokens = truncateTextToMaxTokens;