UNPKG

gemini-token-estimator

Version:

Estimate the number of tokens for Gemini models

github.com/smartprocure/gemini-token-estimator

smartprocure/gemini-token-estimator

67 lines (66 loc) • 3.17 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.truncateTextToMaxTokens = exports.getTokenCount = exports.tokenize = void 0; const lowercaseWord = /[ ]?(?:[bcdfghjklmnpqrstvwxzßçñ]{0,3}[aeiouyàáâäèéêëìíîïòóôöùúûüýÿæœ]{1,3}[bcdfghjklmnpqrstvwxzßçñ]{0,3}){1,3}/; const uppercaseWord = /[ ]?(?:[BCDFGHJKLMNPQRSTVWXZÇÑ]{0,3}[AEIOUYÀÁÂÄÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜÝŸÆŒ]{1,3}[BCDFGHJKLMNPQRSTVWXZÇÑ]{0,3}){1,3}/; const titlecaseWord = /[ ]?[A-ZÀÁÂÄÈÉÊËÌÍÎÏÒÓÔÖÙÚÛÜÝŸÆŒÇÑ][a-zàáâäèéêëìíîïòóôöùúûüýÿæœßçñ]{1,8}/; // Sequences of consonants that are common and should be treated as a single token const commonAbbreviations = /pdf|png|http(?:s)?|rfp|www|PDF|PNG|HTTP|HTTP(?:S)?|RFP|WWW/; // Match anything that's not in common Latin character ranges and not in private use area const nonLatinWord = /[ ]?[^\u0000-\u007F\u00A0-\u00FF\u0100-\u017F\ue000-\uf8ff]{1,5}/; const regexPatterns = [ /\d/, // Single digit /\n+/, // One or more newlines /\r+/, // One or more carriage returns /\t+/, // One or more tabs /\v+/, // One or more vertical tabs /\f+/, // One or more form feeds /[\ue000-\uf8ff]/, // Private use area commonAbbreviations, // Common abbreviations lowercaseWord, // Lowercase word titlecaseWord, // Titlecase word uppercaseWord, // Uppercase word /[bcdfghjklmnpqrstvwxzßçñ]{1,2}/, // One or two consonants /[BCDFGHJKLMNPQRSTVWXZÇÑ]{1,2}/, // One or two uppercase consonants nonLatinWord, /\(\)/, // Parentheses /\[\]/, // Brackets /\{\}/, // Braces /([.=#_-])\1{1,15}/, // Handle repeated special characters with lengths of up to 16 /[ ]?[!@#$%^&*()_+\-=\[\]{}\\|;:'",.<>/?`~]{1,3}/, // One to three special characters /[ ]+/, // One or more spaces /./, // Any other character ]; // Combine all patterns with the 'g' flag for global matching const combinedPattern = new RegExp(regexPatterns.map((pattern) => pattern.source).join('|'), 'g'); /** * Tokenize a string similar to Gemma's tokenizer * @param input The text to tokenize * @returns The tokens * @example * const tokens = tokenize('Hello, world!') * console.log(tokens) * Output: ['Hello', ',', ' world', '!'] */ const tokenize = (input) => input.match(combinedPattern) || []; exports.tokenize = tokenize; /** * Get the count of tokens in a text * @param text The text to tokenize * @returns The count of tokens in the text */ const getTokenCount = (text) => (0, exports.tokenize)(text).length; exports.getTokenCount = getTokenCount; /** * Truncate text to a maximum number of tokens * @param text The text to truncate * @param maxTokens The maximum number of tokens to truncate to * @returns The truncated text */ const truncateTextToMaxTokens = (text, maxTokens) => { const tokens = (0, exports.tokenize)(text); const truncatedTokens = tokens.slice(0, maxTokens); const truncatedText = truncatedTokens.join(''); return { truncatedText, truncatedTokenCount: truncatedTokens.length }; }; exports.truncateTextToMaxTokens = truncateTextToMaxTokens;