tokenx
Version:
GPT token estimation and context size utilities without a full tokenizer
92 lines (89 loc) • 3.25 kB
JavaScript
const modelContextSizeMap = /* @__PURE__ */ new Map([
["gpt-3.5-turbo-16k", 16384],
["gpt-3.5-turbo", 4096],
["gpt-4-1106-preview", 128e3],
["gpt-4-32k", 32768],
["gpt-4", 8192],
["text-davinci-003", 4097],
["text-curie-001", 2048],
["text-babbage-001", 2048],
["text-ada-001", 2048],
["code-davinci-002", 8e3],
["code-cushman-001", 2048]
]);
function resolveModelName(modelName) {
if (modelName.startsWith("gpt-3.5-"))
return "gpt-3.5-turbo";
if (modelName.startsWith("gpt-4-"))
return "gpt-4";
if (modelName.startsWith("gpt-4o"))
return "gpt-4";
return modelName;
}
function getModelContextSize(modelName) {
const modelKey = resolveModelName(modelName);
return modelContextSizeMap.get(modelKey) ?? 4097;
}
const WHITESPACE_RE = /^\s+$/;
const CJK_RE = /[\u4E00-\u9FFF\u3400-\u4DBF\u3000-\u303F\uFF00-\uFFEF\u30A0-\u30FF\u2E80-\u2EFF\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF\uAC00-\uD7AF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uD7B0-\uD7FF]/;
const NUMERIC_SEQUENCE_RE = /^\d+(?:[.,]\d+)?$/;
const PUNCTUATION_RE = /[.,!?;'"„“”‘’\-(){}[\]<>:/\\|@#$%^&*+=`~]/;
const CACHED_SPLIT_REGEX = new RegExp(`(\\s+|${PUNCTUATION_RE.source}+)`);
const ALPHANUMERIC_RE = /^[a-zA-Z0-9\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF]+$/;
const DEFAULT_AVERAGE_CHARS_PER_TOKEN = 6;
const LANGUAGE_METRICS = [
{ regex: /[äöüßẞ]/i, averageCharsPerToken: 3 },
// German
{ regex: /[éèêëàâîïôûùüÿçœæ]/i, averageCharsPerToken: 3 },
// French
{ regex: /[áéíóúüñ]/i, averageCharsPerToken: 3 }
// Spanish
];
function approximateTokenSize(input) {
if (!input)
return 0;
const tokenizedInput = input.split(CACHED_SPLIT_REGEX).filter(Boolean);
let tokenCount = 0;
for (const token of tokenizedInput) {
let averageCharsPerToken;
for (const language of LANGUAGE_METRICS) {
if (language.regex.test(token)) {
averageCharsPerToken = language.averageCharsPerToken;
break;
}
}
if (WHITESPACE_RE.test(token)) {
continue;
} else if (CJK_RE.test(token)) {
tokenCount += Array.from(token).length;
} else if (NUMERIC_SEQUENCE_RE.test(token)) {
tokenCount += 1;
} else if (token.length <= 3) {
tokenCount += 1;
} else if (PUNCTUATION_RE.test(token)) {
tokenCount += token.length > 1 ? Math.ceil(token.length / 2) : 1;
} else if (ALPHANUMERIC_RE.test(token) || averageCharsPerToken) {
tokenCount += Math.ceil(token.length / (averageCharsPerToken ?? DEFAULT_AVERAGE_CHARS_PER_TOKEN));
} else {
tokenCount += Array.from(token).length;
}
}
return tokenCount;
}
function approximateMaxTokenSize({
prompt,
modelName,
maxTokensInResponse = 0
}) {
const remainingTokens = getModelContextSize(modelName) - approximateTokenSize(prompt) - maxTokensInResponse;
return Math.max(0, remainingTokens);
}
function isWithinTokenLimit(input, tokenLimit) {
return approximateTokenSize(input) <= tokenLimit;
}
exports.approximateMaxTokenSize = approximateMaxTokenSize;
exports.approximateTokenSize = approximateTokenSize;
exports.getModelContextSize = getModelContextSize;
exports.isWithinTokenLimit = isWithinTokenLimit;
exports.resolveModelName = resolveModelName;
;