tokenx
Version:
Fast token estimation at 94% accuracy of a full tokenizer in a 2kB bundle
48 lines • 1.88 kB
TypeScript
//#region src/types.d.ts
/**
* Configuration options for token estimation
*/
interface TokenEstimationOptions {
/** Default average characters per token when no language-specific rule applies */
defaultCharsPerToken?: number;
/** Custom language configurations to override defaults */
languageConfigs?: LanguageConfig[];
}
/**
* Language-specific token estimation configurations
*/
interface LanguageConfig {
/** Regular expression to detect the language */
pattern: RegExp;
/** Average number of characters per token for this language */
averageCharsPerToken: number;
}
/**
* Configuration options for splitting text by tokens
*/
interface SplitByTokensOptions extends TokenEstimationOptions {
/** Number of tokens to overlap between consecutive chunks (default: 0) */
overlap?: number;
}
//#endregion
//#region src/index.d.ts
/**
* Checks if a text string is within a specified token limit
*/
declare function isWithinTokenLimit(text: string, tokenLimit: number, options?: TokenEstimationOptions): boolean;
/** @deprecated Use `estimateTokenCount` instead */
declare const approximateTokenSize: typeof estimateTokenCount;
/**
* Estimates the number of tokens in a text string using heuristic rules.
*/
declare function estimateTokenCount(text?: string, options?: TokenEstimationOptions): number;
/**
* Extracts a portion of text based on token positions, similar to Array.prototype.slice().
*/
declare function sliceByTokens(text: string, start?: number, end?: number, options?: TokenEstimationOptions): string;
/**
* Splits text into chunks based on token count.
*/
declare function splitByTokens(text: string, tokensPerChunk: number, options?: SplitByTokensOptions): string[];
//#endregion
export { LanguageConfig, SplitByTokensOptions, TokenEstimationOptions, approximateTokenSize, estimateTokenCount, isWithinTokenLimit, sliceByTokens, splitByTokens };