gpt-tokenizer
Version:
A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models
86 lines (85 loc) • 11 kB
TypeScript
import { ALL_SPECIAL_TOKENS } from './constants.js';
import { type EncodingName, type ModelName } from './mapping.js';
import { type GetMergeableRanksFn } from './modelParams.js';
import { type CostEstimate } from './models.js';
export interface EncodeOptions {
/**
* A list of special tokens that are allowed in the input.
* If set to 'all', all special tokens are allowed except those in disallowedSpecial.
* @default undefined
*/
allowedSpecial?: Set<string> | typeof ALL_SPECIAL_TOKENS;
/**
* A list of special tokens that are disallowed in the input.
* If set to 'all', all special tokens are disallowed except those in allowedSpecial.
* @default 'all'
*/
disallowedSpecial?: Set<string> | typeof ALL_SPECIAL_TOKENS;
}
export interface ChatMessage {
role?: 'system' | 'user' | 'assistant';
name?: string;
content: string;
}
export interface EncodeChatOptions {
primeWithAssistantResponse?: string;
}
export declare class GptEncoding {
static EndOfPrompt: string;
static EndOfText: string;
static FimMiddle: string;
static FimPrefix: string;
static FimSuffix: string;
modelName?: ModelName;
private bytePairEncodingCoreProcessor;
private specialTokensEncoder;
private specialTokensSet;
private allSpecialTokenRegex;
private defaultSpecialTokenConfig;
readonly vocabularySize: number;
private constructor();
static getEncodingApi(encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn): GptEncoding;
static getEncodingApiForModel(modelName: ModelName, getMergeableRanks: GetMergeableRanksFn): GptEncoding;
private processSpecialTokens;
encodeGenerator(lineToEncode: string, encodeOptions?: EncodeOptions): Generator<number[], number, undefined>;
encode(lineToEncode: string, encodeOptions?: EncodeOptions): number[];
/**
* Progressively tokenizes an OpenAI chat.
* Warning: gpt-3.5-turbo and gpt-4 chat format may change over time.
* Returns tokens assuming the 'gpt-3.5-turbo-0301' / 'gpt-4-0314' format.
* Based on OpenAI's guidelines: https://github.com/openai/openai-python/blob/main/chatml.md
* Also mentioned in section 6 of this document: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
*/
encodeChatGenerator(chat: Iterable<ChatMessage>, model?: "gpt-4o" | "gpt-3.5-turbo" | "gpt-3.5-turbo-instruct" | "babbage-002" | "davinci-002" | "text-embedding-3-small" | "o1" | "o1-2024-12-17" | "o1-preview" | "o1-preview-2024-09-12" | "o1-mini" | "o1-mini-2024-09-12" | "o3-mini" | "chatgpt-4o-latest" | "gpt-4o-2024-11-20" | "gpt-4o-2024-08-06" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gpt-4o-realtime-preview" | "gpt-4o-realtime-preview-2024-10-01" | "gpt-4o-realtime-preview-2024-12-17" | "gpt-4o-mini-realtime-preview" | "gpt-4o-mini-realtime-preview-2024-12-17" | "gpt-4o-audio-preview" | "gpt-4o-audio-preview-2024-10-01" | "gpt-4o-audio-preview-2024-12-17" | "gpt-4o-mini-audio-preview" | "gpt-4o-mini-audio-preview-2024-12-17" | "gpt-4o-2024-08-06-finetune" | "gpt-4o-mini-2024-07-18-finetune" | "gpt-4o-mini-training" | "gpt-4o-mini-training-2024-07-18" | "davinci-002-finetune" | "babbage-002-finetune" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-0125-preview" | "gpt-4-1106-preview" | "gpt-4" | "gpt-4-0613" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-finetune" | "gpt-3.5-turbo-16k" | "gpt-4-32k" | "gpt-4-32k-0613" | "gpt-4-vision-preview" | "gpt-4-1106-vision-preview" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-0301" | "text-embedding-3-large" | "text-embedding-ada-002" | "gpt-3.5-turbo-instruct-0914" | "text-ada-001" | "text-babbage-001" | "text-curie-001" | "text-davinci-001" | "text-davinci-002" | "text-davinci-003" | "ada" | "babbage" | "curie" | "davinci" | "code-davinci-002" | "code-davinci-001" | "davinci-codex" | "code-davinci-edit-001" | "code-cushman-002" | "code-cushman-001" | "cushman-codex" | "code-search-ada-code-001" | "code-search-ada-text-001" | "text-davinci-edit-001" | "text-similarity-ada-001" | "text-search-ada-doc-001" | "text-search-ada-query-001" | "text-similarity-babbage-001" | "text-search-babbage-doc-001" | "text-search-babbage-query-001" | "code-search-babbage-code-001" | "code-search-babbage-text-001" | "text-similarity-curie-001" | "text-search-curie-doc-001" | "text-search-curie-query-001" | "text-similarity-davinci-001" | "text-search-davinci-doc-001" | "text-search-davinci-query-001" | undefined): Generator<number[], void, undefined>;
/**
* Encodes a chat into a single array of tokens.
* Warning: gpt-3.5-turbo and gpt-4 chat format may change over time.
* Returns tokens assuming the 'gpt-3.5-turbo-0301' / 'gpt-4-0314' format.
* Based on OpenAI's guidelines: https://github.com/openai/openai-python/blob/main/chatml.md
* Also mentioned in section 6 of this document: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
*/
encodeChat(chat: readonly ChatMessage[], model?: "gpt-4o" | "gpt-3.5-turbo" | "gpt-3.5-turbo-instruct" | "babbage-002" | "davinci-002" | "text-embedding-3-small" | "o1" | "o1-2024-12-17" | "o1-preview" | "o1-preview-2024-09-12" | "o1-mini" | "o1-mini-2024-09-12" | "o3-mini" | "chatgpt-4o-latest" | "gpt-4o-2024-11-20" | "gpt-4o-2024-08-06" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gpt-4o-realtime-preview" | "gpt-4o-realtime-preview-2024-10-01" | "gpt-4o-realtime-preview-2024-12-17" | "gpt-4o-mini-realtime-preview" | "gpt-4o-mini-realtime-preview-2024-12-17" | "gpt-4o-audio-preview" | "gpt-4o-audio-preview-2024-10-01" | "gpt-4o-audio-preview-2024-12-17" | "gpt-4o-mini-audio-preview" | "gpt-4o-mini-audio-preview-2024-12-17" | "gpt-4o-2024-08-06-finetune" | "gpt-4o-mini-2024-07-18-finetune" | "gpt-4o-mini-training" | "gpt-4o-mini-training-2024-07-18" | "davinci-002-finetune" | "babbage-002-finetune" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-0125-preview" | "gpt-4-1106-preview" | "gpt-4" | "gpt-4-0613" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-finetune" | "gpt-3.5-turbo-16k" | "gpt-4-32k" | "gpt-4-32k-0613" | "gpt-4-vision-preview" | "gpt-4-1106-vision-preview" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-0301" | "text-embedding-3-large" | "text-embedding-ada-002" | "gpt-3.5-turbo-instruct-0914" | "text-ada-001" | "text-babbage-001" | "text-curie-001" | "text-davinci-001" | "text-davinci-002" | "text-davinci-003" | "ada" | "babbage" | "curie" | "davinci" | "code-davinci-002" | "code-davinci-001" | "davinci-codex" | "code-davinci-edit-001" | "code-cushman-002" | "code-cushman-001" | "cushman-codex" | "code-search-ada-code-001" | "code-search-ada-text-001" | "text-davinci-edit-001" | "text-similarity-ada-001" | "text-search-ada-doc-001" | "text-search-ada-query-001" | "text-similarity-babbage-001" | "text-search-babbage-doc-001" | "text-search-babbage-query-001" | "code-search-babbage-code-001" | "code-search-babbage-text-001" | "text-similarity-curie-001" | "text-search-curie-doc-001" | "text-search-curie-query-001" | "text-similarity-davinci-001" | "text-search-davinci-doc-001" | "text-search-davinci-query-001" | undefined): number[];
/**
* @returns {false | number} false if token limit is exceeded, otherwise the number of tokens
*/
isWithinTokenLimit(input: string | Iterable<ChatMessage>, tokenLimit: number): false | number;
/**
* Counts the number of tokens in the input.
* @returns {number} The number of tokens.
*/
countTokens(input: string | Iterable<ChatMessage>, encodeOptions?: EncodeOptions): number;
setMergeCacheSize(size: number): void;
clearMergeCache(): void;
decode(inputTokensToDecode: Iterable<number>): string;
decodeGenerator(inputTokensToDecode: Iterable<number>): Generator<string, void, void>;
decodeAsyncGenerator(inputTokensToDecode: AsyncIterable<number>): AsyncGenerator<string, void>;
decodeAsync(inputTokensToDecode: AsyncIterable<number>): Promise<string>;
/**
* Estimates the cost of processing a given token count using the model's pricing.
*
* @param tokenCount - The number of tokens to estimate cost for
* @param modelName - Optional model name to use for cost calculation (defaults to this.modelName)
* @returns Cost estimate object with applicable price components (input, output, batchInput, batchOutput)
*/
estimateCost(tokenCount: number, modelName?: "gpt-4o" | "gpt-3.5-turbo" | "gpt-3.5-turbo-instruct" | "babbage-002" | "davinci-002" | "text-embedding-3-small" | "o1" | "o1-2024-12-17" | "o1-preview" | "o1-preview-2024-09-12" | "o1-mini" | "o1-mini-2024-09-12" | "o3-mini" | "chatgpt-4o-latest" | "gpt-4o-2024-11-20" | "gpt-4o-2024-08-06" | "gpt-4o-2024-05-13" | "gpt-4o-mini" | "gpt-4o-mini-2024-07-18" | "gpt-4o-realtime-preview" | "gpt-4o-realtime-preview-2024-10-01" | "gpt-4o-realtime-preview-2024-12-17" | "gpt-4o-mini-realtime-preview" | "gpt-4o-mini-realtime-preview-2024-12-17" | "gpt-4o-audio-preview" | "gpt-4o-audio-preview-2024-10-01" | "gpt-4o-audio-preview-2024-12-17" | "gpt-4o-mini-audio-preview" | "gpt-4o-mini-audio-preview-2024-12-17" | "gpt-4o-2024-08-06-finetune" | "gpt-4o-mini-2024-07-18-finetune" | "gpt-4o-mini-training" | "gpt-4o-mini-training-2024-07-18" | "davinci-002-finetune" | "babbage-002-finetune" | "gpt-4-turbo" | "gpt-4-turbo-2024-04-09" | "gpt-4-turbo-preview" | "gpt-4-0125-preview" | "gpt-4-1106-preview" | "gpt-4" | "gpt-4-0613" | "gpt-3.5-turbo-0125" | "gpt-3.5-turbo-1106" | "gpt-3.5-turbo-finetune" | "gpt-3.5-turbo-16k" | "gpt-4-32k" | "gpt-4-32k-0613" | "gpt-4-vision-preview" | "gpt-4-1106-vision-preview" | "gpt-4-0314" | "gpt-4-32k-0314" | "gpt-3.5-turbo-0613" | "gpt-3.5-turbo-16k-0613" | "gpt-3.5-turbo-0301" | "text-embedding-3-large" | "text-embedding-ada-002" | "gpt-3.5-turbo-instruct-0914" | "text-ada-001" | "text-babbage-001" | "text-curie-001" | "text-davinci-001" | "text-davinci-002" | "text-davinci-003" | "ada" | "babbage" | "curie" | "davinci" | "code-davinci-002" | "code-davinci-001" | "davinci-codex" | "code-davinci-edit-001" | "code-cushman-002" | "code-cushman-001" | "cushman-codex" | "code-search-ada-code-001" | "code-search-ada-text-001" | "text-davinci-edit-001" | "text-similarity-ada-001" | "text-search-ada-doc-001" | "text-search-ada-query-001" | "text-similarity-babbage-001" | "text-search-babbage-doc-001" | "text-search-babbage-query-001" | "code-search-babbage-code-001" | "code-search-babbage-text-001" | "text-similarity-curie-001" | "text-search-curie-doc-001" | "text-search-curie-query-001" | "text-similarity-davinci-001" | "text-search-davinci-doc-001" | "text-search-davinci-query-001" | undefined): CostEstimate;
}