gpt-tokenizer

Version:

A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models

58 lines (49 loc) • 1.96 kB

text/typescript

import type { BytePairEncodingConfig, RawBytePairRanks, } from './BytePairEncodingCore.js' import { Cl100KBase } from './encodingParams/cl100k_base.js' import { O200KBase } from './encodingParams/o200k_base.js' import { P50KBase } from './encodingParams/p50k_base.js' import { P50KEdit } from './encodingParams/p50k_edit.js' import { R50KBase } from './encodingParams/r50k_base.js' import type { EncodingName, ModelName } from './mapping.js' export interface EncodingParams extends BytePairEncodingConfig { /** * The expected total number of tokens in the vocabulary, including both regular and special tokens. * This parameter is used to ensure that the combined number of regular and special tokens matches the expected total. */ expectedVocabularySize?: number /** * A regular expression that is designed to tokenize or break up a text into parts * that can be contractions, letters, numbers, or other characters, * while handling line terminators and spaces in a specific manner. * It's complex due to its need to deal with a wide variety of cases in text processing. */ tokenSplitRegex: RegExp specialTokensEncoder: Map<string, number> modelName?: ModelName } export type GetMergeableRanksFn = ( encodingName: EncodingName, ) => RawBytePairRanks export function getEncodingParams( encodingName: EncodingName, getMergeableRanks: GetMergeableRanksFn, ): EncodingParams { const mergeableBytePairRanks = getMergeableRanks(encodingName) switch (encodingName.toLowerCase()) { case 'r50k_base': return R50KBase(mergeableBytePairRanks) case 'p50k_base': return P50KBase(mergeableBytePairRanks) case 'p50k_edit': return P50KEdit(mergeableBytePairRanks) case 'cl100k_base': return Cl100KBase(mergeableBytePairRanks) case 'o200k_base': return O200KBase(mergeableBytePairRanks) default: throw new Error(`Unknown encoding name: ${encodingName}`) } }