gpt-tokenizer
Version:
A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models
17 lines (15 loc) • 550 B
text/typescript
/* eslint-disable no-magic-numbers */
import type { RawBytePairRanks } from '../BytePairEncodingCore.js'
import { type EncodingParams } from '../modelParams.js'
import { EndOfText } from '../specialTokens.js'
import { R50K_TOKEN_SPLIT_REGEX } from './constants.js'
export function P50KBase(
bytePairRankDecoder: RawBytePairRanks,
): EncodingParams {
return {
expectedVocabularySize: 50_281,
tokenSplitRegex: R50K_TOKEN_SPLIT_REGEX,
bytePairRankDecoder,
specialTokensEncoder: new Map<string, number>([[EndOfText, 50_256]]),
}
}