gpt-tokenizer
Version:
A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models
97 lines (81 loc) • 3.01 kB
text/typescript
/* eslint-disable no-bitwise */
/* eslint-disable no-magic-numbers */
export const isAscii = (codePoint: number) => codePoint <= 0x7f
const HIGH_SURROGATE_START = 55_296
const HIGH_SURROGATE_END = 56_319
export function endsWithIncompleteUtfPairSurrogate(string: string): boolean {
if (string.length === 0) return false
// Check if the last character is a high surrogate
// eslint-disable-next-line unicorn/prefer-code-point
const lastCharCode = string.charCodeAt(string.length - 1)
return (
lastCharCode >= HIGH_SURROGATE_START && lastCharCode <= HIGH_SURROGATE_END
)
}
function isValidUTF8(bytes: Uint8Array): boolean {
let i = 0
while (i < bytes.length) {
const byte1 = bytes[i]!
let numBytes = 0
let codePoint = 0
// Determine the number of bytes in the current UTF-8 character
if (byte1 <= 0x7f) {
// 1-byte character (ASCII)
numBytes = 1
codePoint = byte1
} else if ((byte1 & 0xe0) === 0xc0) {
// 2-byte character
numBytes = 2
codePoint = byte1 & 0x1f
if (byte1 <= 0xc1) return false // Overlong encoding not allowed
} else if ((byte1 & 0xf0) === 0xe0) {
// 3-byte character
numBytes = 3
codePoint = byte1 & 0x0f
} else if ((byte1 & 0xf8) === 0xf0) {
// 4-byte character
numBytes = 4
codePoint = byte1 & 0x07
if (byte1 > 0xf4) return false // Code points above U+10FFFF not allowed
} else {
// Invalid first byte of UTF-8 character
return false
}
// Ensure there are enough continuation bytes
if (i + numBytes > bytes.length) return false
// Process the continuation bytes
for (let j = 1; j < numBytes; j++) {
const byte = bytes[i + j]
if (byte === undefined || (byte & 0xc0) !== 0x80) return false // Continuation bytes must start with '10'
codePoint = (codePoint << 6) | (byte & 0x3f)
}
// Check for overlong encodings
if (numBytes === 2 && codePoint < 0x80) return false // Overlong 2-byte sequence
if (numBytes === 3 && codePoint < 2_048) return false // Overlong 3-byte sequence
if (numBytes === 4 && codePoint < 65_536) return false // Overlong 4-byte sequence
// Check for surrogate halves (U+D800 to U+DFFF)
if (codePoint >= 55_296 && codePoint <= 57_343) return false
// Check for code points above U+10FFFF
if (codePoint > 1_114_111) return false
// Move to the next character
i += numBytes
}
return true
}
const textDecoder = new TextDecoder('utf8', { fatal: false })
export function tryConvertToString(arr: Uint8Array): string | undefined {
if (!isValidUTF8(arr)) {
return undefined
}
return textDecoder.decode(arr)
}
// Helper function to compare two Uint8Arrays lexicographically
export function compareUint8Arrays(a: Uint8Array, b: Uint8Array): number {
const len = Math.min(a.length, b.length)
for (let i = 0; i < len; i++) {
if (a[i] !== b[i]) {
return a[i]! - b[i]!
}
}
return a.length - b.length
}