UNPKG

gpt-tokenizer

Version:

A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models

github.com/niieani/gpt-tokenizer

niieani/gpt-tokenizer

94 lines • 3.37 kB

JavaScript

/* eslint-disable no-bitwise */ /* eslint-disable no-magic-numbers */ export const isAscii = (codePoint) => codePoint <= 0x7f; const HIGH_SURROGATE_START = 55_296; const HIGH_SURROGATE_END = 56_319; export function endsWithIncompleteUtfPairSurrogate(string) { if (string.length === 0) return false; // Check if the last character is a high surrogate // eslint-disable-next-line unicorn/prefer-code-point const lastCharCode = string.charCodeAt(string.length - 1); return (lastCharCode >= HIGH_SURROGATE_START && lastCharCode <= HIGH_SURROGATE_END); } function isValidUTF8(bytes) { let i = 0; while (i < bytes.length) { const byte1 = bytes[i]; let numBytes = 0; let codePoint = 0; // Determine the number of bytes in the current UTF-8 character if (byte1 <= 0x7f) { // 1-byte character (ASCII) numBytes = 1; codePoint = byte1; } else if ((byte1 & 0xe0) === 0xc0) { // 2-byte character numBytes = 2; codePoint = byte1 & 0x1f; if (byte1 <= 0xc1) return false; // Overlong encoding not allowed } else if ((byte1 & 0xf0) === 0xe0) { // 3-byte character numBytes = 3; codePoint = byte1 & 0x0f; } else if ((byte1 & 0xf8) === 0xf0) { // 4-byte character numBytes = 4; codePoint = byte1 & 0x07; if (byte1 > 0xf4) return false; // Code points above U+10FFFF not allowed } else { // Invalid first byte of UTF-8 character return false; } // Ensure there are enough continuation bytes if (i + numBytes > bytes.length) return false; // Process the continuation bytes for (let j = 1; j < numBytes; j++) { const byte = bytes[i + j]; if (byte === undefined || (byte & 0xc0) !== 0x80) return false; // Continuation bytes must start with '10' codePoint = (codePoint << 6) | (byte & 0x3f); } // Check for overlong encodings if (numBytes === 2 && codePoint < 0x80) return false; // Overlong 2-byte sequence if (numBytes === 3 && codePoint < 2_048) return false; // Overlong 3-byte sequence if (numBytes === 4 && codePoint < 65_536) return false; // Overlong 4-byte sequence // Check for surrogate halves (U+D800 to U+DFFF) if (codePoint >= 55_296 && codePoint <= 57_343) return false; // Check for code points above U+10FFFF if (codePoint > 1_114_111) return false; // Move to the next character i += numBytes; } return true; } const textDecoder = new TextDecoder('utf8', { fatal: false }); export function tryConvertToString(arr) { if (!isValidUTF8(arr)) { return undefined; } return textDecoder.decode(arr); } // Helper function to compare two Uint8Arrays lexicographically export function compareUint8Arrays(a, b) { const len = Math.min(a.length, b.length); for (let i = 0; i < len; i++) { if (a[i] !== b[i]) { return a[i] - b[i]; } } return a.length - b.length; } //# sourceMappingURL=utfUtil.js.map