UNPKG

gpt-tokenizer

Version:

A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models

98 lines 4.06 kB
/* eslint-disable no-console */ import * as fs from 'fs/promises'; import * as path from 'path'; import { fileURLToPath } from 'url'; const DEBUG = process.env.DEBUG === 'true'; const processFilesInDirectory = async (directoryPath, fn) => { try { const files = await fs.readdir(directoryPath, { withFileTypes: true }); for (const file of files) { // eslint-disable-next-line no-continue if (!file.isFile()) continue; const filePath = path.join(directoryPath, file.name); // eslint-disable-next-line no-await-in-loop await fn(filePath); } } catch (error) { console.error('An error occurred:', error); } }; // eslint-disable-next-line no-underscore-dangle const __dirname = path.dirname(fileURLToPath(import.meta.url)); const textDecoder = new TextDecoder('utf8', { fatal: true }); const textEncoder = new TextEncoder(); function safeDecodeUtf8(bytes) { try { const v = textDecoder.decode(bytes); const encoded = textEncoder.encode(v); if (encoded.byteLength !== bytes.byteLength) { if (DEBUG) { console.log('Mismatch:', new Uint8Array(bytes), encoded); } return undefined; } return v; } catch { return undefined; } } await processFilesInDirectory(path.join(__dirname, '../../data'), async (filePath) => { if (!filePath.endsWith('.tiktoken')) return; const modelName = path.basename(filePath, '.tiktoken'); console.log(`Processing ${modelName}`); const bpeFile = await fs.readFile(filePath, 'utf8'); const lines = bpeFile.split('\n'); const encoder = lines.slice(0, -1).map((x) => { const [token, rank] = x.split(' '); if (!token || token.length === 0 || !rank || rank.length === 0) { throw new Error(`Invalid token encoding: ${x}`); } const tokenArray = Buffer.from(token, 'base64'); return [tokenArray, Number.parseInt(rank, 10)]; }); console.log(`${modelName} token count: ${encoder.length}`); const stringifiedBpeItems = []; let lastRank = 0; for (const [token, rank] of encoder) { const decoded = safeDecodeUtf8(token) ?? token; // add array holes if rank is not consecutive let holesToInsert = rank - lastRank - 1; while (holesToInsert-- > 0) { stringifiedBpeItems.push(''); } const rankPrefix = DEBUG ? `\n/** ${rank} = */ ` : ''; stringifiedBpeItems.push(rankPrefix + (typeof decoded === 'string' ? JSON.stringify(decoded) : `[${token.join(',')}]`)); lastRank = rank; } // if the array is too large, Safari on iOS will throw RangeError: Maximum call stack size exceeded. // so we split the array into smaller chunks const chunkSize = 100_000; const jsCodeConstsForEachChunk = []; const chunks = stringifiedBpeItems.length / chunkSize; for (let i = 0; i < chunks; i++) { jsCodeConstsForEachChunk.push(`const c${i} = [${stringifiedBpeItems.slice(i * chunkSize, (i + 1) * chunkSize)}]`); } // now let's create the code that will create a single array from the chunks using .concat const jsCodeBpeArray = `c0.concat(${jsCodeConstsForEachChunk .slice(1) .map((_, i) => `c${i + 1}`) .join(', ')})`; // now reset the helper arrays to free up memory const jsCodeToResetHelperArrays = jsCodeConstsForEachChunk.map((_, i) => `c${i}.length = 0;`); await fs.mkdir(path.join(__dirname, '../bpeRanks'), { recursive: true }); await fs.writeFile(path.join(__dirname, `../bpeRanks/${modelName}.js`), `/* eslint-disable */\n// @ts-nocheck\n// prettier-ignore ${jsCodeConstsForEachChunk.join('\n')} /** @type {(string | number[])[]} */ const bpe = ${jsCodeBpeArray}; ${jsCodeToResetHelperArrays.join('\n')} export default bpe;`); console.log(`Wrote ${modelName}.js`); }); //# sourceMappingURL=generateJsBpe.js.map