gpt-tokenizer
Version:
A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models
22 lines • 1.34 kB
JavaScript
import * as fs from 'fs/promises';
import * as path from 'path';
import { fileURLToPath } from 'url';
import { chatModelParams, modelToEncodingMap } from '../mapping.js';
// eslint-disable-next-line no-underscore-dangle
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const chatModels = Object.keys(chatModelParams);
const template = await fs.readFile(path.join(__dirname, '../encoding/cl100k_base.ts'), 'utf8');
await Promise.all(Object.entries(modelToEncodingMap).map(async ([modelName, encoding]) => {
await fs.mkdir(path.join(__dirname, '../model'), { recursive: true });
const isChatModel = chatModels.includes(modelName);
const content = isChatModel
? template
.replace(`getEncodingApi('cl100k_base'`, `getEncodingApiForModel('${modelName}'`)
.replace('\nconst api =', '// prettier-ignore\nconst api =')
.replaceAll(`cl100k_base.js`, `${encoding}.js`)
: `// eslint-disable-next-line no-restricted-exports, import/no-default-export\nexport { default } from '../encoding/${encoding}.js'\nexport * from '../encoding/${encoding}.js'\n`;
await fs.writeFile(path.join(__dirname, `../model/${modelName}.ts`), content);
// eslint-disable-next-line no-console
console.log(`wrote encoding/${modelName}.ts`);
}));
//# sourceMappingURL=generateByModel.js.map