UNPKG

ai-utils.js

Version:

Build AI applications, chatbots, and agents with JavaScript and TypeScript.

87 lines (86 loc) 2.68 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.TikTokenTokenizer = void 0; const js_tiktoken_1 = require("js-tiktoken"); const never_js_1 = require("../../util/never.cjs"); /** * TikToken tokenizer for OpenAI language models. * * @see https://github.com/openai/tiktoken * * @example * const tokenizer = new TikTokenTokenizer({ model: "gpt-4" }); * * const text = "At first, Nox didn't know what to do with the pup."; * * const tokenCount = await countTokens(tokenizer, text); * const tokens = await tokenizer.tokenize(text); * const tokensAndTokenTexts = await tokenizer.tokenizeWithTexts(text); * const reconstructedText = await tokenizer.detokenize(tokens); */ class TikTokenTokenizer { /** * Get a TikToken tokenizer for a specific model or encoding. */ constructor(options) { Object.defineProperty(this, "tiktoken", { enumerable: true, configurable: true, writable: true, value: void 0 }); this.tiktoken = (0, js_tiktoken_1.getEncoding)("model" in options ? getEncodingNameForModel(options.model) : options.encoding); } async tokenize(text) { return this.tiktoken.encode(text); } async tokenizeWithTexts(text) { const tokens = this.tiktoken.encode(text); return { tokens, tokenTexts: tokens.map((token) => this.tiktoken.decode([token])), }; } async detokenize(tokens) { return this.tiktoken.decode(tokens); } } exports.TikTokenTokenizer = TikTokenTokenizer; // implemented here (instead of using js-tiktoken) to be able to quickly updated it // when new models are released function getEncodingNameForModel(model) { switch (model) { case "code-davinci-002": case "text-davinci-002": case "text-davinci-003": { return "p50k_base"; } case "ada": case "babbage": case "curie": case "davinci": case "text-ada-001": case "text-babbage-001": case "text-curie-001": case "gpt-3.5-turbo": case "gpt-3.5-turbo-0301": case "gpt-3.5-turbo-0613": case "gpt-3.5-turbo-16k": case "gpt-3.5-turbo-16k-0613": case "gpt-4": case "gpt-4-0314": case "gpt-4-0613": case "gpt-4-32k": case "gpt-4-32k-0314": case "gpt-4-32k-0613": case "text-embedding-ada-002": { return "cl100k_base"; } default: { (0, never_js_1.never)(model); throw new Error(`Unknown model: ${model}`); } } }