ai-utils.js
Version:
Build AI applications, chatbots, and agents with JavaScript and TypeScript.
87 lines (86 loc) • 2.68 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.TikTokenTokenizer = void 0;
const js_tiktoken_1 = require("js-tiktoken");
const never_js_1 = require("../../util/never.cjs");
/**
* TikToken tokenizer for OpenAI language models.
*
* @see https://github.com/openai/tiktoken
*
* @example
* const tokenizer = new TikTokenTokenizer({ model: "gpt-4" });
*
* const text = "At first, Nox didn't know what to do with the pup.";
*
* const tokenCount = await countTokens(tokenizer, text);
* const tokens = await tokenizer.tokenize(text);
* const tokensAndTokenTexts = await tokenizer.tokenizeWithTexts(text);
* const reconstructedText = await tokenizer.detokenize(tokens);
*/
class TikTokenTokenizer {
/**
* Get a TikToken tokenizer for a specific model or encoding.
*/
constructor(options) {
Object.defineProperty(this, "tiktoken", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
this.tiktoken = (0, js_tiktoken_1.getEncoding)("model" in options
? getEncodingNameForModel(options.model)
: options.encoding);
}
async tokenize(text) {
return this.tiktoken.encode(text);
}
async tokenizeWithTexts(text) {
const tokens = this.tiktoken.encode(text);
return {
tokens,
tokenTexts: tokens.map((token) => this.tiktoken.decode([token])),
};
}
async detokenize(tokens) {
return this.tiktoken.decode(tokens);
}
}
exports.TikTokenTokenizer = TikTokenTokenizer;
// implemented here (instead of using js-tiktoken) to be able to quickly updated it
// when new models are released
function getEncodingNameForModel(model) {
switch (model) {
case "code-davinci-002":
case "text-davinci-002":
case "text-davinci-003": {
return "p50k_base";
}
case "ada":
case "babbage":
case "curie":
case "davinci":
case "text-ada-001":
case "text-babbage-001":
case "text-curie-001":
case "gpt-3.5-turbo":
case "gpt-3.5-turbo-0301":
case "gpt-3.5-turbo-0613":
case "gpt-3.5-turbo-16k":
case "gpt-3.5-turbo-16k-0613":
case "gpt-4":
case "gpt-4-0314":
case "gpt-4-0613":
case "gpt-4-32k":
case "gpt-4-32k-0314":
case "gpt-4-32k-0613":
case "text-embedding-ada-002": {
return "cl100k_base";
}
default: {
(0, never_js_1.never)(model);
throw new Error(`Unknown model: ${model}`);
}
}
}