@lenml/tokenizers
Version:
a lightweight no-dependency fork of transformers.js (only tokenizers)
82 lines (77 loc) • 3.09 kB
text/typescript
import {
AutoTokenizer as _AutoTokenizer,
PreTrainedTokenizer,
} from "./tokenizers/tokenizers";
import { NSTokenizerConfig, NSTokenizerJSON } from "./types";
interface ITokenizerModelJsonData {
tokenizerJSON: Partial<NSTokenizerJSON.Root>;
tokenizerConfig: Partial<NSTokenizerConfig.Root>;
}
interface ITokenizerModelUrls {
tokenizerJSON: string;
tokenizerConfig: string;
}
export class TokenizerLoader {
/**
* Creates a pre-trained tokenizer from the provided model data.
*
* @param {ITokenizerModelJsonData} model - The model data containing the tokenizer JSON and configuration.
* @return {PreTrainedTokenizer} pre-trained tokenizer.
* @throws {Error} If the tokenizer JSON or configuration is missing.
*/
static fromPreTrained(model: ITokenizerModelJsonData): PreTrainedTokenizer {
const { tokenizerJSON, tokenizerConfig } = model;
if (!tokenizerJSON) {
throw new Error("tokenizerJSON is required.");
}
if (!tokenizerConfig) {
throw new Error("tokenizerConfig is required.");
}
// Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
const tokenizerName =
tokenizerConfig.tokenizer_class?.replace(/Fast$/, "") ??
"PreTrainedTokenizer";
let cls = (_AutoTokenizer as any).TOKENIZER_CLASS_MAPPING[tokenizerName];
if (!cls) {
console.warn(
`Unknown tokenizer class "${tokenizerName}", attempting to construct from base class.`
);
cls = PreTrainedTokenizer;
}
return new cls(tokenizerJSON, tokenizerConfig);
}
/**
* Creates a pre-trained tokenizer from the provided model URLs.
*
* @param {ITokenizerModelUrls} model - The model URLs containing the tokenizer JSON and configuration.
* @param {Object} [options] - Optional parameters.
* @param {any} [options.fetch] - The fetch function to use for making HTTP requests. Defaults to global.fetch.
* @param {Partial<ITokenizerModelJsonData>} [options.tokenizerJSON] - Additional tokenizer JSON data to merge with the fetched data.
* @param {Partial<ITokenizerModelJsonData>} [options.tokenizerConfig] - Additional tokenizer configuration data to merge with the fetched data.
* @return {Promise<PreTrainedTokenizer>} A promise that resolves to the pre-trained tokenizer.
*/
static async fromPreTrainedUrls(
model: ITokenizerModelUrls,
options?: {
fetch?: any;
} & Partial<ITokenizerModelJsonData>
) {
const fetch =
(options?.fetch as typeof global.fetch) ??
globalThis.fetch.bind(globalThis);
const [tokenizerJSON, tokenizerConfig] = await Promise.all([
fetch(model.tokenizerJSON).then((res) => res.json()),
fetch(model.tokenizerConfig).then((res) => res.json()),
]);
return TokenizerLoader.fromPreTrained({
tokenizerJSON: {
...tokenizerJSON,
...options?.tokenizerJSON,
},
tokenizerConfig: {
...tokenizerConfig,
...options?.tokenizerConfig,
},
});
}
}