kitoken

Version:

Fast tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization

62 lines (60 loc) • 2.78 kB

TypeScript

/* tslint:disable */ /* eslint-disable */ /** * Kitoken tokenizer. * A fast and versatile tokenizer for language models. */ export class Kitoken { free(): void; [Symbol.dispose](): void; /** * Returns the configuration of the tokenizer. */ config(): any; /** * Decodes the given sequence of tokens into text. * * `decode_specials` specifies which tokens from the special vocabulary are included in the output. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * * Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode(tokens: Uint32Array, decode_specials: any): Uint8Array; /** * Decodes the given sequences of tokens into texts. * * `decode_specials` specifies which tokens from the special vocabulary are included in the output. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * * Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode_all(tokens: any[], decode_specials: any): any[]; /** * Encodes the given text into a sequence of tokens. * * `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * When `true`, all special token categories from the special vocabulary are used. * * Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode(text: string, encode_specials: any): Uint32Array; /** * Encodes the given texts into sequences of tokens. * * `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * When `true`, all special token categories from the special vocabulary are used. * * Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode_all(text: string[], encode_specials: any): any[]; /** * Returns the metadata of the tokenizer. */ meta(): any; /** * Initializes the tokenizer from a serialized `kitoken` definition. */ constructor(data: Uint8Array); }