kitoken
Version:
Fast tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization
62 lines (60 loc) • 2.78 kB
TypeScript
/* tslint:disable */
/* eslint-disable */
/**
* Kitoken tokenizer.
* A fast and versatile tokenizer for language models.
*/
export class Kitoken {
free(): void;
[Symbol.dispose](): void;
/**
* Returns the configuration of the tokenizer.
*/
config(): any;
/**
* Decodes the given sequence of tokens into text.
*
* `decode_specials` specifies which tokens from the special vocabulary are included in the output.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
*
* Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
*/
decode(tokens: Uint32Array, decode_specials: any): Uint8Array;
/**
* Decodes the given sequences of tokens into texts.
*
* `decode_specials` specifies which tokens from the special vocabulary are included in the output.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
*
* Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
*/
decode_all(tokens: any[], decode_specials: any): any[];
/**
* Encodes the given text into a sequence of tokens.
*
* `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
* When `true`, all special token categories from the special vocabulary are used.
*
* Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
*/
encode(text: string, encode_specials: any): Uint32Array;
/**
* Encodes the given texts into sequences of tokens.
*
* `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary.
* Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`.
* When `true`, all special token categories from the special vocabulary are used.
*
* Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
*/
encode_all(text: string[], encode_specials: any): any[];
/**
* Returns the metadata of the tokenizer.
*/
meta(): any;
/**
* Initializes the tokenizer from a serialized `kitoken` definition.
*/
constructor(data: Uint8Array);
}