kitoken
Version:
Fast and versatile tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization
82 lines (81 loc) • 2.82 kB
TypeScript
/* tslint:disable */
/* eslint-disable */
/**
* Kitoken tokenizer.
* A fast and versatile tokenizer for language models.
*/
export class Kitoken {
free(): void;
/**
* Initializes the tokenizer from a serialized `kitoken` definition.
*/
constructor(data: Uint8Array);
/**
* Encodes the given text into a sequence of tokens.
*
* If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder.
*
* Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
*/
encode(text: string, encode_specials?: boolean): Uint32Array;
/**
* Encodes the given texts into sequences of tokens.
*
* If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder.
*
* Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration.
*/
encode_all(text: (string)[], encode_specials?: boolean): any[];
/**
* Decodes the given sequence of tokens into text.
*
* Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
*/
decode(tokens: Uint32Array, decode_specials?: boolean): Uint8Array;
/**
* Decodes the given sequences of tokens into texts.
*
* Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration.
*/
decode_all(tokens: any[], decode_specials?: boolean): any[];
/**
* Returns the definition of the tokenizer.
*/
definition(): any;
/**
* Sets the definition of the tokenizer.
*
* Returns an error if the definition is invalid.
*/
set_definition(definition: any): void;
/**
* Returns the configuration of the tokenizer.
*/
config(): any;
/**
* Sets the configuration of the tokenizer.
*
* Returns an error if the configuration is invalid.
*/
set_config(config: any): void;
/**
* Creates a definition from this tokenizer and serializes it to bytes.
*/
to_bytes(): Uint8Array;
/**
* Initializes the tokenizer from a serialized `sentencepiece` model.
*/
static from_sentencepiece(data: Uint8Array): Kitoken;
/**
* Initializes the tokenizer from a serialized `tiktoken` model.
*/
static from_tiktoken(data: Uint8Array): Kitoken;
/**
* Initializes the tokenizer from a serialized `tokenizers` model.
*/
static from_tokenizers(data: Uint8Array): Kitoken;
/**
* Initializes the tokenizer from a serialized `tokenizers` model.
*/
static from_tekken(data: Uint8Array): Kitoken;
}