UNPKG

kitoken

Version:

Fast and versatile tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization

82 lines (81 loc) 2.82 kB
/* tslint:disable */ /* eslint-disable */ /** * Kitoken tokenizer. * A fast and versatile tokenizer for language models. */ export class Kitoken { free(): void; /** * Initializes the tokenizer from a serialized `kitoken` definition. */ constructor(data: Uint8Array); /** * Encodes the given text into a sequence of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode(text: string, encode_specials?: boolean): Uint32Array; /** * Encodes the given texts into sequences of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode_all(text: (string)[], encode_specials?: boolean): any[]; /** * Decodes the given sequence of tokens into text. * * Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode(tokens: Uint32Array, decode_specials?: boolean): Uint8Array; /** * Decodes the given sequences of tokens into texts. * * Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode_all(tokens: any[], decode_specials?: boolean): any[]; /** * Returns the definition of the tokenizer. */ definition(): any; /** * Sets the definition of the tokenizer. * * Returns an error if the definition is invalid. */ set_definition(definition: any): void; /** * Returns the configuration of the tokenizer. */ config(): any; /** * Sets the configuration of the tokenizer. * * Returns an error if the configuration is invalid. */ set_config(config: any): void; /** * Creates a definition from this tokenizer and serializes it to bytes. */ to_bytes(): Uint8Array; /** * Initializes the tokenizer from a serialized `sentencepiece` model. */ static from_sentencepiece(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tiktoken` model. */ static from_tiktoken(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tokenizers(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tekken(data: Uint8Array): Kitoken; }