kitoken

Version:

Fast tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization

98 lines (96 loc) • 3.88 kB

TypeScript

/* tslint:disable */ /* eslint-disable */ /** * Kitoken tokenizer. * A fast and versatile tokenizer for language models. */ export class Kitoken { free(): void; [Symbol.dispose](): void; /** * Returns the configuration of the tokenizer. */ config(): any; /** * Decodes the given sequence of tokens into text. * * `decode_specials` specifies which tokens from the special vocabulary are included in the output. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * * Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode(tokens: Uint32Array, decode_specials: any): Uint8Array; /** * Decodes the given sequences of tokens into texts. * * `decode_specials` specifies which tokens from the special vocabulary are included in the output. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * * Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode_all(tokens: any[], decode_specials: any): any[]; /** * Returns the definition of the tokenizer. */ definition(): any; /** * Encodes the given text into a sequence of tokens. * * `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * When `true`, all special token categories from the special vocabulary are used. * * Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode(text: string, encode_specials: any): Uint32Array; /** * Encodes the given texts into sequences of tokens. * * `encode_specials` specifies which special tokens are tokenized with the special vocabulary instead of the regular vocabulary. * Accepted are arrays of strings "control", "priority", "unknown", and boolean values `true` and `false`. * When `true`, all special token categories from the special vocabulary are used. * * Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode_all(text: string[], encode_specials: any): any[]; /** * Initializes the tokenizer from a serialized `sentencepiece` model. */ static from_sentencepiece(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tekken(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tiktoken` model. */ static from_tiktoken(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tokenizers(data: Uint8Array): Kitoken; /** * Returns the metadata of the tokenizer. */ meta(): any; /** * Initializes the tokenizer from a serialized `kitoken` definition. */ constructor(data: Uint8Array); /** * Sets the configuration of the tokenizer. * * Returns an error if the configuration is invalid. */ set_config(config: any): void; /** * Sets the definition of the tokenizer. * * Returns an error if the definition is invalid. */ set_definition(definition: any): void; /** * Creates a definition from this tokenizer and serializes it to bytes. */ to_bytes(): Uint8Array; }