kitoken

Version:

Fast and versatile tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization

42 lines (41 loc) • 1.71 kB

TypeScript

/* tslint:disable */ /* eslint-disable */ /** * Kitoken tokenizer. * A fast and versatile tokenizer for language models. */ export class Kitoken { free(): void; /** * Initializes the tokenizer from a serialized `kitoken` definition. */ constructor(data: Uint8Array); /** * Encodes the given text into a sequence of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode(text: string, encode_specials?: boolean): Uint32Array; /** * Encodes the given texts into sequences of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode_all(text: (string)[], encode_specials?: boolean): any[]; /** * Decodes the given sequence of tokens into text. * * Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode(tokens: Uint32Array, decode_specials?: boolean): Uint8Array; /** * Decodes the given sequences of tokens into texts. * * Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode_all(tokens: any[], decode_specials?: boolean): any[]; }