UNPKG

kitoken

Version:

Fast and versatile tokenizer for language models, supporting BPE, Unigram and WordPiece tokenization

129 lines (124 loc) 5.31 kB
/* tslint:disable */ /* eslint-disable */ /** * Kitoken tokenizer. * A fast and versatile tokenizer for language models. */ export class Kitoken { free(): void; /** * Initializes the tokenizer from a serialized `kitoken` definition. */ constructor(data: Uint8Array); /** * Encodes the given text into a sequence of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode(text: string, encode_specials?: boolean): Uint32Array; /** * Encodes the given texts into sequences of tokens. * * If `encode_specials` is `true`, the text is first split around special tokens which are separately encoded with the special encoder. * * Returns a list of lists of tokens, or an error if no token for a part exists in the encoder and no unknown token id is set in the configuration. */ encode_all(text: (string)[], encode_specials?: boolean): any[]; /** * Decodes the given sequence of tokens into text. * * Returns a list of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode(tokens: Uint32Array, decode_specials?: boolean): Uint8Array; /** * Decodes the given sequences of tokens into texts. * * Returns a list of lists of bytes, or an error if no byte sequence for a token exists in the decoder and no unknown token is set in the configuration. */ decode_all(tokens: any[], decode_specials?: boolean): any[]; /** * Returns the definition of the tokenizer. */ definition(): any; /** * Sets the definition of the tokenizer. * * Returns an error if the definition is invalid. */ set_definition(definition: any): void; /** * Returns the configuration of the tokenizer. */ config(): any; /** * Sets the configuration of the tokenizer. * * Returns an error if the configuration is invalid. */ set_config(config: any): void; /** * Creates a definition from this tokenizer and serializes it to bytes. */ to_bytes(): Uint8Array; /** * Initializes the tokenizer from a serialized `sentencepiece` model. */ static from_sentencepiece(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tiktoken` model. */ static from_tiktoken(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tokenizers(data: Uint8Array): Kitoken; /** * Initializes the tokenizer from a serialized `tokenizers` model. */ static from_tekken(data: Uint8Array): Kitoken; } export type InitInput = RequestInfo | URL | Response | BufferSource | WebAssembly.Module; export interface InitOutput { readonly memory: WebAssembly.Memory; readonly __wbg_kitoken_free: (a: number, b: number) => void; readonly kitoken_new: (a: number, b: number, c: number) => void; readonly kitoken_encode: (a: number, b: number, c: number, d: number, e: number) => void; readonly kitoken_encode_all: (a: number, b: number, c: number, d: number, e: number) => void; readonly kitoken_decode: (a: number, b: number, c: number, d: number, e: number) => void; readonly kitoken_decode_all: (a: number, b: number, c: number, d: number, e: number) => void; readonly kitoken_definition: (a: number) => number; readonly kitoken_set_definition: (a: number, b: number, c: number) => void; readonly kitoken_config: (a: number) => number; readonly kitoken_set_config: (a: number, b: number, c: number) => void; readonly kitoken_to_bytes: (a: number, b: number) => void; readonly kitoken_from_sentencepiece: (a: number, b: number, c: number) => void; readonly kitoken_from_tiktoken: (a: number, b: number, c: number) => void; readonly kitoken_from_tokenizers: (a: number, b: number, c: number) => void; readonly kitoken_from_tekken: (a: number, b: number, c: number) => void; readonly __wbindgen_export_0: (a: number, b: number) => number; readonly __wbindgen_export_1: (a: number, b: number, c: number, d: number) => number; readonly __wbindgen_export_2: (a: number) => void; readonly __wbindgen_export_3: (a: number, b: number, c: number) => void; readonly __wbindgen_add_to_stack_pointer: (a: number) => number; } export type SyncInitInput = BufferSource | WebAssembly.Module; /** * Instantiates the given `module`, which can either be bytes or * a precompiled `WebAssembly.Module`. * * @param {{ module: SyncInitInput }} module - Passing `SyncInitInput` directly is deprecated. * * @returns {InitOutput} */ export function initSync(module: { module: SyncInitInput } | SyncInitInput): InitOutput; /** * If `module_or_path` is {RequestInfo} or {URL}, makes a request and * for everything else, calls `WebAssembly.instantiate` directly. * * @param {{ module_or_path: InitInput | Promise<InitInput> }} module_or_path - Passing `InitInput` directly is deprecated. * * @returns {Promise<InitOutput>} */ export default function __wbg_init (module_or_path?: { module_or_path: InitInput | Promise<InitInput> } | InitInput | Promise<InitInput>): Promise<InitOutput>;