@lenml/llama2-tokenizer

Version:

Our library `@lenml/llama2-tokenizer` has been deprecated. We are excited to introduce our new library `@lenml/tokenizers` as its replacement, offering a broader set of features and an enhanced experience.

github.com/lenML/llama2-tokenizer.js

lenML/llama2-tokenizer.js

150 lines (147 loc) • 5.05 kB

TypeScript

/** * Trie in TypeScript. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass * Loose reference https://en.wikipedia.org/wiki/Trie */ declare class Trie { private data; private _tokens; constructor(); /** * Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. * The special key `""` is used to represent termination. * * This function is idempotent, adding twice the same word will leave the trie unchanged * * Example: * * ```typescript * const trie = new Trie(); * trie.add("Hello 友達"); * console.log(trie.data); * // {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} * * trie.add("Hello"); * console.log(trie.data); * // {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} * ``` */ add(word: string): void; /** * Will look for the words added to the trie within `text`. Output is the original string splitted along the * boundaries of the words found. * * This trie will match the longest possible word first ! * * Example: * * ```typescript * const trie = new Trie(); * console.log(trie.split("[CLS] This is a extra_id_100")); * // ["[CLS] This is a extra_id_100"] * * trie.add("[CLS]"); * trie.add("extra_id_1"); * trie.add("extra_id_100"); * console.log(trie.split("[CLS] This is a extra_id_100")); * // ["[CLS]", " This is a ", "extra_id_100"] * ``` */ split(text: string): string[]; protected cutText(text: string, offsets: number[]): string[]; } declare class Llama2Tokenizer { protected tokens_trie: Trie; protected special_tokens: Record<string, number>; protected vocab: Record<string, number>; protected vocab_ids: Record<number, string>; constructor(); /** * Install the provided vocabulary into the class instance. * * @param {Record<string, number>} vocab - The vocabulary to be installed */ install_vocab(vocab: Record<string, number>): void; /** * Get the size of the vocabulary, including special tokens. * * @return {number} the size of the vocabulary */ get vocab_size(): number; /** * Get the maximum id from the vocab_ids and special_tokens. * * @return {number} the maximum id */ get max_id(): number; /** * Adds a special token with an optional token ID. * * @param {string} token - the special token to be added * @param {number} [token_id] - the optional token ID * @return {void} */ add_special_token(token: string, token_id?: number): void; /** * Adds special tokens to the list of tokens. * * @param {Array} tokens - An array of tokens to add. Each token can be a string or an object with `token` and `token_id` properties. */ add_special_tokens(tokens: (string | { token: string; token_id: number; })[]): void; /** * Convert an id to a token. * * @param {number} id - The id to be converted to a token. * @return {string} The corresponding token for the given id. */ ids_to_token(id: number): string; /** * token_to_id function takes a token as input and returns its corresponding id if found in the vocabulary, otherwise throws an error. * * @param {string} token - the input token * @return {number} the corresponding id of the input token */ token_to_id(token: string): number; /** * Retrieve the vocabulary. * * @return {Object} a shallow copy of the vocabulary */ get_vocab(): { [x: string]: number; }; /** * Checks if the token is a valid token. * * @param {string} token - the token to be checked * @return {boolean} true if the token is valid, false otherwise */ valid_token(token: string): boolean; /** * Converts a string in a sequence of tokens, using the tokenizer. */ tokenize(text: string): string[]; /** * Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. */ encode(text: string): number[]; /** * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. */ decode(ids: number[]): string; /** * Converts a sequence of tokens (string) in a single string. */ convert_tokens_to_string(tokens: string[]): string; /** * Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary. */ convert_tokens_to_ids(tokens: string[]): number[]; /** * Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens. */ convert_ids_to_tokens(ids: number[]): string[]; } export { Llama2Tokenizer };