@lenml/llama2-tokenizer
Version:
Our library `@lenml/llama2-tokenizer` has been deprecated. We are excited to introduce our new library `@lenml/tokenizers` as its replacement, offering a broader set of features and an enhanced experience.
150 lines (147 loc) • 5.05 kB
TypeScript
/**
* Trie in TypeScript. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
* Loose reference https://en.wikipedia.org/wiki/Trie
*/
declare class Trie {
private data;
private _tokens;
constructor();
/**
* Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
* The special key `""` is used to represent termination.
*
* This function is idempotent, adding twice the same word will leave the trie unchanged
*
* Example:
*
* ```typescript
* const trie = new Trie();
* trie.add("Hello 友達");
* console.log(trie.data);
* // {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
*
* trie.add("Hello");
* console.log(trie.data);
* // {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
* ```
*/
add(word: string): void;
/**
* Will look for the words added to the trie within `text`. Output is the original string splitted along the
* boundaries of the words found.
*
* This trie will match the longest possible word first !
*
* Example:
*
* ```typescript
* const trie = new Trie();
* console.log(trie.split("[CLS] This is a extra_id_100"));
* // ["[CLS] This is a extra_id_100"]
*
* trie.add("[CLS]");
* trie.add("extra_id_1");
* trie.add("extra_id_100");
* console.log(trie.split("[CLS] This is a extra_id_100"));
* // ["[CLS]", " This is a ", "extra_id_100"]
* ```
*/
split(text: string): string[];
protected cutText(text: string, offsets: number[]): string[];
}
declare class Llama2Tokenizer {
protected tokens_trie: Trie;
protected special_tokens: Record<string, number>;
protected vocab: Record<string, number>;
protected vocab_ids: Record<number, string>;
constructor();
/**
* Install the provided vocabulary into the class instance.
*
* @param {Record<string, number>} vocab - The vocabulary to be installed
*/
install_vocab(vocab: Record<string, number>): void;
/**
* Get the size of the vocabulary, including special tokens.
*
* @return {number} the size of the vocabulary
*/
get vocab_size(): number;
/**
* Get the maximum id from the vocab_ids and special_tokens.
*
* @return {number} the maximum id
*/
get max_id(): number;
/**
* Adds a special token with an optional token ID.
*
* @param {string} token - the special token to be added
* @param {number} [token_id] - the optional token ID
* @return {void}
*/
add_special_token(token: string, token_id?: number): void;
/**
* Adds special tokens to the list of tokens.
*
* @param {Array} tokens - An array of tokens to add. Each token can be a string or an object with `token` and `token_id` properties.
*/
add_special_tokens(tokens: (string | {
token: string;
token_id: number;
})[]): void;
/**
* Convert an id to a token.
*
* @param {number} id - The id to be converted to a token.
* @return {string} The corresponding token for the given id.
*/
ids_to_token(id: number): string;
/**
* token_to_id function takes a token as input and returns its corresponding id if found in the vocabulary, otherwise throws an error.
*
* @param {string} token - the input token
* @return {number} the corresponding id of the input token
*/
token_to_id(token: string): number;
/**
* Retrieve the vocabulary.
*
* @return {Object} a shallow copy of the vocabulary
*/
get_vocab(): {
[x: string]: number;
};
/**
* Checks if the token is a valid token.
*
* @param {string} token - the token to be checked
* @return {boolean} true if the token is valid, false otherwise
*/
valid_token(token: string): boolean;
/**
* Converts a string in a sequence of tokens, using the tokenizer.
*/
tokenize(text: string): string[];
/**
* Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
*/
encode(text: string): number[];
/**
* Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.
*/
decode(ids: number[]): string;
/**
* Converts a sequence of tokens (string) in a single string.
*/
convert_tokens_to_string(tokens: string[]): string;
/**
* Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the vocabulary.
*/
convert_tokens_to_ids(tokens: string[]): number[];
/**
* Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and added tokens.
*/
convert_ids_to_tokens(ids: number[]): string[];
}
export { Llama2Tokenizer };