UNPKG

@huggingface/transformers

Version:

State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!

1,442 lines (1,279 loc) • 166 kB
/** * @file Tokenizers are used to prepare textual inputs for a model. * * **Example:** Create an `AutoTokenizer` and use it to tokenize a sentence. * This will automatically detect the tokenizer type based on the tokenizer class defined in `tokenizer.json`. * ```javascript * import { AutoTokenizer } from '@huggingface/transformers'; * * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/bert-base-uncased'); * const { input_ids } = await tokenizer('I love transformers!'); * // Tensor { * // data: BigInt64Array(6) [101n, 1045n, 2293n, 19081n, 999n, 102n], * // dims: [1, 6], * // type: 'int64', * // size: 6, * // } * ``` * * @module tokenizers */ import { Callable, } from './utils/generic.js'; import { reverseDictionary, escapeRegExp, isIntegralNumber, mergeArrays, len, } from './utils/core.js'; import { getModelJSON, } from './utils/hub.js'; import { max, min, round } from './utils/maths.js'; import { Tensor } from './utils/tensor.js'; import { PriorityQueue, TokenLattice, CharTrie, DictionarySplitter, LRUCache, } from './utils/data-structures.js'; import { Template } from '@huggingface/jinja'; import { WHISPER_LANGUAGE_MAPPING } from './models/whisper/common_whisper.js'; /** * @typedef {Object} TokenizerProperties Additional tokenizer-specific properties. * @property {boolean} [legacy=false] Whether or not the `legacy` behavior of the tokenizer should be used. * @typedef {import('./utils/hub.js').PretrainedOptions & TokenizerProperties} PretrainedTokenizerOptions */ /** * Loads a tokenizer from the specified path. * @param {string} pretrained_model_name_or_path The path to the tokenizer directory. * @param {PretrainedTokenizerOptions} options Additional options for loading the tokenizer. * @returns {Promise<any[]>} A promise that resolves with information about the loaded tokenizer. */ async function loadTokenizer(pretrained_model_name_or_path, options) { const info = await Promise.all([ getModelJSON(pretrained_model_name_or_path, 'tokenizer.json', true, options), getModelJSON(pretrained_model_name_or_path, 'tokenizer_config.json', true, options), ]) // Override legacy option if `options.legacy` is not null if (options.legacy !== null) { info[1].legacy = options.legacy; } return info; } /** * Helper function to split a string on a regex, but keep the delimiters. * This is required, because the JavaScript `.split()` method does not keep the delimiters, * and wrapping in a capturing group causes issues with existing capturing groups (due to nesting). * @param {string} text The text to split. * @param {RegExp} regex The regex to split on. * @returns {string[]} The split string. */ function regexSplit(text, regex) { const result = []; let prev = 0; for (const match of text.matchAll(regex)) { const fullMatch = match[0]; if (prev < match.index) { result.push(text.slice(prev, match.index)); } if (fullMatch.length > 0) { result.push(fullMatch); } prev = match.index + fullMatch.length; } if (prev < text.length) { result.push(text.slice(prev)); } return result; } /** * Helper method to construct a pattern from a config object. * @param {Object} pattern The pattern object. * @param {boolean} invert Whether to invert the pattern. * @returns {RegExp|null} The compiled pattern. */ function createPattern(pattern, invert = true) { if (pattern.Regex !== undefined) { // In certain cases, the pattern may contain unnecessary escape sequences (e.g., \# or \& or \~). // i.e., valid in Python (where the patterns are exported from) but invalid in JavaScript (where the patterns are parsed). // This isn't an issue when creating the regex w/o the 'u' flag, but it is when the 'u' flag is used. // For this reason, it is necessary to remove these backslashes before creating the regex. // See https://stackoverflow.com/a/63007777/13989043 for more information let regex = pattern.Regex.replace(/\\([#&~])/g, '$1'); // TODO: add more characters to this list if necessary // We also handle special cases where the regex contains invalid (non-JS compatible) syntax. for (const [key, value] of PROBLEMATIC_REGEX_MAP) { regex = regex.replaceAll(key, value); } return new RegExp(regex, 'gu'); } else if (pattern.String !== undefined) { const escaped = escapeRegExp(pattern.String); // NOTE: if invert is true, we wrap the pattern in a group so that it is kept when performing .split() return new RegExp(invert ? escaped : `(${escaped})`, 'gu'); } else { console.warn('Unknown pattern type:', pattern) return null; } } /** * Helper function to convert an Object to a Map * @param {Object} obj The object to convert. * @returns {Map<string, any>} The map. */ function objectToMap(obj) { return new Map(Object.entries(obj)); } /** * Helper function to convert a tensor to a list before decoding. * @param {Tensor} tensor The tensor to convert. * @returns {number[]} The tensor as a list. */ function prepareTensorForDecode(tensor) { const dims = tensor.dims; switch (dims.length) { case 1: return tensor.tolist(); case 2: if (dims[0] !== 1) { throw new Error('Unable to decode tensor with `batch size !== 1`. Use `tokenizer.batch_decode(...)` for batched inputs.'); } return tensor.tolist()[0]; default: throw new Error(`Expected tensor to have 1-2 dimensions, got ${dims.length}.`) } } /** * Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms * @param {string} text The text to clean up. * @returns {string} The cleaned up text. */ function clean_up_tokenization(text) { // Clean up a list of simple English tokenization artifacts // like spaces before punctuations and abbreviated forms return text.replace(/ \./g, '.') .replace(/ \?/g, '?') .replace(/ \!/g, '!') .replace(/ ,/g, ',') .replace(/ \' /g, "'") .replace(/ n\'t/g, "n't") .replace(/ \'m/g, "'m") .replace(/ \'s/g, "'s") .replace(/ \'ve/g, "'ve") .replace(/ \'re/g, "'re"); } /** * Helper function to remove accents from a string. * @param {string} text The text to remove accents from. * @returns {string} The text with accents removed. */ function remove_accents(text) { return text.replace(/\p{M}/gu, ''); } /** * Helper function to lowercase a string and remove accents. * @param {string} text The text to lowercase and remove accents from. * @returns {string} The lowercased text with accents removed. */ function lowercase_and_remove_accent(text) { return remove_accents(text.toLowerCase()); } /** * Checks whether the given Unicode codepoint represents a CJK (Chinese, Japanese, or Korean) character. * * A "chinese character" is defined as anything in the CJK Unicode block: * https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) * * Note that the CJK Unicode block is NOT all Japanese and Korean characters, despite its name. * The modern Korean Hangul alphabet is a different block, as is Japanese Hiragana and Katakana. * Those alphabets are used to write space-separated words, so they are not treated specially * and are handled like all other languages. * * @param {number|bigint} cp The Unicode codepoint to check. * @returns {boolean} True if the codepoint represents a CJK character, false otherwise. */ export function is_chinese_char(cp) { return ( (cp >= 0x4E00 && cp <= 0x9FFF) || (cp >= 0x3400 && cp <= 0x4DBF) || (cp >= 0x20000 && cp <= 0x2A6DF) || (cp >= 0x2A700 && cp <= 0x2B73F) || (cp >= 0x2B740 && cp <= 0x2B81F) || (cp >= 0x2B820 && cp <= 0x2CEAF) || (cp >= 0xF900 && cp <= 0xFAFF) || (cp >= 0x2F800 && cp <= 0x2FA1F) ) } /** * Helper function to fuse consecutive unknown tokens. * @param {string[]} arr The list of input tokens * @param {Map<string, any>} tokens_to_ids The mapping from tokens to token ids. * @param {number} unk_token_id The value to fuse on. * @private */ function fuse_unk(arr, tokens_to_ids, unk_token_id) { const fused = []; let i = 0; while (i < arr.length) { fused.push(arr[i]) if ((tokens_to_ids.get(arr[i]) ?? unk_token_id) !== unk_token_id) { ++i; continue; } while (++i < arr.length && (tokens_to_ids.get(arr[i]) ?? unk_token_id) === unk_token_id) { if (tokens_to_ids.get(fused.at(-1)) !== unk_token_id) { fused[fused.length - 1] += arr[i]; } } } return fused; } /** * Split a string on whitespace. * @param {string} text The text to split. * @returns {string[]} The split string. */ function whitespace_split(text) { return text.match(/\S+/g) || []; } const PUNCTUATION_REGEX = '\\p{P}\\u0021-\\u002F\\u003A-\\u0040\\u005B-\\u0060\\u007B-\\u007E'; const PUNCTUATION_ONLY_REGEX = new RegExp(`^[${PUNCTUATION_REGEX}]+$`, 'gu'); const BLOOM_SPLIT_CHARS = '.,!?\u2026\u3002\uff0c\u3001\u0964\u06d4\u060c'; // A mapping of regex patterns to their equivalent (but possibly longer) JS-compatible versions. const PROBLEMATIC_REGEX_MAP = new Map([ // This uses the case insensitive group modifier, which is not supported in JavaScript. // When parsing the regex, an "Invalid group" error is thrown. ["(?i:'s|'t|'re|'ve|'m|'ll|'d)", "(?:'([sS]|[tT]|[rR][eE]|[vV][eE]|[mM]|[lL][lL]|[dD]))"], // Used to override the default (invalid) regex of the bloom pretokenizer. // For more information, see https://github.com/huggingface/transformers.js/issues/94 [` ?[^(\\s|[${BLOOM_SPLIT_CHARS}])]+`, ` ?[^\\s${BLOOM_SPLIT_CHARS}]+`], ]) /** * Represent a token added by the user on top of the existing Model vocabulary. * AddedToken can be configured to specify the behavior they should have in various situations like: * - Whether they should only match single words * - Whether to include any whitespace on its left or right */ class AddedToken { /** * Creates a new instance of AddedToken. * @param {Object} config Added token configuration object. * @param {string} config.content The content of the added token. * @param {number} config.id The id of the added token. * @param {boolean} [config.single_word=false] Whether this token must be a single word or can break words. * @param {boolean} [config.lstrip=false] Whether this token should strip whitespaces on its left. * @param {boolean} [config.rstrip=false] Whether this token should strip whitespaces on its right. * @param {boolean} [config.normalized=false] Whether this token should be normalized. * @param {boolean} [config.special=false] Whether this token is special. */ constructor(config) { this.content = config.content; this.id = config.id; this.single_word = config.single_word ?? false; this.lstrip = config.lstrip ?? false; this.rstrip = config.rstrip ?? false; this.special = config.special ?? false; this.normalized = config.normalized ?? null; } } /** * Abstract base class for tokenizer models. * * @extends Callable */ export class TokenizerModel extends Callable { /** * Creates a new instance of TokenizerModel. * @param {Object} config The configuration object for the TokenizerModel. */ constructor(config) { super(); this.config = config; /** @type {string[]} */ this.vocab = []; /** * A mapping of tokens to ids. * @type {Map<string, number>} */ this.tokens_to_ids = new Map(); this.unk_token_id = undefined; this.unk_token = undefined; this.end_of_word_suffix = undefined; /** @type {boolean} Whether to fuse unknown tokens when encoding. Defaults to false. */ this.fuse_unk = this.config.fuse_unk ?? false; } /** * Instantiates a new TokenizerModel instance based on the configuration object provided. * @param {Object} config The configuration object for the TokenizerModel. * @param {...*} args Optional arguments to pass to the specific TokenizerModel constructor. * @returns {TokenizerModel} A new instance of a TokenizerModel. * @throws Will throw an error if the TokenizerModel type in the config is not recognized. */ static fromConfig(config, ...args) { switch (config.type) { case 'WordPiece': return new WordPieceTokenizer(config); case 'Unigram': // @ts-ignore return new Unigram(config, ...args); case 'BPE': return new BPE(config); default: // Some older tokenizers, like `google-t5/t5-small` and `distilbert/distilbert-base-uncased`, do not have a `type` field. // In this case, we can infer the tokenizer type based on the structure of the `vocab` field and other properties. if (config.vocab) { if (Array.isArray(config.vocab)) { // config.vocab is of type `[string, number][]` // @ts-ignore return new Unigram(config, ...args); } else if (typeof config.vocab === 'object' && config.continuing_subword_prefix && config.unk_token) { return new WordPieceTokenizer(config); } else { // @ts-ignore return new LegacyTokenizerModel(config, ...args); } } throw new Error(`Unknown TokenizerModel type: ${config.type}`); } } /** * Internal function to call the TokenizerModel instance. * @param {string[]} tokens The tokens to encode. * @returns {string[]} The encoded tokens. */ _call(tokens) { tokens = this.encode(tokens); if (this.fuse_unk) { // Fuse unknown tokens tokens = fuse_unk(tokens, this.tokens_to_ids, this.unk_token_id); } return tokens; } /** * Encodes a list of tokens into a list of token IDs. * @param {string[]} tokens The tokens to encode. * @returns {string[]} The encoded tokens. * @throws Will throw an error if not implemented in a subclass. */ encode(tokens) { throw Error("encode should be implemented in subclass.") } /** * Converts a list of tokens into a list of token IDs. * @param {string[]} tokens The tokens to convert. * @returns {number[]} The converted token IDs. */ convert_tokens_to_ids(tokens) { return tokens.map(t => this.tokens_to_ids.get(t) ?? this.unk_token_id); } /** * Converts a list of token IDs into a list of tokens. * @param {number[]|bigint[]} ids The token IDs to convert. * @returns {string[]} The converted tokens. */ convert_ids_to_tokens(ids) { return ids.map(i => this.vocab[i] ?? this.unk_token); } } /** * A subclass of TokenizerModel that uses WordPiece encoding to encode tokens. * @extends TokenizerModel */ class WordPieceTokenizer extends TokenizerModel { /** * @param {Object} config The configuration object. * @param {Object} config.vocab A mapping of tokens to ids. * @param {string} config.unk_token The unknown token string. * @param {string} config.continuing_subword_prefix The prefix to use for continuing subwords. * @param {number} [config.max_input_chars_per_word=100] The maximum number of characters per word. */ constructor(config) { super(config); /** * A mapping of tokens to ids. * @type {Map<string, number>} */ this.tokens_to_ids = objectToMap(config.vocab); /** * The id of the unknown token. * @type {number} */ this.unk_token_id = this.tokens_to_ids.get(config.unk_token); /** * The unknown token string. * @type {string} */ this.unk_token = config.unk_token; /** * The maximum number of characters allowed per word. * @type {number} */ this.max_input_chars_per_word = config.max_input_chars_per_word ?? 100; /** * An array of tokens. * @type {string[]} */ this.vocab = new Array(this.tokens_to_ids.size); for (const [key, value] of this.tokens_to_ids) { this.vocab[value] = key; } } /** * Encodes an array of tokens using WordPiece encoding. * @param {string[]} tokens The tokens to encode. * @returns {string[]} An array of encoded tokens. */ encode(tokens) { const outputTokens = []; for (const token of tokens) { const chars = [...token]; if (chars.length > this.max_input_chars_per_word) { outputTokens.push(this.unk_token); continue; } let isUnknown = false; let start = 0; const subTokens = []; while (start < chars.length) { let end = chars.length; let currentSubstring = null; while (start < end) { let substr = chars.slice(start, end).join(''); if (start > 0) { substr = this.config.continuing_subword_prefix + substr; } if (this.tokens_to_ids.has(substr)) { currentSubstring = substr; break; } --end; } if (currentSubstring === null) { isUnknown = true; break; } subTokens.push(currentSubstring); start = end; } if (isUnknown) { outputTokens.push(this.unk_token); } else { outputTokens.push(...subTokens); } } return outputTokens; } } /** * Class representing a Unigram tokenizer model. * @extends TokenizerModel */ class Unigram extends TokenizerModel { /** * Create a new Unigram tokenizer model. * @param {Object} config The configuration object for the Unigram model. * @param {number} config.unk_id The ID of the unknown token * @param {[string, number][]} config.vocab A 2D array representing a mapping of tokens to scores. * @param {Object} moreConfig Additional configuration object for the Unigram model. */ constructor(config, moreConfig) { super(config); const vocabSize = config.vocab.length; this.vocab = new Array(vocabSize); /** @type {number[]} */ this.scores = new Array(vocabSize); for (let i = 0; i < vocabSize; ++i) { [this.vocab[i], this.scores[i]] = config.vocab[i]; } this.unk_token_id = config.unk_id; this.unk_token = this.vocab[config.unk_id]; this.tokens_to_ids = new Map(this.vocab.map((x, i) => [x, i])); this.bos_token = ' '; // beginning of a sentence token this.bos_token_id = this.tokens_to_ids.get(this.bos_token); // NOTE: may be undefined this.eos_token = moreConfig.eos_token; this.eos_token_id = this.tokens_to_ids.get(this.eos_token); this.unk_token = this.vocab[this.unk_token_id]; this.minScore = min(this.scores)[0]; this.unk_score = this.minScore - 10.0; this.scores[this.unk_token_id] = this.unk_score; this.trie = new CharTrie(); this.trie.extend(this.vocab); // NOTE: `fuse_unk` is hardcoded to true for Unigram models // See: https://github.com/huggingface/tokenizers/blob/b58227c7f1ccf8b73ee2268354336da56d91e492/tokenizers/src/models/unigram/model.rs#L119 this.fuse_unk = true; } /** * Populates lattice nodes. * @param {TokenLattice} lattice The token lattice to populate with nodes. */ populateNodes(lattice) { const chars = lattice.chars; const mblen = 1; let beginPos = 0; while (beginPos < chars.length) { let hasSingleNode = false; const tokens = []; const sliced = chars.slice(beginPos).join(''); const prefixedTokens = this.trie.commonPrefixSearch(sliced); for (const token of prefixedTokens) { tokens.push(token); const tokenId = this.tokens_to_ids.get(token); const tokenScore = this.scores[tokenId]; const n = len(token); lattice.insert(beginPos, n, tokenScore, tokenId); if (!hasSingleNode && n === mblen) { hasSingleNode = true; } } if (!hasSingleNode) { lattice.insert(beginPos, mblen, this.unk_score, this.unk_token_id); } beginPos += mblen; } } /** * Encodes an array of tokens into an array of subtokens using the unigram model. * * @param {string} normalized The normalized string. * @returns {string[]} An array of subtokens obtained by encoding the input tokens using the unigram model. */ tokenize(normalized) { const lattice = new TokenLattice(normalized, this.bos_token_id, this.eos_token_id); this.populateNodes(lattice); return lattice.tokens(); } /** * Encodes an array of tokens using Unigram encoding. * @param {string[]} tokens The tokens to encode. * @returns {string[]} An array of encoded tokens. */ encode(tokens) { const toReturn = []; for (const token of tokens) { const tokenized = this.tokenize(token); toReturn.push(...tokenized); } return toReturn; } } /** * Returns list of utf-8 byte and a mapping to unicode strings. * Specifically avoids mapping to whitespace/control characters the BPE code barfs on. * @returns {Object} Object with utf-8 byte keys and unicode string values. */ const BYTES_TO_UNICODE = (() => { // Returns list of utf-8 byte and a mapping to unicode strings. // We specifically avoids mapping to whitespace/control characters // the bpe code barfs on. const bs = [ ...Array.from({ length: "~".charCodeAt(0) - "!".charCodeAt(0) + 1 }, (_, i) => i + "!".charCodeAt(0)), ...Array.from({ length: "¬".charCodeAt(0) - "¡".charCodeAt(0) + 1 }, (_, i) => i + "¡".charCodeAt(0)), ...Array.from({ length: "ÿ".charCodeAt(0) - "®".charCodeAt(0) + 1 }, (_, i) => i + "®".charCodeAt(0)), ]; const cs = bs.slice(); let n = 0; for (let b = 0; b < 256; ++b) { if (!bs.includes(b)) { bs.push(b); cs.push(256 + n); n += 1; } } const ccs = cs.map(n => String.fromCharCode(n)); return Object.fromEntries(bs.map((b, i) => [b, ccs[i]])); })(); const UNICODE_TO_BYTES = reverseDictionary(BYTES_TO_UNICODE); /** * @typedef {Object} BPENode * @property {string} token The token associated with the node * @property {number} bias A positional bias for the node. * @property {number} [score] The score of the node. * @property {BPENode} [prev] The previous node in the linked list. * @property {BPENode} [next] The next node in the linked list. */ /** * BPE class for encoding text into Byte-Pair-Encoding (BPE) tokens. * @extends TokenizerModel */ class BPE extends TokenizerModel { /** * Create a BPE instance. * @param {Object} config The configuration object for BPE. * @param {Object} config.vocab A mapping of tokens to ids. * @param {string[]|[string, string][]} config.merges An array of BPE merges as strings. * @param {string} config.unk_token The unknown token used for out of vocabulary words. * @param {string} config.end_of_word_suffix The suffix to place at the end of each word. * @param {string} [config.continuing_subword_suffix] The suffix to insert between words. * @param {boolean} [config.byte_fallback=false] Whether to use spm byte-fallback trick (defaults to False) * @param {boolean} [config.ignore_merges=false] Whether or not to match tokens with the vocab before using merges. */ constructor(config) { super(config); /** @type {Map<string, number>} */ this.tokens_to_ids = objectToMap(config.vocab); this.unk_token_id = this.tokens_to_ids.get(config.unk_token); this.unk_token = config.unk_token; this.vocab = new Array(this.tokens_to_ids.size); for (const [key, value] of this.tokens_to_ids) { this.vocab[value] = key; } // Tokenizers >= 0.20.0 serializes BPE merges as a [string, string][] instead of a string[], // which resolves the ambiguity for merges containing spaces. const use_new_merge_format = Array.isArray(config.merges[0]); /** @type {[string, string][]} */ this.merges = use_new_merge_format ? /** @type {[string, string][]} */(config.merges) : (/** @type {string[]} */(config.merges)).map(x => /** @type {[string, string]} */(x.split(' ', 2))); this.bpe_ranks = new Map(this.merges.map((x, i) => [JSON.stringify(x), i])); this.end_of_word_suffix = config.end_of_word_suffix; // NOTE: `continuing_subword_suffix` is custom (to support `BlenderbotSmallTokenizer`) this.continuing_subword_suffix = config.continuing_subword_suffix ?? null; this.byte_fallback = this.config.byte_fallback ?? false; if (this.byte_fallback) { this.text_encoder = new TextEncoder(); } this.ignore_merges = this.config.ignore_merges ?? false; /** * The maximum length we should cache in a model. * Strings that are too long have minimal chances to cache hit anyway */ this.max_length_to_cache = 256; /** * The default capacity for a `BPE`'s internal cache. */ this.cache_capacity = 10000; this.cache = new LRUCache(this.cache_capacity); } /** * Clears the cache. */ clear_cache() { this.cache.clear(); } /** * Apply Byte-Pair-Encoding (BPE) to a given token. Efficient heap-based priority * queue implementation adapted from https://github.com/belladoreai/llama-tokenizer-js. * @param {string} token The token to encode. * @returns {string[]} The BPE encoded tokens. */ bpe(token) { if (token.length === 0) { return []; } const cached = this.cache.get(token); if (cached !== undefined) { return cached; } const word = Array.from(token); if (this.end_of_word_suffix) { word[word.length - 1] += this.end_of_word_suffix; } let result = []; if (word.length > 1) { // Create a priority queue to store the nodes that will be merged. // The comparator function compares the scores of the nodes. const queue = new PriorityQueue((a, b) => a.score < b.score); // Construct a doubly-linked list of nodes that will be inserted into the priority queue, // starting with the individual characters. We also populate each node with a positional // bias to break ties in the priority queue. let startingNode = { token: word[0], bias: 0, prev: null, next: null, } let previousNode = startingNode for (let i = 1; i < word.length; ++i) { const currentNode = { bias: i / word.length, // Add fractional component to break ties token: word[i], prev: previousNode, next: null, } previousNode.next = currentNode this._add_node(queue, previousNode) previousNode = currentNode } while (!queue.isEmpty()) { // Get the next node with the highest priority const node = queue.pop(); // Check that this merge is still possible if (node.deleted || !node.next || node.next.deleted) continue; // Here, we mark the current node (left side of the merge) and the next node (right side of the merge) as deleted. // This is because they will both be replaced by a new node representing the merge result. node.deleted = true; node.next.deleted = true; // Next, we fix the node that comes before the current node (i.e., left side of the merge). if (node.prev) { // Make a shallow copy of the previous node const newPreviousNode = { ...node.prev }; // Mark the old previous node as deleted. This avoids erroneous merges later, // because there may still be references to this node in the priority queue. node.prev.deleted = true; node.prev = newPreviousNode; // Update the reference of the previous node, by pointing its previous node to this new previous node. if (newPreviousNode.prev) { newPreviousNode.prev.next = newPreviousNode; } else { // If the previous of the previous node does not exist, it means that // `newPreviousNode` must be the new `startingNode`. startingNode = newPreviousNode; } } // Create a new node which represents the result of the merge. const merged = { token: node.token + node.next.token, bias: node.bias, prev: node.prev, next: node.next.next, } // We now consider where we can add the new merged node to the priority queue: // 1. prev <-> merged if (merged.prev) { merged.prev.next = merged; this._add_node(queue, merged.prev); } else { // If `merged.prev` does not exist, then `merged` must be the new `startingNode`. startingNode = merged; } // 2. merged <-> next if (merged.next) { merged.next.prev = merged; this._add_node(queue, merged); } } // Traverse the linked list, starting from the `startingNode`, and collect the tokens. for (let currentNode = startingNode; currentNode !== null; currentNode = currentNode.next) { result.push(currentNode.token); } } else { result = word; } // Possibly append suffix if (this.continuing_subword_suffix) { // Do not append suffix to the last token for (let i = 0; i < result.length - 1; ++i) { result[i] += this.continuing_subword_suffix; } } if (token.length < this.max_length_to_cache) { // Save the result to the cache this.cache.put(token, result); } return result; } /** * Helper function to add a node to the priority queue. * @param {PriorityQueue} queue * @param {BPENode} node * @private */ _add_node(queue, node) { // `score` is a measure of the merge priority: lower means higher priority // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list) // We also add a fractional component to the score to break ties (with the earlier character having higher priority) const rank = this.bpe_ranks.get(JSON.stringify([node.token, node.next.token])); if (rank !== undefined) { node.score = rank + node.bias; queue.push(node); } } /** * Encodes the input sequence of tokens using the BPE algorithm and returns the resulting subword tokens. * @param {string[]} tokens The input sequence of tokens to encode. * @returns {string[]} The resulting subword tokens after applying the BPE algorithm to the input sequence of tokens. */ encode(tokens) { const outputTokens = []; for (const token of tokens) { if (this.ignore_merges && this.tokens_to_ids.has(token)) { outputTokens.push(token); continue; } const bpe_token_list = this.bpe(token); for (const t of bpe_token_list) { if (this.tokens_to_ids.has(t)) { outputTokens.push(t); } else if (this.byte_fallback) { const byteTokens = Array.from(this.text_encoder.encode(t)) .map(x => `<0x${x.toString(16).toUpperCase().padStart(2, '0')}>`); if (byteTokens.every(x => this.tokens_to_ids.has(x))) { // Ensure the byte tokens are actually in the vocabulary, otherwise // we fall back to the unknown token. For more information, see // https://github.com/huggingface/transformers/issues/28096. outputTokens.push(...byteTokens); } else { outputTokens.push(this.unk_token); } } else { outputTokens.push(this.unk_token); } } } return outputTokens; } } /** * Legacy tokenizer class for tokenizers with only a vocabulary. */ class LegacyTokenizerModel extends TokenizerModel { /** * Create a LegacyTokenizerModel instance. * @param {Object} config The configuration object for LegacyTokenizerModel. * @param {Object} config.vocab A (possibly nested) mapping of tokens to ids. * @param {Object} moreConfig Additional configuration object for the LegacyTokenizerModel model. */ constructor(config, moreConfig) { super(config); /**@type {Map<string, number>} */ this.tokens_to_ids = objectToMap( moreConfig.target_lang ? config.vocab[moreConfig.target_lang] : config.vocab ); this.bos_token = moreConfig.bos_token; this.bos_token_id = this.tokens_to_ids.get(this.bos_token); this.eos_token = moreConfig.eos_token; this.eos_token_id = this.tokens_to_ids.get(this.eos_token); this.pad_token = moreConfig.pad_token; this.pad_token_id = this.tokens_to_ids.get(this.pad_token); this.unk_token = moreConfig.unk_token; this.unk_token_id = this.tokens_to_ids.get(this.unk_token); this.vocab = new Array(this.tokens_to_ids.size); for (const [key, value] of this.tokens_to_ids) { this.vocab[value] = key; } } encode(tokens) { return tokens; } } /** * A base class for text normalization. * @abstract */ class Normalizer extends Callable { /** * @param {Object} config The configuration object for the normalizer. */ constructor(config) { super(); this.config = config; } /** * Factory method for creating normalizers from config objects. * @static * @param {Object} config The configuration object for the normalizer. * @returns {Normalizer} A Normalizer object. * @throws {Error} If an unknown Normalizer type is specified in the config. */ static fromConfig(config) { if (config === null) return null; switch (config.type) { case 'BertNormalizer': return new BertNormalizer(config); case 'Precompiled': return new Precompiled(config); case 'Sequence': return new NormalizerSequence(config); case 'Replace': return new Replace(config); case 'NFC': return new NFC(config); case 'NFD': return new NFD(config); case 'NFKC': return new NFKC(config); case 'NFKD': return new NFKD(config); case 'Strip': return new StripNormalizer(config); case 'StripAccents': return new StripAccents(config); case 'Lowercase': return new Lowercase(config); case 'Prepend': return new Prepend(config); default: throw new Error(`Unknown Normalizer type: ${config.type}`); } } /** * Normalize the input text. * @abstract * @param {string} text The text to normalize. * @returns {string} The normalized text. * @throws {Error} If this method is not implemented in a subclass. */ normalize(text) { throw Error("normalize should be implemented in subclass.") } /** * Alias for {@link Normalizer#normalize}. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ _call(text) { return this.normalize(text); } } /** * Replace normalizer that replaces occurrences of a pattern with a given string or regular expression. * @extends Normalizer */ class Replace extends Normalizer { /** * Normalize the input text by replacing the pattern with the content. * @param {string} text The input text to be normalized. * @returns {string} The normalized text after replacing the pattern with the content. */ normalize(text) { const pattern = createPattern(this.config.pattern); return pattern === null ? text : text.replaceAll(pattern, this.config.content); } } /** * A normalizer that applies Unicode normalization to the input text. * @extends Normalizer * @abstract */ class UnicodeNormalizer extends Normalizer { /** * @type {string} The Unicode normalization form to apply. * Should be one of: 'NFC', 'NFD', 'NFKC', or 'NFKD'. */ form = undefined; /** * Normalize the input text by applying Unicode normalization. * @param {string} text The input text to be normalized. * @returns {string} The normalized text. */ normalize(text) { text = text.normalize(this.form) return text; } } /** * A normalizer that applies Unicode normalization form C (NFC) to the input text. * Canonical Decomposition, followed by Canonical Composition. * @extends UnicodeNormalizer */ class NFC extends UnicodeNormalizer { form = 'NFC'; } /** * A normalizer that applies Unicode normalization form D (NFD) to the input text. * Canonical Decomposition. * @extends UnicodeNormalizer */ class NFD extends UnicodeNormalizer { form = 'NFD'; } /** * A normalizer that applies Unicode normalization form KC (NFKC) to the input text. * Compatibility Decomposition, followed by Canonical Composition. * @extends UnicodeNormalizer */ class NFKC extends UnicodeNormalizer { form = 'NFKC'; } /** * A normalizer that applies Unicode normalization form KD (NFKD) to the input text. * Compatibility Decomposition. * @extends UnicodeNormalizer */ class NFKD extends UnicodeNormalizer { form = 'NFKD'; } /** * A normalizer that strips leading and/or trailing whitespace from the input text. */ class StripNormalizer extends Normalizer { /** * Strip leading and/or trailing whitespace from the input text. * @param {string} text The input text. * @returns {string} The normalized text. */ normalize(text) { if (this.config.strip_left && this.config.strip_right) { // Fast path to avoid an extra trim call text = text.trim(); } else { if (this.config.strip_left) { text = text.trimStart(); } if (this.config.strip_right) { text = text.trimEnd(); } } return text; } } /** * StripAccents normalizer removes all accents from the text. * @extends Normalizer */ class StripAccents extends Normalizer { /** * Remove all accents from the text. * @param {string} text The input text. * @returns {string} The normalized text without accents. */ normalize(text) { text = remove_accents(text); return text; } } /** * A Normalizer that lowercases the input string. * @extends Normalizer */ class Lowercase extends Normalizer { /** * Lowercases the input string. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ normalize(text) { text = text.toLowerCase(); return text; } } /** * A Normalizer that prepends a string to the input string. * @extends Normalizer */ class Prepend extends Normalizer { /** * Prepends the input string. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ normalize(text) { text = this.config.prepend + text; return text; } } /** * A Normalizer that applies a sequence of Normalizers. * @extends Normalizer */ class NormalizerSequence extends Normalizer { /** * Create a new instance of NormalizerSequence. * @param {Object} config The configuration object. * @param {Object[]} config.normalizers An array of Normalizer configuration objects. */ constructor(config) { super(config); this.normalizers = config.normalizers.map(x => Normalizer.fromConfig(x)); } /** * Apply a sequence of Normalizers to the input text. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ normalize(text) { return this.normalizers.reduce((t, normalizer) => { return normalizer.normalize(t); }, text); } } /** * A class representing a normalizer used in BERT tokenization. * @extends Normalizer */ class BertNormalizer extends Normalizer { /** * Adds whitespace around any CJK (Chinese, Japanese, or Korean) character in the input text. * * @param {string} text The input text to tokenize. * @returns {string} The tokenized text with whitespace added around CJK characters. */ _tokenize_chinese_chars(text) { /* Adds whitespace around any CJK character. */ const output = []; for (let i = 0; i < text.length; ++i) { const char = text[i]; const cp = char.charCodeAt(0); if (is_chinese_char(cp)) { output.push(" "); output.push(char); output.push(" "); } else { output.push(char); } } return output.join(""); } /** * Strips accents from the given text. * @param {string} text The text to strip accents from. * @returns {string} The text with accents removed. */ stripAccents(text) { // "Mark, Nonspacing" (Mn) return text.normalize('NFD').replace(/\p{Mn}/gu, ''); } /** * Checks whether `char` is a control character. * @param {string} char The character to check. * @returns {boolean} Whether `char` is a control character. * @private */ _is_control(char) { switch (char) { case '\t': case '\n': case '\r': // These are technically control characters but we count them as whitespace characters. return false; default: // Check if unicode category starts with C: // Cc - Control // Cf - Format // Co - Private Use // Cs - Surrogate return /^\p{Cc}|\p{Cf}|\p{Co}|\p{Cs}$/u.test(char); } } /** * Performs invalid character removal and whitespace cleanup on text. * @param {string} text The text to clean. * @returns {string} The cleaned text. * @private */ _clean_text(text) { const output = []; for (const char of text) { const cp = char.charCodeAt(0); if (cp === 0 || cp === 0xFFFD || this._is_control(char)) { continue; } if (/^\s$/.test(char)) { // is whitespace output.push(" "); } else { output.push(char); } } return output.join(""); } /** * Normalizes the given text based on the configuration. * @param {string} text The text to normalize. * @returns {string} The normalized text. */ normalize(text) { if (this.config.clean_text) { text = this._clean_text(text); } if (this.config.handle_chinese_chars) { text = this._tokenize_chinese_chars(text); } if (this.config.lowercase) { text = text.toLowerCase(); if (this.config.strip_accents !== false) { text = this.stripAccents(text); } } else if (this.config.strip_accents) { text = this.stripAccents(text); } return text; } } /** * A callable class representing a pre-tokenizer used in tokenization. Subclasses * should implement the `pre_tokenize_text` method to define the specific pre-tokenization logic. * @extends Callable */ class PreTokenizer extends Callable { /** * Factory method that returns an instance of a subclass of `PreTokenizer` based on the provided configuration. * * @static * @param {Object} config A configuration object for the pre-tokenizer. * @returns {PreTokenizer} An instance of a subclass of `PreTokenizer`. * @throws {Error} If the provided configuration object does not correspond to any known pre-tokenizer. */ static fromConfig(config) { if (config === null) return null; switch (config.type) { case 'BertPreTokenizer': return new BertPreTokenizer(config); case 'Sequence': return new PreTokenizerSequence(config); case 'Whitespace': return new WhitespacePreTokenizer(config); case 'WhitespaceSplit': return new WhitespaceSplit(config); case 'Metaspace': return new MetaspacePreTokenizer(config); case 'ByteLevel': return new ByteLevelPreTokenizer(config); case 'Split': return new SplitPreTokenizer(config); case 'Punctuation': return new PunctuationPreTokenizer(config); case 'Digits': return new DigitsPreTokenizer(config); case 'Replace': return new ReplacePreTokenizer(config); default: throw new Error(`Unknown PreTokenizer type: ${config.type}`); } } /** * Method that should be implemented by subclasses to define the specific pre-tokenization logic. * * @abstract * @param {string} text The text to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} The pre-tokenized text. * @throws {Error} If the method is not implemented in the subclass. */ pre_tokenize_text(text, options) { throw Error("pre_tokenize_text should be implemented in subclass.") } /** * Tokenizes the given text into pre-tokens. * @param {string|string[]} text The text or array of texts to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} An array of pre-tokens. */ pre_tokenize(text, options) { return (Array.isArray(text) ? text.map(x => this.pre_tokenize_text(x, options)) : this.pre_tokenize_text(text, options) ).flat(); } /** * Alias for {@link PreTokenizer#pre_tokenize}. * @param {string|string[]} text The text or array of texts to pre-tokenize. * @param {Object} [options] Additional options for the pre-tokenization logic. * @returns {string[]} An array of pre-tokens. */ _call(text, options) { return this.pre_tokenize(text, options); } } /** * @extends PreTokenizer */ class BertPreTokenizer extends PreTokenizer { /** * A PreTokenizer that splits text into wordpieces using a basic tokenization scheme * similar to that used in the original implementation of BERT. * * @param {Object} config The configuration object. */ constructor(config) { super(); // Construct a pattern which matches the rust implementation: // https://github.com/huggingface/tokenizers/blob/b4fcc9ce6e4ad5806e82826f816acfdfdc4fcc67/tokenizers/src/pre_tokenizers/bert.rs#L11 // Equivalent to removing whitespace and splitting