UNPKG

@tensorflow/tfjs-layers

Version:

TensorFlow layers API in JavaScript

230 lines 32.7 kB
/** * @license * Copyright 2023 Google LLC. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= */ /* Original source: keras-nlp/byte_pair_tokenizer.py */ import { Tensor, tensor } from '@tensorflow/tfjs-core'; import { ValueError } from '../../errors'; import { matchAll } from './match_all_polyfill'; import { tensorArrTo2DArr, tensorToArr } from './utils'; export function bytesToUnicode() { const inclusiveRange = (start, end) => Array.from({ length: (end - start + 1) }, (v, k) => k + start); const bs = [ ...inclusiveRange('!'.charCodeAt(0), '~'.charCodeAt(0)), ...inclusiveRange('¡'.charCodeAt(0), '¬'.charCodeAt(0)), ...inclusiveRange('®'.charCodeAt(0), 'ÿ'.charCodeAt(0)) ]; const cs = [...bs]; let n = 0; // Removes mapping an int to a whitespace character for (let b = 0; b < 2 ** 8; b++) { if (!bs.includes(b)) { bs.push(b); cs.push(2 ** 8 + n); n++; } } const chars = cs.map(n => String.fromCharCode(n)); // TODO(orderique): Verify same functionality. const bytes = Uint8Array.from(bs); return [bytes, chars]; } /** * StaticHashTable includes a `lookup` function for multiple keys at once. */ export class StaticHashTable { constructor(keys, values, defaultValue) { this.defaultValue = defaultValue; if (keys.length !== values.length) { throw new ValueError(`keys and values arrays must be same length. Instead got lengths ${keys.length} and ${values.length}.`); } const keyValPairs = []; for (let idx = 0; idx < keys.length; idx++) { const key = keys[idx]; const val = values[idx]; keyValPairs.push([key, val]); } this._map = new Map(keyValPairs); } get(key) { if (this._map.has(key)) { return this._map.get(key); } return this.defaultValue; } lookup(keys) { const values = keys.map(t => { const innerValues = []; for (const key of t.dataSync()) { innerValues.push(this.get(key)); } return tensor(innerValues, t.shape); }); return values; } } export function createStaticHashtable(keys, values, defaultVal) { return new StaticHashTable(keys, values, defaultVal); } /** * Cache that stores the encoded result of seen tokens. * * The cache key is string tensor or python strings, and the value is split * tokens joined by whitespace. For example, "dragonfly" => "dragon fly" * * Examples: * * ```js * const cache = new BytePairTokenizerCache(); * cache.insert(["butterfly", "dragonfly"], ["but ter fly", "dragon fly"]); * cache.lookup(["butterfly"]); * ``` */ export class BytePairTokenizerCache { constructor() { this._cache = new Map(); } get(key) { if (this._cache.has(key)) { return this._cache.get(key); } return ''; } /** * Insert token <=> encoded outputs pairs. */ insert(keys, values) { const arrKeys = keys instanceof Tensor ? keys.dataSync() : keys; for (const [idx, key] of arrKeys.entries()) { this._cache.set(key, values[idx]); } return this; } /** * Look up the encoded outputs of given tokens. */ lookup(keys) { const arrKeys = keys instanceof Tensor ? keys.dataSync() : keys; return arrKeys.map(key => this.get(key)); } } /** * Remove certain strings from input tensor. */ export function removeStringsFromInputs(inputs, stringToRemove) { const stringArrInputs = tensorArrTo2DArr(inputs); const filteredStrArrays = stringArrInputs .map(arr => arr.filter(s => s !== stringToRemove)); const filteredTensors = filteredStrArrays.map(arr => tensor(arr)); return filteredTensors; } /** * Create alternates for all special tokens that will be not split during * tokenization. */ export function createAltsForUnsplittableTokens(unsplittableTokens) { const prefix = 'ĵ'; // Trim out splitters. const replacePattern = /'|\s+|[^\p{L}\p{N}]+/gu; return unsplittableTokens.map(token => prefix + token.replace(replacePattern, '')); } // Typescript and TF handles special spaces differently, we need to // manually handle special spaces during string split. const SPECIAL_WHITESPACES = /\u00A0\u2009\u202f\u3000/; // String splitting regex pattern. const pL = 'a-zA-ZáàâäãåçéèêëíìîïñóòôöõúùûüýÿæœÁÀÂÄÃÅÇÉÈÊËÍÌÎÏÑÓÒÔÖÕÚÙÛÜÝŸÆŒĵ'; const pN = '0-9'; export const SPLIT_PATTERN_1 = new RegExp(`'s|'t|'re|'ve|'m|'ll|'d` + `|[\\s${SPECIAL_WHITESPACES.source}]+` + `[\\n\\r\\t\\f६${SPECIAL_WHITESPACES.source}]| ?${pL}+|` + ` ?${pN}+| ?[^\\s${pL}${pN}${SPECIAL_WHITESPACES.source}]+`, 'gu'); const SPLIT_PATTERN_2 = new RegExp(`[\\s६${SPECIAL_WHITESPACES.source}]\$`); function flatten(inputs) { return inputs.reduce((accumulator, value) => accumulator.concat(value), []); } export function regexSplit(strs, delimRegexPattern, keepDelimRegexPattern = false) { if (strs[0] instanceof Array) { const mapped = strs.map(arr => regexSplit(arr, delimRegexPattern, keepDelimRegexPattern)); return mapped.map(flatten); } strs = strs; if (!(delimRegexPattern instanceof RegExp)) { if (keepDelimRegexPattern) { delimRegexPattern = new RegExp(`(${delimRegexPattern})`); } return strs.map(str => str.split(delimRegexPattern).filter(s => s)); } const regexPattern = delimRegexPattern.flags.includes('g') ? delimRegexPattern : new RegExp(delimRegexPattern.source, delimRegexPattern.flags + 'g'); return strs.map(str => { const matches = matchAll(str, regexPattern); const splitString = []; let currIdx = 0; for (const match of matches) { splitString.push(str.slice(currIdx, match.index)); if (keepDelimRegexPattern) { splitString.push(str.slice(match.index, match.index + match[0].length)); } currIdx = match.index + match[0].length; } if (currIdx !== str.length) { splitString.push(str.slice(currIdx, str.length)); } return splitString.filter(s => s); }); } export function splitStringsForBpe(inputs, unsplittableTokens) { // We need to recreate the exact behavior of token presplitting in the // original gpt2 implementation which uses a lookahead. We are using an // alternative by inserting a special token "६" before leading space of // non-space characters and after the trailing space, e.g., // " tf" will be "६ tf". const pattern1 = new RegExp(`( )([^\s${SPECIAL_WHITESPACES}])`); const pattern2 = new RegExp(`(\s${SPECIAL_WHITESPACES})\$`); const inputsStr = tensorToArr(inputs).map(str => str.replace(pattern1, `६$1$2`).replace(pattern2, `$1६`)); let alts; let rawTokens; function escape(input) { return input.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); } if (unsplittableTokens && unsplittableTokens.length > 0) { alts = createAltsForUnsplittableTokens(unsplittableTokens); for (const [idx, token] of unsplittableTokens.entries()) { const alt = alts[idx]; const escapedToken = escape(token); rawTokens = regexSplit(rawTokens !== undefined ? rawTokens : inputsStr, escapedToken, true); rawTokens = rawTokens.map(arr => arr.map(t => t.replace(new RegExp(escapedToken), alt))); } } rawTokens = regexSplit(rawTokens !== undefined ? rawTokens : inputsStr, SPLIT_PATTERN_1, true); // Second pass splits out the last whilespace char or "६". rawTokens = regexSplit(rawTokens, SPLIT_PATTERN_2, true); if (unsplittableTokens && unsplittableTokens.length > 0) { // Replace special tokens alternate with originals. for (const [idx, token] of unsplittableTokens.entries()) { const alt = alts[idx]; const escapedAlt = escape(alt); rawTokens = rawTokens.map(arr => arr.map(t => t.replace(new RegExp(escapedAlt), token))); } } return removeStringsFromInputs(rawTokens.map(tokens => tensor(tokens)), '६'); } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"tokenizers_utils.js","sourceRoot":"","sources":["../../../../../../../tfjs-layers/src/layers/nlp/tokenizers_utils.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,uDAAuD;AAEvD,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,uBAAuB,CAAC;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,cAAc,CAAC;AAC1C,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAChD,OAAO,EAAE,gBAAgB,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAExD,MAAM,UAAU,cAAc;IAC5B,MAAM,cAAc,GAAG,CAAC,KAAa,EAAE,GAAW,EAAE,EAAE,CACpD,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,GAAG,GAAG,KAAK,GAAG,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC;IAEjE,MAAM,EAAE,GAAG;QACT,GAAG,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACvD,GAAG,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACvD,GAAG,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;KACxD,CAAC;IAEF,MAAM,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC;IACnB,IAAI,CAAC,GAAG,CAAC,CAAC;IAEV,mDAAmD;IACnD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,EAAE;QAC/B,IAAI,CAAC,EAAE,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE;YACnB,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACX,EAAE,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;YACpB,CAAC,EAAE,CAAC;SACL;KACF;IAED,MAAM,KAAK,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC;IAElD,8CAA8C;IAC9C,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAElC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;AACxB,CAAC;AAED;;GAEG;AACH,MAAM,OAAO,eAAe;IAG1B,YAAY,IAAS,EAAE,MAAW,EAAmB,YAAe;QAAf,iBAAY,GAAZ,YAAY,CAAG;QAClE,IAAI,IAAI,CAAC,MAAM,KAAK,MAAM,CAAC,MAAM,EAAE;YACjC,MAAM,IAAI,UAAU,CAAC;8BACG,IAAI,CAAC,MAAM,QAAQ,MAAM,CAAC,MAAM,GAAG,CAC1D,CAAC;SACH;QACD,MAAM,WAAW,GAAkB,EAAE,CAAC;QACtC,KAAK,IAAI,GAAG,GAAG,CAAC,EAAE,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE;YAC1C,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;YACtB,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;YACxB,WAAW,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC;SAC9B;QAED,IAAI,CAAC,IAAI,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,CAAC;IACnC,CAAC;IAED,GAAG,CAAC,GAAM;QACR,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;YACtB,OAAO,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;SAC3B;QACD,OAAO,IAAI,CAAC,YAAY,CAAC;IAC3B,CAAC;IAED,MAAM,CAAC,IAAc;QACnB,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE;YAC1B,MAAM,WAAW,GAAQ,EAAE,CAAC;YAC5B,KAAK,MAAM,GAAG,IAAI,CAAC,CAAC,QAAQ,EAAoB,EAAE;gBAChD,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;aACjC;YAED,OAAO,MAAM,CAAC,WAAW,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC;QACtC,CAAC,CAAC,CAAC;QAEH,OAAO,MAAM,CAAC;IAChB,CAAC;CACF;AAED,MAAM,UAAU,qBAAqB,CACnC,IAAS,EAAE,MAAW,EAAE,UAAa;IAErC,OAAO,IAAI,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC;AACvD,CAAC;AAED;;;;;;;;;;;;;GAaG;AACH,MAAM,OAAO,sBAAsB;IAIjC;QACE,IAAI,CAAC,MAAM,GAAG,IAAI,GAAG,EAAE,CAAC;IAC1B,CAAC;IAED,GAAG,CAAC,GAAW;QACb,IAAI,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;YACxB,OAAO,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;SAC7B;QACD,OAAO,EAAE,CAAC;IACZ,CAAC;IAED;;OAEG;IACH,MAAM,CACJ,IAAqB,EAAE,MAAgB;QACvC,MAAM,OAAO,GAAG,IAAI,YAAY,MAAM,CAAC,CAAC;YACtC,IAAI,CAAC,QAAQ,EAAyB,CAAC,CAAC,CAAC,IAAI,CAAC;QAEhD,KAAK,MAAM,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,OAAO,CAAC,OAAO,EAAE,EAAE;YAC1C,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;SACnC;QACD,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAqB;QAC1B,MAAM,OAAO,GAAG,IAAI,YAAY,MAAM,CAAC,CAAC;YACtC,IAAI,CAAC,QAAQ,EAAyB,CAAC,CAAC,CAAC,IAAI,CAAC;QAChD,OAAO,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3C,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,MAAgB,EAAE,cAAsB;IAExC,MAAM,eAAe,GAAG,gBAAgB,CAAC,MAAM,CAAe,CAAC;IAC/D,MAAM,iBAAiB,GAAG,eAAe;SACtC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,cAAc,CAAC,CAAC,CAAC;IAErD,MAAM,eAAe,GAAG,iBAAiB,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC;IAElE,OAAO,eAAe,CAAC;AACzB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,+BAA+B,CAC7C,kBAA4B;IAE5B,MAAM,MAAM,GAAG,GAAG,CAAC;IAEnB,sBAAsB;IACtB,MAAM,cAAc,GAAW,wBAAwB,CAAC;IACxD,OAAO,kBAAkB,CAAC,GAAG,CAC3B,KAAK,CAAC,EAAE,CAAC,MAAM,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,EAAE,EAAE,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,mEAAmE;AACnE,sDAAsD;AACtD,MAAM,mBAAmB,GAAG,0BAA0B,CAAC;AAEvD,kCAAkC;AAClC,MAAM,EAAE,GAAG,mEAAmE,CAAC;AAC/E,MAAM,EAAE,GAAG,KAAK,CAAC;AACjB,MAAM,CAAC,MAAM,eAAe,GAAG,IAAI,MAAM,CACvC,yBAAyB;IACzB,QAAQ,mBAAmB,CAAC,MAAM,IAAI;IACtC,iBAAiB,mBAAmB,CAAC,MAAM,OAAO,EAAE,IAAI;IACxD,KAAK,EAAE,YAAY,EAAE,GAAG,EAAE,GAAG,mBAAmB,CAAC,MAAM,IAAI,EAC3D,IAAI,CACL,CAAC;AAEF,MAAM,eAAe,GAAG,IAAI,MAAM,CAAC,QAAQ,mBAAmB,CAAC,MAAM,KAAK,CAAC,CAAC;AAE5E,SAAS,OAAO,CAAI,MAAa;IAC/B,OAAO,MAAM,CAAC,MAAM,CAClB,CAAC,WAAW,EAAE,KAAK,EAAE,EAAE,CAAC,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;AAC3D,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,IAAyB,EACzB,iBAAkC,EAClC,qBAAqB,GAAG,KAAK;IAE7B,IAAI,IAAI,CAAC,CAAC,CAAC,YAAY,KAAK,EAAE;QAC5B,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,UAAU,CACvC,GAAe,EAAE,iBAAiB,EAAE,qBAAqB,CAAC,CAAC,CAAC;QAC9D,OAAO,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;KAC5B;IAED,IAAI,GAAG,IAAgB,CAAC;IAExB,IAAI,CAAC,CAAC,iBAAiB,YAAY,MAAM,CAAC,EAAE;QAC1C,IAAI,qBAAqB,EAAE;YACzB,iBAAiB,GAAG,IAAI,MAAM,CAAC,IAAI,iBAAiB,GAAG,CAAC,CAAC;SAC1D;QACD,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,KAAK,CAAC,iBAAiB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;KACrE;IAED,MAAM,YAAY,GAAG,iBAAiB,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC;QAC1D,iBAAiB;QACjB,CAAC,CAAC,IAAI,MAAM,CAAC,iBAAiB,CAAC,MAAM,EAAE,iBAAiB,CAAC,KAAK,GAAG,GAAG,CAAC,CAAC;IAExE,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE;QACpB,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,EAAE,YAAY,CAAC,CAAC;QAE5C,MAAM,WAAW,GAAG,EAAE,CAAC;QACvB,IAAI,OAAO,GAAG,CAAC,CAAC;QAChB,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE;YAC3B,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;YAClD,IAAI,qBAAqB,EAAE;gBACzB,WAAW,CAAC,IAAI,CACd,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,KAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC;aAC3D;YACD,OAAO,GAAG,KAAK,CAAC,KAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;SAC1C;QACD,IAAI,OAAO,KAAK,GAAG,CAAC,MAAM,EAAE;YAC1B,WAAW,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC;SAClD;QACD,OAAO,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;IACpC,CAAC,CAAC,CAAC;AACL,CAAC;AAED,MAAM,UAAU,kBAAkB,CAChC,MAAc,EAAE,kBAA6B;IAE7C,sEAAsE;IACtE,uEAAuE;IACvE,uEAAuE;IACvE,2DAA2D;IAC3D,wBAAwB;IACxB,MAAM,QAAQ,GAAG,IAAI,MAAM,CAAC,WAAW,mBAAmB,IAAI,CAAC,CAAC;IAChE,MAAM,QAAQ,GAAG,IAAI,MAAM,CAAC,MAAM,mBAAmB,KAAK,CAAC,CAAC;IAE5D,MAAM,SAAS,GAAI,WAAW,CAAC,MAAM,CAAc,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAC5D,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,KAAK,CAAC,CACxD,CAAC;IAEF,IAAI,IAAc,CAAC;IACnB,IAAI,SAAqB,CAAC;IAE1B,SAAS,MAAM,CAAC,KAAa;QAC3B,OAAO,KAAK,CAAC,OAAO,CAAC,wBAAwB,EAAE,MAAM,CAAC,CAAC;IACzD,CAAC;IAED,IAAI,kBAAkB,IAAI,kBAAkB,CAAC,MAAM,GAAG,CAAC,EAAE;QACvD,IAAI,GAAG,+BAA+B,CAAC,kBAAkB,CAAC,CAAC;QAC3D,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,kBAAkB,CAAC,OAAO,EAAE,EAAE;YACvD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;YACtB,MAAM,YAAY,GAAG,MAAM,CAAC,KAAK,CAAC,CAAC;YAEnC,SAAS,GAAG,UAAU,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC;gBAC9C,SAAS,CAAC,CAAC,CAAC,SAAS,EAAE,YAAY,EAAE,IAAI,CAAC,CAAC;YAC7C,SAAS,GAAG,SAAS,CAAC,GAAG,CACvB,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,YAAY,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;SAClE;KACF;IACD,SAAS,GAAG,UAAU,CAAC,SAAS,KAAK,SAAS,CAAC,CAAC;QAC9C,SAAS,CAAC,CAAC,CAAC,SAAS,EAAE,eAAe,EAAE,IAAI,CAAC,CAAC;IAChD,0DAA0D;IAC1D,SAAS,GAAI,UAAU,CAAC,SAAS,EAAE,eAAe,EAAE,IAAI,CAAC,CAAC;IAE1D,IAAI,kBAAkB,IAAI,kBAAkB,CAAC,MAAM,GAAG,CAAC,EAAE;QACvD,mDAAmD;QACnD,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,kBAAkB,CAAC,OAAO,EAAE,EAAE;YACvD,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC;YACtB,MAAM,UAAU,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC;YAC/B,SAAS,GAAG,SAAS,CAAC,GAAG,CACvB,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,UAAU,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;SAClE;KACF;IAED,OAAO,uBAAuB,CAAC,SAAS,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;AAC/E,CAAC","sourcesContent":["/**\n * @license\n * Copyright 2023 Google LLC.\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n * =============================================================================\n */\n\n/* Original source: keras-nlp/byte_pair_tokenizer.py */\n\nimport { Tensor, tensor } from '@tensorflow/tfjs-core';\nimport { ValueError } from '../../errors';\nimport { matchAll } from './match_all_polyfill';\nimport { tensorArrTo2DArr, tensorToArr } from './utils';\n\nexport function bytesToUnicode(): [Uint8Array, string[]] {\n  const inclusiveRange = (start: number, end: number) =>\n    Array.from({ length: (end - start + 1) }, (v, k) => k + start);\n\n  const bs = [\n    ...inclusiveRange('!'.charCodeAt(0), '~'.charCodeAt(0)),\n    ...inclusiveRange('¡'.charCodeAt(0), '¬'.charCodeAt(0)),\n    ...inclusiveRange('®'.charCodeAt(0), 'ÿ'.charCodeAt(0))\n  ];\n\n  const cs = [...bs];\n  let n = 0;\n\n  // Removes mapping an int to a whitespace character\n  for (let b = 0; b < 2 ** 8; b++) {\n    if (!bs.includes(b)) {\n      bs.push(b);\n      cs.push(2 ** 8 + n);\n      n++;\n    }\n  }\n\n  const chars = cs.map(n => String.fromCharCode(n));\n\n  // TODO(orderique): Verify same functionality.\n  const bytes = Uint8Array.from(bs);\n\n  return [bytes, chars];\n}\n\n/**\n * StaticHashTable includes a `lookup` function for multiple keys at once.\n */\nexport class StaticHashTable<K, V extends number|string> {\n  private _map: Map<K, V>;\n\n  constructor(keys: K[], values: V[], private readonly defaultValue: V) {\n    if (keys.length !== values.length) {\n      throw new ValueError(`keys and values arrays must be same length.\n        Instead got lengths ${keys.length} and ${values.length}.`\n      );\n    }\n    const keyValPairs: Array<[K, V]> = [];\n    for (let idx = 0; idx < keys.length; idx++) {\n      const key = keys[idx];\n      const val = values[idx];\n      keyValPairs.push([key, val]);\n    }\n\n    this._map = new Map(keyValPairs);\n  }\n\n  get(key: K): V {\n    if (this._map.has(key)) {\n      return this._map.get(key);\n    }\n    return this.defaultValue;\n  }\n\n  lookup(keys: Tensor[]): Tensor[] {\n    const values = keys.map(t => {\n      const innerValues: V[] = [];\n      for (const key of t.dataSync() as unknown as K[]) {\n        innerValues.push(this.get(key));\n      }\n\n      return tensor(innerValues, t.shape);\n    });\n\n    return values;\n  }\n}\n\nexport function createStaticHashtable<K, V extends number|string>(\n  keys: K[], values: V[], defaultVal: V): StaticHashTable<K, V> {\n\n  return new StaticHashTable(keys, values, defaultVal);\n}\n\n/**\n * Cache that stores the encoded result of seen tokens.\n *\n * The cache key is string tensor or python strings, and the value is split\n * tokens joined by whitespace. For example, \"dragonfly\" => \"dragon fly\"\n *\n * Examples:\n *\n * ```js\n * const cache = new BytePairTokenizerCache();\n * cache.insert([\"butterfly\", \"dragonfly\"], [\"but ter fly\", \"dragon fly\"]);\n * cache.lookup([\"butterfly\"]);\n * ```\n */\nexport class BytePairTokenizerCache {\n  // TODO(orderique): modify to use id2value map. Debug for correct behavior.\n  private _cache: Map<string, string>;\n\n  constructor() {\n    this._cache = new Map();\n  }\n\n  get(key: string): string {\n    if (this._cache.has(key)) {\n      return this._cache.get(key);\n    }\n    return '';\n  }\n\n  /**\n   * Insert token <=> encoded outputs pairs.\n   */\n  insert(\n    keys: Tensor|string[], values: string[]): BytePairTokenizerCache {\n    const arrKeys = keys instanceof Tensor ?\n      keys.dataSync() as unknown as string[] : keys;\n\n    for (const [idx, key] of arrKeys.entries()) {\n      this._cache.set(key, values[idx]);\n    }\n    return this;\n  }\n\n  /**\n   * Look up the encoded outputs of given tokens.\n   */\n  lookup(keys: Tensor|string[]): string[] {\n    const arrKeys = keys instanceof Tensor ?\n      keys.dataSync() as unknown as string[] : keys;\n    return arrKeys.map(key => this.get(key));\n  }\n}\n\n/**\n * Remove certain strings from input tensor.\n */\nexport function removeStringsFromInputs(\n  inputs: Tensor[], stringToRemove: string): Tensor[] {\n\n  const stringArrInputs = tensorArrTo2DArr(inputs) as string[][];\n  const filteredStrArrays = stringArrInputs\n    .map(arr => arr.filter(s => s !== stringToRemove));\n\n  const filteredTensors = filteredStrArrays.map(arr => tensor(arr));\n\n  return filteredTensors;\n}\n\n/**\n * Create alternates for all special tokens that will be not split during\n * tokenization.\n */\nexport function createAltsForUnsplittableTokens(\n  unsplittableTokens: string[]): string[] {\n\n  const prefix = 'ĵ';\n\n  // Trim out splitters.\n  const replacePattern: RegExp = /'|\\s+|[^\\p{L}\\p{N}]+/gu;\n  return unsplittableTokens.map(\n    token => prefix + token.replace(replacePattern, ''));\n}\n\n// Typescript and TF handles special spaces differently, we need to\n// manually handle special spaces during string split.\nconst SPECIAL_WHITESPACES = /\\u00A0\\u2009\\u202f\\u3000/;\n\n// String splitting regex pattern.\nconst pL = 'a-zA-ZáàâäãåçéèêëíìîïñóòôöõúùûüýÿæœÁÀÂÄÃÅÇÉÈÊËÍÌÎÏÑÓÒÔÖÕÚÙÛÜÝŸÆŒĵ';\nconst pN = '0-9';\nexport const SPLIT_PATTERN_1 = new RegExp(\n  `'s|'t|'re|'ve|'m|'ll|'d` +\n  `|[\\\\s${SPECIAL_WHITESPACES.source}]+` +\n  `[\\\\n\\\\r\\\\t\\\\f६${SPECIAL_WHITESPACES.source}]| ?${pL}+|`+\n  ` ?${pN}+| ?[^\\\\s${pL}${pN}${SPECIAL_WHITESPACES.source}]+`,\n  'gu'\n);\n\nconst SPLIT_PATTERN_2 = new RegExp(`[\\\\s६${SPECIAL_WHITESPACES.source}]\\$`);\n\nfunction flatten<T>(inputs: T[][]): T[] {\n  return inputs.reduce(\n    (accumulator, value) => accumulator.concat(value), []);\n}\n\nexport function regexSplit(\n  strs: string[]|string[][],\n  delimRegexPattern: RegExp | string,\n  keepDelimRegexPattern = false): string[][] {\n\n  if (strs[0] instanceof Array) {\n    const mapped = strs.map(arr => regexSplit(\n      arr as string[], delimRegexPattern, keepDelimRegexPattern));\n    return mapped.map(flatten);\n  }\n\n  strs = strs as string[];\n\n  if (!(delimRegexPattern instanceof RegExp)) {\n    if (keepDelimRegexPattern) {\n      delimRegexPattern = new RegExp(`(${delimRegexPattern})`);\n    }\n    return strs.map(str => str.split(delimRegexPattern).filter(s => s));\n  }\n\n  const regexPattern = delimRegexPattern.flags.includes('g') ?\n    delimRegexPattern\n    : new RegExp(delimRegexPattern.source, delimRegexPattern.flags + 'g');\n\n  return strs.map(str => {\n    const matches = matchAll(str, regexPattern);\n\n    const splitString = [];\n    let currIdx = 0;\n    for (const match of matches) {\n      splitString.push(str.slice(currIdx, match.index));\n      if (keepDelimRegexPattern) {\n        splitString.push(\n          str.slice(match.index, match.index! + match[0].length));\n      }\n      currIdx = match.index! + match[0].length;\n    }\n    if (currIdx !== str.length) {\n      splitString.push(str.slice(currIdx, str.length));\n    }\n    return splitString.filter(s => s);\n  });\n}\n\nexport function splitStringsForBpe(\n  inputs: Tensor, unsplittableTokens?: string[]): Tensor[] {\n\n  // We need to recreate the exact behavior of token presplitting in the\n  // original gpt2 implementation which uses a lookahead. We are using an\n  // alternative by inserting a special token \"६\" before leading space of\n  // non-space characters and after the trailing space, e.g.,\n  // \" tf\" will be \"६ tf\".\n  const pattern1 = new RegExp(`( )([^\\s${SPECIAL_WHITESPACES}])`);\n  const pattern2 = new RegExp(`(\\s${SPECIAL_WHITESPACES})\\$`);\n\n  const inputsStr = (tensorToArr(inputs) as string[]).map(str =>\n    str.replace(pattern1, `६$1$2`).replace(pattern2, `$1६`)\n  );\n\n  let alts: string[];\n  let rawTokens: string[][];\n\n  function escape(input: string): string {\n    return input.replace(/[-\\/\\\\^$*+?.()|[\\]{}]/g, '\\\\$&');\n  }\n\n  if (unsplittableTokens && unsplittableTokens.length > 0) {\n    alts = createAltsForUnsplittableTokens(unsplittableTokens);\n    for (const [idx, token] of unsplittableTokens.entries()) {\n      const alt = alts[idx];\n      const escapedToken = escape(token);\n\n      rawTokens = regexSplit(rawTokens !== undefined ?\n        rawTokens : inputsStr, escapedToken, true);\n      rawTokens = rawTokens.map(\n        arr => arr.map(t => t.replace(new RegExp(escapedToken), alt)));\n    }\n  }\n  rawTokens = regexSplit(rawTokens !== undefined ?\n    rawTokens : inputsStr, SPLIT_PATTERN_1, true);\n  // Second pass splits out the last whilespace char or \"६\".\n  rawTokens  = regexSplit(rawTokens, SPLIT_PATTERN_2, true);\n\n  if (unsplittableTokens && unsplittableTokens.length > 0) {\n    // Replace special tokens alternate with originals.\n    for (const [idx, token] of unsplittableTokens.entries()) {\n      const alt = alts[idx];\n      const escapedAlt = escape(alt);\n      rawTokens = rawTokens.map(\n        arr => arr.map(t => t.replace(new RegExp(escapedAlt), token)));\n    }\n  }\n\n  return removeStringsFromInputs(rawTokens.map(tokens => tensor(tokens)), '६');\n}\n"]}