UNPKG

@atjsh/llmlingua-2

Version:

JavaScript/TypeScript Implementation of LLMLingua-2

102 lines 3.6 kB
// SPDX-License-Identifier: MIT // Equivalent to Python's string.punctuation const PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; /** * Implementation of `GetPureTokenFunction` for "XLM-RoBERTa Large" model. * * @category Adaptors */ export const get_pure_tokens_xlm_roberta_large = (token) => { return token ? token.replace(/^▁/, "") : ""; }; /** * Implementation of `GetPureTokenFunction` for "BERT Base Multilingual Cased" model. * * @category Adaptors */ export const get_pure_tokens_bert_base_multilingual_cased = (token) => { return token ? token.replace(/^##/, "") : ""; }; /** * Implementation of `IsBeginOfNewWordFunction` for "XLM-RoBERTa Large" model. * * @category Adaptors */ export const is_begin_of_new_word_xlm_roberta_large = (token, force_tokens = [], token_map = {}) => { if (token && (PUNCTUATION.includes(token) || force_tokens.includes(token) || Object.values(token_map).includes(token))) { return true; } return token?.startsWith("▁") || false; }; /** * Implementation of `IsBeginOfNewWordFunction` for "BERT Base Multilingual Cased" model. * * @category Adaptors */ export const is_begin_of_new_word_bert_base_multilingual_cased = (token, force_tokens = [], token_map = {}) => { if (force_tokens.includes(token ? token.replace(/^##/, "") : "") || Object.values(token_map).includes(token ? token.replace(/^##/, "") : "")) { return true; } return !token?.startsWith("##"); }; /** * Implementation on `replace_added_token` function of original LLMLingua implementation. * @see [Original Implementation](https://github.com/microsoft/LLMLingua/blob/e4e172afb42d8ae3c0b6cb271a3f5d6a812846a0/llmlingua/utils.py#L102) * * @category Utils */ export function replace_added_token(token, token_map) { let t = token; for (const [ori, added] of Object.entries(token_map)) { t = t.replaceAll(added, ori); } return t; } /** * Calculate the **p-th percentile** of a numeric array. * * The function follows the “inclusive” linear-interpolation rule used by Excel’s * `PERCENTILE.INC` and NumPy’s default percentile implementation: * * 1. The input array is **copied and sorted** (ascending) so the original order * is preserved. * 2. An index `k = (n − 1) × (p / 100)` is computed, where `n` is the array’s * length. * 3. If `k` is an integer, the element at that index is the percentile. * Otherwise, the result is the linear interpolation between the two nearest * ranks (`⌊k⌋` and `⌈k⌉`). * * @param {number[]} arr – Source data. The function does **not** mutate it. * @param {number} p – Desired percentile (0 ≤ `p` ≤ 100). * @returns {number} The computed percentile value. If the array is empty, * the function returns `0`. * * @throws {RangeError} If `p` is outside the 0–100 range. * * @example * const data = [7, 15, 36, 39, 40, 41]; * percentile(data, 25); // → 15 (1st quartile) * percentile(data, 50); // → 37.5 (median with interpolation) * percentile(data, 90); // → 40.5 * * @category Utils */ export function percentile(arr, p) { if (arr.length === 0) return 0; const sortedArr = [...arr].sort((a, b) => a - b); const k = (sortedArr.length - 1) * (p / 100); const f = Math.floor(k); const c = Math.ceil(k); if (f === c) { return sortedArr[f]; } const d0 = sortedArr[f] * (c - k); const d1 = sortedArr[c] * (k - f); return d0 + d1; } //# sourceMappingURL=utils.js.map