UNPKG

@visulima/string

Version:

Functions for manipulating strings.

visulima.com/packages/string

visulima/visulima

635 lines (630 loc) • 22.3 kB

JavaScript

import { stripVTControlCharacters } from 'node:util'; import { j as RE_EMOJI, s as stripEmoji, k as RE_SEPARATORS, R as RE_FAST_ANSI, l as RE_CYRILLIC, m as RE_LATIN, n as RE_GREEK, o as RE_GREEK_LATIN_SPLIT, p as RE_KANJI, q as RE_HEBREW, r as RE_ARABIC, t as RE_HANGUL, u as RE_UZBEK_LATIN_MODIFIER, v as RE_KATAKANA, w as RE_HIRAGANA, x as RE_DEVANAGARI, y as RE_BENGALI, z as RE_GUJARATI, A as RE_GURMUKHI, B as RE_KANNADA, C as RE_TAMIL, D as RE_TELUGU, E as RE_MALAYALAM, F as RE_SINHALA, G as RE_THAI, H as RE_LAO, I as RE_TIBETAN, J as RE_MYANMAR, K as RE_ETHIOPIC, L as RE_KHMER, M as RE_ORIYA } from '../packem_shared/constants-CKNmLDBQ.mjs'; import LRUCache from '../packem_shared/LRUCache-udNErhWw.mjs'; var __defProp$2 = Object.defineProperty; var __name$2 = (target, value) => __defProp$2(target, "name", { value, configurable: true }); const regexCache = new LRUCache(1e3); const getSeparatorsRegex = /* @__PURE__ */ __name$2((separators) => { const key = separators.join(""); if (regexCache.has(key)) { return regexCache.get(key); } const pattern = separators.map((s) => s.replaceAll(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|"); const regex = new RegExp(pattern, "g"); regexCache.set(key, regex); return regex; }, "getSeparatorsRegex"); var __defProp$1 = Object.defineProperty; var __name$1 = (target, value) => __defProp$1(target, "name", { value, configurable: true }); const splitByEmoji = /* @__PURE__ */ __name$1((text) => { const segments = []; let lastIndex = 0; let match; RE_EMOJI.lastIndex = 0; while ((match = RE_EMOJI.exec(text)) !== null) { if (match.index > lastIndex) { segments.push(text.slice(lastIndex, match.index)); } segments.push(match[0]); lastIndex = RE_EMOJI.lastIndex; } if (lastIndex < text.length) { segments.push(text.slice(lastIndex)); } return segments.filter(Boolean); }, "splitByEmoji"); var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); const isUpperCode = new Uint8Array(128); const isLowerCode = new Uint8Array(128); const isDigitCode = new Uint8Array(128); for (let index = 0; index < 128; index++) { isUpperCode[index] = index >= 65 && index <= 90 ? 1 : 0; isLowerCode[index] = index >= 97 && index <= 122 ? 1 : 0; isDigitCode[index] = index >= 48 && index <= 57 ? 1 : 0; } const handleScriptTransitions = /* @__PURE__ */ __name((s, scriptDetectors, caseSensitive, locale, customSplitLogic) => { if (s.length === 0) { return []; } let hasDetectedScript = false; for (const detector of Object.values(scriptDetectors)) { if (detector(s[0])) { hasDetectedScript = true; break; } } if (!hasDetectedScript && !caseSensitive) { return [s]; } const chars = [...s]; const result = []; let currentSegment = chars[0]; let previousType = "other"; for (const [type, detector] of Object.entries(scriptDetectors)) { if (detector(chars[0])) { previousType = type; break; } } let previousIsUpper = caseSensitive && locale ? chars[0] === chars[0].toLocaleUpperCase(locale) : false; for (let index = 1; index < chars.length; index++) { const char = chars[index]; let currentType = "other"; for (const [type, detector] of Object.entries(scriptDetectors)) { if (detector(char)) { currentType = type; break; } } const isUpper = caseSensitive && locale ? char === char.toLocaleUpperCase(locale) : false; let shouldSplit = false; if (customSplitLogic) { shouldSplit = customSplitLogic(previousType, currentType, previousIsUpper, isUpper, char, index, chars); } else { if (previousType !== currentType && previousType !== "other" && currentType !== "other") { shouldSplit = true; } if (caseSensitive && currentType !== "other" && !previousIsUpper && isUpper) { shouldSplit = true; } } if (shouldSplit) { result.push(currentSegment); currentSegment = char; } else { currentSegment += char; } previousType = currentType; if (caseSensitive) { previousIsUpper = isUpper; } } if (currentSegment && currentSegment.length > 0) { result.push(currentSegment); } return result.length > 0 ? result : [s]; }, "handleScriptTransitions"); const splitCamelCaseFast = /* @__PURE__ */ __name((s, knownAcronyms = /* @__PURE__ */ new Set()) => { if (s.length === 0) { return []; } if (s.toUpperCase() === s) { return [s]; } let start = 0; const tokens = []; const width = s.length; for (let index = 1; index < width; index++) { const previousCode = s.codePointAt(index - 1); const currentCode = s.codePointAt(index); if (knownAcronyms.size > 0) { for (const acronym of knownAcronyms) { if (s.startsWith(acronym, start)) { tokens.push(acronym); start += acronym.length; index = start - 1; break; } } if (index < start) { continue; } } const previousIsUpper = previousCode && previousCode < 128 && isUpperCode[previousCode]; const currentIsUpper = currentCode && currentCode < 128 && isUpperCode[currentCode]; const previousIsLower = previousCode && previousCode < 128 && isLowerCode[previousCode]; const previousIsDigit = previousCode && previousCode < 128 && isDigitCode[previousCode]; const currentIsDigit = currentCode && currentCode < 128 && isDigitCode[currentCode]; if (previousIsLower && currentIsUpper) { tokens.push(s.slice(start, index)); start = index; continue; } if (previousIsDigit && !currentIsDigit || !previousIsDigit && currentIsDigit) { tokens.push(s.slice(start, index)); start = index; continue; } if (currentIsDigit && !previousIsDigit) { let isNextUpper = false; let isNextDigit = false; if (index + 1 < width) { const nextCode = s.codePointAt(index + 1); isNextUpper = nextCode && nextCode < 128 && isUpperCode[nextCode]; isNextDigit = nextCode && nextCode < 128 && isDigitCode[nextCode]; } if (!isNextDigit && isNextUpper) { tokens.push(s.slice(start, index), s.slice(index, index + 1)); start = index + 1; continue; } } if (index + 1 < width) { const nextCode = s.codePointAt(index + 1); const nextIsLower = nextCode && nextCode < 128 && isLowerCode[nextCode]; if (previousIsUpper && currentIsUpper && nextIsLower) { const candidate = s.slice(start, index + 1); if (!knownAcronyms.has(candidate)) { tokens.push(s.slice(start, index)); start = index; } } } } if (start < width) { tokens.push(s.slice(start)); } return tokens.filter((token) => token !== ""); }, "splitCamelCaseFast"); const splitCamelCaseLocale = /* @__PURE__ */ __name((s, locale, knownAcronyms) => { if (s.length === 0) { return []; } const isUpperCase = s === s.toLocaleUpperCase(locale); if (locale.startsWith("de")) { if (!isUpperCase && s.replaceAll("ß", "SS") === s.toLocaleUpperCase(locale)) { return [s]; } const chars2 = [...s]; const width_2 = chars2.length; const result2 = []; let currentSegment2 = chars2[0]; let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale); let isInUpperSequence = previousIsUpper2; let upperSequenceStart = previousIsUpper2 ? 0 : -1; for (let index = 1; index < width_2; index++) { const char = chars2[index]; const isUpper = char === char.toLocaleUpperCase(locale); if (isUpper === previousIsUpper2) { currentSegment2 += char; } else if (isUpper) { if (currentSegment2 && currentSegment2.length > 0) { result2.push(currentSegment2); currentSegment2 = char; } isInUpperSequence = true; upperSequenceStart = index; } else { if (isInUpperSequence && index - upperSequenceStart > 1) { const lastUpperChar = chars2[index - 1]; const withoutLastUpper = currentSegment2.slice(0, -1); if (withoutLastUpper && withoutLastUpper.length > 0) { result2.push(withoutLastUpper); } currentSegment2 = lastUpperChar + char; } else { currentSegment2 += char; } isInUpperSequence = false; upperSequenceStart = -1; } previousIsUpper2 = isUpper; } if (currentSegment2 && currentSegment2.length > 0) { result2.push(currentSegment2); } return result2; } if (locale.startsWith("uk") || locale.startsWith("ru") || locale.startsWith("bg") || locale.startsWith("sr") || locale.startsWith("mk") || locale.startsWith("be")) { if (!RE_CYRILLIC.test(s) && !RE_LATIN.test(s)) { return [s]; } const chars2 = [...s]; const width_2 = chars2.length; const result2 = []; let currentSegment2 = chars2[0]; let previousType = RE_CYRILLIC.test(chars2[0]) ? 1 : RE_LATIN.test(chars2[0]) ? 2 : 0; let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale); for (let index = 1; index < width_2; index++) { const char = chars2[index]; const currentType = RE_CYRILLIC.test(char) ? 1 : RE_LATIN.test(char) ? 2 : 0; const isUpper = char === char.toLocaleUpperCase(locale); if (previousType !== currentType && (previousType === 1 || previousType === 2) && (currentType === 1 || currentType === 2) || currentType === previousType && !previousIsUpper2 && isUpper) { result2.push(currentSegment2); currentSegment2 = char; } else { currentSegment2 += char; } previousType = currentType; previousIsUpper2 = isUpper; } if (currentSegment2 && currentSegment2.length > 0) { result2.push(currentSegment2); } const finalResult = []; for (let index = 0; index < result2.length; index++) { if (index < result2.length - 1 && // eslint-disable-next-line security/detect-object-injection result2[index].length === 1 && // eslint-disable-next-line security/detect-object-injection RE_LATIN.test(result2[index]) && RE_CYRILLIC.test(result2[index + 1][0])) { finalResult.push(result2[index] + result2[index + 1]); index++; } else { finalResult.push(result2[index]); } } return finalResult; } if (locale.startsWith("el")) { if (!RE_GREEK.test(s) && !RE_LATIN.test(s)) { return [s]; } const parts = s.match(RE_GREEK_LATIN_SPLIT) ?? [s]; const result2 = []; const width = parts.length; if (width === 1) { const part = parts[0]; if (!part || !RE_GREEK.test(part[0]) || part.length === 1) { return [part || s]; } } for (const part of parts) { if (!part) { continue; } if (!RE_GREEK.test(part[0]) || part.length === 1) { result2.push(part); continue; } const partLength = part.length; let word = part[0]; let previousIsUpper2 = part[0] === part[0].toLocaleUpperCase(locale); for (let index = 1; index < partLength; index++) { const char = part[index]; const isUpper = char === char.toLocaleUpperCase(locale); if (!previousIsUpper2 && isUpper) { result2.push(word); word = char; } else { word += char; } previousIsUpper2 = isUpper; } if (word) { result2.push(word); } } return result2; } if (locale.startsWith("ja") || locale.startsWith("ko")) { const isJapanese = locale.startsWith("ja"); const scriptDetectors = isJapanese ? { hiragana: /* @__PURE__ */ __name((char) => RE_HIRAGANA.test(char), "hiragana"), kanji: /* @__PURE__ */ __name((char) => RE_KANJI.test(char), "kanji"), katakana: /* @__PURE__ */ __name((char) => RE_KATAKANA.test(char), "katakana"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") } : { hangul: /* @__PURE__ */ __name((char) => RE_HANGUL.test(char), "hangul"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") }; const particles = /* @__PURE__ */ new Set(["と", "に", "へ", "を", "は", "が", "の", "で", "や", "も"]); if (isJapanese) { const baseSegments = handleScriptTransitions( s, scriptDetectors, false, locale, (previousType, currentType) => previousType === "hiragana" && currentType === "katakana" || // hiragana -> katakana previousType === "katakana" && currentType === "hiragana" || // katakana -> hiragana previousType === "hiragana" && currentType === "latin" || // hiragana -> latin previousType === "katakana" && currentType === "latin" || // katakana -> latin previousType === "kanji" && currentType === "latin" || // kanji -> latin previousType === "latin" && (currentType === "hiragana" || currentType === "katakana" || currentType === "kanji") // latin -> japanese ); const result2 = []; for (const segment of baseSegments) { if (segment.length === 1 && particles.has(segment) && result2.length > 0) { result2[result2.length - 1] += segment; } else { result2.push(segment); } } return result2.length > 0 ? result2 : [s]; } return handleScriptTransitions( s, scriptDetectors, false, locale, (previousType, currentType) => previousType === "hangul" && currentType === "latin" || // hangul -> latin previousType === "latin" && currentType === "hangul" // latin -> hangul ); } if (locale.startsWith("sl")) { const chars2 = [...s]; const width_2 = chars2.length; const result2 = []; let currentSegment2 = chars2[0]; let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale); for (let index = 1; index < width_2; index++) { const char = chars2[index]; const isUpper = char === char.toLocaleUpperCase(locale); const isSpecialChar = /[ČŠŽĐ]/i.test(char); const nextIsUpper = index < width_2 - 1 && chars2[index + 1] === chars2[index + 1].toLocaleUpperCase(locale); if (!previousIsUpper2 && isUpper || isSpecialChar && nextIsUpper) { result2.push(currentSegment2); currentSegment2 = char; if (isSpecialChar && nextIsUpper) { result2.push(currentSegment2); currentSegment2 = ""; } } else { currentSegment2 += char; } previousIsUpper2 = isUpper; } if (currentSegment2 && currentSegment2.length > 0) { result2.push(currentSegment2); } return result2; } if (locale.startsWith("zh")) { return handleScriptTransitions( s, { han: /* @__PURE__ */ __name((char) => RE_KANJI.test(char), "han"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") }, false, locale ); } if (["ar", "fa", "he", "ur"].includes(locale.split("-")[0])) { const isRtlChar = /* @__PURE__ */ __name((ch) => RE_HEBREW.test(ch) || RE_ARABIC.test(ch), "isRtlChar"); return handleScriptTransitions( s, { latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin"), rtl: /* @__PURE__ */ __name((char) => isRtlChar(char), "rtl") }, false, locale ); } if ([ "am", // Amharic "bn", // Bengali "gu", // Gujarati "hi", // Hindi "km", // Khmer "kn", // Kannada "lo", // Lao "ml", // Malayalam "mr", // Marathi "ne", // Nepali "or", // Oriya "pa", // Punjabi "si", // Sinhala "ta", // Tamil "te", // Telugu "th" // Thai ].includes(locale.split("-")[0])) { const isIndicChar = /* @__PURE__ */ __name((ch) => RE_DEVANAGARI.test(ch) || RE_BENGALI.test(ch) || RE_GUJARATI.test(ch) || RE_GURMUKHI.test(ch) || RE_KANNADA.test(ch) || RE_TAMIL.test(ch) || RE_TELUGU.test(ch) || RE_MALAYALAM.test(ch) || RE_SINHALA.test(ch) || RE_THAI.test(ch) || RE_LAO.test(ch) || RE_TIBETAN.test(ch) || RE_MYANMAR.test(ch) || RE_ETHIOPIC.test(ch) || RE_KHMER.test(ch) || RE_ORIYA.test(ch), "isIndicChar"); return handleScriptTransitions( s, { indic: /* @__PURE__ */ __name((char) => isIndicChar(char), "indic"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") }, false, locale ); } if (["be", "bg", "ru", "sr", "uk"].includes(locale)) { return handleScriptTransitions( s, { cyrillic: /* @__PURE__ */ __name((char) => RE_CYRILLIC.test(char), "cyrillic"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") }, true, // Enable case-sensitive splitting locale ); } if (["ar", "fa", "he"].includes(locale)) { return handleScriptTransitions( s, { latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin"), rtl: /* @__PURE__ */ __name((char) => RE_HEBREW.test(char) || RE_ARABIC.test(char), "rtl") }, false, locale ); } if (locale.startsWith("ko")) { return handleScriptTransitions( s, { hangul: /* @__PURE__ */ __name((char) => RE_HANGUL.test(char), "hangul"), latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin") }, false, locale ); } if (locale.startsWith("uz")) { if (!RE_CYRILLIC.test(s) && !RE_LATIN.test(s)) { return [s]; } const chars2 = [...s]; const width_2 = chars2.length; const result2 = []; let currentSegment2 = chars2[0]; let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale); for (let index = 1; index < width_2; index++) { const char = chars2[index]; const isUpper = char === char.toLocaleUpperCase(locale); if (RE_UZBEK_LATIN_MODIFIER.test(char) || RE_UZBEK_LATIN_MODIFIER.test(chars2[index - 1])) { currentSegment2 += char; continue; } if (!previousIsUpper2 && isUpper) { result2.push(currentSegment2); currentSegment2 = char; } else { currentSegment2 += char; } previousIsUpper2 = isUpper; } if (currentSegment2 && currentSegment2.length > 0) { result2.push(currentSegment2); } return result2; } const chars = [...s]; const width_ = chars.length; const result = []; let currentSegment = chars[0]; let previousIsUpper = chars[0] === chars[0].toLocaleUpperCase(locale); for (const acronym of knownAcronyms) { if (s.startsWith(acronym)) { result.push(acronym); currentSegment = chars[acronym.length]; previousIsUpper = currentSegment === currentSegment.toLocaleUpperCase(locale); break; } } for (let index = 1; index < width_; index++) { const char = chars[index]; const isUpper = char === char.toLocaleUpperCase(locale); let isAcronym = false; for (const acronym of knownAcronyms) { if (s.startsWith(acronym, index)) { result.push(currentSegment, acronym); index += acronym.length - 1; currentSegment = ""; isAcronym = true; break; } } if (isAcronym) { continue; } if (!previousIsUpper && isUpper) { result.push(currentSegment); currentSegment = char; } else { currentSegment += char; } previousIsUpper = isUpper; } if (currentSegment) { result.push(currentSegment); } return result; }, "splitCamelCaseLocale"); const processTextWithAnsiEmoji = /* @__PURE__ */ __name((text, locale, knownAcronyms) => { const result = []; const segments = RE_FAST_ANSI.test(text) ? text.split(RE_FAST_ANSI).filter(Boolean) : [text]; for (const seg of segments) { if (RE_FAST_ANSI.test(seg)) { result.push(seg); } else { const subs = RE_EMOJI.test(seg) ? splitByEmoji(seg).filter(Boolean) : [seg]; for (const sub of subs) { if (RE_EMOJI.test(sub)) { result.push(sub); } else { if (locale) { const normalizedLocale = locale.toLowerCase().split("-")[0]; result.push(...splitCamelCaseLocale(sub, normalizedLocale, knownAcronyms)); } else { result.push(...splitCamelCaseFast(sub, knownAcronyms)); } } } } } return result; }, "processTextWithAnsiEmoji"); const splitByCase = /* @__PURE__ */ __name((input, options = {}) => { if (!input || typeof input !== "string") { return []; } const { handleAnsi = false, handleEmoji = false, knownAcronyms = [], locale, normalize = false, separators, stripAnsi: stripAnsiOption = false, stripEmoji: stripEmojiOption = false } = options; const acronymSet = new Set([...knownAcronyms].sort((a, b) => b.length - a.length)); let cleanedInput = input; if (stripAnsiOption) { cleanedInput = stripVTControlCharacters(cleanedInput); } if (stripEmojiOption) { cleanedInput = stripEmoji(cleanedInput); } const separatorRegex = Array.isArray(separators) ? getSeparatorsRegex(separators) : separators instanceof RegExp ? separators : RE_SEPARATORS; const parts = cleanedInput.split(separatorRegex).filter(Boolean); let tokens = []; for (const part of parts) { if (handleAnsi || handleEmoji) { tokens.push(...processTextWithAnsiEmoji(part, locale, acronymSet)); } else if (locale) { tokens.push(...splitCamelCaseLocale(part, locale, acronymSet)); } else { tokens.push(...splitCamelCaseFast(part, acronymSet)); } } if (normalize) { tokens = tokens.map((token) => { if (acronymSet.has(token)) { return token; } if (locale && token === token.toLocaleUpperCase(locale)) { return token[0] + token.slice(1).toLocaleLowerCase(locale); } if (token.toUpperCase() === token && !acronymSet.has(token)) { return token.slice(0, 1) + token.slice(1).toLowerCase(); } return token; }); } return tokens; }, "splitByCase"); export { splitByCase };