@visulima/string
Version:
Functions for manipulating strings.
635 lines (630 loc) • 22.3 kB
JavaScript
import { stripVTControlCharacters } from 'node:util';
import { j as RE_EMOJI, s as stripEmoji, k as RE_SEPARATORS, R as RE_FAST_ANSI, l as RE_CYRILLIC, m as RE_LATIN, n as RE_GREEK, o as RE_GREEK_LATIN_SPLIT, p as RE_KANJI, q as RE_HEBREW, r as RE_ARABIC, t as RE_HANGUL, u as RE_UZBEK_LATIN_MODIFIER, v as RE_KATAKANA, w as RE_HIRAGANA, x as RE_DEVANAGARI, y as RE_BENGALI, z as RE_GUJARATI, A as RE_GURMUKHI, B as RE_KANNADA, C as RE_TAMIL, D as RE_TELUGU, E as RE_MALAYALAM, F as RE_SINHALA, G as RE_THAI, H as RE_LAO, I as RE_TIBETAN, J as RE_MYANMAR, K as RE_ETHIOPIC, L as RE_KHMER, M as RE_ORIYA } from '../packem_shared/constants-CKNmLDBQ.mjs';
import LRUCache from '../packem_shared/LRUCache-udNErhWw.mjs';
var __defProp$2 = Object.defineProperty;
var __name$2 = (target, value) => __defProp$2(target, "name", { value, configurable: true });
const regexCache = new LRUCache(1e3);
const getSeparatorsRegex = /* @__PURE__ */ __name$2((separators) => {
const key = separators.join("");
if (regexCache.has(key)) {
return regexCache.get(key);
}
const pattern = separators.map((s) => s.replaceAll(/[.*+?^${}()|[\]\\]/g, "\\$&")).join("|");
const regex = new RegExp(pattern, "g");
regexCache.set(key, regex);
return regex;
}, "getSeparatorsRegex");
var __defProp$1 = Object.defineProperty;
var __name$1 = (target, value) => __defProp$1(target, "name", { value, configurable: true });
const splitByEmoji = /* @__PURE__ */ __name$1((text) => {
const segments = [];
let lastIndex = 0;
let match;
RE_EMOJI.lastIndex = 0;
while ((match = RE_EMOJI.exec(text)) !== null) {
if (match.index > lastIndex) {
segments.push(text.slice(lastIndex, match.index));
}
segments.push(match[0]);
lastIndex = RE_EMOJI.lastIndex;
}
if (lastIndex < text.length) {
segments.push(text.slice(lastIndex));
}
return segments.filter(Boolean);
}, "splitByEmoji");
var __defProp = Object.defineProperty;
var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
const isUpperCode = new Uint8Array(128);
const isLowerCode = new Uint8Array(128);
const isDigitCode = new Uint8Array(128);
for (let index = 0; index < 128; index++) {
isUpperCode[index] = index >= 65 && index <= 90 ? 1 : 0;
isLowerCode[index] = index >= 97 && index <= 122 ? 1 : 0;
isDigitCode[index] = index >= 48 && index <= 57 ? 1 : 0;
}
const handleScriptTransitions = /* @__PURE__ */ __name((s, scriptDetectors, caseSensitive, locale, customSplitLogic) => {
if (s.length === 0) {
return [];
}
let hasDetectedScript = false;
for (const detector of Object.values(scriptDetectors)) {
if (detector(s[0])) {
hasDetectedScript = true;
break;
}
}
if (!hasDetectedScript && !caseSensitive) {
return [s];
}
const chars = [...s];
const result = [];
let currentSegment = chars[0];
let previousType = "other";
for (const [type, detector] of Object.entries(scriptDetectors)) {
if (detector(chars[0])) {
previousType = type;
break;
}
}
let previousIsUpper = caseSensitive && locale ? chars[0] === chars[0].toLocaleUpperCase(locale) : false;
for (let index = 1; index < chars.length; index++) {
const char = chars[index];
let currentType = "other";
for (const [type, detector] of Object.entries(scriptDetectors)) {
if (detector(char)) {
currentType = type;
break;
}
}
const isUpper = caseSensitive && locale ? char === char.toLocaleUpperCase(locale) : false;
let shouldSplit = false;
if (customSplitLogic) {
shouldSplit = customSplitLogic(previousType, currentType, previousIsUpper, isUpper, char, index, chars);
} else {
if (previousType !== currentType && previousType !== "other" && currentType !== "other") {
shouldSplit = true;
}
if (caseSensitive && currentType !== "other" && !previousIsUpper && isUpper) {
shouldSplit = true;
}
}
if (shouldSplit) {
result.push(currentSegment);
currentSegment = char;
} else {
currentSegment += char;
}
previousType = currentType;
if (caseSensitive) {
previousIsUpper = isUpper;
}
}
if (currentSegment && currentSegment.length > 0) {
result.push(currentSegment);
}
return result.length > 0 ? result : [s];
}, "handleScriptTransitions");
const splitCamelCaseFast = /* @__PURE__ */ __name((s, knownAcronyms = /* @__PURE__ */ new Set()) => {
if (s.length === 0) {
return [];
}
if (s.toUpperCase() === s) {
return [s];
}
let start = 0;
const tokens = [];
const width = s.length;
for (let index = 1; index < width; index++) {
const previousCode = s.codePointAt(index - 1);
const currentCode = s.codePointAt(index);
if (knownAcronyms.size > 0) {
for (const acronym of knownAcronyms) {
if (s.startsWith(acronym, start)) {
tokens.push(acronym);
start += acronym.length;
index = start - 1;
break;
}
}
if (index < start) {
continue;
}
}
const previousIsUpper = previousCode && previousCode < 128 && isUpperCode[previousCode];
const currentIsUpper = currentCode && currentCode < 128 && isUpperCode[currentCode];
const previousIsLower = previousCode && previousCode < 128 && isLowerCode[previousCode];
const previousIsDigit = previousCode && previousCode < 128 && isDigitCode[previousCode];
const currentIsDigit = currentCode && currentCode < 128 && isDigitCode[currentCode];
if (previousIsLower && currentIsUpper) {
tokens.push(s.slice(start, index));
start = index;
continue;
}
if (previousIsDigit && !currentIsDigit || !previousIsDigit && currentIsDigit) {
tokens.push(s.slice(start, index));
start = index;
continue;
}
if (currentIsDigit && !previousIsDigit) {
let isNextUpper = false;
let isNextDigit = false;
if (index + 1 < width) {
const nextCode = s.codePointAt(index + 1);
isNextUpper = nextCode && nextCode < 128 && isUpperCode[nextCode];
isNextDigit = nextCode && nextCode < 128 && isDigitCode[nextCode];
}
if (!isNextDigit && isNextUpper) {
tokens.push(s.slice(start, index), s.slice(index, index + 1));
start = index + 1;
continue;
}
}
if (index + 1 < width) {
const nextCode = s.codePointAt(index + 1);
const nextIsLower = nextCode && nextCode < 128 && isLowerCode[nextCode];
if (previousIsUpper && currentIsUpper && nextIsLower) {
const candidate = s.slice(start, index + 1);
if (!knownAcronyms.has(candidate)) {
tokens.push(s.slice(start, index));
start = index;
}
}
}
}
if (start < width) {
tokens.push(s.slice(start));
}
return tokens.filter((token) => token !== "");
}, "splitCamelCaseFast");
const splitCamelCaseLocale = /* @__PURE__ */ __name((s, locale, knownAcronyms) => {
if (s.length === 0) {
return [];
}
const isUpperCase = s === s.toLocaleUpperCase(locale);
if (locale.startsWith("de")) {
if (!isUpperCase && s.replaceAll("ß", "SS") === s.toLocaleUpperCase(locale)) {
return [s];
}
const chars2 = [...s];
const width_2 = chars2.length;
const result2 = [];
let currentSegment2 = chars2[0];
let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale);
let isInUpperSequence = previousIsUpper2;
let upperSequenceStart = previousIsUpper2 ? 0 : -1;
for (let index = 1; index < width_2; index++) {
const char = chars2[index];
const isUpper = char === char.toLocaleUpperCase(locale);
if (isUpper === previousIsUpper2) {
currentSegment2 += char;
} else if (isUpper) {
if (currentSegment2 && currentSegment2.length > 0) {
result2.push(currentSegment2);
currentSegment2 = char;
}
isInUpperSequence = true;
upperSequenceStart = index;
} else {
if (isInUpperSequence && index - upperSequenceStart > 1) {
const lastUpperChar = chars2[index - 1];
const withoutLastUpper = currentSegment2.slice(0, -1);
if (withoutLastUpper && withoutLastUpper.length > 0) {
result2.push(withoutLastUpper);
}
currentSegment2 = lastUpperChar + char;
} else {
currentSegment2 += char;
}
isInUpperSequence = false;
upperSequenceStart = -1;
}
previousIsUpper2 = isUpper;
}
if (currentSegment2 && currentSegment2.length > 0) {
result2.push(currentSegment2);
}
return result2;
}
if (locale.startsWith("uk") || locale.startsWith("ru") || locale.startsWith("bg") || locale.startsWith("sr") || locale.startsWith("mk") || locale.startsWith("be")) {
if (!RE_CYRILLIC.test(s) && !RE_LATIN.test(s)) {
return [s];
}
const chars2 = [...s];
const width_2 = chars2.length;
const result2 = [];
let currentSegment2 = chars2[0];
let previousType = RE_CYRILLIC.test(chars2[0]) ? 1 : RE_LATIN.test(chars2[0]) ? 2 : 0;
let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale);
for (let index = 1; index < width_2; index++) {
const char = chars2[index];
const currentType = RE_CYRILLIC.test(char) ? 1 : RE_LATIN.test(char) ? 2 : 0;
const isUpper = char === char.toLocaleUpperCase(locale);
if (previousType !== currentType && (previousType === 1 || previousType === 2) && (currentType === 1 || currentType === 2) || currentType === previousType && !previousIsUpper2 && isUpper) {
result2.push(currentSegment2);
currentSegment2 = char;
} else {
currentSegment2 += char;
}
previousType = currentType;
previousIsUpper2 = isUpper;
}
if (currentSegment2 && currentSegment2.length > 0) {
result2.push(currentSegment2);
}
const finalResult = [];
for (let index = 0; index < result2.length; index++) {
if (index < result2.length - 1 && // eslint-disable-next-line security/detect-object-injection
result2[index].length === 1 && // eslint-disable-next-line security/detect-object-injection
RE_LATIN.test(result2[index]) && RE_CYRILLIC.test(result2[index + 1][0])) {
finalResult.push(result2[index] + result2[index + 1]);
index++;
} else {
finalResult.push(result2[index]);
}
}
return finalResult;
}
if (locale.startsWith("el")) {
if (!RE_GREEK.test(s) && !RE_LATIN.test(s)) {
return [s];
}
const parts = s.match(RE_GREEK_LATIN_SPLIT) ?? [s];
const result2 = [];
const width = parts.length;
if (width === 1) {
const part = parts[0];
if (!part || !RE_GREEK.test(part[0]) || part.length === 1) {
return [part || s];
}
}
for (const part of parts) {
if (!part) {
continue;
}
if (!RE_GREEK.test(part[0]) || part.length === 1) {
result2.push(part);
continue;
}
const partLength = part.length;
let word = part[0];
let previousIsUpper2 = part[0] === part[0].toLocaleUpperCase(locale);
for (let index = 1; index < partLength; index++) {
const char = part[index];
const isUpper = char === char.toLocaleUpperCase(locale);
if (!previousIsUpper2 && isUpper) {
result2.push(word);
word = char;
} else {
word += char;
}
previousIsUpper2 = isUpper;
}
if (word) {
result2.push(word);
}
}
return result2;
}
if (locale.startsWith("ja") || locale.startsWith("ko")) {
const isJapanese = locale.startsWith("ja");
const scriptDetectors = isJapanese ? {
hiragana: /* @__PURE__ */ __name((char) => RE_HIRAGANA.test(char), "hiragana"),
kanji: /* @__PURE__ */ __name((char) => RE_KANJI.test(char), "kanji"),
katakana: /* @__PURE__ */ __name((char) => RE_KATAKANA.test(char), "katakana"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
} : {
hangul: /* @__PURE__ */ __name((char) => RE_HANGUL.test(char), "hangul"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
};
const particles = /* @__PURE__ */ new Set(["と", "に", "へ", "を", "は", "が", "の", "で", "や", "も"]);
if (isJapanese) {
const baseSegments = handleScriptTransitions(
s,
scriptDetectors,
false,
locale,
(previousType, currentType) => previousType === "hiragana" && currentType === "katakana" || // hiragana -> katakana
previousType === "katakana" && currentType === "hiragana" || // katakana -> hiragana
previousType === "hiragana" && currentType === "latin" || // hiragana -> latin
previousType === "katakana" && currentType === "latin" || // katakana -> latin
previousType === "kanji" && currentType === "latin" || // kanji -> latin
previousType === "latin" && (currentType === "hiragana" || currentType === "katakana" || currentType === "kanji")
// latin -> japanese
);
const result2 = [];
for (const segment of baseSegments) {
if (segment.length === 1 && particles.has(segment) && result2.length > 0) {
result2[result2.length - 1] += segment;
} else {
result2.push(segment);
}
}
return result2.length > 0 ? result2 : [s];
}
return handleScriptTransitions(
s,
scriptDetectors,
false,
locale,
(previousType, currentType) => previousType === "hangul" && currentType === "latin" || // hangul -> latin
previousType === "latin" && currentType === "hangul"
// latin -> hangul
);
}
if (locale.startsWith("sl")) {
const chars2 = [...s];
const width_2 = chars2.length;
const result2 = [];
let currentSegment2 = chars2[0];
let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale);
for (let index = 1; index < width_2; index++) {
const char = chars2[index];
const isUpper = char === char.toLocaleUpperCase(locale);
const isSpecialChar = /[ČŠŽĐ]/i.test(char);
const nextIsUpper = index < width_2 - 1 && chars2[index + 1] === chars2[index + 1].toLocaleUpperCase(locale);
if (!previousIsUpper2 && isUpper || isSpecialChar && nextIsUpper) {
result2.push(currentSegment2);
currentSegment2 = char;
if (isSpecialChar && nextIsUpper) {
result2.push(currentSegment2);
currentSegment2 = "";
}
} else {
currentSegment2 += char;
}
previousIsUpper2 = isUpper;
}
if (currentSegment2 && currentSegment2.length > 0) {
result2.push(currentSegment2);
}
return result2;
}
if (locale.startsWith("zh")) {
return handleScriptTransitions(
s,
{
han: /* @__PURE__ */ __name((char) => RE_KANJI.test(char), "han"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
},
false,
locale
);
}
if (["ar", "fa", "he", "ur"].includes(locale.split("-")[0])) {
const isRtlChar = /* @__PURE__ */ __name((ch) => RE_HEBREW.test(ch) || RE_ARABIC.test(ch), "isRtlChar");
return handleScriptTransitions(
s,
{
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin"),
rtl: /* @__PURE__ */ __name((char) => isRtlChar(char), "rtl")
},
false,
locale
);
}
if ([
"am",
// Amharic
"bn",
// Bengali
"gu",
// Gujarati
"hi",
// Hindi
"km",
// Khmer
"kn",
// Kannada
"lo",
// Lao
"ml",
// Malayalam
"mr",
// Marathi
"ne",
// Nepali
"or",
// Oriya
"pa",
// Punjabi
"si",
// Sinhala
"ta",
// Tamil
"te",
// Telugu
"th"
// Thai
].includes(locale.split("-")[0])) {
const isIndicChar = /* @__PURE__ */ __name((ch) => RE_DEVANAGARI.test(ch) || RE_BENGALI.test(ch) || RE_GUJARATI.test(ch) || RE_GURMUKHI.test(ch) || RE_KANNADA.test(ch) || RE_TAMIL.test(ch) || RE_TELUGU.test(ch) || RE_MALAYALAM.test(ch) || RE_SINHALA.test(ch) || RE_THAI.test(ch) || RE_LAO.test(ch) || RE_TIBETAN.test(ch) || RE_MYANMAR.test(ch) || RE_ETHIOPIC.test(ch) || RE_KHMER.test(ch) || RE_ORIYA.test(ch), "isIndicChar");
return handleScriptTransitions(
s,
{
indic: /* @__PURE__ */ __name((char) => isIndicChar(char), "indic"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
},
false,
locale
);
}
if (["be", "bg", "ru", "sr", "uk"].includes(locale)) {
return handleScriptTransitions(
s,
{
cyrillic: /* @__PURE__ */ __name((char) => RE_CYRILLIC.test(char), "cyrillic"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
},
true,
// Enable case-sensitive splitting
locale
);
}
if (["ar", "fa", "he"].includes(locale)) {
return handleScriptTransitions(
s,
{
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin"),
rtl: /* @__PURE__ */ __name((char) => RE_HEBREW.test(char) || RE_ARABIC.test(char), "rtl")
},
false,
locale
);
}
if (locale.startsWith("ko")) {
return handleScriptTransitions(
s,
{
hangul: /* @__PURE__ */ __name((char) => RE_HANGUL.test(char), "hangul"),
latin: /* @__PURE__ */ __name((char) => RE_LATIN.test(char), "latin")
},
false,
locale
);
}
if (locale.startsWith("uz")) {
if (!RE_CYRILLIC.test(s) && !RE_LATIN.test(s)) {
return [s];
}
const chars2 = [...s];
const width_2 = chars2.length;
const result2 = [];
let currentSegment2 = chars2[0];
let previousIsUpper2 = chars2[0] === chars2[0].toLocaleUpperCase(locale);
for (let index = 1; index < width_2; index++) {
const char = chars2[index];
const isUpper = char === char.toLocaleUpperCase(locale);
if (RE_UZBEK_LATIN_MODIFIER.test(char) || RE_UZBEK_LATIN_MODIFIER.test(chars2[index - 1])) {
currentSegment2 += char;
continue;
}
if (!previousIsUpper2 && isUpper) {
result2.push(currentSegment2);
currentSegment2 = char;
} else {
currentSegment2 += char;
}
previousIsUpper2 = isUpper;
}
if (currentSegment2 && currentSegment2.length > 0) {
result2.push(currentSegment2);
}
return result2;
}
const chars = [...s];
const width_ = chars.length;
const result = [];
let currentSegment = chars[0];
let previousIsUpper = chars[0] === chars[0].toLocaleUpperCase(locale);
for (const acronym of knownAcronyms) {
if (s.startsWith(acronym)) {
result.push(acronym);
currentSegment = chars[acronym.length];
previousIsUpper = currentSegment === currentSegment.toLocaleUpperCase(locale);
break;
}
}
for (let index = 1; index < width_; index++) {
const char = chars[index];
const isUpper = char === char.toLocaleUpperCase(locale);
let isAcronym = false;
for (const acronym of knownAcronyms) {
if (s.startsWith(acronym, index)) {
result.push(currentSegment, acronym);
index += acronym.length - 1;
currentSegment = "";
isAcronym = true;
break;
}
}
if (isAcronym) {
continue;
}
if (!previousIsUpper && isUpper) {
result.push(currentSegment);
currentSegment = char;
} else {
currentSegment += char;
}
previousIsUpper = isUpper;
}
if (currentSegment) {
result.push(currentSegment);
}
return result;
}, "splitCamelCaseLocale");
const processTextWithAnsiEmoji = /* @__PURE__ */ __name((text, locale, knownAcronyms) => {
const result = [];
const segments = RE_FAST_ANSI.test(text) ? text.split(RE_FAST_ANSI).filter(Boolean) : [text];
for (const seg of segments) {
if (RE_FAST_ANSI.test(seg)) {
result.push(seg);
} else {
const subs = RE_EMOJI.test(seg) ? splitByEmoji(seg).filter(Boolean) : [seg];
for (const sub of subs) {
if (RE_EMOJI.test(sub)) {
result.push(sub);
} else {
if (locale) {
const normalizedLocale = locale.toLowerCase().split("-")[0];
result.push(...splitCamelCaseLocale(sub, normalizedLocale, knownAcronyms));
} else {
result.push(...splitCamelCaseFast(sub, knownAcronyms));
}
}
}
}
}
return result;
}, "processTextWithAnsiEmoji");
const splitByCase = /* @__PURE__ */ __name((input, options = {}) => {
if (!input || typeof input !== "string") {
return [];
}
const {
handleAnsi = false,
handleEmoji = false,
knownAcronyms = [],
locale,
normalize = false,
separators,
stripAnsi: stripAnsiOption = false,
stripEmoji: stripEmojiOption = false
} = options;
const acronymSet = new Set([...knownAcronyms].sort((a, b) => b.length - a.length));
let cleanedInput = input;
if (stripAnsiOption) {
cleanedInput = stripVTControlCharacters(cleanedInput);
}
if (stripEmojiOption) {
cleanedInput = stripEmoji(cleanedInput);
}
const separatorRegex = Array.isArray(separators) ? getSeparatorsRegex(separators) : separators instanceof RegExp ? separators : RE_SEPARATORS;
const parts = cleanedInput.split(separatorRegex).filter(Boolean);
let tokens = [];
for (const part of parts) {
if (handleAnsi || handleEmoji) {
tokens.push(...processTextWithAnsiEmoji(part, locale, acronymSet));
} else if (locale) {
tokens.push(...splitCamelCaseLocale(part, locale, acronymSet));
} else {
tokens.push(...splitCamelCaseFast(part, acronymSet));
}
}
if (normalize) {
tokens = tokens.map((token) => {
if (acronymSet.has(token)) {
return token;
}
if (locale && token === token.toLocaleUpperCase(locale)) {
return token[0] + token.slice(1).toLocaleLowerCase(locale);
}
if (token.toUpperCase() === token && !acronymSet.has(token)) {
return token.slice(0, 1) + token.slice(1).toLowerCase();
}
return token;
});
}
return tokens;
}, "splitByCase");
export { splitByCase };