@hugoalh/string-dissect
Version:
A module to dissect the string; Safe with the emojis, URLs, and words.
105 lines (104 loc) • 3.71 kB
JavaScript
import regexpANSI from "ansi-regex";
import { urlRegExp as regexpURL } from "@hugoalh/url-regexp";
const regexpEmojiExact = /^\p{Emoji}+$/v;
function* dissectorWithRegExp(matchers, item) {
const [matcher, ...matchersRemain] = matchers;
let cursor = 0;
for (const match of item.matchAll(matcher.regexp)) {
const segmentMatch = match[0];
const indexStart = match.index;
if (cursor < indexStart) {
const segmentNotMatch = item.slice(cursor, indexStart);
if (matchersRemain.length > 0) {
yield* dissectorWithRegExp(matchersRemain, segmentNotMatch);
}
else {
yield segmentNotMatch;
}
}
yield {
type: matcher.type,
value: segmentMatch
};
cursor = indexStart + segmentMatch.length;
}
if (cursor < item.length) {
const segmentNotMatch = item.slice(cursor, item.length);
if (matchersRemain.length > 0) {
yield* dissectorWithRegExp(matchersRemain, segmentNotMatch);
}
else {
yield segmentNotMatch;
}
}
}
/**
* String dissector to dissect the string; Safe with the emojis, URLs, and words.
*/
export class StringDissector {
#outputANSI;
#regexpMatchers = [{
regexp: regexpANSI(),
type: "ansi"
}];
#segmenter;
/**
* Initialize.
* @param {StringDissectorOptions} [options={}] Options.
*/
constructor(options = {}) {
const { locales, outputANSI = true, safeURLs = true, safeWords = true } = options;
this.#outputANSI = outputANSI;
if (safeURLs) {
this.#regexpMatchers.push({
regexp: regexpURL({
auth: true
}),
type: "url"
});
}
this.#segmenter = new Intl.Segmenter(locales, { granularity: safeWords ? "word" : "grapheme" });
}
/**
* Dissect the string.
* @param {string} item String that need to dissect.
* @returns {Generator<StringSegmentDescriptor>} An iterable segment descriptors from the dissected string.
*/
*dissect(item) {
let cursor = 0;
for (const segmentThroughRegExp of dissectorWithRegExp(this.#regexpMatchers, item)) {
if (typeof segmentThroughRegExp !== "string") {
const { type, value } = segmentThroughRegExp;
if (!(!this.#outputANSI && type === "ansi")) {
yield {
indexEnd: cursor + value.length,
indexStart: cursor,
type,
value
};
}
cursor += value.length;
continue;
}
for (const { isWordLike = false, segment } of this.#segmenter.segment(segmentThroughRegExp)) {
yield {
indexEnd: cursor + segment.length,
indexStart: cursor,
type: regexpEmojiExact.test(segment) ? "emoji" : (isWordLike ? "word" : "character"),
value: segment
};
cursor += segment.length;
}
}
}
}
export default StringDissector;
/**
* Dissect the string; Safe with the emojis, URLs, and words.
* @param {string} item String that need to dissect.
* @param {StringDissectorOptions} [options={}] Options.
* @returns {Generator<StringSegmentDescriptor>} An iterable segment descriptors from the dissected string.
*/
export function dissectString(item, options = {}) {
return new StringDissector(options).dissect(item);
}