UNPKG

@hugoalh/string-dissect

Version:

A module to dissect the string; Safe with the emojis, URLs, and words.

105 lines (104 loc) 3.71 kB
import regexpANSI from "ansi-regex"; import { urlRegExp as regexpURL } from "@hugoalh/url-regexp"; const regexpEmojiExact = /^\p{Emoji}+$/v; function* dissectorWithRegExp(matchers, item) { const [matcher, ...matchersRemain] = matchers; let cursor = 0; for (const match of item.matchAll(matcher.regexp)) { const segmentMatch = match[0]; const indexStart = match.index; if (cursor < indexStart) { const segmentNotMatch = item.slice(cursor, indexStart); if (matchersRemain.length > 0) { yield* dissectorWithRegExp(matchersRemain, segmentNotMatch); } else { yield segmentNotMatch; } } yield { type: matcher.type, value: segmentMatch }; cursor = indexStart + segmentMatch.length; } if (cursor < item.length) { const segmentNotMatch = item.slice(cursor, item.length); if (matchersRemain.length > 0) { yield* dissectorWithRegExp(matchersRemain, segmentNotMatch); } else { yield segmentNotMatch; } } } /** * String dissector to dissect the string; Safe with the emojis, URLs, and words. */ export class StringDissector { #outputANSI; #regexpMatchers = [{ regexp: regexpANSI(), type: "ansi" }]; #segmenter; /** * Initialize. * @param {StringDissectorOptions} [options={}] Options. */ constructor(options = {}) { const { locales, outputANSI = true, safeURLs = true, safeWords = true } = options; this.#outputANSI = outputANSI; if (safeURLs) { this.#regexpMatchers.push({ regexp: regexpURL({ auth: true }), type: "url" }); } this.#segmenter = new Intl.Segmenter(locales, { granularity: safeWords ? "word" : "grapheme" }); } /** * Dissect the string. * @param {string} item String that need to dissect. * @returns {Generator<StringSegmentDescriptor>} An iterable segment descriptors from the dissected string. */ *dissect(item) { let cursor = 0; for (const segmentThroughRegExp of dissectorWithRegExp(this.#regexpMatchers, item)) { if (typeof segmentThroughRegExp !== "string") { const { type, value } = segmentThroughRegExp; if (!(!this.#outputANSI && type === "ansi")) { yield { indexEnd: cursor + value.length, indexStart: cursor, type, value }; } cursor += value.length; continue; } for (const { isWordLike = false, segment } of this.#segmenter.segment(segmentThroughRegExp)) { yield { indexEnd: cursor + segment.length, indexStart: cursor, type: regexpEmojiExact.test(segment) ? "emoji" : (isWordLike ? "word" : "character"), value: segment }; cursor += segment.length; } } } } export default StringDissector; /** * Dissect the string; Safe with the emojis, URLs, and words. * @param {string} item String that need to dissect. * @param {StringDissectorOptions} [options={}] Options. * @returns {Generator<StringSegmentDescriptor>} An iterable segment descriptors from the dissected string. */ export function dissectString(item, options = {}) { return new StringDissector(options).dissect(item); }