UNPKG

shoetest

Version:

Powerful string matching insensitive to diacritics, special characters, symbols and case

350 lines 14.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const RandExp = require("randexp"); const regexp_1 = require("./regexp"); // Constants const DEFAULT_OPTIONS = { strict: true, diacritics: false, charCase: false, symbols: false, whitespaces: false, boundaries: true, begin: '', end: '', }; const UNICODE_NULL_CHAR = '\u0000'; // Load reference data const REFERENCE = require('../reference.json'); // Advanced Unicode-aware string matching library class Shoetest { constructor() { this.basic = {}; this.basicExtra = {}; this.special = {}; this.symbols = REFERENCE.symbolsRegExp; this.initializeCharacterIndices(); } /** * Initialize character mappings for basic and extra characters * * This method sets up the basic and extra character indices used for * fuzzy matching, including special character mappings. */ initializeCharacterIndices() { this.buildCharacterMappings(REFERENCE.chars, this.basic); this.basicExtra = this.cloneDeep(this.basic); this.buildCharacterMappings(REFERENCE.extra, this.basicExtra, true); } /** * Build character mappings for the given character set * * Creates regex patterns for character variations and stores them in the index. * For basic mappings, creates simple alternation patterns. For extra mappings, * combines existing patterns with new variants using non-capturing groups. * * @param chars - Character mapping object where keys are base characters and values are their variants * @param index - The character index to populate with regex patterns * @param isExtra - Whether this is building extra character mappings (combines with existing patterns) */ buildCharacterMappings(chars, index, isExtra = false) { var _a; for (const [character, variants] of Object.entries(chars)) { const [first, second, third] = character; // Ensure nested structure exists index[first] ?? (index[first] = {}); (_a = index[first])[second] ?? (_a[second] = {}); if (isExtra) { const existing = index[first][second][third]; const pattern = `[${(0, regexp_1.escape)(variants)}]`; index[first][second][third] = existing ? `(?:${existing}|${pattern})` : `(?:${character}|${pattern})`; } else { index[first][second][third] = `(?:${character}|[${variants}])`; // Map variants to base character for (const variant of variants) { this.special[variant] = character; } } } } /** * Deep clone an object using the best available method * * Uses structuredClone if available (modern browsers/Node.js), * otherwise falls back to JSON serialization for deep cloning. * * @param obj - The object to clone * @returns A deep copy of the input object */ cloneDeep(obj) { return typeof structuredClone !== 'undefined' ? structuredClone(obj) : JSON.parse(JSON.stringify(obj)); } /** * Remove diacritics and accents from text * * Converts accented and special characters to their basic equivalents * using the character mapping table. Returns undefined for falsy inputs. * * @param str - The input string to simplify * @returns The simplified string without diacritics, or undefined if input is falsy */ simplify(str) { if (!str) return undefined; return String(str) .split('') .map(char => this.special[char] || char) .join(''); } /** * Create a fuzzy matching regular expression * * Generates a regex pattern that matches the input string with various * character variations based on the provided options. Handles diacritics, * case sensitivity, symbols, and other fuzzy matching features. * * @param str - The input string to create a regex pattern for * @param options - Configuration options for the regex generation * @returns A RegExp object for fuzzy matching, or undefined if input is falsy */ getRegExp(str, options = {}) { if (!str) return undefined; const config = { ...DEFAULT_OPTIONS, ...options }; const index = config.strict ? this.basic : this.basicExtra; const processedStr = config.diacritics ? str : (this.simplify(str) ?? str); return this.buildRegexPattern(processedStr, config, index); } /** * Build the actual regex pattern from processed string and options * * Constructs the final regular expression by preprocessing the input string * and building the pattern with appropriate flags based on configuration. * * @param str - The input string to build a pattern for * @param options - Complete configuration options for pattern building * @param index - Character index to use for pattern generation * @returns A RegExp object with the generated pattern and appropriate flags */ buildRegexPattern(str, options, index) { const { processedStr, sb, sp } = this.preprocessString(str, options); const pattern = this.buildPattern(processedStr, options, index, sb, sp); const flags = options.charCase ? 'g' : 'gi'; return new RegExp(`(${options.begin}${pattern}${options.end})`, flags); } /** * Preprocess string based on symbol and boundary options * * Handles removal or transformation of symbols and boundaries based on * configuration options. Prepares separators for pattern building. * * @param str - The input string to preprocess * @param options - Complete configuration options for preprocessing * @returns Object containing processed string and boundary/symbol separators */ preprocessString(str, options) { let processedStr = str; let sb = ''; let sp = ''; if (!options.symbols && !options.boundaries) { sb = `[\\s${this.symbols}]`; processedStr = processedStr.replace(new RegExp(`${sb}+`, 'g'), ''); sb = `${sb}*`; } else if (!options.symbols) { sp = `[${this.symbols}]`; processedStr = processedStr.replace(new RegExp(`${sp}+`, 'g'), UNICODE_NULL_CHAR); sp = `${sp}*`; } else if (!options.boundaries) { sb = '\\s'; processedStr = processedStr.replace(new RegExp(`${sb}+`, 'g'), ''); sb = `${sb}*`; } return { processedStr, sb, sp }; } /** * Build the core pattern from processed string * * Constructs the main regex pattern by iterating through characters * and handling multi-character sequences, boundaries, and separators. * * @param str - The preprocessed input string * @param options - Complete configuration options for pattern building * @param index - Character index to use for pattern generation * @param sb - Boundary separator pattern * @param sp - Symbol separator pattern * @returns The constructed regex pattern string */ buildPattern(str, options, index, sb, sp) { let pattern = ''; let current = ''; let prevX = ''; let prevY = ''; let prevZ = ''; for (let i = 0; i < str.length; i++) { const chars = [str[i - 2], str[i - 1], str[i]]; const [cY, cZ, c] = chars; [prevX, prevY, prevZ] = [prevY, prevZ, current]; const re1 = this.getCharacterRegex(c, options, index); const re2 = this.getMultiCharRegex(cZ, c, undefined, index); const re3 = this.getMultiCharRegex(cY, cZ, c, index); if (!re2 && !re3) { pattern += current + (current ? sb : ''); current = re1 === UNICODE_NULL_CHAR ? sp : re1; [prevX, prevY, prevZ] = ['', '', '']; } else { current = `(?:${prevZ}${sb}${re1}`; if (re2) current += `|${prevY}${sb}${re2}`; if (re3) current += `|${prevX}${sb}${re3}`; current += ')'; } } return pattern + current; } /** * Get regex pattern for a single character * * Creates a regex pattern for an individual character, handling * whitespace, symbols, and character variations based on options. * * @param char - The character to create a pattern for * @param options - Complete configuration options for pattern building * @param index - Character index to use for pattern generation * @returns The regex pattern string for the character */ getCharacterRegex(char, options, index) { if (options.boundaries && /\s/.test(char)) { if (!options.whitespaces && !options.symbols) return `[\\s${this.symbols}]+`; if (!options.whitespaces) return '\\s+'; if (!options.symbols) return `[${this.symbols}]*${char}[${this.symbols}]*`; return char; } return char && index[char]?.['undefined']?.['undefined'] ? index[char]['undefined']['undefined'] : (0, regexp_1.escape)(char); } /** * Get regex pattern for multi-character sequences * * Attempts to find regex patterns for character sequences of 2 or 3 characters * in the character index. Used for handling ligatures and multi-character mappings. * * @param char1 - First character of the sequence * @param char2 - Second character of the sequence * @param char3 - Optional third character of the sequence * @param index - Character index to search for multi-character patterns * @returns The regex pattern string for the sequence, or null if not found */ getMultiCharRegex(char1, char2, char3, index) { if (!char1 || !char2) return null; if (char3 && index?.[char1]?.[char2]?.[char3]) { return index[char1][char2][char3]; } if (!char3 && index?.[char1]?.[char2]?.['undefined']) { return index[char1][char2]['undefined']; } return null; } /** * Check if pattern exists in target text(s) * * Tests whether the fuzzy pattern matches any of the provided texts. * Returns true if any match is found, false if no matches, undefined for invalid inputs. * * @param str - The pattern string to search for * @param texts - Single text string or array of strings to search in * @param options - Configuration options for fuzzy matching * @returns True if pattern matches any text, false if no matches, undefined if invalid input */ test(str, texts, options) { if (!str || !texts) return undefined; const regex = this.getRegExp(str, options); if (!regex) return false; return (Array.isArray(texts) ? texts : [texts]).some(text => typeof text === 'string' && regex.test(text)); } /** * Extract all matching substrings from text(s) * * Searches for all occurrences of the fuzzy pattern in the provided text(s) * and returns an array of matching substrings. Filters out non-string inputs. * * @param str - The pattern string to search for * @param texts - Single text string or array of strings to search in * @param options - Configuration options for fuzzy matching * @returns Array of matching substrings, empty array if no matches, undefined for invalid input */ match(str, texts, options) { if (!str || !texts) return undefined; const regex = this.getRegExp(str, options); if (!regex) return []; return (Array.isArray(texts) ? texts : [texts]) .filter((text) => typeof text === 'string') .flatMap(text => text.match(regex) ?? []); } /** * Replace pattern matches with new content * * Searches for all occurrences of the fuzzy pattern in the provided text(s) * and replaces them with the specified replacement string. Preserves original * input structure (string vs array). * * @param str - The pattern string to search for * @param newstr - The replacement string for matches * @param texts - Single text string or array of strings to search in * @param options - Configuration options for fuzzy matching * @returns Modified string/array with replacements, or undefined for invalid input */ replace(str, newstr, texts, options) { if (!str || !texts) return undefined; const regex = this.getRegExp(str, options); if (!regex) return texts; const results = (Array.isArray(texts) ? texts : [texts]).map(text => typeof text === 'string' ? text.replace(regex, newstr || '') : text); return results.length === 1 ? results[0] : results; } /** * Add random diacritics and character variations to text * * Generates a random variation of the input string by creating a fuzzy * regex pattern and using it to produce a randomized version with * character substitutions and diacritics. * * @param str - The input string to complexify * @returns A randomized version of the string with character variations, or undefined if input is falsy */ complexify(str) { if (!str) return undefined; const regex = this.getRegExp(str, { charCase: true, symbols: true, whitespaces: true, }); if (!regex) return str; const generator = new RandExp(regex); generator.defaultRange.add(0, 65535); return generator.gen(); } } const shoetestInstance = new Shoetest(); exports.default = shoetestInstance; //# sourceMappingURL=shoetest.js.map