shoetest
Version:
Powerful string matching insensitive to diacritics, special characters, symbols and case
437 lines (387 loc) • 14.5 kB
text/typescript
import RandExp = require('randexp');
import { escape } from './regexp';
// Type Definitions
type TextInput = string | string[];
type CharacterIndex = Record<string, Record<string, Record<string, string>>>;
type SpecialCharacters = Record<string, string>;
type SearchResult<T> = T | undefined;
interface ReferenceData {
readonly chars: Record<string, string>;
readonly extra: Record<string, string>;
readonly symbolsRegExp: string;
}
interface ShoetestOptions {
strict?: boolean;
diacritics?: boolean;
charCase?: boolean;
symbols?: boolean;
whitespaces?: boolean;
boundaries?: boolean;
begin?: string;
end?: string;
}
// Constants
const DEFAULT_OPTIONS: Required<ShoetestOptions> = {
strict: true,
diacritics: false,
charCase: false,
symbols: false,
whitespaces: false,
boundaries: true,
begin: '',
end: '',
} as const;
const UNICODE_NULL_CHAR = '\u0000';
// Load reference data
const REFERENCE = require('../reference.json') as ReferenceData;
// Advanced Unicode-aware string matching library
class Shoetest {
private readonly basic: CharacterIndex = {};
private basicExtra: CharacterIndex = {};
private readonly special: SpecialCharacters = {};
private readonly symbols: string;
constructor() {
this.symbols = REFERENCE.symbolsRegExp;
this.initializeCharacterIndices();
}
/**
* Initialize character mappings for basic and extra characters
*
* This method sets up the basic and extra character indices used for
* fuzzy matching, including special character mappings.
*/
private initializeCharacterIndices(): void {
this.buildCharacterMappings(REFERENCE.chars, this.basic);
this.basicExtra = this.cloneDeep(this.basic);
this.buildCharacterMappings(REFERENCE.extra, this.basicExtra, true);
}
/**
* Build character mappings for the given character set
*
* Creates regex patterns for character variations and stores them in the index.
* For basic mappings, creates simple alternation patterns. For extra mappings,
* combines existing patterns with new variants using non-capturing groups.
*
* @param chars - Character mapping object where keys are base characters and values are their variants
* @param index - The character index to populate with regex patterns
* @param isExtra - Whether this is building extra character mappings (combines with existing patterns)
*/
private buildCharacterMappings(
chars: Record<string, string>,
index: CharacterIndex,
isExtra = false,
): void {
for (const [character, variants] of Object.entries(chars)) {
const [first, second, third] = character;
// Ensure nested structure exists
index[first] ??= {};
index[first][second] ??= {};
if (isExtra) {
const existing = index[first][second][third];
const pattern = `[${escape(variants)}]`;
index[first][second][third] = existing
? `(?:${existing}|${pattern})`
: `(?:${character}|${pattern})`;
} else {
index[first][second][third] = `(?:${character}|[${variants}])`;
// Map variants to base character
for (const variant of variants) {
this.special[variant] = character;
}
}
}
}
/**
* Deep clone an object using the best available method
*
* Uses structuredClone if available (modern browsers/Node.js),
* otherwise falls back to JSON serialization for deep cloning.
*
* @param obj - The object to clone
* @returns A deep copy of the input object
*/
private cloneDeep<T>(obj: T): T {
return typeof structuredClone !== 'undefined'
? structuredClone(obj)
: JSON.parse(JSON.stringify(obj));
}
/**
* Remove diacritics and accents from text
*
* Converts accented and special characters to their basic equivalents
* using the character mapping table. Returns undefined for falsy inputs.
*
* @param str - The input string to simplify
* @returns The simplified string without diacritics, or undefined if input is falsy
*/
simplify(str: string): SearchResult<string> {
if (!str) return undefined;
return String(str)
.split('')
.map(char => this.special[char] || char)
.join('');
}
/**
* Create a fuzzy matching regular expression
*
* Generates a regex pattern that matches the input string with various
* character variations based on the provided options. Handles diacritics,
* case sensitivity, symbols, and other fuzzy matching features.
*
* @param str - The input string to create a regex pattern for
* @param options - Configuration options for the regex generation
* @returns A RegExp object for fuzzy matching, or undefined if input is falsy
*/
getRegExp(str: string, options: ShoetestOptions = {}): SearchResult<RegExp> {
if (!str) return undefined;
const config = { ...DEFAULT_OPTIONS, ...options };
const index = config.strict ? this.basic : this.basicExtra;
const processedStr = config.diacritics ? str : (this.simplify(str) ?? str);
return this.buildRegexPattern(processedStr, config, index);
}
/**
* Build the actual regex pattern from processed string and options
*
* Constructs the final regular expression by preprocessing the input string
* and building the pattern with appropriate flags based on configuration.
*
* @param str - The input string to build a pattern for
* @param options - Complete configuration options for pattern building
* @param index - Character index to use for pattern generation
* @returns A RegExp object with the generated pattern and appropriate flags
*/
private buildRegexPattern(
str: string,
options: Required<ShoetestOptions>,
index: CharacterIndex,
): RegExp {
const { processedStr, sb, sp } = this.preprocessString(str, options);
const pattern = this.buildPattern(processedStr, options, index, sb, sp);
const flags = options.charCase ? 'g' : 'gi';
return new RegExp(`(${options.begin}${pattern}${options.end})`, flags);
}
/**
* Preprocess string based on symbol and boundary options
*
* Handles removal or transformation of symbols and boundaries based on
* configuration options. Prepares separators for pattern building.
*
* @param str - The input string to preprocess
* @param options - Complete configuration options for preprocessing
* @returns Object containing processed string and boundary/symbol separators
*/
private preprocessString(str: string, options: Required<ShoetestOptions>) {
let processedStr = str;
let sb = '';
let sp = '';
if (!options.symbols && !options.boundaries) {
sb = `[\\s${this.symbols}]`;
processedStr = processedStr.replace(new RegExp(`${sb}+`, 'g'), '');
sb = `${sb}*`;
} else if (!options.symbols) {
sp = `[${this.symbols}]`;
processedStr = processedStr.replace(new RegExp(`${sp}+`, 'g'), UNICODE_NULL_CHAR);
sp = `${sp}*`;
} else if (!options.boundaries) {
sb = '\\s';
processedStr = processedStr.replace(new RegExp(`${sb}+`, 'g'), '');
sb = `${sb}*`;
}
return { processedStr, sb, sp };
}
/**
* Build the core pattern from processed string
*
* Constructs the main regex pattern by iterating through characters
* and handling multi-character sequences, boundaries, and separators.
*
* @param str - The preprocessed input string
* @param options - Complete configuration options for pattern building
* @param index - Character index to use for pattern generation
* @param sb - Boundary separator pattern
* @param sp - Symbol separator pattern
* @returns The constructed regex pattern string
*/
private buildPattern(
str: string,
options: Required<ShoetestOptions>,
index: CharacterIndex,
sb: string,
sp: string,
): string {
let pattern = '';
let current = '';
let prevX = '';
let prevY = '';
let prevZ = '';
for (let i = 0; i < str.length; i++) {
const chars = [str[i - 2], str[i - 1], str[i]];
const [cY, cZ, c] = chars;
[prevX, prevY, prevZ] = [prevY, prevZ, current];
const re1 = this.getCharacterRegex(c, options, index);
const re2 = this.getMultiCharRegex(cZ, c, undefined, index);
const re3 = this.getMultiCharRegex(cY, cZ, c, index);
if (!re2 && !re3) {
pattern += current + (current ? sb : '');
current = re1 === UNICODE_NULL_CHAR ? sp : re1;
[prevX, prevY, prevZ] = ['', '', ''];
} else {
current = `(?:${prevZ}${sb}${re1}`;
if (re2) current += `|${prevY}${sb}${re2}`;
if (re3) current += `|${prevX}${sb}${re3}`;
current += ')';
}
}
return pattern + current;
}
/**
* Get regex pattern for a single character
*
* Creates a regex pattern for an individual character, handling
* whitespace, symbols, and character variations based on options.
*
* @param char - The character to create a pattern for
* @param options - Complete configuration options for pattern building
* @param index - Character index to use for pattern generation
* @returns The regex pattern string for the character
*/
private getCharacterRegex(
char: string,
options: Required<ShoetestOptions>,
index: CharacterIndex,
): string {
if (options.boundaries && /\s/.test(char)) {
if (!options.whitespaces && !options.symbols) return `[\\s${this.symbols}]+`;
if (!options.whitespaces) return '\\s+';
if (!options.symbols) return `[${this.symbols}]*${char}[${this.symbols}]*`;
return char;
}
return char && index[char]?.['undefined']?.['undefined']
? index[char]['undefined']['undefined']
: escape(char);
}
/**
* Get regex pattern for multi-character sequences
*
* Attempts to find regex patterns for character sequences of 2 or 3 characters
* in the character index. Used for handling ligatures and multi-character mappings.
*
* @param char1 - First character of the sequence
* @param char2 - Second character of the sequence
* @param char3 - Optional third character of the sequence
* @param index - Character index to search for multi-character patterns
* @returns The regex pattern string for the sequence, or null if not found
*/
private getMultiCharRegex(
char1: string,
char2: string,
char3?: string,
index?: CharacterIndex,
): string | null {
if (!char1 || !char2) return null;
if (char3 && index?.[char1]?.[char2]?.[char3]) {
return index[char1][char2][char3];
}
if (!char3 && index?.[char1]?.[char2]?.['undefined']) {
return index[char1][char2]['undefined'];
}
return null;
}
/**
* Check if pattern exists in target text(s)
*
* Tests whether the fuzzy pattern matches any of the provided texts.
* Returns true if any match is found, false if no matches, undefined for invalid inputs.
*
* @param str - The pattern string to search for
* @param texts - Single text string or array of strings to search in
* @param options - Configuration options for fuzzy matching
* @returns True if pattern matches any text, false if no matches, undefined if invalid input
*/
test(
str: string,
texts: TextInput,
options?: ShoetestOptions,
): SearchResult<boolean> {
if (!str || !texts) return undefined;
const regex = this.getRegExp(str, options);
if (!regex) return false;
return (Array.isArray(texts) ? texts : [texts]).some(
text => typeof text === 'string' && regex.test(text),
);
}
/**
* Extract all matching substrings from text(s)
*
* Searches for all occurrences of the fuzzy pattern in the provided text(s)
* and returns an array of matching substrings. Filters out non-string inputs.
*
* @param str - The pattern string to search for
* @param texts - Single text string or array of strings to search in
* @param options - Configuration options for fuzzy matching
* @returns Array of matching substrings, empty array if no matches, undefined for invalid input
*/
match(
str: string,
texts: TextInput,
options?: ShoetestOptions,
): SearchResult<string[]> {
if (!str || !texts) return undefined;
const regex = this.getRegExp(str, options);
if (!regex) return [];
return (Array.isArray(texts) ? texts : [texts])
.filter((text): text is string => typeof text === 'string')
.flatMap(text => text.match(regex) ?? []);
}
/**
* Replace pattern matches with new content
*
* Searches for all occurrences of the fuzzy pattern in the provided text(s)
* and replaces them with the specified replacement string. Preserves original
* input structure (string vs array).
*
* @param str - The pattern string to search for
* @param newstr - The replacement string for matches
* @param texts - Single text string or array of strings to search in
* @param options - Configuration options for fuzzy matching
* @returns Modified string/array with replacements, or undefined for invalid input
*/
replace(
str: string,
newstr: string,
texts: TextInput,
options?: ShoetestOptions,
): SearchResult<string | string[]> {
if (!str || !texts) return undefined;
const regex = this.getRegExp(str, options);
if (!regex) return texts;
const results = (Array.isArray(texts) ? texts : [texts]).map(text =>
typeof text === 'string' ? text.replace(regex, newstr || '') : text,
);
return results.length === 1 ? results[0] : results;
}
/**
* Add random diacritics and character variations to text
*
* Generates a random variation of the input string by creating a fuzzy
* regex pattern and using it to produce a randomized version with
* character substitutions and diacritics.
*
* @param str - The input string to complexify
* @returns A randomized version of the string with character variations, or undefined if input is falsy
*/
complexify(str: string): SearchResult<string> {
if (!str) return undefined;
const regex = this.getRegExp(str, {
charCase: true,
symbols: true,
whitespaces: true,
});
if (!regex) return str;
const generator = new RandExp(regex);
generator.defaultRange.add(0, 65535);
return generator.gen();
}
}
const shoetestInstance = new Shoetest();
export default shoetestInstance;