@orchidjs/unicode-variants
Version:
Unicode variant string matching
78 lines (77 loc) • 2.77 kB
TypeScript
import { escape_regex } from './regex.ts';
export type TUnicodeMap = {
[key: string]: string;
};
export type TUnicodeSets = {
[key: string]: Set<string>;
};
export type TCodePoints = [[number, number]];
export type TCodePointObj = {
folded: string;
composed: string;
code_point: number;
};
export type TSequencePart = {
start: number;
end: number;
length: number;
substr: string;
};
export declare const code_points: TCodePoints;
export declare let unicode_map: TUnicodeMap;
/**
* Initialize the unicode_map from the give code point ranges
*/
export declare const initialize: (_code_points?: TCodePoints) => void;
/**
* Helper method for normalize a string
* https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
*/
export declare const normalize: (str: string, form?: string) => string;
/**
* Remove accents without reordering string
* calling str.normalize('NFKD') on \u{594}\u{595}\u{596} becomes \u{596}\u{594}\u{595}
* via https://github.com/krisk/Fuse/issues/133#issuecomment-318692703
*/
export declare const asciifold: (str: string) => string;
export declare const _asciifold: (str: string) => string;
/**
* Generate a list of unicode variants from the list of code points
*/
export declare function generator(code_points: TCodePoints): Generator<TCodePointObj>;
/**
* Generate a unicode map from the list of code points
*/
export declare const generateSets: (code_points: TCodePoints) => TUnicodeSets;
/**
* Generate a unicode map from the list of code points
* ae => (?:(?:ae|Æ|Ǽ|Ǣ)|(?:A|Ⓐ|A...)(?:E|ɛ|Ⓔ...))
*/
export declare const generateMap: (code_points: TCodePoints) => TUnicodeMap;
/**
* Map each element of an array from its folded value to all possible unicode matches
*/
export declare const mapSequence: (strings: string[], min_replacement?: number) => string;
/**
* Convert a short string and split it into all possible patterns
* Keep a pattern only if min_replacement is met
*
* 'abc'
* => [['abc'],['ab','c'],['a','bc'],['a','b','c']]
* => ['abc-pattern','ab-c-pattern'...]
*/
export declare const substringsToPattern: (str: string, min_replacement?: number) => string;
/**
* Expand a regular expression pattern to include unicode variants
* eg /a/ becomes /aⓐaẚàáâầấẫẩãāăằắẵẳȧǡäǟảåǻǎȁȃạậặḁąⱥɐɑAⒶAÀÁÂẦẤẪẨÃĀĂẰẮẴẲȦǠÄǞẢÅǺǍȀȂẠẬẶḀĄȺⱯ/
*
* Issue:
* ﺊﺋ [ 'ﺊ = \\u{fe8a}', 'ﺋ = \\u{fe8b}' ]
* becomes: ئئ [ 'ي = \\u{64a}', 'ٔ = \\u{654}', 'ي = \\u{64a}', 'ٔ = \\u{654}' ]
*
* İIJ = IIJ = ⅡJ
*
* 1/2/4
*/
export declare const getPattern: (str: string) => string | undefined;
export { escape_regex };