UNPKG

extract-zhongwen

Version:

Utility for extracting chinese characters from a string

github.com/TheRobertLing/extract-zhongwen

TheRobertLing/extract-zhongwen

118 lines (117 loc) • 4.65 kB

JavaScript

/** * ============================================================= * * UNICODE RANGES * * ============================================================= * * The following were adapted from the following MIT-licensed project: * https://github.com/alsotang/is-chinese * * The main differences include: * - Excludes CJK Compatibility Range [0x3300, 0x33ff] * - Excludes CJK Compatibility Forms [0xfe30, 0xfe4f] * * All unicode ranges were sourced from: * https://www.unicode.org/charts/ * */ const characterUnicodeRanges = [ [0x4e00, 0x9fff], // CJK Unified Ideographs [0x3400, 0x4dbf], // CJK Extension A [0x20000, 0x2a6df], // CJK Extension B [0x2a700, 0x2b739], // CJK Extension C [0x2b740, 0x2b81d], // CJK Extension D [0x2b820, 0x2cea1], // CJK Extension E [0x2ceb0, 0x2ebe0], // CJK Extension F [0x30000, 0x3134a], // CJK Extension G [0x31350, 0x323af], // CJK Extension H [0x2ebf0, 0x2ee5d], // CJK Extension I [0xf900, 0xfad9], // CJK Compatibility Ideographs [0x2f800, 0x2fa1d], // CJK Compatibility Ideographs Supplement [0x2f00, 0x2fd5], // Kangxi Radicals, [0x2e80, 0x2ed3], // CJK Radicals Supplement [0x31c0, 0x31e5], // CJK Strokes. Techically ends at 0x31ef, but 0x31ef is an ideographic description character // Exclude since they are just structural indicators // [0x2ff0, 0x2fff], // Ideographic Description Characters ]; /** * ============================================================= * * FUNCTION DEFINITIONS * * ============================================================= */ const unicodeToRegex = (ranges) => { let result = ""; for (let i = 0, n = ranges.length; i < n; i++) { const [start, end] = ranges[i]; result += `\\u{${start.toString(16)}}-\\u{${end.toString(16)}}`; } return result; }; const userListsToRegex = (str) => { // Escape any reserved symbols/characters e.g * -> \* return str.replace(/[-\/\\^$*+?.()|[\]{}]/g, "\\$&").replace(/\s+/g, "\\s+"); }; const combineToRegex = (ranges, includeCharacters, excludeCharacters) => { // Convert Unicode Ranges to RegEx string const rangesRegEx = unicodeToRegex(ranges); // Normalize the user provided whitelist/blacklists const includeRegEx = userListsToRegex(includeCharacters); const excludeRegEx = userListsToRegex(excludeCharacters); const whitelistPattern = `[^${rangesRegEx}${includeRegEx}]+`; const blacklistPattern = excludeRegEx ? `[${excludeRegEx}]+` : "(?!)"; // To avoid regex error due to empty string const whitelist = new RegExp(whitelistPattern, "gu"); const blacklist = new RegExp(blacklistPattern, "gu"); return { whitelist, blacklist, }; }; const removeDuplicatesFromString = (str) => { const seen = new Set(); let result = ""; for (let i = 0, n = str.length; i < n; i++) { // https://www.linkedin.com/pulse/staying-clear-surrogate-pairs-issues-javascript-mazen-sharkawy-ofw1f/ const code = str.codePointAt(i); if (!seen.has(code)) { result += String.fromCodePoint(code); seen.add(code); } if (code > 0xffff) { i++; } } return result; }; const extract = (input, { normalizeUnicode = true, removeDuplicates = true, includeCharacters = "", excludeCharacters = "", } = {}) => { const { whitelist, blacklist } = combineToRegex(characterUnicodeRanges, includeCharacters, excludeCharacters); const original = input; const whiteblacklistSet = new Set([ ...includeCharacters.split(""), ...excludeCharacters.split(""), ]); // Normalize string but prevent whitelisted characters from being normalized if (normalizeUnicode) { input = input.normalize("NFKC"); for (let i = 0, j = 0, m = original.length, n = input.length; i < m && j < n; // Iterator manually incremented ) { const char1 = String.fromCodePoint(original.codePointAt(i)); const char2 = String.fromCodePoint(input.codePointAt(j)); if (char1 !== char2 && whiteblacklistSet.has(char1)) { // Reform string to avoid corruption due to surrogate pairs input = input.substring(0, j) + char1 + input.substring(j + char2.length); } i += char1.length; j += char2.length; } } input = input.replace(whitelist, "").replace(blacklist, ""); if (removeDuplicates) { input = removeDuplicatesFromString(input); } return input; }; export { extract };