UNPKG

@vrcd-community/zhlint

Version:

A linting tool for Chinese language.

143 lines (142 loc) 4.93 kB
import { CharType } from './types.js'; /** * NOTE: * - U+FE41 PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET -> U+300C LEFT CORNER BRACKET, etc. * - U+2E3A TWO-EM DASH, U+2014 EM DASH x2 * - U+2026 HORIZONTAL ELLIPSIS, U+22EF MIDLINE HORIZONTAL ELLIPSIS * - U+25CF BLACK CIRCLE (emphasis dots), U+2022 BULLET (emphasis dots), U+00B7 MIDDLE DOT (interpuncts), * U+2027 HYPHENATION POINT, U+2022 BULLET, U+30FB KATAKANA MIDDLE DOT * * Decoration marks: * - emphasis dots: U+25CF BLACK CIRCLE, U+2022 BULLET * - book title marks: U+FE4F WAVY LOW LINE * - proper noun marks: U+FF3F FULLWIDTH LOW LINE */ const newCharTypeSet = { [CharType.HALFWIDTH_PAUSE_OR_STOP]: ',.;:?!', [CharType.FULLWIDTH_PAUSE_OR_STOP]: [ // normal punctuation marks ',。、;:?!', // special punctuation marks '⁈⁇‼⁉' ].join(''), [CharType.HALFWIDTH_QUOTATION]: '\'"', [CharType.FULLWIDTH_QUOTATION]: '‘’“”《》〈〉『』「」【】〖〗', [CharType.HALFWIDTH_BRACKET]: '()[]{}', [CharType.FULLWIDTH_BRACKET]: '()〔〕[]{}', [CharType.HALFWIDTH_OTHER_PUNCTUATION]: [ // on-keyboard symbols '~-+*/\\%=&|`<>@#$^', // symbol of death '†‡' ].join(''), [CharType.FULLWIDTH_OTHER_PUNCTUATION]: [ // U+2E3A TWO-EM DASH, U+2014 EM DASH '—⸺', // U+2026 HORIZONTAL ELLIPSIS, U+22EF MIDLINE HORIZONTAL ELLIPSIS '…⋯', // U+FF5E FULLWIDTH TILDE '~', // U+25CF BLACK CIRCLE, U+2022 BULLET, U+00B7 MIDDLE DOT, // U+2027 HYPHENATION POINT, U+30FB KATAKANA MIDDLE DOT '●•·‧・' ].join('') }; /** * Check whether the character is full-width or half-width, * content or punctuation, or empty, or space, or emoji etc. * Refs: * - https://unicode.org/charts/ * - https://jrgraphix.net/research/unicode.php * - https://mathiasbynens.be/notes/javascript-unicode * - https://stackoverflow.com/a/21113538 * - https://www.w3.org/International/clreq/#categories_and_usage_of_punctuation_marks */ export const checkCharType = (char) => { if (char === '') { return CharType.EMPTY; } // space if (char.match(/\s/) != null) { return CharType.SPACE; } // punctuation marks for (const [charType, charSet] of Object.entries(newCharTypeSet)) { if ((charSet === null || charSet === void 0 ? void 0 : charSet.indexOf(char)) >= 0) { return charType; } } // 0-9 if (char.match(/[0-9]/) != null) { return CharType.WESTERN_LETTER; } // Basic Latin if (char.match(/[\u0020-\u007F]/) != null) { return CharType.WESTERN_LETTER; } // Latin-1 Supplement if (char.match(/[\u00A0-\u00FF]/) != null) { return CharType.WESTERN_LETTER; } // Latin Extended-A if (char.match(/[\u0100-\u017F]/) != null) { return CharType.WESTERN_LETTER; } // Latin Extended-B if (char.match(/[\u0180-\u024F]/) != null) { return CharType.WESTERN_LETTER; } // Greek and Coptic if (char.match(/[\u0370-\u03FF]/) != null) { return CharType.WESTERN_LETTER; } // CJK Unified Ideographs if (char.match(/[\u4E00-\u9FFF]/) != null) { return CharType.CJK_CHAR; } // CJK Unified Ideographs Extension A if (char.match(/[\u3400-\u4DBF]/) != null) { return CharType.CJK_CHAR; } // CJK Unified Ideographs Extension B if (char.match(/[\ud840-\ud868][\udc00-\udfff]|\ud869[\udc00-\uded6]/) != null) { return CharType.CJK_CHAR; } // CJK Unified Ideographs Extension C if (char.match(/\ud869[\udf00-\udfff]|[\ud86a-\ud86c][\udc00-\udfff]|\ud86d[\udc00-\udf34]/) != null) { return CharType.CJK_CHAR; } // CJK Unified Ideographs Extension D if (char.match(/\ud86d[\udf40-\udfff]|\ud86e[\udc00-\udc1d]/) != null) { return CharType.CJK_CHAR; } // CJK Compatibility Ideographs if (char.match(/[\uF900-\uFAFF]/) != null) { return CharType.CJK_CHAR; } // CJK Compatibility Forms if (char.match(/[\uFE30-\uFE4F]/) != null) { return CharType.CJK_CHAR; } // CJK Radicals Supplement if (char.match(/[\u2E80-\u2EFF]/) != null) { return CharType.CJK_CHAR; } // Private Use Area (part) if (char.match(/[\uE815-\uE864]/) != null) { return CharType.CJK_CHAR; } // CJK Unified Ideographs Extension B if (char.match(/[\u{20000}-\u{2A6DF}]/u) != null) { return CharType.CJK_CHAR; } // CJK Compatibility Ideographs Supplement if (char.match(/[\u{2F800}-\u{2FA1F}]/u) != null) { return CharType.CJK_CHAR; } // CJK Symbols and Punctuation if (char.match(/[\u3000-\u303F]/) != null) { return CharType.FULLWIDTH_OTHER_PUNCTUATION; } return CharType.UNKNOWN; };