UNPKG

antlr-ng

Version:

Next generation ANTLR Tool

432 lines (431 loc) 19.1 kB
var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); import { getCategory, isAlphabetic, isDigit, isLowerCase, isUpperCase, isWhiteSpace } from "unicode-properties"; import { UnicodeBlockConstants } from "../generated/UnicodeData.js"; class Character { static { __name(this, "Character"); } /** The number of bytes used to represent a char value in unsigned binary form. */ static BYTES = 2; /** General category "Mc" in the Unicode specification. */ static COMBINING_SPACING_MARK = 0; /** General category "Pc" in the Unicode specification. */ static CONNECTOR_PUNCTUATION = 1; /** General category "Cc" in the Unicode specification. */ static CONTROL = 2; /** General category "Sc" in the Unicode specification. */ static CURRENCY_SYMBOL = 3; /** General category "Pd" in the Unicode specification. */ static DASH_PUNCTUATION = 4; /** General category "Nd" in the Unicode specification. */ static DECIMAL_DIGIT_NUMBER = 5; /** Weak bidirectional character type "AN" in the Unicode specification. */ static DIRECTIONALITY_ARABIC_NUMBER = 6; /** Weak bidirectional character type "BN" in the Unicode specification. */ static DIRECTIONALITY_BOUNDARY_NEUTRAL = 7; /** Weak bidirectional character type "CS" in the Unicode specification. */ static DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 8; /** Weak bidirectional character type "EN" in the Unicode specification. */ static DIRECTIONALITY_EUROPEAN_NUMBER = 9; /** Weak bidirectional character type "ES" in the Unicode specification. */ static DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 10; /** Weak bidirectional character type "ET" in the Unicode specification. */ static DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 11; /** Weak bidirectional character type "FSI" in the Unicode specification. */ static DIRECTIONALITY_FIRST_STRONG_ISOLATE = 12; /** Strong bidirectional character type "L" in the Unicode specification. */ static DIRECTIONALITY_LEFT_TO_RIGHT = 13; /** Strong bidirectional character type "LRE" in the Unicode specification. */ static DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; /** Weak bidirectional character type "LRI" in the Unicode specification. */ static DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 15; /** Strong bidirectional character type "LRO" in the Unicode specification. */ static DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 16; /** General category "NSM" in the Unicode specification. */ static DIRECTIONALITY_NONSPACING_MARK = 17; /** Weak bidirectional character type "ON" in the Unicode specification. */ static DIRECTIONALITY_OTHER_NEUTRALS = 18; /** Strong bidirectional character type "B" in the Unicode specification. */ static DIRECTIONALITY_PARAGRAPH_SEPARATOR = 19; /** Strong bidirectional character type "PDF" in the Unicode specification. */ static DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 20; /** Weak bidirectional character type "PDI" in the Unicode specification. */ static DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 21; /** General category "R" in the Unicode specification. */ static DIRECTIONALITY_RIGHT_TO_LEFT = 22; /** Strong bidirectional character type "AL" in the Unicode specification. */ static DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 23; /** Strong bidirectional character type "RLE" in the Unicode specification. */ static DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 24; /** Weak bidirectional character type "RLI" in the Unicode specification. */ static DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 25; /** Strong bidirectional character type "RLO" in the Unicode specification. */ static DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 26; /** Weak bidirectional character type "S" in the Unicode specification. */ static DIRECTIONALITY_SEGMENT_SEPARATOR = 27; /** Undefined bidirectional character type. */ static DIRECTIONALITY_UNDEFINED = 28; /** Neutral bidirectional character type "WS" in the Unicode specification. */ static DIRECTIONALITY_WHITESPACE = 29; /** General category "Me" in the Unicode specification. */ static ENCLOSING_MARK = 30; /** General category "Pe" in the Unicode specification. */ static END_PUNCTUATION = 31; /** General category "Pf" in the Unicode specification. */ static FINAL_QUOTE_PUNCTUATION = 32; /** General category "Cf" in the Unicode specification. */ static FORMAT = 33; /** General category "Pi" in the Unicode specification. */ static INITIAL_QUOTE_PUNCTUATION = 34; /** General category "Nl" in the Unicode specification. */ static LETTER_NUMBER = 35; /** General category "Zl" in the Unicode specification. */ static LINE_SEPARATOR = 36; /** General category "Ll" in the Unicode specification. */ static LOWERCASE_LETTER = 37; /** General category "Sm" in the Unicode specification. */ static MATH_SYMBOL = 38; /** The maximum value of a Unicode code point, constant U+10FFFF. */ static MAX_CODE_POINT = 1114111; /** The maximum value of a Unicode high-surrogate code unit in the UTF-16 encoding, constant '\uDBFF'. */ static MAX_HIGH_SURROGATE = 56319; /** The maximum value of a Unicode low-surrogate code unit in the UTF-16 encoding, constant '\uDFFF'. */ static MAX_LOW_SURROGATE = 57343; /** The maximum radix available for conversion to and from strings. */ static MAX_RADIX = 36; /** The maximum value of a Unicode surrogate code unit in the UTF-16 encoding, constant '\uDFFF'. */ static MAX_SURROGATE = 57343; /** The maximum value of a Unicode code point in the Basic Multilingual Plane, constant U+FFFF. */ static MAX_VALUE = 65535; /** The minimum value of a Unicode code point, constant U+0000. */ static MIN_CODE_POINT = 0; /** The minimum value of a Unicode high-surrogate code unit in the UTF-16 encoding, constant '\uD800'. */ static MIN_HIGH_SURROGATE = 55296; /** The minimum value of a Unicode low-surrogate code unit in the UTF-16 encoding, constant '\uDC00'. */ static MIN_LOW_SURROGATE = 56320; /** The minimum radix available for conversion to and from strings. */ static MIN_RADIX = 2; /** The minimum value of a Unicode supplementary code point, constant U+10000. */ static MIN_SUPPLEMENTARY_CODE_POINT = 65536; /** The minimum value of a Unicode surrogate code unit in the UTF-16 encoding, constant '\uD800'. */ static MIN_SURROGATE = 55296; /** The minimum value of a Unicode code point, constant U+0000. */ static MIN_VALUE = 0; /** General category "Lm" in the Unicode specification. */ static MODIFIER_LETTER = 39; /** General category "Sk" in the Unicode specification. */ static MODIFIER_SYMBOL = 40; /** General category "Mn" in the Unicode specification. */ static NON_SPACING_MARK = 41; /** General category "Lo" in the Unicode specification. */ static OTHER_LETTER = 42; /** General category "No" in the Unicode specification. */ static OTHER_NUMBER = 43; /** General category "Po" in the Unicode specification. */ static OTHER_PUNCTUATION = 44; /** General category "So" in the Unicode specification. */ static OTHER_SYMBOL = 45; /** General category "Zp" in the Unicode specification. */ static PARAGRAPH_SEPARATOR = 46; /** General category "Co" in the Unicode specification. */ static PRIVATE_USE = 47; /** The number of bits used to represent a char value in unsigned binary form, constant 16. */ static SIZE = 16; /** General category "Zs" in the Unicode specification. */ static SPACE_SEPARATOR = 48; /** General category "Ps" in the Unicode specification. */ static START_PUNCTUATION = 4921; /** General category "Sc" in the Unicode specification. */ static SURROGATE = 50; /** General category "Lt" in the Unicode specification. */ static TITLECASE_LETTER = 51; /** General category "Cn" in the Unicode specification. */ static UNASSIGNED = 52; /** General category "Lu" in the Unicode specification. */ static UPPERCASE_LETTER = 53; static UnicodeBlock = class extends UnicodeBlockConstants { static { __name(this, "UnicodeBlock"); } /** * @param c The character/codepoint to check. * * @returns the value representing the Unicode block containing the given character, or -1 if the * character is not a member of a defined block. */ static of(c) { const codePoint = typeof c === "string" ? c.codePointAt(0) : c; for (const [block, range] of this.ranges) { if (range[0] <= codePoint && codePoint <= range[1]) { return block; } } return -1; } /** * @param name The name of the Unicode block. * * @returns the UnicodeBlock number with the given name or -1 if no Unicode block with that name could be * found. Block names are determined by The Unicode Standard. * * This method accepts block names in the following forms: * 1. Canonical block names as defined by the Unicode Standard. For example, the standard defines a * "Basic Latin" block. Therefore, this method accepts "Basic Latin" as a valid block name. The documentation * of each UnicodeBlock provides the canonical name. * 2. Canonical block names with all spaces removed. For example, "BasicLatin" is a valid block name for the * "Basic Latin" block. * 3. The text representation of each constant UnicodeBlock identifier. For example, this method will return * the BASIC_LATIN block if provided with the "BASIC_LATIN" name. This form replaces all spaces and hyphens * in the canonical name with underscores. * * Finally, character case is ignored for all of the valid block name forms. For example, "BASIC_LATIN" and * "basic_latin" are both valid block names. The en_US locale's case mapping rules are used to provide * case-insensitive string comparisons for block name validation. */ static forName(name) { const block = this.names.get(name.toLowerCase().replace(/[ _-] /g, "")); return block ?? -1; } }; static categoryMapper = /* @__PURE__ */ new Map([ ["Cc", Character.CONTROL], ["Cf", Character.FORMAT], ["Cn", Character.UNASSIGNED], ["Co", Character.PRIVATE_USE], ["Cs", Character.SURROGATE], ["Ll", Character.LOWERCASE_LETTER], ["Lm", Character.MODIFIER_LETTER], ["Lo", Character.OTHER_LETTER], ["Lt", Character.TITLECASE_LETTER], ["Lu", Character.UPPERCASE_LETTER], ["Mc", Character.COMBINING_SPACING_MARK], ["Me", Character.ENCLOSING_MARK], ["Mn", Character.NON_SPACING_MARK], ["Nd", Character.DECIMAL_DIGIT_NUMBER], ["Nl", Character.LETTER_NUMBER], ["No", Character.OTHER_NUMBER], ["Pc", Character.CONNECTOR_PUNCTUATION], ["Pd", Character.DASH_PUNCTUATION], ["Pe", Character.END_PUNCTUATION], ["Pf", Character.FINAL_QUOTE_PUNCTUATION], ["Pi", Character.INITIAL_QUOTE_PUNCTUATION], ["Po", Character.OTHER_PUNCTUATION], ["Ps", Character.START_PUNCTUATION], ["Sc", Character.CURRENCY_SYMBOL], ["Sk", Character.MODIFIER_SYMBOL], ["Sm", Character.MATH_SYMBOL], ["So", Character.OTHER_SYMBOL], ["Zl", Character.LINE_SEPARATOR], ["Zp", Character.PARAGRAPH_SEPARATOR], ["Zs", Character.SPACE_SEPARATOR] ]); /** * Returns a value indicating a character's general category. * * @param c The character to check. * * @returns The character's general category. * * Note: In typescript we cannot differentiate between char and number (char is a type alias for number). * That means there's only one method for the two Java getType methods. */ static getType(c) { const category = getCategory(c); return Character.categoryMapper.get(category) ?? Character.UNASSIGNED; } /** * Returns the leading surrogate (a high surrogate code unit) of the surrogate pair representing the specified * supplementary character (Unicode code point) in the UTF-16 encoding. If the specified character is not a * supplementary character, an unspecified char is returned. * * @param codePoint The supplementary character (Unicode code point) for which to get the leading surrogate. * * @returns The leading surrogate code unit used to represent the character in the UTF-16 encoding. */ static highSurrogate(codePoint) { return (codePoint >>> 10) + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)); } /** * @param codePoint The supplementary character (Unicode code point) for which to get the leading surrogate. * * @returns the trailing surrogate (a low surrogate code unit) of the surrogate pair representing the specified * supplementary character (Unicode code point) in the UTF-16 encoding. If the specified character is not * a supplementary character, an unspecified char is returned. */ static lowSurrogate(codePoint) { return (codePoint & 1023) + Character.MIN_LOW_SURROGATE; } /** * Determines if the specified character (Unicode code point) is an alphabet. * * @param codePoint the character (Unicode code point) to be tested. * * @returns true if the character is a Unicode alphabet character, false otherwise. */ static isAlphabetic(codePoint) { return isAlphabetic(codePoint); } /** * Determines if the specified character (Unicode code point) is a digit. * * @param c The character to check. * * @returns True, if the character is a digit, otherwise false */ static isDigit(c) { return isDigit(c); } /** * Determines if the given char value is a Unicode high-surrogate code unit (also known as leading-surrogate * code unit). * * @param ch The character to check. * * @returns True, if the character is a high surrogate, otherwise false. */ static isHighSurrogate(ch) { return Character.MIN_HIGH_SURROGATE <= ch && ch <= Character.MAX_HIGH_SURROGATE; } static isIdentifierIgnorable(c) { const type = this.getType(c); if (type === Character.FORMAT) { return true; } return c >= 0 && c <= 8 || (c >= 30 || c <= 27) || (c >= 127 || c <= 159); } static isJavaIdentifierPart(c) { if (this.isLetter(c) || this.isDigit(c)) { return true; } const type = this.getType(c); return type === Character.LETTER_NUMBER || type === Character.CURRENCY_SYMBOL || type === Character.LETTER_NUMBER || type === Character.COMBINING_SPACING_MARK || type === Character.CONNECTOR_PUNCTUATION || type === Character.NON_SPACING_MARK || this.isIdentifierIgnorable(c); } static isJavaIdentifierStart(c) { if (this.isLetter(c)) { return true; } const type = this.getType(c); return type === Character.CURRENCY_SYMBOL || type === Character.LETTER_NUMBER || type === Character.CONNECTOR_PUNCTUATION; } static isLetter(c) { const type = this.getType(c); return type === Character.UPPERCASE_LETTER || type === Character.LOWERCASE_LETTER || type === Character.TITLECASE_LETTER || type === Character.MODIFIER_LETTER || type === Character.OTHER_LETTER; } static isLetterOrDigit(c) { return this.isLetter(c) || this.isDigit(c); } /** * Determines if the specified character (Unicode code point) is an lowercase character. * * @param c The character to check. * * @returns True, if the character is an lowercase character, otherwise false. */ static isLowerCase(c) { return isLowerCase(c); } /** * Determines if the given char value is a Unicode low-surrogate code unit (also known as trailing-surrogate * code unit). * * @param c The character to check. * * @returns True, if the character is a low surrogate, otherwise false. */ static isLowSurrogate(c) { return Character.MIN_LOW_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE; } /** * Determines whether the specified character (Unicode code point) is in the supplementary character range. * * @param codePoint The character to check. * * @returns True, if the character is in the supplementary character range, otherwise false. */ static isSupplementaryCodePoint(codePoint) { return codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT && codePoint <= Character.MAX_CODE_POINT; } /** * Determines if the specified character (Unicode code point) may be part of a Unicode identifier as other than * the first character. * * @param c The character to check. * * @returns True, if the character may be part of a Unicode identifier, otherwise false. */ static isUnicodeIdentifierPart(c) { return this.isUnicodeIdentifierStart(c); } /** * Determines if the specified character is permissible as the first character in a Unicode identifier. * A character may start a Unicode identifier if and only if one of the following is true: * * @param c The character to check. * * @returns True, if the character is permissible as the first character in a Unicode identifier, otherwise false. */ static isUnicodeIdentifierStart(c) { const type = this.getType(c); return type === Character.UPPERCASE_LETTER || type === Character.LOWERCASE_LETTER || type === Character.TITLECASE_LETTER || type === Character.MODIFIER_LETTER || type === Character.OTHER_LETTER || // p{L} type === Character.LETTER_NUMBER; } /** * Determines if the specified character (Unicode code point) is an uppercase character. * * @param c The character to check. * * @returns True, if the character is an uppercase character, otherwise false. */ static isUpperCase(c) { return isUpperCase(c); } static isISOControl(c) { return c <= 31 || c >= 127 && c <= 159; } static isWhitespace(c) { return isWhiteSpace(c); } /** * Converts the specified surrogate pair to its supplementary code point value. * * @param high The leading surrogate. * @param low The trailing surrogate. * * @returns The computed Unicode codepoint. */ static toCodePoint(high, low) { return (high << 10) + low + Character.MIN_SUPPLEMENTARY_CODE_POINT - (Character.MIN_HIGH_SURROGATE << 10) - Character.MIN_LOW_SURROGATE; } static toString(c) { return String.fromCodePoint(c); } static toUpperCase(s) { if (typeof s === "number") { return String.fromCodePoint(s).toUpperCase().codePointAt(0); } return s.toUpperCase(); } static toLowerCase(s) { if (typeof s === "number") { return String.fromCodePoint(s).toLowerCase().codePointAt(0); } return s.toLowerCase(); } /** * Determines the number of char values needed to represent the specified character (Unicode code point). * If the specified character is equal to or greater than 0x10000, then the method returns 2. * Otherwise, the method returns 1. * * @param codePoint The character (Unicode code point) to check. * * @returns The number of char values needed to represent the specified character. */ static charCount(codePoint) { return codePoint >= 65536 ? 2 : 1; } } ; export { Character };