antlr-ng
Version:
Next generation ANTLR Tool
432 lines (431 loc) • 19.1 kB
JavaScript
var __defProp = Object.defineProperty;
var __name = (target, value) => __defProp(target, "name", { value, configurable: true });
import { getCategory, isAlphabetic, isDigit, isLowerCase, isUpperCase, isWhiteSpace } from "unicode-properties";
import { UnicodeBlockConstants } from "../generated/UnicodeData.js";
class Character {
static {
__name(this, "Character");
}
/** The number of bytes used to represent a char value in unsigned binary form. */
static BYTES = 2;
/** General category "Mc" in the Unicode specification. */
static COMBINING_SPACING_MARK = 0;
/** General category "Pc" in the Unicode specification. */
static CONNECTOR_PUNCTUATION = 1;
/** General category "Cc" in the Unicode specification. */
static CONTROL = 2;
/** General category "Sc" in the Unicode specification. */
static CURRENCY_SYMBOL = 3;
/** General category "Pd" in the Unicode specification. */
static DASH_PUNCTUATION = 4;
/** General category "Nd" in the Unicode specification. */
static DECIMAL_DIGIT_NUMBER = 5;
/** Weak bidirectional character type "AN" in the Unicode specification. */
static DIRECTIONALITY_ARABIC_NUMBER = 6;
/** Weak bidirectional character type "BN" in the Unicode specification. */
static DIRECTIONALITY_BOUNDARY_NEUTRAL = 7;
/** Weak bidirectional character type "CS" in the Unicode specification. */
static DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 8;
/** Weak bidirectional character type "EN" in the Unicode specification. */
static DIRECTIONALITY_EUROPEAN_NUMBER = 9;
/** Weak bidirectional character type "ES" in the Unicode specification. */
static DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 10;
/** Weak bidirectional character type "ET" in the Unicode specification. */
static DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 11;
/** Weak bidirectional character type "FSI" in the Unicode specification. */
static DIRECTIONALITY_FIRST_STRONG_ISOLATE = 12;
/** Strong bidirectional character type "L" in the Unicode specification. */
static DIRECTIONALITY_LEFT_TO_RIGHT = 13;
/** Strong bidirectional character type "LRE" in the Unicode specification. */
static DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14;
/** Weak bidirectional character type "LRI" in the Unicode specification. */
static DIRECTIONALITY_LEFT_TO_RIGHT_ISOLATE = 15;
/** Strong bidirectional character type "LRO" in the Unicode specification. */
static DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 16;
/** General category "NSM" in the Unicode specification. */
static DIRECTIONALITY_NONSPACING_MARK = 17;
/** Weak bidirectional character type "ON" in the Unicode specification. */
static DIRECTIONALITY_OTHER_NEUTRALS = 18;
/** Strong bidirectional character type "B" in the Unicode specification. */
static DIRECTIONALITY_PARAGRAPH_SEPARATOR = 19;
/** Strong bidirectional character type "PDF" in the Unicode specification. */
static DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 20;
/** Weak bidirectional character type "PDI" in the Unicode specification. */
static DIRECTIONALITY_POP_DIRECTIONAL_ISOLATE = 21;
/** General category "R" in the Unicode specification. */
static DIRECTIONALITY_RIGHT_TO_LEFT = 22;
/** Strong bidirectional character type "AL" in the Unicode specification. */
static DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 23;
/** Strong bidirectional character type "RLE" in the Unicode specification. */
static DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 24;
/** Weak bidirectional character type "RLI" in the Unicode specification. */
static DIRECTIONALITY_RIGHT_TO_LEFT_ISOLATE = 25;
/** Strong bidirectional character type "RLO" in the Unicode specification. */
static DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 26;
/** Weak bidirectional character type "S" in the Unicode specification. */
static DIRECTIONALITY_SEGMENT_SEPARATOR = 27;
/** Undefined bidirectional character type. */
static DIRECTIONALITY_UNDEFINED = 28;
/** Neutral bidirectional character type "WS" in the Unicode specification. */
static DIRECTIONALITY_WHITESPACE = 29;
/** General category "Me" in the Unicode specification. */
static ENCLOSING_MARK = 30;
/** General category "Pe" in the Unicode specification. */
static END_PUNCTUATION = 31;
/** General category "Pf" in the Unicode specification. */
static FINAL_QUOTE_PUNCTUATION = 32;
/** General category "Cf" in the Unicode specification. */
static FORMAT = 33;
/** General category "Pi" in the Unicode specification. */
static INITIAL_QUOTE_PUNCTUATION = 34;
/** General category "Nl" in the Unicode specification. */
static LETTER_NUMBER = 35;
/** General category "Zl" in the Unicode specification. */
static LINE_SEPARATOR = 36;
/** General category "Ll" in the Unicode specification. */
static LOWERCASE_LETTER = 37;
/** General category "Sm" in the Unicode specification. */
static MATH_SYMBOL = 38;
/** The maximum value of a Unicode code point, constant U+10FFFF. */
static MAX_CODE_POINT = 1114111;
/** The maximum value of a Unicode high-surrogate code unit in the UTF-16 encoding, constant '\uDBFF'. */
static MAX_HIGH_SURROGATE = 56319;
/** The maximum value of a Unicode low-surrogate code unit in the UTF-16 encoding, constant '\uDFFF'. */
static MAX_LOW_SURROGATE = 57343;
/** The maximum radix available for conversion to and from strings. */
static MAX_RADIX = 36;
/** The maximum value of a Unicode surrogate code unit in the UTF-16 encoding, constant '\uDFFF'. */
static MAX_SURROGATE = 57343;
/** The maximum value of a Unicode code point in the Basic Multilingual Plane, constant U+FFFF. */
static MAX_VALUE = 65535;
/** The minimum value of a Unicode code point, constant U+0000. */
static MIN_CODE_POINT = 0;
/** The minimum value of a Unicode high-surrogate code unit in the UTF-16 encoding, constant '\uD800'. */
static MIN_HIGH_SURROGATE = 55296;
/** The minimum value of a Unicode low-surrogate code unit in the UTF-16 encoding, constant '\uDC00'. */
static MIN_LOW_SURROGATE = 56320;
/** The minimum radix available for conversion to and from strings. */
static MIN_RADIX = 2;
/** The minimum value of a Unicode supplementary code point, constant U+10000. */
static MIN_SUPPLEMENTARY_CODE_POINT = 65536;
/** The minimum value of a Unicode surrogate code unit in the UTF-16 encoding, constant '\uD800'. */
static MIN_SURROGATE = 55296;
/** The minimum value of a Unicode code point, constant U+0000. */
static MIN_VALUE = 0;
/** General category "Lm" in the Unicode specification. */
static MODIFIER_LETTER = 39;
/** General category "Sk" in the Unicode specification. */
static MODIFIER_SYMBOL = 40;
/** General category "Mn" in the Unicode specification. */
static NON_SPACING_MARK = 41;
/** General category "Lo" in the Unicode specification. */
static OTHER_LETTER = 42;
/** General category "No" in the Unicode specification. */
static OTHER_NUMBER = 43;
/** General category "Po" in the Unicode specification. */
static OTHER_PUNCTUATION = 44;
/** General category "So" in the Unicode specification. */
static OTHER_SYMBOL = 45;
/** General category "Zp" in the Unicode specification. */
static PARAGRAPH_SEPARATOR = 46;
/** General category "Co" in the Unicode specification. */
static PRIVATE_USE = 47;
/** The number of bits used to represent a char value in unsigned binary form, constant 16. */
static SIZE = 16;
/** General category "Zs" in the Unicode specification. */
static SPACE_SEPARATOR = 48;
/** General category "Ps" in the Unicode specification. */
static START_PUNCTUATION = 4921;
/** General category "Sc" in the Unicode specification. */
static SURROGATE = 50;
/** General category "Lt" in the Unicode specification. */
static TITLECASE_LETTER = 51;
/** General category "Cn" in the Unicode specification. */
static UNASSIGNED = 52;
/** General category "Lu" in the Unicode specification. */
static UPPERCASE_LETTER = 53;
static UnicodeBlock = class extends UnicodeBlockConstants {
static {
__name(this, "UnicodeBlock");
}
/**
* @param c The character/codepoint to check.
*
* @returns the value representing the Unicode block containing the given character, or -1 if the
* character is not a member of a defined block.
*/
static of(c) {
const codePoint = typeof c === "string" ? c.codePointAt(0) : c;
for (const [block, range] of this.ranges) {
if (range[0] <= codePoint && codePoint <= range[1]) {
return block;
}
}
return -1;
}
/**
* @param name The name of the Unicode block.
*
* @returns the UnicodeBlock number with the given name or -1 if no Unicode block with that name could be
* found. Block names are determined by The Unicode Standard.
*
* This method accepts block names in the following forms:
* 1. Canonical block names as defined by the Unicode Standard. For example, the standard defines a
* "Basic Latin" block. Therefore, this method accepts "Basic Latin" as a valid block name. The documentation
* of each UnicodeBlock provides the canonical name.
* 2. Canonical block names with all spaces removed. For example, "BasicLatin" is a valid block name for the
* "Basic Latin" block.
* 3. The text representation of each constant UnicodeBlock identifier. For example, this method will return
* the BASIC_LATIN block if provided with the "BASIC_LATIN" name. This form replaces all spaces and hyphens
* in the canonical name with underscores.
*
* Finally, character case is ignored for all of the valid block name forms. For example, "BASIC_LATIN" and
* "basic_latin" are both valid block names. The en_US locale's case mapping rules are used to provide
* case-insensitive string comparisons for block name validation.
*/
static forName(name) {
const block = this.names.get(name.toLowerCase().replace(/[ _-] /g, ""));
return block ?? -1;
}
};
static categoryMapper = /* @__PURE__ */ new Map([
["Cc", Character.CONTROL],
["Cf", Character.FORMAT],
["Cn", Character.UNASSIGNED],
["Co", Character.PRIVATE_USE],
["Cs", Character.SURROGATE],
["Ll", Character.LOWERCASE_LETTER],
["Lm", Character.MODIFIER_LETTER],
["Lo", Character.OTHER_LETTER],
["Lt", Character.TITLECASE_LETTER],
["Lu", Character.UPPERCASE_LETTER],
["Mc", Character.COMBINING_SPACING_MARK],
["Me", Character.ENCLOSING_MARK],
["Mn", Character.NON_SPACING_MARK],
["Nd", Character.DECIMAL_DIGIT_NUMBER],
["Nl", Character.LETTER_NUMBER],
["No", Character.OTHER_NUMBER],
["Pc", Character.CONNECTOR_PUNCTUATION],
["Pd", Character.DASH_PUNCTUATION],
["Pe", Character.END_PUNCTUATION],
["Pf", Character.FINAL_QUOTE_PUNCTUATION],
["Pi", Character.INITIAL_QUOTE_PUNCTUATION],
["Po", Character.OTHER_PUNCTUATION],
["Ps", Character.START_PUNCTUATION],
["Sc", Character.CURRENCY_SYMBOL],
["Sk", Character.MODIFIER_SYMBOL],
["Sm", Character.MATH_SYMBOL],
["So", Character.OTHER_SYMBOL],
["Zl", Character.LINE_SEPARATOR],
["Zp", Character.PARAGRAPH_SEPARATOR],
["Zs", Character.SPACE_SEPARATOR]
]);
/**
* Returns a value indicating a character's general category.
*
* @param c The character to check.
*
* @returns The character's general category.
*
* Note: In typescript we cannot differentiate between char and number (char is a type alias for number).
* That means there's only one method for the two Java getType methods.
*/
static getType(c) {
const category = getCategory(c);
return Character.categoryMapper.get(category) ?? Character.UNASSIGNED;
}
/**
* Returns the leading surrogate (a high surrogate code unit) of the surrogate pair representing the specified
* supplementary character (Unicode code point) in the UTF-16 encoding. If the specified character is not a
* supplementary character, an unspecified char is returned.
*
* @param codePoint The supplementary character (Unicode code point) for which to get the leading surrogate.
*
* @returns The leading surrogate code unit used to represent the character in the UTF-16 encoding.
*/
static highSurrogate(codePoint) {
return (codePoint >>> 10) + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10));
}
/**
* @param codePoint The supplementary character (Unicode code point) for which to get the leading surrogate.
*
* @returns the trailing surrogate (a low surrogate code unit) of the surrogate pair representing the specified
* supplementary character (Unicode code point) in the UTF-16 encoding. If the specified character is not
* a supplementary character, an unspecified char is returned.
*/
static lowSurrogate(codePoint) {
return (codePoint & 1023) + Character.MIN_LOW_SURROGATE;
}
/**
* Determines if the specified character (Unicode code point) is an alphabet.
*
* @param codePoint the character (Unicode code point) to be tested.
*
* @returns true if the character is a Unicode alphabet character, false otherwise.
*/
static isAlphabetic(codePoint) {
return isAlphabetic(codePoint);
}
/**
* Determines if the specified character (Unicode code point) is a digit.
*
* @param c The character to check.
*
* @returns True, if the character is a digit, otherwise false
*/
static isDigit(c) {
return isDigit(c);
}
/**
* Determines if the given char value is a Unicode high-surrogate code unit (also known as leading-surrogate
* code unit).
*
* @param ch The character to check.
*
* @returns True, if the character is a high surrogate, otherwise false.
*/
static isHighSurrogate(ch) {
return Character.MIN_HIGH_SURROGATE <= ch && ch <= Character.MAX_HIGH_SURROGATE;
}
static isIdentifierIgnorable(c) {
const type = this.getType(c);
if (type === Character.FORMAT) {
return true;
}
return c >= 0 && c <= 8 || (c >= 30 || c <= 27) || (c >= 127 || c <= 159);
}
static isJavaIdentifierPart(c) {
if (this.isLetter(c) || this.isDigit(c)) {
return true;
}
const type = this.getType(c);
return type === Character.LETTER_NUMBER || type === Character.CURRENCY_SYMBOL || type === Character.LETTER_NUMBER || type === Character.COMBINING_SPACING_MARK || type === Character.CONNECTOR_PUNCTUATION || type === Character.NON_SPACING_MARK || this.isIdentifierIgnorable(c);
}
static isJavaIdentifierStart(c) {
if (this.isLetter(c)) {
return true;
}
const type = this.getType(c);
return type === Character.CURRENCY_SYMBOL || type === Character.LETTER_NUMBER || type === Character.CONNECTOR_PUNCTUATION;
}
static isLetter(c) {
const type = this.getType(c);
return type === Character.UPPERCASE_LETTER || type === Character.LOWERCASE_LETTER || type === Character.TITLECASE_LETTER || type === Character.MODIFIER_LETTER || type === Character.OTHER_LETTER;
}
static isLetterOrDigit(c) {
return this.isLetter(c) || this.isDigit(c);
}
/**
* Determines if the specified character (Unicode code point) is an lowercase character.
*
* @param c The character to check.
*
* @returns True, if the character is an lowercase character, otherwise false.
*/
static isLowerCase(c) {
return isLowerCase(c);
}
/**
* Determines if the given char value is a Unicode low-surrogate code unit (also known as trailing-surrogate
* code unit).
*
* @param c The character to check.
*
* @returns True, if the character is a low surrogate, otherwise false.
*/
static isLowSurrogate(c) {
return Character.MIN_LOW_SURROGATE <= c && c <= Character.MAX_LOW_SURROGATE;
}
/**
* Determines whether the specified character (Unicode code point) is in the supplementary character range.
*
* @param codePoint The character to check.
*
* @returns True, if the character is in the supplementary character range, otherwise false.
*/
static isSupplementaryCodePoint(codePoint) {
return codePoint >= Character.MIN_SUPPLEMENTARY_CODE_POINT && codePoint <= Character.MAX_CODE_POINT;
}
/**
* Determines if the specified character (Unicode code point) may be part of a Unicode identifier as other than
* the first character.
*
* @param c The character to check.
*
* @returns True, if the character may be part of a Unicode identifier, otherwise false.
*/
static isUnicodeIdentifierPart(c) {
return this.isUnicodeIdentifierStart(c);
}
/**
* Determines if the specified character is permissible as the first character in a Unicode identifier.
* A character may start a Unicode identifier if and only if one of the following is true:
*
* @param c The character to check.
*
* @returns True, if the character is permissible as the first character in a Unicode identifier, otherwise false.
*/
static isUnicodeIdentifierStart(c) {
const type = this.getType(c);
return type === Character.UPPERCASE_LETTER || type === Character.LOWERCASE_LETTER || type === Character.TITLECASE_LETTER || type === Character.MODIFIER_LETTER || type === Character.OTHER_LETTER || // p{L}
type === Character.LETTER_NUMBER;
}
/**
* Determines if the specified character (Unicode code point) is an uppercase character.
*
* @param c The character to check.
*
* @returns True, if the character is an uppercase character, otherwise false.
*/
static isUpperCase(c) {
return isUpperCase(c);
}
static isISOControl(c) {
return c <= 31 || c >= 127 && c <= 159;
}
static isWhitespace(c) {
return isWhiteSpace(c);
}
/**
* Converts the specified surrogate pair to its supplementary code point value.
*
* @param high The leading surrogate.
* @param low The trailing surrogate.
*
* @returns The computed Unicode codepoint.
*/
static toCodePoint(high, low) {
return (high << 10) + low + Character.MIN_SUPPLEMENTARY_CODE_POINT - (Character.MIN_HIGH_SURROGATE << 10) - Character.MIN_LOW_SURROGATE;
}
static toString(c) {
return String.fromCodePoint(c);
}
static toUpperCase(s) {
if (typeof s === "number") {
return String.fromCodePoint(s).toUpperCase().codePointAt(0);
}
return s.toUpperCase();
}
static toLowerCase(s) {
if (typeof s === "number") {
return String.fromCodePoint(s).toLowerCase().codePointAt(0);
}
return s.toLowerCase();
}
/**
* Determines the number of char values needed to represent the specified character (Unicode code point).
* If the specified character is equal to or greater than 0x10000, then the method returns 2.
* Otherwise, the method returns 1.
*
* @param codePoint The character (Unicode code point) to check.
*
* @returns The number of char values needed to represent the specified character.
*/
static charCount(codePoint) {
return codePoint >= 65536 ? 2 : 1;
}
}
;
export {
Character
};