@keymanapp/common-types
Version:
Keyman Developer keyboard file types
374 lines (372 loc) • 11.9 kB
JavaScript
!function(){try{var e="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:{},n=(new Error).stack;n&&(e._sentryDebugIds=e._sentryDebugIds||{},e._sentryDebugIds[n]="a6b17902-2349-521f-a60d-874f5695b675")}catch(e){}}();
import { MATCH_HEX_ESCAPE, CONTAINS_QUAD_ESCAPE, MATCH_QUAD_ESCAPE } from './consts.js';
export { MATCH_HEX_ESCAPE, CONTAINS_QUAD_ESCAPE, MATCH_QUAD_ESCAPE };
/**
* xml2js will not place single-entry objects into arrays. Easiest way to fix
* this is to box them ourselves as needed. Ensures that o.x is an array.
*
* @param o Object with property to box
* @param x Name of element to box
*/
export function boxXmlArray(o, x) {
if (typeof o == 'object' && !Array.isArray(o[x])) {
if (o[x] === null || o[x] === undefined) {
o[x] = [];
}
else {
o[x] = [o[x]];
}
}
}
export class UnescapeError extends Error {
}
/**
* Unescape one codepoint
* @param hex one codepoint in hex, such as '0127'
* @returns the unescaped codepoint
*/
export function unescapeOne(hex) {
const codepoint = Number.parseInt(hex, 16);
return String.fromCodePoint(codepoint);
}
/**
* Unescape one single quad string such as \u0127 / \U00000000
* Throws exception if the string doesn't match MATCH_QUAD_ESCAPE
* Note this does not attempt to handle or reject surrogates.
* So, `\\uD838\\uDD09` will work but other combinations may not.
* @param s input string
* @returns output
*/
export function unescapeOneQuadString(s) {
if (!s || !s.match(MATCH_QUAD_ESCAPE)) {
throw new UnescapeError(`Not a quad escape: ${s}`);
}
function processMatch(str, m16, m32) {
return unescapeOne(m16 || m32); // either \u or \U
}
s = s.replace(MATCH_QUAD_ESCAPE, processMatch);
return s;
}
/** unscape multiple occurences of \u0127 style strings */
export function unescapeQuadString(s) {
s = s.replaceAll(MATCH_QUAD_ESCAPE, (quad) => unescapeOneQuadString(quad));
return s;
}
/**
* Unescapes a string according to UTS#18§1.1, see <https://www.unicode.org/reports/tr18/#Hex_notation>
* @param s escaped string
* @returns
*/
export function unescapeString(s) {
if (!s) {
return s;
}
try {
/**
* process one regex match
* @param str ignored
* @param matched the entire match such as '0127' or '22 22'
* @returns the unescaped match
*/
function processMatch(str, matched) {
const codepoints = matched.split(' ');
const unescaped = codepoints.map(unescapeOne);
return unescaped.join('');
}
s = s.replaceAll(MATCH_HEX_ESCAPE, processMatch);
}
catch (e) {
if (e instanceof RangeError) {
throw new UnescapeError(`Out of range while unescaping '${s}': ${e.message}`, { cause: e });
/* c8 ignore next 3 */
}
else {
throw e; // pass through some other error
}
}
return s;
}
/** 0000 … FFFF */
export function hexQuad(n) {
if (n < 0x0000 || n > 0xFFFF) {
throw RangeError(`${n} not in [0x0000,0xFFFF]`);
}
return n.toString(16).padStart(4, '0');
}
/** 00000000 … FFFFFFFF */
export function hexOcts(n) {
if (n < 0x0000 || n > 0xFFFFFFFF) {
throw RangeError(`${n} not in [0x00000000,0xFFFFFFFF]`);
}
return n.toString(16).padStart(8, '0');
}
/** escape one char for regex in \uXXXX form */
export function escapeRegexChar(ch) {
const code = ch.codePointAt(0);
if (code <= 0xFFFF) {
return '\\u' + hexQuad(code);
}
else {
return '\\U' + hexOcts(code);
}
}
/** chars that must be escaped: syntax, C0 + C1 controls */
const REGEX_SYNTAX_CHAR = /^[\u0000-\u001F\u007F-\u009F{}\[\]\\?|.^$*()/+-]$/;
function escapeRegexCharIfSyntax(ch) {
// escape if syntax or not valid
if (REGEX_SYNTAX_CHAR.test(ch) || !isValidUnicode(ch.codePointAt(0))) {
return escapeRegexChar(ch);
}
else {
return ch; // leave unescaped
}
}
/**
* Unescape one codepoint to \u or \U format
* @param hex one codepoint in hex, such as '0127'
* @returns the unescaped codepoint
*/
function regexOne(hex) {
const unescaped = unescapeOne(hex);
// re-escape as 16 or 32 bit code units
return Array.from(unescaped).map(ch => escapeRegexCharIfSyntax(ch)).join('');
}
/**
* Escape a string (\uxxxx form) if there are any problematic codepoints
*/
export function escapeStringForRegex(s) {
return s.split('').map(ch => escapeRegexCharIfSyntax(ch)).join('');
}
/**
* Unescapes a string according to UTS#18§1.1, see <https://www.unicode.org/reports/tr18/#Hex_notation>
* @param s escaped string
* @returns
*/
export function unescapeStringToRegex(s) {
if (!s) {
return s;
}
try {
/**
* process one regex match
* @param str ignored
* @param matched the entire match such as '0127' or '22 22'
* @returns the unescaped match
*/
function processMatch(str, matched) {
const codepoints = matched.split(' ');
const unescaped = codepoints.map(regexOne);
return unescaped.join('');
}
s = s.replaceAll(MATCH_HEX_ESCAPE, processMatch);
}
catch (e) {
if (e instanceof RangeError) {
throw new UnescapeError(`Out of range while unescaping '${s}': ${e.message}`, { cause: e });
/* c8 ignore next 3 */
}
else {
throw e; // pass through some other error
}
}
return s;
}
/** True if this string *could* be a UTF-32 single char */
export function isOneChar(value) {
return [...value].length === 1;
}
export function toOneChar(value) {
if (!isOneChar(value)) {
throw Error(`Not a single char: ${value}`);
}
return value.codePointAt(0);
}
export function describeCodepoint(ch) {
let s;
const p = BadStringAnalyzer.getProblem(ch);
if (p != null) {
// for example: 'PUA (U+E010)'
s = p;
}
else {
// for example: '"a" (U+61)'
s = `"${String.fromCodePoint(ch)}"`;
}
return `${s} (U+${Number(ch).toString(16).toUpperCase()})`;
}
export var BadStringType;
(function (BadStringType) {
BadStringType["pua"] = "PUA";
BadStringType["unassigned"] = "Unassigned";
BadStringType["illegal"] = "Illegal";
BadStringType["denormalized"] = "Denormalized";
})(BadStringType || (BadStringType = {}));
;
// Following from kmx_xstring.h / .cpp
const Uni_LEAD_SURROGATE_START = 0xD800;
const Uni_LEAD_SURROGATE_END = 0xDBFF;
const Uni_TRAIL_SURROGATE_START = 0xDC00;
const Uni_TRAIL_SURROGATE_END = 0xDFFF;
const Uni_SURROGATE_START = Uni_LEAD_SURROGATE_START;
const Uni_SURROGATE_END = Uni_TRAIL_SURROGATE_END;
const Uni_FD_NONCHARACTER_START = 0xFDD0;
const Uni_FD_NONCHARACTER_END = 0xFDEF;
const Uni_FFFE_NONCHARACTER = 0xFFFE;
const Uni_PLANE_MASK = 0x1F0000;
const Uni_MAX_CODEPOINT = 0x10FFFF;
// plane 0, 15, and 16 PUA
const Uni_PUA_00_START = 0xE000;
const Uni_PUA_00_END = 0xF8FF;
const Uni_PUA_15_START = 0x0F0000;
const Uni_PUA_15_END = 0x0FFFFD;
const Uni_PUA_16_START = 0x100000;
const Uni_PUA_16_END = 0x10FFFD;
/**
* @brief True if a lead surrogate
* \def Uni_IsSurrogate1
*/
export function Uni_IsSurrogate1(ch) {
return ((ch) >= Uni_LEAD_SURROGATE_START && (ch) <= Uni_LEAD_SURROGATE_END);
}
/**
* @brief True if a trail surrogate
* \def Uni_IsSurrogate2
*/
export function Uni_IsSurrogate2(ch) {
return ((ch) >= Uni_TRAIL_SURROGATE_START && (ch) <= Uni_TRAIL_SURROGATE_END);
}
/**
* @brief True if any surrogate
* \def UniIsSurrogate
*/
export function Uni_IsSurrogate(ch) {
return (Uni_IsSurrogate1(ch) || Uni_IsSurrogate2(ch));
}
function Uni_IsEndOfPlaneNonCharacter(ch) {
return (((ch) & Uni_FFFE_NONCHARACTER) == Uni_FFFE_NONCHARACTER); // matches FFFF or FFFE
}
function Uni_IsNoncharacter(ch) {
return (((ch) >= Uni_FD_NONCHARACTER_START && (ch) <= Uni_FD_NONCHARACTER_END) || Uni_IsEndOfPlaneNonCharacter(ch));
}
function Uni_InCodespace(ch) {
return (ch >= 0 && ch <= Uni_MAX_CODEPOINT);
}
;
function Uni_IsValid1(ch) {
return (Uni_InCodespace(ch) && !Uni_IsSurrogate(ch) && !Uni_IsNoncharacter(ch));
}
export function isValidUnicode(start, end) {
if (!end) {
// single char
return Uni_IsValid1(start);
}
else if (!Uni_IsValid1(end) || !Uni_IsValid1(start) || (end < start)) {
// start or end out of range, or inverted range
return false;
}
else if ((start <= Uni_SURROGATE_END) && (end >= Uni_SURROGATE_START)) {
// contains some of the surrogate range
return false;
}
else if ((start <= Uni_FD_NONCHARACTER_END) && (end >= Uni_FD_NONCHARACTER_START)) {
// contains some of the noncharacter range
return false;
}
else if ((start & Uni_PLANE_MASK) != (end & Uni_PLANE_MASK)) {
// start and end are on different planes, meaning that the U+__FFFE/U+__FFFF noncharacters
// are contained.
// As a reminder, we already checked that start/end are themselves valid,
// so we know that 'end' is not on a noncharacter at end of plane.
return false;
}
else {
return true;
}
}
export function isPUA(ch) {
return ((ch >= Uni_PUA_00_START && ch <= Uni_PUA_00_END) ||
(ch >= Uni_PUA_15_START && ch <= Uni_PUA_15_END) ||
(ch >= Uni_PUA_16_START && ch <= Uni_PUA_16_END));
}
/** @returns false if s is NEITHER NFC nor NFD. (Returns true for falsy) */
export function isNormalized(s) {
if (!s)
return true; // empty or null
const nfc = s.normalize("NFC");
const nfd = s.normalize("NFD");
if (s !== nfc && s !== nfd)
return false;
return true;
}
class BadStringMap extends Map {
toString() {
if (!this.size) {
return "{}";
}
return Array.from(this.entries()).map(([t, s]) => `${t}: ${Array.from(s.values()).map(describeCodepoint).join(' ')}`).join(', ');
}
}
/** abstract class for analyzing and categorizing strings */
export class StringAnalyzer {
/** add a string for analysis */
add(s) {
for (const c of [...s]) {
const ch = c.codePointAt(0);
const problem = this.analyzeCodePoint(c, ch);
if (problem) {
this.addProblem(ch, problem);
}
}
}
/** internal interface for the result of an analysis */
addProblem(ch, type) {
if (!this.m.has(type)) {
this.m.set(type, new Set());
}
this.m.get(type).add(ch);
}
/** get the results of the analysis */
analyze() {
if (this.m.size == 0) {
return null;
}
else {
return this.m;
}
}
/** internal map */
m = new BadStringMap();
}
/** analyze a string looking for bad unicode */
export class BadStringAnalyzer extends StringAnalyzer {
/** analyze one codepoint */
analyzeCodePoint(c, ch) {
return BadStringAnalyzer.getProblem(ch);
}
/** export analyzer function */
static getProblem(ch) {
if (!isValidUnicode(ch)) {
return BadStringType.illegal;
}
else if (isPUA(ch)) {
return BadStringType.pua;
}
else { // TODO-LDML: unassigned
return null;
}
}
}
/** Analyzer that checks if something isn't NFD */
export class NFDAnalyzer extends StringAnalyzer {
analyzeCodePoint(c, ch) {
const nfd = c.normalize("NFD");
if (c !== nfd) {
return BadStringType.denormalized;
}
else {
return null;
}
}
}
//# sourceMappingURL=util.js.map
//# debugId=a6b17902-2349-521f-a60d-874f5695b675