@keymanapp/common-types
Version:
Keyman Developer keyboard file types
511 lines (509 loc) • 19.1 kB
JavaScript
/**
* Utilities for transform and marker processing
*/
!function(){try{var e="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:{},n=(new Error).stack;n&&(e._sentryDebugIds=e._sentryDebugIds||{},e._sentryDebugIds[n]="28718c34-6dab-55d0-af86-7052cfc2ee28")}catch(e){}}();
import { constants } from "@keymanapp/ldml-keyboard-constants";
import { MATCH_QUAD_ESCAPE } from "../util/consts.js";
import { isOneChar, unescapeOneQuadString, unescapeString, hexQuad } from "../util/util.js";
/**
* Helper function for extracting matched items
* @param str input string
* @param match global RegEx to use
* @returns array of matched values
*/
function matchArray(str, match) {
const refs = (str || '').matchAll(match);
return Array.from(refs).map(r => r[1]);
}
/**
* Common regex for an ID
*/
const COMMON_ID = /^[0-9A-Za-z_]{1,32}$/;
/**
* Class for helping with markers
*/
export class MarkerParser {
/**
* A marker id has the same constraint as a key id. TODO-LDML: Needs to be reflected in the spec
*/
static ID = COMMON_ID;
/**
* Special marker reference referring to any marker
*/
static ANY_MARKER = '\\m{.}';
/**
* id of the 'any' marker
*/
static ANY_MARKER_ID = '.';
/**
* Marker sentinel as a string - U+FFFF
*/
static SENTINEL = String.fromCodePoint(constants.uc_sentinel);
/** Marker sentinel as a regex match */
static SENTINEL_MATCH = '\\u' + hexQuad(constants.uc_sentinel);
/**
* Marker code as a string - U+0008
*/
static MARKER_CODE = String.fromCodePoint(constants.marker_code);
/** Marker code as a regex match */
static MARKER_CODE_MATCH = '\\u' + hexQuad(constants.marker_code);
/** Minimum ID (trailing code unit) */
static MIN_MARKER_INDEX = constants.marker_min_index;
/** Index meaning 'any marker' == `\m{.}` */
static ANY_MARKER_INDEX = constants.marker_any_index;
/** Maximum usable marker index */
static MAX_MARKER_INDEX = constants.marker_max_index;
/** Max count of markers */
static MAX_MARKER_COUNT = constants.marker_max_count;
static anyMarkerMatch() {
const start = hexQuad(MarkerParser.MIN_MARKER_INDEX);
const end = hexQuad(MarkerParser.MAX_MARKER_INDEX);
return `${MarkerParser.SENTINEL_MATCH}${MarkerParser.MARKER_CODE_MATCH}[\\u${start}-\\u${end}]`; // TODO-LDML: #9121 wrong escape format
}
/** Expression that matches any marker */
static ANY_MARKER_MATCH = MarkerParser.anyMarkerMatch();
/**
* Pattern for matching a marker reference, OR the special marker \m{.}
*/
static REFERENCE = /(?<!\\)(?:\\\\)*\\m{([0-9A-Za-z_]{1,32}|\.)}/g;
/**
* Pattern for matching a broken marker reference (assuming REFERENCE was not matched)
*/
static BROKEN_REFERENCE = /(?<!\\)(?:\\\\)*\\m{([^}\\{}]*)/g;
/**
* parse a string into marker references
* @param str input string such as "\m{a} … \m{.}"
* @returns `[]` or an array of all markers referenced
*/
static allReferences(str) {
if (!str) {
return [];
}
return matchArray(str, MarkerParser.REFERENCE);
}
/**
* parse a string for broken marker references
* @param str input string such as "\m{a} … \m{.}"
* @returns `[]` or an array of all broken markers referenced
*/
static allBrokenReferences(str) {
if (!str) {
return [];
}
// exclude valid markers
const strMinusGoodMarkers = str.replaceAll(this.REFERENCE, '');
return matchArray(strMinusGoodMarkers, MarkerParser.BROKEN_REFERENCE);
}
static markerCodeToString(n, forMatch) {
if (!forMatch) {
return String.fromCharCode(n);
}
else {
return `\\u${hexQuad(n)}`; // TODO-LDML: #9121 wrong escape format
}
}
/** @returns string for marker #n */
static markerOutput(n, forMatch) {
if (n < MarkerParser.MIN_MARKER_INDEX || n > MarkerParser.ANY_MARKER_INDEX) {
throw RangeError(`Internal Error: marker index out of range ${n}`);
}
if (forMatch) {
return MarkerParser.SENTINEL_MATCH + MarkerParser.MARKER_CODE_MATCH + MarkerParser.markerCodeToString(n, forMatch);
}
else {
return MarkerParser.SENTINEL + MarkerParser.MARKER_CODE + MarkerParser.markerCodeToString(n, forMatch);
}
}
/** @returns all marker strings as sentinel values */
static toSentinelString(s, markers, forMatch) {
if (!s)
return s;
return s.replaceAll(MarkerParser.REFERENCE, (sub, arg) => {
if (arg === MarkerParser.ANY_MARKER_ID) {
if (forMatch) {
return MarkerParser.ANY_MARKER_MATCH;
}
return MarkerParser.markerOutput(MarkerParser.ANY_MARKER_INDEX);
}
if (!markers) {
throw RangeError(`Internal Error: Could not find marker \\m{${arg}} (no markers defined)`);
}
const order = markers.getItemOrder(arg);
if (order === -1) {
throw RangeError(`Internal Error: Could not find marker \\m{${arg}}`);
}
else if (order > MarkerParser.MAX_MARKER_INDEX) {
throw RangeError(`Internal Error: marker \\m{${arg}} has out of range index ${order}`);
}
else {
return MarkerParser.markerOutput(order + 1, forMatch);
}
});
}
/**
* NFD a string, respecting markers.
* @param s input string
* @param forMatch true if regex, false if individual
* @returns the normalized string
*/
static nfd_markers(s, forMatch) {
const m = [];
return this.nfd_markers_segment(s, m, forMatch);
}
/**
* NFD a safe subset of a string, respecting markers
* @param s input string
* @param map output array of marker chars
* @param forMatch true if used for regexes
* @returns the updated string
*/
static nfd_markers_segment(s, map, forMatch) {
// remove (and parse) the markers first
const str_unmarked = MarkerParser.remove_markers(s, map, forMatch);
// then, NFD the normalized string
const str_unmarked_nfd = str_unmarked.normalize("NFD");
if (map.length == 0) {
// no markers, so we can safely return the normalized unmarked string
return str_unmarked_nfd;
}
else if (str_unmarked_nfd === str_unmarked) {
// normalization didn't shuffle anything, so it's entirely a no-op.
return s;
}
else {
// we had markers AND the normalization made a difference.
// add the markers back per the map, and return
return MarkerParser.add_back_markers(str_unmarked_nfd, map, forMatch);
}
}
/** return the string s but with a marker sequence before it */
static prepend_marker(s, marker, forMatch) {
if (forMatch && marker === constants.marker_any_index) {
return MarkerParser.ANY_MARKER_MATCH + s;
}
else {
return MarkerParser.markerOutput(marker, forMatch) + s;
}
}
/**
* Add back all markers in the map to the string
* @param s input string
* @param map output: the marker map
* @param forMatch if true, use regex format
*/
static add_back_markers(s, map, forMatch) {
// quick check: if no string, or no map: nothing to do
if (!s || !map?.length) {
return s;
}
/** output string */
let out = '';
/** for checking: the total number of markers expected, skipping end markers */
const max_markers = map.filter(({ end }) => !end).length;
/** for checking: the number of markers we've written */
let written_markers = 0;
/** we are going to mutate the map, so copy it */
const map2 = [...map]; // make a copy
// First, add back all 'MARKER_BEFORE_EOT' markers
while (map2.length && map2[map2.length - 1].ch === MARKER_BEFORE_EOT) {
// remove from list
const { marker, end } = map2.pop();
if (!end) {
out = MarkerParser.prepend_marker(out, marker, forMatch);
written_markers++;
}
}
// Then, take each codepoint (from back to front)
for (let p of [...s].reverse()) {
// reverse order code units, prepend to out
out = p + out;
for (let i = map2.length - 1; i >= 0; i--) {
const { ch, marker, processed, end } = map2[i];
if (ch === p && !processed) {
map2[i].processed = true; // mark as processed
if (end) {
break; // exit loop
}
else {
out = MarkerParser.prepend_marker(out, marker, forMatch);
written_markers++;
}
}
else if (map2[map2.length - 1]?.processed) {
// keep the list as short as possible
map2.pop();
}
}
}
// validate that we consumed all markers
if (written_markers !== max_markers) {
throw Error(`Internal Error: should have written ${max_markers} markers but only wrote ${written_markers}`);
}
return out;
}
/**
* Remove (and parse) markers from a string
* @param s input string
* @param map output map containing marker locations
* @param forMatch true if regex
* @returns the original string, without any markers
*/
static remove_markers(s, map, forMatch) {
/** accumulated output */
let out = '';
/** array of marker ids in order waiting to be added */
let last_markers = [];
/** input string, split into codepoint runs */
let a = [...s];
/** were any markers found? */
let had_markers = false;
/**
* subfunc: add all markers in the pending (last_markers) queue
* @param l string the marker is 'glued' to, or '' for end
*/
function add_pending_markers(l) {
// first char, or, marker-before-eot
const glueChars = (l === '') ? [MARKER_BEFORE_EOT] : [...(l.normalize("NFD"))];
const glue = glueChars[0];
// push the 'end' value
map.push({ ch: glue, end: true });
while (last_markers.length) {
const marker = last_markers[0];
last_markers = last_markers.slice(1); // pop from front
map.push({ ch: glue, marker });
}
// now, push the rest of the glue chars as an NFD sequence.
// For example, `\m{m}\u0344` will create the following stream:
// { ch: 0308, end: true}
// { ch: 0308, marker: 1}
// { ch: 0301, end: true} // added because of decomp
for (const ch of glueChars.slice(1)) {
map.push({ ch, end: true });
}
}
// iterate until the codepoint list is empty
while (a.length > 0) {
// does 'a' begin with a marker?
const p = MarkerParser.parse_next_marker(a.join(''), forMatch);
if (!p?.match) {
// no match
add_pending_markers(a[0]); // add any pending markers
out = out + a[0]; // add the non-marker text to the buffer
a = a.slice(1); // move forward 1 codepoint
}
else {
// found a marker
had_markers = true;
const { marker, match } = p;
if ((marker == constants.marker_any_index) ||
(marker >= constants.marker_min_index && marker <= constants.marker_max_index)) {
last_markers.push(marker);
}
else {
throw RangeError(`String contained out-of-range marker ${marker}: '${s}'`);
}
a = a.slice([...match].length); // move forward over matched marker
}
}
// add any remaining markers at the end of the string
add_pending_markers('');
if (!had_markers) {
// no markers were found. clear out the map.
map = [];
}
return out;
}
/**
* analyze the string to see if it begins with a marker
* @param s input string
* @param forMatch true if regex
* @returns parsed marker details
*/
static parse_next_marker(s, forMatch) {
if (!forMatch) {
// plain
const m = s.match(PARSE_SENTINEL_MARKER);
if (m) {
// full string matched
const match = m[0];
// extract the marker number
const marker = match.codePointAt(2);
return ({ match, marker });
}
}
else {
// regex
const m = s.match(PARSE_REGEX_MARKER);
if (m) {
// full string
const match = m[0];
// hex digit (if a single)
const single = m[1];
if (single) {
return ({ match, marker: Number.parseInt(single.substring(3), 16) });
}
else {
// it's a range, so it's an any match
return ({ match, marker: constants.marker_any_index });
}
}
}
return null;
}
}
;
/** special noncharacter value denoting end of string */
export const MARKER_BEFORE_EOT = '\ufffe';
/** matcher for a sentinel */
const PARSE_SENTINEL_MARKER = new RegExp(`^${MarkerParser.ANY_MARKER_MATCH}`);
/** matcher for a regex marker, either single or any */
const PARSE_REGEX_MARKER = /^\\uffff\\u0008(?:(\\u[0-9a-fA-F]{4})|(\[\\u[0-9a-fA-F]{4}-\\u[0-9a-fA-F]{4}\]))/;
;
;
/**
* Class for helping with markers
*/
export class VariableParser {
/**
* A marker id has the same constraint as a key id. TODO-LDML: Needs to be reflected in the spec
*/
static ID = COMMON_ID;
/**
* Pattern for matching a string reference `$(str)`
*/
static STRING_REFERENCE = /\${([0-9A-Za-z_]{1,32})}/g;
/**
* Pattern for matching a set reference `$[set]`
*/
static SET_REFERENCE = /\$\[([0-9A-Za-z_]{1,32})\]/g;
/**
* Pattern for matching a capture set reference `($[set])`
*/
static CAPTURE_SET_REFERENCE = /\(\$\[([0-9A-Za-z_]{1,32})\]\)/;
/**
* `$[1:variable]`
* This regex matches the whole string.
*/
static MAPPED_SET_REFERENCE = /^\$\[1:([0-9A-Za-z_]{1,32})\]$/;
/**
* parse a string into references
* @param str input string
* @returns `[]` or an array of all string references referenced
*/
static allStringReferences(str) {
return matchArray(str, VariableParser.STRING_REFERENCE);
}
/**
* parse a string into references
* @param str input string
* @returns `[]` or an array of all string references referenced
*/
static allSetReferences(str) {
return matchArray(str, VariableParser.SET_REFERENCE);
}
/**
* Split an input string into a proper set
* @param str input string
* @returns
*/
static setSplitter(str) {
const s = str?.trim();
if (!s)
return [];
return s.split(/\s+/);
}
}
/** for ElementParser.segment() */
export var ElementType;
(function (ElementType) {
ElementType["codepoint"] = ".";
ElementType["escaped"] = "\\";
ElementType["uset"] = "[";
ElementType["string"] = "*";
})(ElementType || (ElementType = {}));
;
/** one portion of a segmented element string */
export class ElementSegment {
segment;
type;
/**
* @param segment the string in the segment
* @param type type of segment. Will be calculated if not provided.
*/
constructor(segment, type) {
this.segment = segment;
if (type) {
this.type = type;
}
else if (ElementParser.MATCH_USET.test(segment)) {
this.type = ElementType.uset;
}
else if (ElementParser.MATCH_ESCAPED.test(segment)) {
this.type = ElementType.escaped;
}
else {
this.type = ElementType.codepoint;
}
}
/** unescaped format */
get unescaped() {
if (this.type !== ElementType.escaped) {
return this.segment;
}
else {
if (MATCH_QUAD_ESCAPE.test(this.segment)) {
return unescapeOneQuadString(this.segment);
}
else {
return unescapeString(this.segment);
}
}
}
}
;
/** Class for helping with Element strings (i.e. reorder) */
export class ElementParser {
/**
* Matches any complex UnicodeSet that would otherwise be misinterpreted
* by `MATCH_ELEMENT_SEGMENTS` due to nested `[]`'s.
* For example, `[[a-z]-[aeiou]]` could be
* mis-segmented into `[[a-z]`, `-`, `[aeiou]`, `]` */
static MATCH_NESTED_SQUARE_BRACKETS = /\[[^\]]*\[/;
/** Match (segment) UnicodeSets OR hex escapes OR single Unicode codepoints */
static MATCH_ELEMENT_SEGMENTS = /(?:\[[^\]]*\]|\\u[0-9a-fA-F]{4}|\\u\{[0-9a-fA-F]{1,6}\}|\\u\{(?:[0-9a-fA-F]{1,6})(?: [0-9a-fA-F]{1,6}){1,}\}|.)/gu;
/** Does it start with a UnicodeSet? Used to test the segments. */
static MATCH_USET = /^\[/;
/** Does it start with an escaped char? Used to test the segments. */
static MATCH_ESCAPED = /^\\u/;
/** Split a string into ElementSegments */
static segment(str) {
if (ElementParser.MATCH_NESTED_SQUARE_BRACKETS.test(str)) {
throw Error(`Unsupported: nested square brackets in element segment: ${str}`);
}
const list = [];
for (let m of str.match(ElementParser.MATCH_ELEMENT_SEGMENTS)) {
const e = new ElementSegment(m);
if (e.type === ElementType.escaped) {
// unescape
const { unescaped } = e;
if (isOneChar(unescaped)) {
list.push(e);
}
else {
// need to split the escaped segment, \u{41 42} -> \u{41}, \u{42}
for (let s of unescaped) {
list.push(new ElementSegment(`\\u{${s.codePointAt(0).toString(16)}}`));
}
}
}
else {
// all others
list.push(e);
}
}
return list;
}
}
;
//# sourceMappingURL=pattern-parser.js.map
//# debugId=28718c34-6dab-55d0-af86-7052cfc2ee28