apg-unicode
Version:
JavaScript APG parser of Unicode code point arrays
323 lines (306 loc) • 15.3 kB
JavaScript
// The utilities object is a collection of utility functions
// primarily for converting various array types to/from strings
// and assisting other components, especially Trace, in their tasks.
export { utilities };
const utilities = {
// Replaces ASCII control characters with their hexadecimal representation.
// Primarily used by the Trace facility for display line generation.
// Allows visual recogonition of non-printing control characters.
// * @param {string} str - a string that may contain control characters
// * @returns {string} - the input string with control characters replaced with \\xHH
exposeCtrlChars: (str) => {
if (/[\x00-\x1F\x7F]/.test(str)) {
return str.replace(/[\x00-\x1F\x7F]/g, (ch) => {
const hex = ch.charCodeAt(0).toString(16).padStart(2, '0').toUpperCase();
return `\\x${hex}`;
});
}
return str;
},
// Normalize the beginning and ending characters of a substring
// with the same rules used by JavaScript's
// [`String.prototype.slice()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice).
// * @param {number} length - The string or array length.
// * @param {number | undefined} start - The index, inclusive, of the first character in the substring.
// * @param {number | undefined} end - The index, exclusive, of the last character in the substring.
// * @returns {object} {indexStart, indexEnd} - The normalized start and end values described below.
//
// - If start >= length, start is normalized to length.
// - If start < 0, the index is counted from the end of the string. More formally, in this case, the substring starts at max(start + length, 0).
// - If start is omitted, undefined, or cannot be converted to a number, it's treated as 0.
// - If end is omitted, undefined, cannot be converted to a number or if end >= length, end is normalized to length.
// - If end < 0, the index is counted from the end of the string. More formally, in this case, the substring ends at max(end + length, 0).
// - If end < start after normalizing negative values (i.e., end represents a character that's before start),
// end is normalized to start.
sliceInterval: (length, start, end) => {
let indexStart = typeof start != 'number' ? 0 : start;
let indexEnd = typeof end != 'number' ? length : end;
if (indexStart < 0) {
indexStart = Math.max(indexStart + length, 0);
} else if (indexStart > length) {
indexStart = length;
}
if (indexEnd < 0) {
indexEnd = Math.max(indexEnd + length, 0);
} else if (indexEnd > length) {
indexEnd = length;
}
indexEnd = indexEnd < indexStart ? indexStart : indexEnd;
return { indexStart, indexEnd };
},
// Find the beginning index (inclusive) and the ending index (exclusive) of a phrase
// from its beginning character index and phrase length.
// Unlike the slice function, the phrase beginning and phrase length are natural for callback functions.
// * @param {number} length - Length of the phrase.
// * @param {number | undefined} beg - Index(default=0) of the first element, inclusive, of the sub-array.
// * @param {number} len - Number of characters(default=length) in the sub-array.
// * @returns {Object} {charStart, charEnd} The normalized first and last character of the sub-array
// * charStart and charEnd will satisfy:
// * 0 <= charStart <= length
// * 0 <= charEnd <= length
// * charStart <= charEnd
phraseInterval: (length, beg, len) => {
let charStart, charEnd;
if (beg === undefined || beg <= 0) {
charStart = 0;
} else {
charStart = Math.min(beg, length);
}
if (len === undefined) {
charEnd = length;
} else if (len <= 0) {
charEnd = charStart;
} else {
charEnd = Math.min(charStart + len, length);
}
return { charStart, charEnd };
},
// Convert a string to a `Uint32Array` of code points.
// * @param {string} str
// * @returns Uint32Array
stringToCodePoints: (str) => {
const buffer = new Uint32Array(str.length); // worst-case size
let count = 0;
for (let i = 0; i < str.length; i++) {
const codeUnit = str.charCodeAt(i);
/* Check for high surrogate */
if (codeUnit >= 0xd800 && codeUnit <= 0xdbff && i + 1 < str.length) {
const nextCodeUnit = str.charCodeAt(i + 1);
if (nextCodeUnit >= 0xdc00 && nextCodeUnit <= 0xdfff) {
/* Valid surrogate pair */
const high = codeUnit;
const low = nextCodeUnit;
const codePoint = ((high - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
buffer[count++] = codePoint;
i++; /* Skip the low surrogate */
continue;
}
}
/* Not a surrogate pair — BMP character */
buffer[count++] = codeUnit;
}
return count < str.length ? buffer.slice(0, count) : buffer;
},
// Convert a string to an Array.
// * @param {string} string - The string to convert.
// * @returns Array of code points.
stringToArray: (string) => Array.from(string, (ch) => ch.codePointAt(0)),
// Convert a string to UTF-8 encoded code points.
// * @param {string} string - The string to convert.
// * @returns - Uint8Array of UTF-8 encoded bytes.
stringToUtf8: (string) => new TextEncoder().encode(string),
// Convert a string to UTF-16 encoded code points.
// * @param {*} string - The string to convert.
// * @returns - Uint16Array of UTF-16 encoded 16-bit words.
stringToUtf16: (string) => {
const uint16 = new Uint16Array(string.length);
for (let i = 0; i < string.length; i++) {
uint16[i] = string.charCodeAt(i);
}
return uint16;
},
// Convert a string to 32-bit code points.
// * @param {string} string - The string to convert.
// * @returns - Uint32Array of 32-bit Unicode code points
stringToUtf32: (string) => {
const codePoints = Array.from(string, (ch) => ch.codePointAt(0));
return new Uint32Array(codePoints);
},
// Decode UTF-8 encoded bytes to a JavaScript string.
// * @param {Uint8Array} utf8 - The UTF-8 encoded bytes.
// * @param {number | undefined} beg - The index of the first byte to decode.
// * @param {number | undefined} len - The number of bytes to decode.
// * @param {boolean | undefined} fatal - If `true` the function will throw an exception on invalid UTF-8 byte sequence.
// * @returns - The decoded string;<br>
// **Note**: beg and len are normalized with `phraseInterval()`.
utf8ToString: (utf8, beg, len, fatal) => {
const i = utilities.phraseInterval(utf8.length, beg, len);
let decoder;
if (fatal) {
decoder = new TextDecoder('utf-8', { fatal: true });
} else {
decoder = new TextDecoder('utf-8');
}
return decoder.decode(utf8.subarray(i.charStart, i.charEnd));
},
// Decode UTF-16 encoded 16-bit words to a JavaScript string.
// * @param {Uint16Array} utf16 - The UTF-16 encoded words.
// * @param {number | undefined} beg - The index of the first byte to decode.
// * @param {number | undefined} len - The number of bytes to decode.
// * @param {boolean | undefined} fatal - If `true` the function will throw an exception on invalid UTF-8 byte sequence.
// * @returns - The decoded string;<br>
// **Note**: beg and len are normalized with `phraseInterval()`.
utf16ToString: (utf16, beg, len, fatal) => {
const i = utilities.phraseInterval(utf16.length, beg, len);
let decoder;
if (fatal) {
decoder = new TextDecoder('utf-16', { fatal: true });
} else {
decoder = new TextDecoder('utf-16');
}
return decoder.decode(utf16.subarray(i.charStart, i.charEnd));
},
// Decode UTF-32 encoded 32-bit words to a JavaScript string.
// * @param {Uint326Array} utf32 - The UTF-32 encoded double words.
// * @param {number | undefined} beg - The index of the first byte to decode.
// * @param {number | undefined} len - The number of bytes to decode.
// * @returns - The decoded string;<br>
// **Note**: beg and len are normalized with `phraseInterval()`.
utf32ToString: (utf32, beg, len) => {
const interval = utilities.phraseInterval(utf32.length, beg, len);
let str = [];
for (let i = interval.charStart; i < interval.charEnd; i++) {
str.push(String.fromCodePoint(utf32[i]));
}
return str.join('');
},
// Convert an array (any type) to ASCII if in the printing ASCII range, period or `.` otherwise.
// * @param {array†} chars - The characters to display.
// * @param {number | undefined} beg - The index of the first byte to decode.
// * @param {number | undefined} len - The number of bytes to decode.
// * @returns - The converted string;<br>
// **Note**: beg and len are normalized with `phraseInterval()`.<br>
// † Uint8Array, Uint16Array, Uint32Array, Array
arrayToAscii: (chars, beg, len) => {
const interval = utilities.phraseInterval(chars.length, beg, len);
let str = [];
for (let i = interval.charStart; i < interval.charEnd; i++) {
let c = chars[i];
if (c >= 32 && c < 128) {
str.push(String.fromCharCode(c));
} else {
str.push('.');
}
}
return str.join('');
},
// Convert an Array of numbers to a) string if a valid Unicode code point,
// b) a period, ".", place holder if not. Used by Trace for line display.
// * @param {Array} chars - The characters to display.
// * @param {number | undefined} beg - The index of the first byte to decode.
// * @param {number | undefined} len - The number of bytes to decode.
// * @returns - The converted display string;<br>
// **Note**: beg and len are normalized with `phraseInterval()`.
arrayToUnicode: (chars, beg, len) => {
const interval = utilities.phraseInterval(chars.length, beg, len);
let str = [];
for (let i = interval.charStart; i < interval.charEnd; i++) {
let b = chars[i];
let s =
(b >= 0x20 && b < 0x7f) || (b >= 0x80 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff)
? String.fromCodePoint(b)
: '.';
str.push(s);
}
return str.join('');
},
// Converts an Array of numbers to a hexdump-like display line used by Trace.
// * @param {Array} arr - The Array of numbers to convert.
// * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert.
// * @param {number | undefined} phraseLength - The number of characters to convert.<br>
// **Note**: startIndex and phraseLength are normalized with `phraseInterval()`.
hexLineArray: (arr, startIndex, phraseLength) => {
if (Array.isArray(arr)) {
const interval = utilities.phraseInterval(arr.length, startIndex, phraseLength);
let w = [];
for (let i = interval.charStart; i < interval.charEnd; i++) {
let char = arr[i] < 0x100000000 ? arr[i] : 0xffffffff;
w.push(char.toString(16).padStart(8, '0').toUpperCase());
}
const words = w.join(' ');
/* ASCII rendering */
w = [];
for (let i = interval.charStart; i < interval.charEnd; i++) {
let b = arr[i];
let s =
(b >= 0x20 && b < 0x7f) || (b >= 0x80 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff)
? String.fromCodePoint(b)
: '.';
w.push(s);
}
const ascii = w.join('');
return `${words} |${ascii}|`;
}
throw new Error('hexLineArray: input is not Array');
},
// Converts a Unit32Array of 32-bit double words to a hexdump-like display line used by Trace.
// * @param {Uint32Array} uint32 - The array of double words to convert.
// * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert.
// * @param {number | undefined} phraseLength - The number of characters to convert.<br>
// **Note**: startIndex and phraseLength are normalized with `phraseInterval()`.
hexLineDWords: (uint32, startIndex, phraseLength) => {
if (uint32 instanceof Uint32Array) {
const interval = utilities.phraseInterval(uint32.length, startIndex, phraseLength);
const slice = uint32.subarray(interval.charStart, interval.charEnd);
/* Format each 4-byte word */
const words = Array.from(slice)
.map((w) => w.toString(16).padStart(8, '0').toUpperCase())
.join(' ');
/* ASCII rendering */
const ascii = Array.from(slice)
.map((b) => ((b >= 0x20 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff) ? String.fromCodePoint(b) : '.'))
.join('');
return `${words} |${ascii}|`;
}
throw new Error('hexLineDWords: input is not Uint32Array');
},
// Converts a Unit16Array of 16-bit words to a hexdump-like display line used by Trace.
// * @param {Uint16Array} uint16 - The array of words to convert.
// * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert.
// * @param {number | undefined} phraseLength - The number of characters to convert.<br>
// **Note**: startIndex and phraseLength are normalized with `phraseInterval()`.
hexLineWords: (uint16, startIndex, phraseLength) => {
if (uint16 instanceof Uint16Array) {
const interval = utilities.phraseInterval(uint16.length, startIndex, phraseLength);
const slice = uint16.subarray(interval.charStart, interval.charEnd);
/* Format each 2-byte word */
const words = Array.from(slice)
.map((w) => w.toString(16).padStart(4, '0').toUpperCase())
.join(' ');
const ascii = Array.from(slice)
.map((b) => ((b >= 0x20 && b < 0xd800) || b >= 0xe000 ? String.fromCodePoint(b) : '.'))
.join('');
return `${words} |${ascii}|`;
}
throw new Error('hexLineWords: input is not Uint16Array');
},
// Converts a Unit8Array of 8-bit bytes to a hexdump-like display line used by Trace.
// * @param {Uint8Array} input - The array of bytes to convert.
// * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert.
// * @param {number | undefined} phraseLength - The number of characters to convert.<br>
// **Note**: startIndex and phraseLength are normalized with `phraseInterval()`.
hexLineBytes: (input, startIndex, phraseLength) => {
if (input instanceof Uint8Array) {
const interval = utilities.phraseInterval(input.length, startIndex, phraseLength);
const slice = input.subarray(interval.charStart, interval.charEnd); // safer than slice()
const hex = Array.from(slice)
.map((b) => b.toString(16).padStart(2, '0'))
.join(' ');
const ascii = Array.from(slice)
.map((b) => (b >= 0x20 && b <= 0x7e ? String.fromCharCode(b) : '.'))
.join('');
return `${hex} |${ascii}|`;
}
throw new Error('hexLineBytes: input not instance of Uint8Array');
},
};