UNPKG

apg-unicode

Version:

JavaScript APG parser of Unicode code point arrays

github.com/ldthomas/apg-unicode

ldthomas/apg-unicode

323 lines (306 loc) • 15.3 kB

JavaScript

// The utilities object is a collection of utility functions // primarily for converting various array types to/from strings // and assisting other components, especially Trace, in their tasks. export { utilities }; const utilities = { // Replaces ASCII control characters with their hexadecimal representation. // Primarily used by the Trace facility for display line generation. // Allows visual recogonition of non-printing control characters. // * @param {string} str - a string that may contain control characters // * @returns {string} - the input string with control characters replaced with \\xHH exposeCtrlChars: (str) => { if (/[\x00-\x1F\x7F]/.test(str)) { return str.replace(/[\x00-\x1F\x7F]/g, (ch) => { const hex = ch.charCodeAt(0).toString(16).padStart(2, '0').toUpperCase(); return `\\x${hex}`; }); } return str; }, // Normalize the beginning and ending characters of a substring // with the same rules used by JavaScript's // [`String.prototype.slice()`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice). // * @param {number} length - The string or array length. // * @param {number | undefined} start - The index, inclusive, of the first character in the substring. // * @param {number | undefined} end - The index, exclusive, of the last character in the substring. // * @returns {object} {indexStart, indexEnd} - The normalized start and end values described below. // // - If start >= length, start is normalized to length. // - If start < 0, the index is counted from the end of the string. More formally, in this case, the substring starts at max(start + length, 0). // - If start is omitted, undefined, or cannot be converted to a number, it's treated as 0. // - If end is omitted, undefined, cannot be converted to a number or if end >= length, end is normalized to length. // - If end < 0, the index is counted from the end of the string. More formally, in this case, the substring ends at max(end + length, 0). // - If end < start after normalizing negative values (i.e., end represents a character that's before start), // end is normalized to start. sliceInterval: (length, start, end) => { let indexStart = typeof start != 'number' ? 0 : start; let indexEnd = typeof end != 'number' ? length : end; if (indexStart < 0) { indexStart = Math.max(indexStart + length, 0); } else if (indexStart > length) { indexStart = length; } if (indexEnd < 0) { indexEnd = Math.max(indexEnd + length, 0); } else if (indexEnd > length) { indexEnd = length; } indexEnd = indexEnd < indexStart ? indexStart : indexEnd; return { indexStart, indexEnd }; }, // Find the beginning index (inclusive) and the ending index (exclusive) of a phrase // from its beginning character index and phrase length. // Unlike the slice function, the phrase beginning and phrase length are natural for callback functions. // * @param {number} length - Length of the phrase. // * @param {number | undefined} beg - Index(default=0) of the first element, inclusive, of the sub-array. // * @param {number} len - Number of characters(default=length) in the sub-array. // * @returns {Object} {charStart, charEnd} The normalized first and last character of the sub-array // * charStart and charEnd will satisfy: // * 0 <= charStart <= length // * 0 <= charEnd <= length // * charStart <= charEnd phraseInterval: (length, beg, len) => { let charStart, charEnd; if (beg === undefined || beg <= 0) { charStart = 0; } else { charStart = Math.min(beg, length); } if (len === undefined) { charEnd = length; } else if (len <= 0) { charEnd = charStart; } else { charEnd = Math.min(charStart + len, length); } return { charStart, charEnd }; }, // Convert a string to a `Uint32Array` of code points. // * @param {string} str // * @returns Uint32Array stringToCodePoints: (str) => { const buffer = new Uint32Array(str.length); // worst-case size let count = 0; for (let i = 0; i < str.length; i++) { const codeUnit = str.charCodeAt(i); /* Check for high surrogate */ if (codeUnit >= 0xd800 && codeUnit <= 0xdbff && i + 1 < str.length) { const nextCodeUnit = str.charCodeAt(i + 1); if (nextCodeUnit >= 0xdc00 && nextCodeUnit <= 0xdfff) { /* Valid surrogate pair */ const high = codeUnit; const low = nextCodeUnit; const codePoint = ((high - 0xd800) << 10) + (low - 0xdc00) + 0x10000; buffer[count++] = codePoint; i++; /* Skip the low surrogate */ continue; } } /* Not a surrogate pair — BMP character */ buffer[count++] = codeUnit; } return count < str.length ? buffer.slice(0, count) : buffer; }, // Convert a string to an Array. // * @param {string} string - The string to convert. // * @returns Array of code points. stringToArray: (string) => Array.from(string, (ch) => ch.codePointAt(0)), // Convert a string to UTF-8 encoded code points. // * @param {string} string - The string to convert. // * @returns - Uint8Array of UTF-8 encoded bytes. stringToUtf8: (string) => new TextEncoder().encode(string), // Convert a string to UTF-16 encoded code points. // * @param {*} string - The string to convert. // * @returns - Uint16Array of UTF-16 encoded 16-bit words. stringToUtf16: (string) => { const uint16 = new Uint16Array(string.length); for (let i = 0; i < string.length; i++) { uint16[i] = string.charCodeAt(i); } return uint16; }, // Convert a string to 32-bit code points. // * @param {string} string - The string to convert. // * @returns - Uint32Array of 32-bit Unicode code points stringToUtf32: (string) => { const codePoints = Array.from(string, (ch) => ch.codePointAt(0)); return new Uint32Array(codePoints); }, // Decode UTF-8 encoded bytes to a JavaScript string. // * @param {Uint8Array} utf8 - The UTF-8 encoded bytes. // * @param {number | undefined} beg - The index of the first byte to decode. // * @param {number | undefined} len - The number of bytes to decode. // * @param {boolean | undefined} fatal - If `true` the function will throw an exception on invalid UTF-8 byte sequence. // * @returns - The decoded string;<br> // **Note**: beg and len are normalized with `phraseInterval()`. utf8ToString: (utf8, beg, len, fatal) => { const i = utilities.phraseInterval(utf8.length, beg, len); let decoder; if (fatal) { decoder = new TextDecoder('utf-8', { fatal: true }); } else { decoder = new TextDecoder('utf-8'); } return decoder.decode(utf8.subarray(i.charStart, i.charEnd)); }, // Decode UTF-16 encoded 16-bit words to a JavaScript string. // * @param {Uint16Array} utf16 - The UTF-16 encoded words. // * @param {number | undefined} beg - The index of the first byte to decode. // * @param {number | undefined} len - The number of bytes to decode. // * @param {boolean | undefined} fatal - If `true` the function will throw an exception on invalid UTF-8 byte sequence. // * @returns - The decoded string;<br> // **Note**: beg and len are normalized with `phraseInterval()`. utf16ToString: (utf16, beg, len, fatal) => { const i = utilities.phraseInterval(utf16.length, beg, len); let decoder; if (fatal) { decoder = new TextDecoder('utf-16', { fatal: true }); } else { decoder = new TextDecoder('utf-16'); } return decoder.decode(utf16.subarray(i.charStart, i.charEnd)); }, // Decode UTF-32 encoded 32-bit words to a JavaScript string. // * @param {Uint326Array} utf32 - The UTF-32 encoded double words. // * @param {number | undefined} beg - The index of the first byte to decode. // * @param {number | undefined} len - The number of bytes to decode. // * @returns - The decoded string;<br> // **Note**: beg and len are normalized with `phraseInterval()`. utf32ToString: (utf32, beg, len) => { const interval = utilities.phraseInterval(utf32.length, beg, len); let str = []; for (let i = interval.charStart; i < interval.charEnd; i++) { str.push(String.fromCodePoint(utf32[i])); } return str.join(''); }, // Convert an array (any type) to ASCII if in the printing ASCII range, period or `.` otherwise. // * @param {array&dagger;} chars - The characters to display. // * @param {number | undefined} beg - The index of the first byte to decode. // * @param {number | undefined} len - The number of bytes to decode. // * @returns - The converted string;<br> // **Note**: beg and len are normalized with `phraseInterval()`.<br> // &dagger; Uint8Array, Uint16Array, Uint32Array, Array arrayToAscii: (chars, beg, len) => { const interval = utilities.phraseInterval(chars.length, beg, len); let str = []; for (let i = interval.charStart; i < interval.charEnd; i++) { let c = chars[i]; if (c >= 32 && c < 128) { str.push(String.fromCharCode(c)); } else { str.push('.'); } } return str.join(''); }, // Convert an Array of numbers to a) string if a valid Unicode code point, // b) a period, ".", place holder if not. Used by Trace for line display. // * @param {Array} chars - The characters to display. // * @param {number | undefined} beg - The index of the first byte to decode. // * @param {number | undefined} len - The number of bytes to decode. // * @returns - The converted display string;<br> // **Note**: beg and len are normalized with `phraseInterval()`. arrayToUnicode: (chars, beg, len) => { const interval = utilities.phraseInterval(chars.length, beg, len); let str = []; for (let i = interval.charStart; i < interval.charEnd; i++) { let b = chars[i]; let s = (b >= 0x20 && b < 0x7f) || (b >= 0x80 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff) ? String.fromCodePoint(b) : '.'; str.push(s); } return str.join(''); }, // Converts an Array of numbers to a hexdump-like display line used by Trace. // * @param {Array} arr - The Array of numbers to convert. // * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert. // * @param {number | undefined} phraseLength - The number of characters to convert.<br> // **Note**: startIndex and phraseLength are normalized with `phraseInterval()`. hexLineArray: (arr, startIndex, phraseLength) => { if (Array.isArray(arr)) { const interval = utilities.phraseInterval(arr.length, startIndex, phraseLength); let w = []; for (let i = interval.charStart; i < interval.charEnd; i++) { let char = arr[i] < 0x100000000 ? arr[i] : 0xffffffff; w.push(char.toString(16).padStart(8, '0').toUpperCase()); } const words = w.join(' '); /* ASCII rendering */ w = []; for (let i = interval.charStart; i < interval.charEnd; i++) { let b = arr[i]; let s = (b >= 0x20 && b < 0x7f) || (b >= 0x80 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff) ? String.fromCodePoint(b) : '.'; w.push(s); } const ascii = w.join(''); return `${words} |${ascii}|`; } throw new Error('hexLineArray: input is not Array'); }, // Converts a Unit32Array of 32-bit double words to a hexdump-like display line used by Trace. // * @param {Uint32Array} uint32 - The array of double words to convert. // * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert. // * @param {number | undefined} phraseLength - The number of characters to convert.<br> // **Note**: startIndex and phraseLength are normalized with `phraseInterval()`. hexLineDWords: (uint32, startIndex, phraseLength) => { if (uint32 instanceof Uint32Array) { const interval = utilities.phraseInterval(uint32.length, startIndex, phraseLength); const slice = uint32.subarray(interval.charStart, interval.charEnd); /* Format each 4-byte word */ const words = Array.from(slice) .map((w) => w.toString(16).padStart(8, '0').toUpperCase()) .join(' '); /* ASCII rendering */ const ascii = Array.from(slice) .map((b) => ((b >= 0x20 && b < 0xd800) || (b >= 0xe000 && b <= 0x10ffff) ? String.fromCodePoint(b) : '.')) .join(''); return `${words} |${ascii}|`; } throw new Error('hexLineDWords: input is not Uint32Array'); }, // Converts a Unit16Array of 16-bit words to a hexdump-like display line used by Trace. // * @param {Uint16Array} uint16 - The array of words to convert. // * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert. // * @param {number | undefined} phraseLength - The number of characters to convert.<br> // **Note**: startIndex and phraseLength are normalized with `phraseInterval()`. hexLineWords: (uint16, startIndex, phraseLength) => { if (uint16 instanceof Uint16Array) { const interval = utilities.phraseInterval(uint16.length, startIndex, phraseLength); const slice = uint16.subarray(interval.charStart, interval.charEnd); /* Format each 2-byte word */ const words = Array.from(slice) .map((w) => w.toString(16).padStart(4, '0').toUpperCase()) .join(' '); const ascii = Array.from(slice) .map((b) => ((b >= 0x20 && b < 0xd800) || b >= 0xe000 ? String.fromCodePoint(b) : '.')) .join(''); return `${words} |${ascii}|`; } throw new Error('hexLineWords: input is not Uint16Array'); }, // Converts a Unit8Array of 8-bit bytes to a hexdump-like display line used by Trace. // * @param {Uint8Array} input - The array of bytes to convert. // * @param {number | undefined} startIndex - The index, inclusive, of the first character to convert. // * @param {number | undefined} phraseLength - The number of characters to convert.<br> // **Note**: startIndex and phraseLength are normalized with `phraseInterval()`. hexLineBytes: (input, startIndex, phraseLength) => { if (input instanceof Uint8Array) { const interval = utilities.phraseInterval(input.length, startIndex, phraseLength); const slice = input.subarray(interval.charStart, interval.charEnd); // safer than slice() const hex = Array.from(slice) .map((b) => b.toString(16).padStart(2, '0')) .join(' '); const ascii = Array.from(slice) .map((b) => (b >= 0x20 && b <= 0x7e ? String.fromCharCode(b) : '.')) .join(''); return `${hex} |${ascii}|`; } throw new Error('hexLineBytes: input not instance of Uint8Array'); }, };