UNPKG

mutf-8

Version:

An encoder and decoder for Modified UTF-8 which is used in the Java platform such as the class file format and object serialization.

253 lines (252 loc) 8.25 kB
// SPDX-License-Identifier: MIT /** * The decoder for Modified UTF-8. * * @example * ```ts * const src = new Uint8Array([ * 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8, * 0x96, 0xe7, 0x95, 0x8c, 0x21, * ]); * const decoder = new MUtf8Decoder(); * const text = decoder.decode(src); * // Hello 世界! * ``` * * @see {@link https://encoding.spec.whatwg.org/#interface-textdecoder} */ export class MUtf8Decoder { #fatal; #ignoreBOM; #leavings; /** * @returns Always `"mutf-8"`. */ get encoding() { return "mutf-8"; } /** * @returns `true` if error mode is fatal, otherwise `false`. */ get fatal() { return this.#fatal; } /** * @returns Whether to ignore the BOM or not. */ get ignoreBOM() { return this.#ignoreBOM; } /** * @param label - The label of the decoder. Must be `"mutf-8"` or `"mutf8"`. * @param options - The options for the decoder. * @throws `RangeError` If the `label` is an invalid value. */ constructor(label = "mutf-8", options = {}) { const normalizedLabel = label.toLowerCase(); if (normalizedLabel !== "mutf-8" && normalizedLabel !== "mutf8") { throw new RangeError(`MUtf8Decoder.constructor: '${label}' is not supported.`); } this.#fatal = options.fatal ?? false; this.#ignoreBOM = options.ignoreBOM ?? false; } /** * Decodes the specified bytes into a string. * * @param input - The bytes to be decoded. * @param options - The options for decoding. * @returns The resultant string after decoding. * @throws `TypeError` If {@link fatal} is `true` and the `input` contains invalid bytes. */ decode(input, options = {}) { const stream = options.stream ?? false; const buf = this.#toBinary(input); const length = buf.length; const result = []; let p = 0; while (p < length) { const b1 = buf[p++]; if (!(b1 & 0x80) && b1 !== 0) { // U+0001-007F result.push(String.fromCharCode(b1)); } else if ((b1 & 0xe0) === 0xc0) { // U+0000, U+0080-07FF if (length <= p) { if (stream) { this.#leavings = buf.slice(p - 1); break; } result.push(this.#handleError()); continue; } const b2 = buf[p++]; if ((b2 & 0xc0) !== 0x80) { result.push(this.#handleError()); p--; continue; } result.push(String.fromCharCode(((b1 & 0x1f) << 6) | (b2 & 0x3f))); } else if ((b1 & 0xf0) === 0xe0) { // U+0800-FFFF if (length <= p + 1) { if (stream) { this.#leavings = buf.slice(p - 1); break; } result.push(this.#handleError()); continue; } const b2 = buf[p++]; if ((b2 & 0xc0) !== 0x80) { result.push(this.#handleError()); p--; continue; } const b3 = buf[p++]; if ((b3 & 0xc0) !== 0x80) { result.push(this.#handleError()); p -= 2; continue; } if (p === 3 && b1 === 0xef && b2 === 0xbb && b3 === 0xbf && !this.ignoreBOM) { // skip BOM `EF BB BF` continue; } result.push(String.fromCharCode(((b1 & 0x0f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f))); } else { result.push(this.#handleError()); } } return result.join(""); } #toBinary(input) { let bin; if (input instanceof Uint8Array) { bin = input; } else { bin = new Uint8Array("buffer" in input ? input.buffer : input); } if (!this.#leavings) { return bin; } const combined = new Uint8Array(this.#leavings.length + bin.length); combined.set(this.#leavings, 0); combined.set(bin, this.#leavings.length); this.#leavings = undefined; return combined; } #handleError() { if (this.fatal) { throw new TypeError("MUtf8Decoder.decode: Decoding failed."); } return "\ufffd"; } } /** * The encoder for Modified UTF-8. * * @example * ```ts * const encoder = new MUtf8Encoder(); * const code = encoder.encode("Hello 世界!"); * // Uint8Array [ * // 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0xe4, 0xb8, * // 0x96, 0xe7, 0x95, 0x8c, 0x21, * // ] * ``` * * @see {@link https://encoding.spec.whatwg.org/#interface-textencoder} */ export class MUtf8Encoder { /** * @returns Always `"mutf-8"`. */ get encoding() { return "mutf-8"; } /** * Encodes the specified string in Modified UTF-8. * * @param input - The string to be encoded. * @returns The resultant bytes. */ encode(input = "") { const bin = []; for (const c of input) { // biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string. const code = c.codePointAt(0); if (0x0001 <= code && code <= 0x007f) { bin.push(code); } else if (code <= 0x07ff) { bin.push(0xc0 | (code >>> 6)); bin.push(0x80 | (0x3f & code)); } else if (code <= 0xffff) { bin.push(0xe0 | (code >>> 12)); bin.push(0x80 | (0x3f & (code >>> 6))); bin.push(0x80 | (0x3f & code)); } else { bin.push(0xed); bin.push(0xa0 | ((code >>> 16) - 1)); bin.push(0x80 | (0x3f & (code >>> 10))); bin.push(0xed); bin.push(0xb0 | (0x0f & (code >>> 6))); bin.push(0x80 | (0x3f & code)); } } return new Uint8Array(bin); } /** * Encodes the specified string in Modified UTF-8 and stores the result in the specified array. * * @param source - The string to be encoded. * @param destination - The array to store the encoded bytes. * @returns The progress of the encoding operation. */ encodeInto(source, destination) { const destLen = destination.length; let i = 0; let read = 0; for (const c of source) { // biome-ignore lint/style/noNonNullAssertion: `c` is always a non-empty string. const code = c.codePointAt(0); if (0x0001 <= code && code <= 0x007f) { if (destLen <= i) break; destination[i++] = code; } else if (code <= 0x07ff) { if (destLen <= i + 1) break; destination[i++] = 0xc0 | (code >>> 6); destination[i++] = 0x80 | (0x3f & code); } else if (code <= 0xffff) { if (destLen <= i + 2) break; destination[i++] = 0xe0 | (code >>> 12); destination[i++] = 0x80 | (0x3f & (code >>> 6)); destination[i++] = 0x80 | (0x3f & code); } else { if (destLen <= i + 5) break; destination[i++] = 0xed; destination[i++] = 0xa0 | ((code >>> 16) - 1); destination[i++] = 0x80 | (0x3f & (code >>> 10)); destination[i++] = 0xed; destination[i++] = 0xb0 | (0x0f & (code >>> 6)); destination[i++] = 0x80 | (0x3f & code); read++; } read++; } return { read, written: i }; } }