functionalscript
Version:
FunctionalScript is a purely functional subset of JavaScript
199 lines (198 loc) • 7.04 kB
JavaScript
import { flat, flatMap, stateScan } from "../../types/list/module.f.js";
/**
* Error mask constant used to represent invalid code points or encoding errors in UTF-8.
*/
const errorMask = 0b1000_0000_0000_0000_0000_0000_0000_0000;
/**
* Converts a Unicode code point to a sequence of UTF-8 bytes.
* @param input The Unicode code point to be converted. Valid range:
* - 0x0000 to 0x007F for 1-byte sequences.
* - 0x0080 to 0x07FF for 2-byte sequences.
* - 0x0800 to 0xFFFF for 3-byte sequences.
* - 0x10000 to 0x10FFFF for 4-byte sequences.
* @returns A readonly array of UTF-8 bytes representing the input code point.
* - Returns `[errorMask]` if the input does not match valid UTF-8 encoding rules.
*/
const codePointToUtf8 = (input) => {
if (input >= 0x0000 && input <= 0x007f) {
return [input & 0b01111_1111];
}
if (input >= 0x0080 && input <= 0x07ff) {
return [input >> 6 | 0b1100_0000, input & 0b0011_1111 | 0b1000_0000];
}
if (input >= 0x0800 && input <= 0xffff) {
return [
input >> 12 | 0b1110_0000,
input >> 6 & 0b0011_1111 | 0b1000_0000,
input & 0b0011_1111 | 0b1000_0000,
];
}
if (input >= 0x10000 && input <= 0x10ffff) {
return [
input >> 18 | 0b1111_0000,
input >> 12 & 0b0011_1111 | 0b1000_0000,
input >> 6 & 0b0011_1111 | 0b1000_0000,
input & 0b0011_1111 | 0b1000_0000,
];
}
if ((input & errorMask) !== 0) {
if ((input & 0b1000_0000_0000_0000) !== 0) {
return [
input >> 12 & 0b0000_0111 | 0b1111_0000,
input >> 6 & 0b0011_1111 | 0b1000_0000,
input & 0b0011_1111 | 0b1000_0000,
];
}
if ((input & 0b0000_0100_0000_0000) !== 0) {
return [
input >> 6 & 0b0000_1111 | 0b1110_0000,
input & 0b0011_1111 | 0b1000_0000,
];
}
if ((input & 0b0000_0010_0000_0000) !== 0) {
return [
input >> 6 & 0b0000_0111 | 0b1111_0000,
input & 0b0011_1111 | 0b1000_0000,
];
}
if ((input & 0b0000_0000_1000_0000) !== 0)
return [input & 0b1111_1111];
}
return [errorMask];
};
/**
* Maps a list of Unicode code points to a stream of UTF-8 bytes.
*
* @param input - A list of Unicode code points to be converted.
* @returns A thunk that lazily produces a sequence of UTF-8 bytes.
*/
export const fromCodePointList = flatMap(codePointToUtf8);
/**
* Converts a non-empty UTF-8 decoding state to an error code.
*
* @param state - A non-empty UTF-8 decoding state.
* @returns An I32 error code derived from the invalid UTF-8 state.
*/
const utf8StateToError = (state) => {
let x;
switch (state.length) {
case 1: {
[x] = state;
break;
}
case 2: {
const [s0, s1] = state;
x = s0 < 0b1111_0000
? ((s0 & 0b0000_1111) << 6) + (s1 & 0b0011_1111) + 0b0000_0100_0000_0000
: ((s0 & 0b0000_0111) << 6) + (s1 & 0b0011_1111) +
0b0000_0010_0000_0000;
break;
}
case 3: {
const [s0, s1, s2] = state;
x = ((s0 & 0b0000_0111) << 12) + ((s1 & 0b0011_1111) << 6) +
(s2 & 0b0011_1111) + 0b1000_0000_0000_0000;
break;
}
default:
throw 'invalid state';
}
return x | errorMask;
};
/**
* Decodes a byte into a Unicode code point, using a given UTF-8 state.
*
* @param state - The current UTF-8 decoding state.
* @param byte - A single byte to decode.
* @returns A tuple containing:
* - A list of decoded Unicode code points or error codes.
* - The updated UTF-8 state.
*/
const utf8ByteToCodePointOp = (state) => (byte) => {
if (byte < 0x00 || byte > 0xff) {
return [[errorMask], state];
}
if (state === null) {
if (byte < 0b1000_0000)
return [[byte], null];
if (byte >= 0b1100_0010 && byte <= 0b1111_0100)
return [[], [byte]];
return [[byte | errorMask], null];
}
if (byte >= 0b1000_0000 && byte < 0b1100_0000) {
switch (state.length) {
case 1: {
const [s0] = state;
if (s0 < 0b1110_0000) {
return [[((s0 & 0b0001_1111) << 6) + (byte & 0b0011_1111)], null];
}
if (s0 < 0b1111_1000)
return [[], [s0, byte]];
break;
}
case 2: {
const [s0, s1] = state;
if (s0 < 0b1111_0000) {
return [[
((s0 & 0b0000_1111) << 12) + ((s1 & 0b0011_1111) << 6) +
(byte & 0b0011_1111),
], null];
}
if (s0 < 0b1111_1000)
return [[], [s0, s1, byte]];
break;
}
case 3: {
const [s0, s1, s2] = state;
return [[
((s0 & 0b0000_0111) << 18) + ((s1 & 0b0011_1111) << 12) +
((s2 & 0b0011_1111) << 6) + (byte & 0b0011_1111),
], null];
}
}
}
const error = utf8StateToError(state);
if (byte < 0b1000_0000)
return [[error, byte], null];
if (byte >= 0b1100_0010 && byte <= 0b1111_0100)
return [[error], [byte]];
return [[error, byte | errorMask], null];
};
/**
* Handles the end-of-file (EOF) case for UTF-8 decoding.
*
* @param state - The current UTF-8 decoding state.
* @returns A tuple containing:
* - A list of decoded Unicode code points or error codes.
* - The reset UTF-8 state (`null`).
*/
const utf8EofToCodePointOp = (state) => [
state === null ? null : [utf8StateToError(state)],
null,
];
/**
* Combines UTF-8 byte and EOF handling into a single decoding operation.
*
* @param state - The current UTF-8 decoding state.
* @param input - The next byte or EOF indicator.
* @returns A tuple containing:
* - A list of decoded Unicode code points or error codes.
* - The updated UTF-8 state.
*/
const utf8ByteOrEofToCodePointOp = (state) => (input) => input === null ? utf8EofToCodePointOp(state) : utf8ByteToCodePointOp(state)(input);
/**
* A constant representing the end-of-file (EOF) marker for UTF-8 decoding.
*
* @remarks
* This is used as a sentinel value in decoding operations to signify the
* termination of input. The list contains a single `null` value, which
* represents the EOF condition.
*/
const eofList = [null];
/**
* Converts a list of UTF-8 bytes into a list of Unicode code points.
*
* @param input - A list of UTF-8 bytes.
* @returns A list of Unicode code points or error codes.
*/
export const toCodePointList = (input) => flat(stateScan(utf8ByteOrEofToCodePointOp)(null)(flat([input, eofList])));