functionalscript
Version:
FunctionalScript is a purely functional subset of JavaScript
353 lines (352 loc) β’ 14.8 kB
JavaScript
import { map, flat, stateScan, reduce, flatMap, empty, } from "../../types/list/module.f.js";
import { concat } from "../../types/function/operator/module.f.js";
import { contains } from "../../types/range/module.f.js";
import { fn } from "../../types/function/module.f.js";
/**
* Ranges of code points for the lower (Low) and higher (High) parts of the BMP (Basic Multilingual Plane) plane.
*/
const lowBmp = contains([0x0000, 0xd7ff]);
const highBmp = contains([0xe000, 0xffff]);
/**
* Checks whether the code point is in the BMP range.
* BMP is the main multi-plane Unicode plane that covers code points 0x0000 - 0xFFFF, except for the range of surrogates.
*/
const isBmpCodePoint = (codePoint) => lowBmp(codePoint) || highBmp(codePoint);
/**
* Checks whether the 16-bit word (U16) is a surrogate of the high part.
* Range: 0xD800 - 0xDBFF.
*/
const isHighSurrogate = contains([0xd800, 0xdbff]);
/**
* Checks whether the 16-bit word (U16) is a substitute for the low part.
* Range: 0xDC00 β 0xDFFF.
*/
const isLowSurrogate = contains([0xdc00, 0xdfff]);
/**
* Mask of mistakes. Used to indicate invalid code points or coding errors
*/
const errorMask = 0b1000_0000_0000_0000_0000_0000_0000_0000;
/**
* Checks whether the code point belongs to the additional (Supplementary) plane of Unicode.
* Additional planes include code points from 0x010000 to 0x10FFFF.
*/
const isSupplementaryPlane = contains([0x01_0000, 0x10_ffff]);
/**
* Converts a Unicode code point to its corresponding UTF-16 representation.
*
* This function handles:
* 1. Code points in the Basic Multilingual Plane (BMP) [0x0000β0xFFFF],
* which map directly to a single 16-bit value (U16).
* 2. Supplementary Plane code points [0x10000β0x10FFFF],
* which are represented as surrogate pairs in UTF-16.
*
* @param codePoint - A valid Unicode code point.
* @returns A list of 16-bit unsigned integers (U16) representing the UTF-16 encoding
* of the input code point. If the code point is in the BMP range, a single U16
* value is returned. For code points in the supplementary planes, two U16
* values (a high and a low surrogate) are returned.
* @example
*
* ```ts
* const exampleCodePoints: List<CodePoint> = [
* 0x0041, // 'A' (BMP, single U16)
* 0x1F600, // π (Emoji, supplementary plane, surrogate pair)
* 0xD7FF, // Last code point in the low BMP range
* 0xE000, // First code point in the high BMP range
* 0x10FFFF, // Maximum valid code point in Unicode
* 0x110000, // Invalid code point (outside Unicode range)
* ]
* exampleCodePoints.forEach((codePoint) => {
* const utf16Result = codePointToUtf16(codePoint)
* console.log(`Code Point: U+${codePoint.toString(16).toUpperCase()}`)
* console.log(`UTF-16: ${utf16Result.map(u16 => u16.toString(16).toUpperCase()).join(' ')}`)
* })
* ```
*/
const codePointToUtf16 = (codePoint) => {
if (isBmpCodePoint(codePoint)) {
return [codePoint];
}
if (isSupplementaryPlane(codePoint)) {
const n = codePoint - 0x1_0000;
const high = (n >> 10) + 0xd800;
const low = (n & 0b0011_1111_1111) + 0xdc00;
return [high, low];
}
return [codePoint & 0xffff];
};
/**
* Converts a UTF-16 sequence to its corresponding Unicode code points.
*
* This function handles:
* 1. Single U16 values in the Basic Multilingual Plane (BMP) [0x0000β0xFFFF].
* 2. Surrogate pairs representing code points in the Supplementary Plane [0x10000β0x10FFFF].
* 3. Invalid input sequences by applying an error mask to the resulting code point.
*
* @param utf16 - A list of UTF-16 code units (U16) to convert.
* @returns A list of Unicode code points. Each code point corresponds to one or more U16
* values in the input. Invalid sequences are marked with the `errorMask`.
* @example
*
* ```ts
* const exampleUtf16: List<U16> = [
* 0x0041, // 'A' (BMP, single U16)
* 0xD83D, 0xDE00, // π (Emoji, surrogate pair)
* 0xD800, // Unpaired high surrogate
* 0xDC00, // Unpaired low surrogate
* ]
*
* const codePoints = toCodePointList(exampleUtf16)
* codePoints.forEach((codePoint) => {
* if (codePoint & errorMask) {
* console.log(`Invalid sequence detected: ${codePoint.toString(16).toUpperCase()}`)
* } else {
* console.log(`Code Point: U+${codePoint.toString(16).toUpperCase()}`)
* }
* })
* ```
*/
export const fromCodePointList = flatMap(codePointToUtf16);
/**
* Validates whether a given 16-bit unsigned integer (U16) falls within the valid range for UTF-16 code units.
*
* UTF-16 uses 16-bit code units to encode characters. The valid range for these code units is [0x0000, 0xFFFF].
* This function is used to verify that a number is within this range.
*
* @param i - A 16-bit unsigned integer (U16) to validate.
* @returns A boolean value indicating whether the input is a valid UTF-16 code unit.
*
* @example
*
* ```ts
* const validU16 = u16(0x0041) // true: U+0041 ('A')
* const invalidU16 = u16(0x110000) // false: Value is outside the valid range
* const edgeCaseLow = u16(0x0000) // true: Minimum valid value for UTF-16
* const edgeCaseHigh = u16(0xFFFF) // true: Maximum valid value for UTF-16
* ```
*/
const u16 = contains([0x0000, 0xFFFF]);
/**
* A stateful operation that converts a UTF-16 word (U16) to a list of Unicode code points (CodePoint),
* while maintaining the state of surrogate pair decoding.
*
* This operation processes UTF-16 code units and decodes them into Unicode code points. It handles:
* 1. BMP code points (single U16).
* 2. Surrogate pairs (two U16s representing a single code point in the supplementary planes).
* 3. Invalid or malformed code units.
*
* It also manages the internal state, which is necessary to handle surrogate pairs. If the state is null,
* it expects a valid BMP code point or a high surrogate. If the state is not null, it processes a low surrogate
* and combines the pair into a single supplementary code point.
*
* @param state - The current state of the UTF-16 decoding operation.
* This can be either `null` (no state) or the previous high surrogate value.
* @param word - The current UTF-16 word (U16) to process.
* @returns A tuple where the first element is a list of decoded Unicode code points (`CodePoint`), and
* the second element is the updated state. If the word is invalid, an error code `0xffffffff` is returned.
*
* @example
*
* ```ts
* const state: Utf16State = null;
* const word: U16 = 0xD83D; // High surrogate for π emoji
* const [decodedCodePoints, newState] = utf16ByteToCodePointOp(state)(word);
* ```
*
* @example
*
* ```ts
* const state: Utf16State = 0xD83D; // High surrogate already stored
* const word: U16 = 0xDC00; // Low surrogate for π emoji
* const [decodedCodePoints, newState] = utf16ByteToCodePointOp(state)(word);
* ```
*/
const utf16ByteToCodePointOp = state => word => {
if (!u16(word)) {
return [[0xffffffff], state];
}
if (state === null) {
if (isBmpCodePoint(word)) {
return [[word], null];
}
if (isHighSurrogate(word)) {
return [[], word];
}
return [[word | errorMask], null];
}
if (isLowSurrogate(word)) {
const high = state - 0xd800;
const low = word - 0xdc00;
return [[(high << 10) + low + 0x10000], null];
}
if (isBmpCodePoint(word)) {
return [[state | errorMask, word], null];
}
if (isHighSurrogate(word)) {
return [[state | errorMask], word];
}
return [[state | errorMask, word | errorMask], null];
};
/**
* Handles the EOF (end-of-file) condition during UTF-16 decoding.
*
* If there is no pending state (`state === null`), it simply returns an empty list
* of code points, indicating no further input to process. If there is a pending state,
* it is treated as an unpaired surrogate, and the `errorMask` is applied to flag
* the invalid sequence.
*
* @param state - The current UTF-16 decoding state. This can be:
* - `null`: No pending surrogate to process.
* - A high surrogate (0xD800β0xDBFF) left from an earlier input, waiting for a low surrogate.
* @returns A tuple:
* - The first element is a list of code points. If thereβs a pending state, it is returned
* with the `errorMask` applied.
* - The second element is the next state, which will always be `null` because EOF means no
* further processing.
*
* @example
*
* ```js
* const eofState = utf16EofToCodePointOp(0xD800) // Unpaired high surrogate
* const validState = utf16EofToCodePointOp(null) // No pending state
* ```
*/
const utf16EofToCodePointOp = (state) => [state === null ? empty : [state | errorMask], null];
/**
* A stateful scan operation that processes UTF-16 input (word or EOF).
* This function determines whether to handle a UTF-16 word or an end-of-file (EOF)
* signal during decoding:
* 1. If the input is `null` (EOF), it calls `utf16EofToCodePointOp` to process
* any remaining state.
* 2. If the input is a valid UTF-16 word, it calls `utf16ByteToCodePointOp` to
* process the word and update the state accordingly.
* @param state - The current state in the UTF-16 decoding process:
* - `null`: No pending surrogate.
* - A high surrogate waiting for a low surrogate.
* @param input - The current UTF-16 word to process, or `null` to signal EOF.
* @returns A tuple:
* - A list of decoded code points (if any).
* - The updated decoding state.
*
* @example
*
* ```ts
* // Example 1: Process a valid UTF-16 word
* const input1 = 0x0041 // 'A' (BMP code point)
* const result1 = utf16ByteOrEofToCodePointOp(null)(input1)
* console.log(result1) // [[0x0041], null]
* // Example 2: Process a high surrogate, followed by EOF
* const input2 = 0xD83D // High surrogate
* const result2 = utf16ByteOrEofToCodePointOp(null)(input2)
* console.log(result2) // [[], 0xD83D] (waiting for a low surrogate)
* const eofResult = utf16ByteOrEofToCodePointOp(0xD83D)(null)
* console.log(eofResult) // [[0xD83D | errorMask], null] (unpaired high surrogate)
* // Example 3: Handle EOF with no pending state
* const eofResult2 = utf16ByteOrEofToCodePointOp(null)(null)
* ```
*/
const utf16ByteOrEofToCodePointOp = state => input => input === null ? utf16EofToCodePointOp(state) : utf16ByteToCodePointOp(state)(input);
/**
* Represents an end-of-file (EOF) indicator in a list of UTF-16 code units.
*
* This list contains a single element, `null`, which is used to signal the end
* of input during UTF-16 decoding operations.
* @example
*
* ```ts
* const input = [...utf16Data, ...eofList]
* // Ensures the EOF is handled during processing.
* ```
*/
const eofList = [null];
/**
* Converts a list of UTF-16 code units to a list of Unicode code points (CodePoint).
* This function processes each UTF-16 code unit, decoding them into their corresponding Unicode code points.
* The input list of `U16` values may represent characters in the Basic Multilingual Plane (BMP) or supplementary planes,
* with surrogate pairs handled correctly. The function also handles EOF (`null`).
* @param input - A list of UTF-16 code units (`U16`), possibly containing surrogate pairs.
* @returns A list of Unicode code points (`CodePoint`), one for each valid code unit or surrogate pair.
*
* @example
*
* ```ts
* const utf16List: List<U16> = [0x0041, 0xD83D, 0xDE00] // 'A' and π (surrogate pair)
* const codePoints = toCodePointList(utf16List)
* ```
*/
export const toCodePointList = (input) => flat(stateScan(utf16ByteOrEofToCodePointOp)(null)(flat([input, eofList])));
/**
* Converts a string to a list of UTF-16 code units (U16).
*
* This function processes each character in the input string and converts it to its corresponding UTF-16 code unit(s).
* Characters in the Basic Multilingual Plane (BMP) will produce a single `U16`, while supplementary plane characters
* (those requiring surrogate pairs) will produce two `U16` values.
* @param s - The input string to convert to UTF-16 code units.
* @returns A list of UTF-16 code units (`U16`) representing the string.
*
* @example
*
* ```js
* const inputString = "Hello, π"
* const utf16List = stringToList(inputString)
* ```
*/
export const stringToList = (s) => {
const at = (i) => {
const first = s.charCodeAt(i);
return isNaN(first) ? empty : { first, tail: () => at(i + 1) };
};
return at(0);
};
/**
* Converts a string to a list of Unicode code points (CodePoint).
* This function first converts the string to a list of UTF-16 code units (U16) using `stringToList`,
* then it converts the UTF-16 code units to Unicode code points using `toCodePointList`. This is useful for handling
* Unicode characters, including supplementary characters represented by surrogate pairs in UTF-16.
*
* @param input - The input string to convert.
* @returns A list of Unicode code points (`CodePoint`) corresponding to the characters in the string.
*
* @example
*
* ```js
* const inputString = "Hello, π"
* const codePoints = stringToCodePointList(inputString)
* ```
*/
export const stringToCodePointList = (input) => toCodePointList(stringToList(input));
/**
* Converts a list of UTF-16 code units (U16) to a string.
* This function takes a list of `U16` values (UTF-16 code units) and reconstructs the original string by mapping
* each code unit back to its character using `String.fromCharCode`. The resulting characters are concatenated
* to form the final string.
*
* @param input - A list of UTF-16 code units (`U16`).
* @returns A string representing the characters encoded by the input UTF-16 code units.
*
* @example
*
* ```ts
* const utf16List: List<U16> = [0x0041, 0x0042, 0x0043] // 'ABC'
* const outputString = listToString(utf16List)
* ```
*/
export const listToString = fn(map(String.fromCharCode))
.then(reduce(concat)(''))
.result;
/**
* Converts a list of Unicode code points (CodePoint) to a string.
* This function first converts the list of Unicode code points to a list of UTF-16 code units using `fromCodePointList`,
* then it uses `listToString` to reconstruct the string from the UTF-16 code units.
*
* @param input - A list of Unicode code points (`CodePoint`).
* @returns A string representing the characters encoded by the input code points.
*
* @example
*
* ```ts
* const codePoints: List<CodePoint> = [0x48, 0x65, 0x6C, 0x6C, 0x6F]
* const outputString = codePointListToString(codePoints)
* ```
*/
export const codePointListToString = (input) => listToString(fromCodePointList(input));