UNPKG

wcwidth-o1

Version:

Unicode text width calculation (wcwidth/wcswidth) for Node.js and TypeScript, optimized to O(1), based on Markus Kuhn's reference.

203 lines (202 loc) 7.19 kB
"use strict"; /* * Markus Kuhn -- 2007-05-26 (Unicode 5.0) * * Permission to use, copy, modify, and distribute this software * for any purpose and without fee is hereby granted. The author * disclaims all warranties with regard to this software. * * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * ---------------------------------------------------------------------------- * * Dawson Huang -- 11 Sep 2025 * Translated wcwidth implementation from C to TypeScript. * * Dawson Huang -- 13 Sep 2025 * - Optimized wcwidth and wcswidth implementation by adopting bitset lookup. * - Extended combining ranges to full Unicode 15.1 coverage. * * Dawson Huang -- 16 Sep 2025 * Extended combining and ambiguous ranges to full Unicode 17.0 coverage. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.wcwidth = wcwidth; exports.wcswidth = wcswidth; exports.wcwidthCjk = wcwidthCjk; exports.wcswidthCjk = wcswidthCjk; const combining_1 = require("./combining"); const ambiguous_1 = require("./ambiguous"); /* The following two functions define the column width of an ISO 10646 * character as follows: * * - The null character (U+0000) has a column width of 0. * * - Other C0/C1 control characters and DEL will lead to a return * value of -1. * * - Non-spacing and enclosing combining characters (general * category code Mn or Me in the Unicode database) have a * column width of 0. * * - SOFT HYPHEN (U+00AD) has a column width of 1. * * - Other format characters (general category code Cf in the Unicode * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. * * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) * have a column width of 0. * * - Spacing characters in the East Asian Wide (W) or East Asian * Full-width (F) category as defined in Unicode Technical * Report #11 have a column width of 2. * * - All remaining characters (including all printable * ISO 8859-1 and WGL4 characters, Unicode control characters, * etc.) have a column width of 1. * * This implementation assumes that JavaScript strings are valid UTF-16 * and code points are obtained via `String.prototype.codePointAt`. */ /** * Determines the display width of a Unicode code point. * * - Returns `0`, `1`, or `2` based on Unicode width rules. * - Returns `-1` for control characters or invalid cases. * * @param char - A single-character string to measure. * @returns The display width of `ucs` in terminal columns. */ function wcwidth(char) { const ucs = char.codePointAt(0); // invalid character if (ucs === undefined) return -1; // test for 8-bit control characters if (ucs === 0) return 0; if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) return -1; // bitset lookup for non-spacing characters const isCombining = (ucs) => ucs <= 0x10ffff && (((0, combining_1.combining)(ucs >> 5) >>> (ucs & 31)) & 1) !== 0; if (isCombining(ucs)) return 0; // if we arrive here, ucs is not a combining or C0/C1 control character // ucs is still in combining if (ucs < 0x1100) return 1; // Hangul Jamo init. consonants if (ucs <= 0x115f || ucs === 0x2329 || ucs === 0x232a) return 2; // Hangul Filler if (ucs === 0x3164) return 1; // CJK ... Yi if (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs !== 0x303f) return 2; return ( // Hangul Syllables (0xac00 <= ucs && ucs <= 0xd7a3) || // CJK Compatibility Ideographs (0xf900 <= ucs && ucs <= 0xfaff) || // Vertical forms (0xfe10 <= ucs && ucs <= 0xfe19) || // CJK Compatibility Forms (0xfe30 <= ucs && ucs <= 0xfe6f) || // Fullwidth Forms (0xff00 <= ucs && ucs <= 0xff60) || (0xffe0 <= ucs && ucs <= 0xffe6) || (0x20000 <= ucs && ucs <= 0x2fffd) || (0x30000 <= ucs && ucs <= 0x3fffd)) ? 2 : 1; } /** * Computes the display width of a Unicode string up to `n` characters. * * - Iterates code points and sums their widths using `wcwidth`. * - Returns `-1` if any code point has undefined width. * * @param str - Input string to evaluate. * @param n - Max characters to process (defaults to full length). * @returns The total display width in terminal columns. */ function wcswidth(str, n) { if (!str) return 0; const chars = Array.from(str); let width = 0; for (let i = 0; i < chars.length; i++) { if (n !== undefined && i >= n) break; const w = wcwidth(chars[i]); if (w < 0) return -1; width += w; } return width; } /* * The following functions are the same as wcwidth() and * wcswidth(), except that spacing characters in the East Asian * Ambiguous (A) category as defined in Unicode Technical Report #11 * have a column width of 2. This variant might be useful for users of * CJK legacy encodings who want to migrate to UCS without changing * the traditional terminal character-width behaviour. It is not * otherwise recommended for general use. */ /** * Computes the column width of a Unicode code point (CJK variant). * * - Useful for CJK legacy environments; not recommended for general use. * - Same rules as {@link wcwidth}, except East Asian Ambiguous (A) characters * are treated as width 2 (per Unicode TR#11). * - Returns `0`, `1`, or `2` depending on the character. * - Returns `-1` for control or undefined characters. * * @param char - A single-character string to measure. * @returns Column width in terminal cells, or -1 on error. */ function wcwidthCjk(char) { const ucs = char.codePointAt(0); // invalid character if (ucs === undefined) return -1; // bitset lookup for non-spacing characters const isAmbiguous = (ucs) => ucs <= 0x10ffff && (((0, ambiguous_1.ambiguous)(ucs >> 5) >>> (ucs & 31)) & 1) !== 0; if (isAmbiguous(ucs)) return 2; return wcwidth(char); } /** * Computes the display width of a Unicode string (CJK variant). * * - Useful for CJK legacy environments; not recommended for general use. * - Same rules as {@link wcswidth}, except East Asian Ambiguous (A) characters * are treated as width 2 (per Unicode TR#11). * - Returns `-1` if any character in the string is invalid or a control code. * * @param str - Input string to evaluate. * @param n - Max characters to process (defaults to full length). * @returns Total display width in terminal cells, or -1 on error. */ function wcswidthCjk(str, n) { if (!str) return 0; const chars = Array.from(str); let width = 0; for (let i = 0; i < chars.length; i++) { if (n !== undefined && i >= n) break; const w = wcwidthCjk(chars[i]); if (w < 0) return -1; width += w; } return width; } exports.default = wcwidth; module.exports = wcwidth; module.exports.default = wcwidth; module.exports.wcwidth = wcwidth; module.exports.wcswidth = wcswidth; module.exports.wcwidthCjk = wcwidthCjk; module.exports.wcswidthCjk = wcswidthCjk;