UNPKG

string-kit

Version:

A string manipulation toolbox, featuring a string formatter (inspired by sprintf), a variable inspector (output featuring ANSI colors and HTML) and various escape functions (shell argument, regexp, html, etc).

361 lines (269 loc) 11.3 kB
/* String Kit Copyright (c) 2014 - 2021 Cédric Ronvel The MIT License (MIT) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ "use strict" ; /* Javascript does not use UTF-8 but UCS-2. The purpose of this module is to process correctly strings containing UTF-8 characters that take more than 2 bytes. Since the punycode module is deprecated in Node.js v8.x, this is an adaptation of punycode.ucs2.x as found on Aug 16th 2017 at: https://github.com/bestiejs/punycode.js/blob/master/punycode.js. 2021 note -- Modern Javascript is way more unicode friendly since many years, e.g. `Array.from( string )` and `for ( char of string )` are unicode aware. Some methods here are now useless, but have been modernized to use the correct ES features. */ // Create the module and export it const unicode = {} ; module.exports = unicode ; unicode.encode = array => String.fromCodePoint( ... array ) ; // Decode a string into an array of unicode codepoints. // The 2nd argument of Array.from() is a map function, it avoids creating intermediate array. unicode.decode = str => Array.from( str , c => c.codePointAt( 0 ) ) ; // DEPRECATED: This function is totally useless now, with modern JS. unicode.firstCodePoint = str => str.codePointAt( 0 ) ; // Extract only the first char. unicode.firstChar = str => str.length ? String.fromCodePoint( str.codePointAt( 0 ) ) : undefined ; // DEPRECATED: This function is totally useless now, with modern JS. unicode.toArray = str => Array.from( str ) ; // Decode a string into an array of Cell (used by Terminal-kit). // Wide chars have an additionnal filler cell, so position is correct unicode.toCells = ( Cell , str , tabWidth = 4 , linePosition = 0 , ... extraCellArgs ) => { var char , code , fillSize , width , output = [] ; for ( char of str ) { code = char.codePointAt( 0 ) ; if ( code === 0x0a ) { // New line linePosition = 0 ; } else if ( code === 0x09 ) { // Tab // Depends upon the next tab-stop fillSize = tabWidth - ( linePosition % tabWidth ) - 1 ; //output.push( new Cell( '\t' , ... extraCellArgs ) ) ; output.push( new Cell( '\t' , 1 , ... extraCellArgs ) ) ; linePosition += 1 + fillSize ; // Add a filler cell while ( fillSize -- ) { output.push( new Cell( ' ' , -2 , ... extraCellArgs ) ) ; } } else { width = unicode.codePointWidth( code ) , output.push( new Cell( char , width , ... extraCellArgs ) ) ; linePosition += width ; // Add an anti-filler cell (a cell with 0 width, following a wide char) while ( -- width > 0 ) { output.push( new Cell( ' ' , -1 , ... extraCellArgs ) ) ; } } } return output ; } ; unicode.fromCells = ( cells ) => { var cell , str = '' ; for ( cell of cells ) { if ( ! cell.filler ) { str += cell.char ; } } return str ; } ; // Get the length of an unicode string // Mostly an adaptation of .decode(), not factorized for performance's sake (used by Terminal-kit) // /!\ Use Array.from().length instead??? Not using it is potentially faster, but it needs benchmark to be sure. unicode.length = str => { // for ... of is unicode-aware var char , length = 0 ; for ( char of str ) { length ++ ; } /* eslint-disable-line no-unused-vars */ return length ; } ; // Return a string that does not exceed the character limit unicode.truncateLength = unicode.truncate = ( str , limit ) => { var position = 0 , length = 0 ; for ( let char of str ) { if ( length === limit ) { return str.slice( 0 , position ) ; } length ++ ; position += char.length ; } // The string remains unchanged return str ; } ; // Return the width of a string in a terminal/monospace font unicode.width = str => { // for ... of is unicode-aware var char , count = 0 ; for ( char of str ) { count += unicode.codePointWidth( char.codePointAt( 0 ) ) ; } return count ; } ; // Return the width of an array of string in a terminal/monospace font unicode.arrayWidth = ( array , limit ) => { var index , count = 0 ; if ( limit === undefined ) { limit = array.length ; } for ( index = 0 ; index < limit ; index ++ ) { count += unicode.isFullWidth( array[ index ] ) ? 2 : 1 ; } return count ; } ; // Userland may use this, it is more efficient than .truncateWidth() + .width(), // and BTW even more than testing .width() then .truncateWidth() + .width() var lastTruncateWidth = 0 ; unicode.getLastTruncateWidth = () => lastTruncateWidth ; // Return a string that does not exceed the width limit (taking wide-char into considerations) unicode.widthLimit = // DEPRECATED unicode.truncateWidth = ( str , limit ) => { var char , charWidth , position = 0 ; // Module global: lastTruncateWidth = 0 ; for ( char of str ) { charWidth = unicode.codePointWidth( char.codePointAt( 0 ) ) ; if ( lastTruncateWidth + charWidth > limit ) { return str.slice( 0 , position ) ; } lastTruncateWidth += charWidth ; position += char.length ; } // The string remains unchanged return str ; } ; /* ** PROBABLY DEPRECATED ** Check if a UCS2 char is a surrogate pair. Returns: 0: single char 1: leading surrogate -1: trailing surrogate Note: it does not check input, to gain perfs. */ unicode.surrogatePair = char => { var code = char.charCodeAt( 0 ) ; if ( code < 0xd800 || code >= 0xe000 ) { return 0 ; } else if ( code < 0xdc00 ) { return 1 ; } return -1 ; } ; // Check if a character is a full-width char or not unicode.isFullWidth = char => unicode.isFullWidthCodePoint( char.codePointAt( 0 ) ) ; // Return the width of a char, leaner than .width() for one char unicode.charWidth = char => unicode.codePointWidth( char.codePointAt( 0 ) ) ; /* Build the Emoji width lookup. The ranges file (./lib/unicode-emoji-width-ranges.json) is produced by a Terminal-Kit script ([terminal-kit]/utilities/build-emoji-width-lookup.js), that writes each emoji and check the cursor location. */ const emojiWidthLookup = new Map() ; ( function() { var ranges = require( './json-data/unicode-emoji-width-ranges.json' ) ; for ( let range of ranges ) { for ( let i = range.s ; i <= range.e ; i ++ ) { emojiWidthLookup.set( i , range.w ) ; } } } )() ; /* Check if a codepoint represent a full-width char or not. */ unicode.codePointWidth = code => { // Assuming all emoji are wide here if ( unicode.isEmojiCodePoint( code ) ) { return emojiWidthLookup.get( code ) ?? 2 ; } // Code points are derived from: // http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt if ( code >= 0x1100 && ( code <= 0x115f || // Hangul Jamo code === 0x2329 || // LEFT-POINTING ANGLE BRACKET code === 0x232a || // RIGHT-POINTING ANGLE BRACKET // CJK Radicals Supplement .. Enclosed CJK Letters and Months ( 0x2e80 <= code && code <= 0x3247 && code !== 0x303f ) || // Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A ( 0x3250 <= code && code <= 0x4dbf ) || // CJK Unified Ideographs .. Yi Radicals ( 0x4e00 <= code && code <= 0xa4c6 ) || // Hangul Jamo Extended-A ( 0xa960 <= code && code <= 0xa97c ) || // Hangul Syllables ( 0xac00 <= code && code <= 0xd7a3 ) || // CJK Compatibility Ideographs ( 0xf900 <= code && code <= 0xfaff ) || // Vertical Forms ( 0xfe10 <= code && code <= 0xfe19 ) || // CJK Compatibility Forms .. Small Form Variants ( 0xfe30 <= code && code <= 0xfe6b ) || // Halfwidth and Fullwidth Forms ( 0xff01 <= code && code <= 0xff60 ) || ( 0xffe0 <= code && code <= 0xffe6 ) || // Kana Supplement ( 0x1b000 <= code && code <= 0x1b001 ) || // Enclosed Ideographic Supplement ( 0x1f200 <= code && code <= 0x1f251 ) || // CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane ( 0x20000 <= code && code <= 0x3fffd ) ) ) { return 2 ; } if ( unicode.isEmojiModifierCodePoint( code ) || unicode.isZeroWidthDiacriticCodePoint( code ) ) { return 0 ; } return 1 ; } ; // For a true/false type of result unicode.isFullWidthCodePoint = code => unicode.codePointWidth( code ) === 2 ; // Convert normal ASCII chars to their full-width counterpart unicode.toFullWidth = str => { return String.fromCodePoint( ... Array.from( str , char => { var code = char.codePointAt( 0 ) ; return code >= 33 && code <= 126 ? 0xff00 + code - 0x20 : code ; } ) ) ; } ; // Check if a character is a diacritic with zero-width or not unicode.isZeroWidthDiacritic = char => unicode.isZeroWidthDiacriticCodePoint( char.codePointAt( 0 ) ) ; // Some doc found here: https://en.wikipedia.org/wiki/Combining_character // Diacritics and other characters that combines with previous one (zero-width) unicode.isZeroWidthDiacriticCodePoint = code => // Combining Diacritical Marks ( 0x300 <= code && code <= 0x36f ) || // Combining Diacritical Marks Extended ( 0x1ab0 <= code && code <= 0x1aff ) || // Combining Diacritical Marks Supplement ( 0x1dc0 <= code && code <= 0x1dff ) || // Combining Diacritical Marks for Symbols ( 0x20d0 <= code && code <= 0x20ff ) || // Combining Half Marks ( 0xfe20 <= code && code <= 0xfe2f ) || // Dakuten and handakuten (japanese) code === 0x3099 || code === 0x309a || // Devanagari ( 0x900 <= code && code <= 0x903 ) || ( 0x93a <= code && code <= 0x957 && code !== 0x93d && code !== 0x950 ) || code === 0x962 || code === 0x963 || // Thai code === 0xe31 || ( 0xe34 <= code && code <= 0xe3a ) || ( 0xe47 <= code && code <= 0xe4e ) ; // Check if a character is an emoji or not unicode.isEmoji = char => unicode.isEmojiCodePoint( char.codePointAt( 0 ) ) ; // Some doc found here: https://stackoverflow.com/questions/30470079/emoji-value-range unicode.isEmojiCodePoint = code => // Miscellaneous symbols ( 0x2600 <= code && code <= 0x26ff ) || // Dingbats ( 0x2700 <= code && code <= 0x27bf ) || // Emoji ( 0x1f000 <= code && code <= 0x1f1ff ) || ( 0x1f300 <= code && code <= 0x1f3fa ) || ( 0x1f400 <= code && code <= 0x1faff ) ; // Emoji modifier unicode.isEmojiModifier = char => unicode.isEmojiModifierCodePoint( char.codePointAt( 0 ) ) ; unicode.isEmojiModifierCodePoint = code => ( 0x1f3fb <= code && code <= 0x1f3ff ) || // (Fitzpatrick): https://en.wikipedia.org/wiki/Miscellaneous_Symbols_and_Pictographs#Emoji_modifiers code === 0xfe0f ; // VARIATION SELECTOR-16 [VS16] {emoji variation selector}