UNPKG

phpjs

Version:

php.js offers community built php functions in javascript

105 lines (99 loc) 3.77 kB
function str_word_count(str, format, charlist) { // From: http://phpjs.org/functions // + original by: Ole Vrijenhoek // + bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net) // + bugfixed by: Brett Zamir (http://brett-zamir.me) // + input by: Bug? // + bugfixed by: Brett Zamir (http://brett-zamir.me) // + improved by: Brett Zamir (http://brett-zamir.me) // - depends on: ctype_alpha // * example 1: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1); // * returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today'] // * example 2: str_word_count("Hello fri3nd, you're\r\n looking good today!", 2); // * returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'} // * example 3: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1, '\u00e0\u00e1\u00e3\u00e73'); // * returns 3: ['Hello', 'fri3nd', "you're", 'looking', 'good', 'today'] var len = str.length, cl = charlist && charlist.length, chr = '', tmpStr = '', i = 0, c = '', wArr = [], wC = 0, assoc = {}, aC = 0, reg = '', match = false; // BEGIN STATIC var _preg_quote = function(str) { return (str + '').replace(/([\\\.\+\*\?\[\^\]\$\(\)\{\}\=\!<>\|\:])/g, '\\$1'); }; _getWholeChar = function(str, i) { // Use for rare cases of non-BMP characters var code = str.charCodeAt(i); if (code < 0xD800 || code > 0xDFFF) { return str.charAt(i); } if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters) if (str.length <= (i + 1)) { throw 'High surrogate without following low surrogate'; } var next = str.charCodeAt(i + 1); if (0xDC00 > next || next > 0xDFFF) { throw 'High surrogate without following low surrogate'; } return str.charAt(i) + str.charAt(i + 1); } // Low surrogate (0xDC00 <= code && code <= 0xDFFF) if (i === 0) { throw 'Low surrogate without preceding high surrogate'; } var prev = str.charCodeAt(i - 1); if (0xD800 > prev || prev > 0xDBFF) { // (could change last hex to 0xDB7F to treat high private surrogates as single characters) throw 'Low surrogate without preceding high surrogate'; } return false; // We can pass over low surrogates now as the second component in a pair which we have already processed }; // END STATIC if (cl) { reg = '^(' + _preg_quote(_getWholeChar(charlist, 0)); for (i = 1; i < cl; i++) { if ((chr = _getWholeChar(charlist, i)) === false) { continue; } reg += '|' + _preg_quote(chr); } reg += ')$'; reg = new RegExp(reg); } for (i = 0; i < len; i++) { if ((c = _getWholeChar(str, i)) === false) { continue; } match = this.ctype_alpha(c) || (reg && c.search(reg) !== -1) || ((i !== 0 && i !== len - 1) && c === '-') || // No hyphen at beginning or end unless allowed in charlist (or locale) (i !== 0 && c === "'"); // No apostrophe at beginning unless allowed in charlist (or locale) if (match) { if (tmpStr === '' && format === 2) { aC = i; } tmpStr = tmpStr + c; } if (i === len - 1 || !match && tmpStr !== '') { if (format !== 2) { wArr[wArr.length] = tmpStr; } else { assoc[aC] = tmpStr; } tmpStr = ''; wC++; } } if (!format) { return wC; } else if (format === 1) { return wArr; } else if (format === 2) { return assoc; } throw 'You have supplied an incorrect format'; }