phpjs
Version:
105 lines (99 loc) • 3.77 kB
JavaScript
function str_word_count(str, format, charlist) {
// From: http://phpjs.org/functions
// + original by: Ole Vrijenhoek
// + bugfixed by: Kevin van Zonneveld (http://kevin.vanzonneveld.net)
// + bugfixed by: Brett Zamir (http://brett-zamir.me)
// + input by: Bug?
// + bugfixed by: Brett Zamir (http://brett-zamir.me)
// + improved by: Brett Zamir (http://brett-zamir.me)
// - depends on: ctype_alpha
// * example 1: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1);
// * returns 1: ['Hello', 'fri', 'nd', "you're", 'looking', 'good', 'today']
// * example 2: str_word_count("Hello fri3nd, you're\r\n looking good today!", 2);
// * returns 2: {0: 'Hello', 6: 'fri', 10: 'nd', 14: "you're", 29: 'looking', 46: 'good', 51: 'today'}
// * example 3: str_word_count("Hello fri3nd, you're\r\n looking good today!", 1, '\u00e0\u00e1\u00e3\u00e73');
// * returns 3: ['Hello', 'fri3nd', "you're", 'looking', 'good', 'today']
var len = str.length,
cl = charlist && charlist.length,
chr = '',
tmpStr = '',
i = 0,
c = '',
wArr = [],
wC = 0,
assoc = {},
aC = 0,
reg = '',
match = false;
// BEGIN STATIC
var _preg_quote = function(str) {
return (str + '').replace(/([\\\.\+\*\?\[\^\]\$\(\)\{\}\=\!<>\|\:])/g, '\\$1');
};
_getWholeChar = function(str, i) { // Use for rare cases of non-BMP characters
var code = str.charCodeAt(i);
if (code < 0xD800 || code > 0xDFFF) {
return str.charAt(i);
}
if (0xD800 <= code && code <= 0xDBFF) { // High surrogate (could change last hex to 0xDB7F to treat high private surrogates as single characters)
if (str.length <= (i + 1)) {
throw 'High surrogate without following low surrogate';
}
var next = str.charCodeAt(i + 1);
if (0xDC00 > next || next > 0xDFFF) {
throw 'High surrogate without following low surrogate';
}
return str.charAt(i) + str.charAt(i + 1);
}
// Low surrogate (0xDC00 <= code && code <= 0xDFFF)
if (i === 0) {
throw 'Low surrogate without preceding high surrogate';
}
var prev = str.charCodeAt(i - 1);
if (0xD800 > prev || prev > 0xDBFF) { // (could change last hex to 0xDB7F to treat high private surrogates as single characters)
throw 'Low surrogate without preceding high surrogate';
}
return false; // We can pass over low surrogates now as the second component in a pair which we have already processed
};
// END STATIC
if (cl) {
reg = '^(' + _preg_quote(_getWholeChar(charlist, 0));
for (i = 1; i < cl; i++) {
if ((chr = _getWholeChar(charlist, i)) === false) {
continue;
}
reg += '|' + _preg_quote(chr);
}
reg += ')$';
reg = new RegExp(reg);
}
for (i = 0; i < len; i++) {
if ((c = _getWholeChar(str, i)) === false) {
continue;
}
match = this.ctype_alpha(c) || (reg && c.search(reg) !== -1) || ((i !== 0 && i !== len - 1) && c === '-') || // No hyphen at beginning or end unless allowed in charlist (or locale)
(i !== 0 && c === "'"); // No apostrophe at beginning unless allowed in charlist (or locale)
if (match) {
if (tmpStr === '' && format === 2) {
aC = i;
}
tmpStr = tmpStr + c;
}
if (i === len - 1 || !match && tmpStr !== '') {
if (format !== 2) {
wArr[wArr.length] = tmpStr;
} else {
assoc[aC] = tmpStr;
}
tmpStr = '';
wC++;
}
}
if (!format) {
return wC;
} else if (format === 1) {
return wArr;
} else if (format === 2) {
return assoc;
}
throw 'You have supplied an incorrect format';
}