anylang
Version:
A translator's kit that uses the free APIs of Google Translate, Yandex, Bing, ChatGPT, and other LLMs
71 lines (69 loc) • 8.5 kB
JavaScript
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.splitLongText = void 0;
/**
* This code taken from https://github.com/zlargon/google-tts/blob/42bae63cf406c3cf20521e0cf36cbc5d9b9dce31/src/splitLongText.ts
* This code is under MIT license (2016 Leon Huang)
*/
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/Trim
const SPACE_REGEX = '\\s\\uFEFF\\xA0';
// https://remarkablemark.org/blog/2019/09/28/javascript-remove-punctuation/
const DEFAULT_PUNCTUATION_REGEX = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~';
/**
* split the long text to short texts
* Time Complexity: O(n)
*
* @param {string} text
* @param {object?} option
* @param {number?} option.maxLength default is 200
* @param {string?} option.splitPunct default is ''
* @returns {string[]} short text list
*/
const splitLongText = (text, {
maxLength = 200,
splitPunct = ''
} = {}) => {
const isSpaceOrPunct = (s, i) => {
const regex = new RegExp('[' + SPACE_REGEX + DEFAULT_PUNCTUATION_REGEX + splitPunct + ']');
return regex.test(s.charAt(i));
};
const lastIndexOfSpaceOrPunct = (s, left, right) => {
for (let i = right; i >= left; i--) {
if (isSpaceOrPunct(s, i)) return i;
}
return -1; // not found
};
const result = [];
const addResult = (text, start, end) => {
result.push(text.slice(start, end + 1));
};
let start = 0;
for (;;) {
// check text's length
if (text.length - start <= maxLength) {
addResult(text, start, text.length - 1);
break; // end of text
}
// check whether the word is cut in the middle.
let end = start + maxLength - 1;
if (isSpaceOrPunct(text, end) || isSpaceOrPunct(text, end + 1)) {
addResult(text, start, end);
start = end + 1;
continue;
}
// find last index of space
end = lastIndexOfSpaceOrPunct(text, start, end);
if (end === -1) {
const str = text.slice(start, start + maxLength);
throw new Error('The word is too long to split into a short text:' + `\n${str} ...` + '\n\nTry the option "splitPunct" to split the text by punctuation.');
}
// add result
addResult(text, start, end);
start = end + 1;
}
return result;
};
exports.splitLongText = splitLongText;
//# sourceMappingURL=data:application/json;charset=utf8;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoidXRpbHMvdGV4dC9zcGxpdExvbmdUZXh0LmpzIiwibmFtZXMiOlsiU1BBQ0VfUkVHRVgiLCJERUZBVUxUX1BVTkNUVUFUSU9OX1JFR0VYIiwic3BsaXRMb25nVGV4dCIsInRleHQiLCJtYXhMZW5ndGgiLCJzcGxpdFB1bmN0IiwiaXNTcGFjZU9yUHVuY3QiLCJzIiwiaSIsInJlZ2V4IiwiUmVnRXhwIiwidGVzdCIsImNoYXJBdCIsImxhc3RJbmRleE9mU3BhY2VPclB1bmN0IiwibGVmdCIsInJpZ2h0IiwicmVzdWx0IiwiYWRkUmVzdWx0Iiwic3RhcnQiLCJlbmQiLCJwdXNoIiwic2xpY2UiLCJsZW5ndGgiLCJzdHIiLCJFcnJvciIsImV4cG9ydHMiXSwic291cmNlcyI6WyJ1dGlscy90ZXh0L3NwbGl0TG9uZ1RleHQudHMiXSwic291cmNlc0NvbnRlbnQiOlsiLyoqXG4gKiBUaGlzIGNvZGUgdGFrZW4gZnJvbSBodHRwczovL2dpdGh1Yi5jb20vemxhcmdvbi9nb29nbGUtdHRzL2Jsb2IvNDJiYWU2M2NmNDA2YzNjZjIwNTIxZTBjZjM2Y2JjNWQ5YjlkY2UzMS9zcmMvc3BsaXRMb25nVGV4dC50c1xuICogVGhpcyBjb2RlIGlzIHVuZGVyIE1JVCBsaWNlbnNlICgyMDE2IExlb24gSHVhbmcpXG4gKi9cblxuLy8gaHR0cHM6Ly9kZXZlbG9wZXIubW96aWxsYS5vcmcvZW4tVVMvZG9jcy9XZWIvSmF2YVNjcmlwdC9SZWZlcmVuY2UvR2xvYmFsX09iamVjdHMvU3RyaW5nL1RyaW1cbmNvbnN0IFNQQUNFX1JFR0VYID0gJ1xcXFxzXFxcXHVGRUZGXFxcXHhBMCc7XG5cbi8vIGh0dHBzOi8vcmVtYXJrYWJsZW1hcmsub3JnL2Jsb2cvMjAxOS8wOS8yOC9qYXZhc2NyaXB0LXJlbW92ZS1wdW5jdHVhdGlvbi9cbmNvbnN0IERFRkFVTFRfUFVOQ1RVQVRJT05fUkVHRVggPSAnIVwiIyQlJlxcJygpKissLS4vOjs8PT4/QFtcXFxcXV5fYHt8fX4nO1xuXG5pbnRlcmZhY2UgT3B0aW9uIHtcblx0bWF4TGVuZ3RoPzogbnVtYmVyO1xuXHRzcGxpdFB1bmN0Pzogc3RyaW5nO1xufVxuXG4vKipcbiAqIHNwbGl0IHRoZSBsb25nIHRleHQgdG8gc2hvcnQgdGV4dHNcbiAqIFRpbWUgQ29tcGxleGl0eTogTyhuKVxuICpcbiAqIEBwYXJhbSB7c3RyaW5nfSAgdGV4dFxuICogQHBhcmFtIHtvYmplY3Q/fSBvcHRpb25cbiAqIEBwYXJhbSB7bnVtYmVyP30gb3B0aW9uLm1heExlbmd0aCAgZGVmYXVsdCBpcyAyMDBcbiAqIEBwYXJhbSB7c3RyaW5nP30gb3B0aW9uLnNwbGl0UHVuY3QgZGVmYXVsdCBpcyAnJ1xuICogQHJldHVybnMge3N0cmluZ1tdfSBzaG9ydCB0ZXh0IGxpc3RcbiAqL1xuZXhwb3J0IGNvbnN0IHNwbGl0TG9uZ1RleHQgPSAoXG5cdHRleHQ6IHN0cmluZyxcblx0eyBtYXhMZW5ndGggPSAyMDAsIHNwbGl0UHVuY3QgPSAnJyB9OiBPcHRpb24gPSB7fSxcbik6IHN0cmluZ1tdID0+IHtcblx0Y29uc3QgaXNTcGFjZU9yUHVuY3QgPSAoczogc3RyaW5nLCBpOiBudW1iZXIpID0+IHtcblx0XHRjb25zdCByZWdleCA9IG5ldyBSZWdFeHAoXG5cdFx0XHQnWycgKyBTUEFDRV9SRUdFWCArIERFRkFVTFRfUFVOQ1RVQVRJT05fUkVHRVggKyBzcGxpdFB1bmN0ICsgJ10nLFxuXHRcdCk7XG5cdFx0cmV0dXJuIHJlZ2V4LnRlc3Qocy5jaGFyQXQoaSkpO1xuXHR9O1xuXG5cdGNvbnN0IGxhc3RJbmRleE9mU3BhY2VPclB1bmN0ID0gKHM6IHN0cmluZywgbGVmdDogbnVtYmVyLCByaWdodDogbnVtYmVyKTogbnVtYmVyID0+IHtcblx0XHRmb3IgKGxldCBpID0gcmlnaHQ7IGkgPj0gbGVmdDsgaS0tKSB7XG5cdFx0XHRpZiAoaXNTcGFjZU9yUHVuY3QocywgaSkpIHJldHVybiBpO1xuXHRcdH1cblx0XHRyZXR1cm4gLTE7IC8vIG5vdCBmb3VuZFxuXHR9O1xuXG5cdGNvbnN0IHJlc3VsdDogc3RyaW5nW10gPSBbXTtcblx0Y29uc3QgYWRkUmVzdWx0ID0gKHRleHQ6IHN0cmluZywgc3RhcnQ6IG51bWJlciwgZW5kOiBudW1iZXIpID0+IHtcblx0XHRyZXN1bHQucHVzaCh0ZXh0LnNsaWNlKHN0YXJ0LCBlbmQgKyAxKSk7XG5cdH07XG5cblx0bGV0IHN0YXJ0ID0gMDtcblx0Zm9yICg7Oykge1xuXHRcdC8vIGNoZWNrIHRleHQncyBsZW5ndGhcblx0XHRpZiAodGV4dC5sZW5ndGggLSBzdGFydCA8PSBtYXhMZW5ndGgpIHtcblx0XHRcdGFkZFJlc3VsdCh0ZXh0LCBzdGFydCwgdGV4dC5sZW5ndGggLSAxKTtcblx0XHRcdGJyZWFrOyAvLyBlbmQgb2YgdGV4dFxuXHRcdH1cblxuXHRcdC8vIGNoZWNrIHdoZXRoZXIgdGhlIHdvcmQgaXMgY3V0IGluIHRoZSBtaWRkbGUuXG5cdFx0bGV0IGVuZCA9IHN0YXJ0ICsgbWF4TGVuZ3RoIC0gMTtcblx0XHRpZiAoaXNTcGFjZU9yUHVuY3QodGV4dCwgZW5kKSB8fCBpc1NwYWNlT3JQdW5jdCh0ZXh0LCBlbmQgKyAxKSkge1xuXHRcdFx0YWRkUmVzdWx0KHRleHQsIHN0YXJ0LCBlbmQpO1xuXHRcdFx0c3RhcnQgPSBlbmQgKyAxO1xuXHRcdFx0Y29udGludWU7XG5cdFx0fVxuXG5cdFx0Ly8gZmluZCBsYXN0IGluZGV4IG9mIHNwYWNlXG5cdFx0ZW5kID0gbGFzdEluZGV4T2ZTcGFjZU9yUHVuY3QodGV4dCwgc3RhcnQsIGVuZCk7XG5cdFx0aWYgKGVuZCA9PT0gLTEpIHtcblx0XHRcdGNvbnN0IHN0ciA9IHRleHQuc2xpY2Uoc3RhcnQsIHN0YXJ0ICsgbWF4TGVuZ3RoKTtcblx0XHRcdHRocm93IG5ldyBFcnJvcihcblx0XHRcdFx0J1RoZSB3b3JkIGlzIHRvbyBsb25nIHRvIHNwbGl0IGludG8gYSBzaG9ydCB0ZXh0OicgK1xuXHRcdFx0XHRcdGBcXG4ke3N0cn0gLi4uYCArXG5cdFx0XHRcdFx0J1xcblxcblRyeSB0aGUgb3B0aW9uIFwic3BsaXRQdW5jdFwiIHRvIHNwbGl0IHRoZSB0ZXh0IGJ5IHB1bmN0dWF0aW9uLicsXG5cdFx0XHQpO1xuXHRcdH1cblxuXHRcdC8vIGFkZCByZXN1bHRcblx0XHRhZGRSZXN1bHQodGV4dCwgc3RhcnQsIGVuZCk7XG5cdFx0c3RhcnQgPSBlbmQgKyAxO1xuXHR9XG5cblx0cmV0dXJuIHJlc3VsdDtcbn07XG4iXSwibWFwcGluZ3MiOiI7Ozs7OztBQUFBOzs7O0FBS0E7QUFDQSxNQUFNQSxXQUFXLEdBQUcsaUJBQWlCO0FBRXJDO0FBQ0EsTUFBTUMseUJBQXlCLEdBQUcsb0NBQW9DO0FBT3RFOzs7Ozs7Ozs7O0FBVU8sTUFBTUMsYUFBYSxHQUFHQSxDQUM1QkMsSUFBWSxFQUNaO0VBQUVDLFNBQVMsR0FBRyxHQUFHO0VBQUVDLFVBQVUsR0FBRztBQUFFLElBQWEsRUFBRSxLQUNwQztFQUNiLE1BQU1DLGNBQWMsR0FBR0EsQ0FBQ0MsQ0FBUyxFQUFFQyxDQUFTLEtBQUk7SUFDL0MsTUFBTUMsS0FBSyxHQUFHLElBQUlDLE1BQU0sQ0FDdkIsR0FBRyxHQUFHVixXQUFXLEdBQUdDLHlCQUF5QixHQUFHSSxVQUFVLEdBQUcsR0FBRyxDQUNoRTtJQUNELE9BQU9JLEtBQUssQ0FBQ0UsSUFBSSxDQUFDSixDQUFDLENBQUNLLE1BQU0sQ0FBQ0osQ0FBQyxDQUFDLENBQUM7RUFDL0IsQ0FBQztFQUVELE1BQU1LLHVCQUF1QixHQUFHQSxDQUFDTixDQUFTLEVBQUVPLElBQVksRUFBRUMsS0FBYSxLQUFZO0lBQ2xGLEtBQUssSUFBSVAsQ0FBQyxHQUFHTyxLQUFLLEVBQUVQLENBQUMsSUFBSU0sSUFBSSxFQUFFTixDQUFDLEVBQUUsRUFBRTtNQUNuQyxJQUFJRixjQUFjLENBQUNDLENBQUMsRUFBRUMsQ0FBQyxDQUFDLEVBQUUsT0FBT0EsQ0FBQztJQUNuQztJQUNBLE9BQU8sQ0FBQyxDQUFDLENBQUMsQ0FBQztFQUNaLENBQUM7RUFFRCxNQUFNUSxNQUFNLEdBQWEsRUFBRTtFQUMzQixNQUFNQyxTQUFTLEdBQUdBLENBQUNkLElBQVksRUFBRWUsS0FBYSxFQUFFQyxHQUFXLEtBQUk7SUFDOURILE1BQU0sQ0FBQ0ksSUFBSSxDQUFDakIsSUFBSSxDQUFDa0IsS0FBSyxDQUFDSCxLQUFLLEVBQUVDLEdBQUcsR0FBRyxDQUFDLENBQUMsQ0FBQztFQUN4QyxDQUFDO0VBRUQsSUFBSUQsS0FBSyxHQUFHLENBQUM7RUFDYixTQUFTO0lBQ1I7SUFDQSxJQUFJZixJQUFJLENBQUNtQixNQUFNLEdBQUdKLEtBQUssSUFBSWQsU0FBUyxFQUFFO01BQ3JDYSxTQUFTLENBQUNkLElBQUksRUFBRWUsS0FBSyxFQUFFZixJQUFJLENBQUNtQixNQUFNLEdBQUcsQ0FBQyxDQUFDO01BQ3ZDLE1BQU0sQ0FBQztJQUNSO0lBRUE7SUFDQSxJQUFJSCxHQUFHLEdBQUdELEtBQUssR0FBR2QsU0FBUyxHQUFHLENBQUM7SUFDL0IsSUFBSUUsY0FBYyxDQUFDSCxJQUFJLEVBQUVnQixHQUFHLENBQUMsSUFBSWIsY0FBYyxDQUFDSCxJQUFJLEVBQUVnQixHQUFHLEdBQUcsQ0FBQyxDQUFDLEVBQUU7TUFDL0RGLFNBQVMsQ0FBQ2QsSUFBSSxFQUFFZSxLQUFLLEVBQUVDLEdBQUcsQ0FBQztNQUMzQkQsS0FBSyxHQUFHQyxHQUFHLEdBQUcsQ0FBQztNQUNmO0lBQ0Q7SUFFQTtJQUNBQSxHQUFHLEdBQUdOLHVCQUF1QixDQUFDVixJQUFJLEVBQUVlLEtBQUssRUFBRUMsR0FBRyxDQUFDO0lBQy9DLElBQUlBLEdBQUcsS0FBSyxDQUFDLENBQUMsRUFBRTtNQUNmLE1BQU1JLEdBQUcsR0FBR3BCLElBQUksQ0FBQ2tCLEtBQUssQ0FBQ0gsS0FBSyxFQUFFQSxLQUFLLEdBQUdkLFNBQVMsQ0FBQztNQUNoRCxNQUFNLElBQUlvQixLQUFLLENBQ2Qsa0RBQWtELEdBQ2pELEtBQUtELEdBQUcsTUFBTSxHQUNkLG1FQUFtRSxDQUNwRTtJQUNGO0lBRUE7SUFDQU4sU0FBUyxDQUFDZCxJQUFJLEVBQUVlLEtBQUssRUFBRUMsR0FBRyxDQUFDO0lBQzNCRCxLQUFLLEdBQUdDLEdBQUcsR0FBRyxDQUFDO0VBQ2hCO0VBRUEsT0FBT0gsTUFBTTtBQUNkLENBQUM7QUFBQ1MsT0FBQSxDQUFBdkIsYUFBQSxHQUFBQSxhQUFBIiwiaWdub3JlTGlzdCI6W119
;