anylang
Version:
A translator's kit that uses the free APIs of Google Translate, Yandex, Bing, ChatGPT, and other LLMs
65 lines (63 loc) • 8.33 kB
JavaScript
/**
* This code taken from https://github.com/zlargon/google-tts/blob/42bae63cf406c3cf20521e0cf36cbc5d9b9dce31/src/splitLongText.ts
* This code is under MIT license (2016 Leon Huang)
*/
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/Trim
const SPACE_REGEX = '\\s\\uFEFF\\xA0';
// https://remarkablemark.org/blog/2019/09/28/javascript-remove-punctuation/
const DEFAULT_PUNCTUATION_REGEX = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~';
/**
* split the long text to short texts
* Time Complexity: O(n)
*
* @param {string} text
* @param {object?} option
* @param {number?} option.maxLength default is 200
* @param {string?} option.splitPunct default is ''
* @returns {string[]} short text list
*/
export const splitLongText = (text, { maxLength = 200, splitPunct = '' } = {}) => {
const isSpaceOrPunct = (s, i) => {
const regex = new RegExp('[' + SPACE_REGEX + DEFAULT_PUNCTUATION_REGEX + splitPunct + ']');
return regex.test(s.charAt(i));
};
const lastIndexOfSpaceOrPunct = (s, left, right) => {
for (let i = right; i >= left; i--) {
if (isSpaceOrPunct(s, i))
return i;
}
return -1; // not found
};
const result = [];
const addResult = (text, start, end) => {
result.push(text.slice(start, end + 1));
};
let start = 0;
for (;;) {
// check text's length
if (text.length - start <= maxLength) {
addResult(text, start, text.length - 1);
break; // end of text
}
// check whether the word is cut in the middle.
let end = start + maxLength - 1;
if (isSpaceOrPunct(text, end) || isSpaceOrPunct(text, end + 1)) {
addResult(text, start, end);
start = end + 1;
continue;
}
// find last index of space
end = lastIndexOfSpaceOrPunct(text, start, end);
if (end === -1) {
const str = text.slice(start, start + maxLength);
throw new Error('The word is too long to split into a short text:' +
`\n${str} ...` +
'\n\nTry the option "splitPunct" to split the text by punctuation.');
}
// add result
addResult(text, start, end);
start = end + 1;
}
return result;
};
//# sourceMappingURL=data:application/json;charset=utf8;base64,eyJ2ZXJzaW9uIjozLCJzb3VyY2VzIjpbInV0aWxzL3RleHQvc3BsaXRMb25nVGV4dC50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiQUFBQTs7O0dBR0c7QUFFSCwrRkFBK0Y7QUFDL0YsTUFBTSxXQUFXLEdBQUcsaUJBQWlCLENBQUM7QUFFdEMsNEVBQTRFO0FBQzVFLE1BQU0seUJBQXlCLEdBQUcsb0NBQW9DLENBQUM7QUFPdkU7Ozs7Ozs7OztHQVNHO0FBQ0gsTUFBTSxDQUFDLE1BQU0sYUFBYSxHQUFHLENBQzVCLElBQVksRUFDWixFQUFFLFNBQVMsR0FBRyxHQUFHLEVBQUUsVUFBVSxHQUFHLEVBQUUsS0FBYSxFQUFFLEVBQ3RDLEVBQUU7SUFDYixNQUFNLGNBQWMsR0FBRyxDQUFDLENBQVMsRUFBRSxDQUFTLEVBQUUsRUFBRTtRQUMvQyxNQUFNLEtBQUssR0FBRyxJQUFJLE1BQU0sQ0FDdkIsR0FBRyxHQUFHLFdBQVcsR0FBRyx5QkFBeUIsR0FBRyxVQUFVLEdBQUcsR0FBRyxDQUNoRSxDQUFDO1FBQ0YsT0FBTyxLQUFLLENBQUMsSUFBSSxDQUFDLENBQUMsQ0FBQyxNQUFNLENBQUMsQ0FBQyxDQUFDLENBQUMsQ0FBQztJQUNoQyxDQUFDLENBQUM7SUFFRixNQUFNLHVCQUF1QixHQUFHLENBQUMsQ0FBUyxFQUFFLElBQVksRUFBRSxLQUFhLEVBQVUsRUFBRTtRQUNsRixLQUFLLElBQUksQ0FBQyxHQUFHLEtBQUssRUFBRSxDQUFDLElBQUksSUFBSSxFQUFFLENBQUMsRUFBRSxFQUFFLENBQUM7WUFDcEMsSUFBSSxjQUFjLENBQUMsQ0FBQyxFQUFFLENBQUMsQ0FBQztnQkFBRSxPQUFPLENBQUMsQ0FBQztRQUNwQyxDQUFDO1FBQ0QsT0FBTyxDQUFDLENBQUMsQ0FBQyxDQUFDLFlBQVk7SUFDeEIsQ0FBQyxDQUFDO0lBRUYsTUFBTSxNQUFNLEdBQWEsRUFBRSxDQUFDO0lBQzVCLE1BQU0sU0FBUyxHQUFHLENBQUMsSUFBWSxFQUFFLEtBQWEsRUFBRSxHQUFXLEVBQUUsRUFBRTtRQUM5RCxNQUFNLENBQUMsSUFBSSxDQUFDLElBQUksQ0FBQyxLQUFLLENBQUMsS0FBSyxFQUFFLEdBQUcsR0FBRyxDQUFDLENBQUMsQ0FBQyxDQUFDO0lBQ3pDLENBQUMsQ0FBQztJQUVGLElBQUksS0FBSyxHQUFHLENBQUMsQ0FBQztJQUNkLFNBQVMsQ0FBQztRQUNULHNCQUFzQjtRQUN0QixJQUFJLElBQUksQ0FBQyxNQUFNLEdBQUcsS0FBSyxJQUFJLFNBQVMsRUFBRSxDQUFDO1lBQ3RDLFNBQVMsQ0FBQyxJQUFJLEVBQUUsS0FBSyxFQUFFLElBQUksQ0FBQyxNQUFNLEdBQUcsQ0FBQyxDQUFDLENBQUM7WUFDeEMsTUFBTSxDQUFDLGNBQWM7UUFDdEIsQ0FBQztRQUVELCtDQUErQztRQUMvQyxJQUFJLEdBQUcsR0FBRyxLQUFLLEdBQUcsU0FBUyxHQUFHLENBQUMsQ0FBQztRQUNoQyxJQUFJLGNBQWMsQ0FBQyxJQUFJLEVBQUUsR0FBRyxDQUFDLElBQUksY0FBYyxDQUFDLElBQUksRUFBRSxHQUFHLEdBQUcsQ0FBQyxDQUFDLEVBQUUsQ0FBQztZQUNoRSxTQUFTLENBQUMsSUFBSSxFQUFFLEtBQUssRUFBRSxHQUFHLENBQUMsQ0FBQztZQUM1QixLQUFLLEdBQUcsR0FBRyxHQUFHLENBQUMsQ0FBQztZQUNoQixTQUFTO1FBQ1YsQ0FBQztRQUVELDJCQUEyQjtRQUMzQixHQUFHLEdBQUcsdUJBQXVCLENBQUMsSUFBSSxFQUFFLEtBQUssRUFBRSxHQUFHLENBQUMsQ0FBQztRQUNoRCxJQUFJLEdBQUcsS0FBSyxDQUFDLENBQUMsRUFBRSxDQUFDO1lBQ2hCLE1BQU0sR0FBRyxHQUFHLElBQUksQ0FBQyxLQUFLLENBQUMsS0FBSyxFQUFFLEtBQUssR0FBRyxTQUFTLENBQUMsQ0FBQztZQUNqRCxNQUFNLElBQUksS0FBSyxDQUNkLGtEQUFrRDtnQkFDakQsS0FBSyxHQUFHLE1BQU07Z0JBQ2QsbUVBQW1FLENBQ3BFLENBQUM7UUFDSCxDQUFDO1FBRUQsYUFBYTtRQUNiLFNBQVMsQ0FBQyxJQUFJLEVBQUUsS0FBSyxFQUFFLEdBQUcsQ0FBQyxDQUFDO1FBQzVCLEtBQUssR0FBRyxHQUFHLEdBQUcsQ0FBQyxDQUFDO0lBQ2pCLENBQUM7SUFFRCxPQUFPLE1BQU0sQ0FBQztBQUNmLENBQUMsQ0FBQyIsImZpbGUiOiJ1dGlscy90ZXh0L3NwbGl0TG9uZ1RleHQuanMiLCJzb3VyY2VzQ29udGVudCI6WyIvKipcbiAqIFRoaXMgY29kZSB0YWtlbiBmcm9tIGh0dHBzOi8vZ2l0aHViLmNvbS96bGFyZ29uL2dvb2dsZS10dHMvYmxvYi80MmJhZTYzY2Y0MDZjM2NmMjA1MjFlMGNmMzZjYmM1ZDliOWRjZTMxL3NyYy9zcGxpdExvbmdUZXh0LnRzXG4gKiBUaGlzIGNvZGUgaXMgdW5kZXIgTUlUIGxpY2Vuc2UgKDIwMTYgTGVvbiBIdWFuZylcbiAqL1xuXG4vLyBodHRwczovL2RldmVsb3Blci5tb3ppbGxhLm9yZy9lbi1VUy9kb2NzL1dlYi9KYXZhU2NyaXB0L1JlZmVyZW5jZS9HbG9iYWxfT2JqZWN0cy9TdHJpbmcvVHJpbVxuY29uc3QgU1BBQ0VfUkVHRVggPSAnXFxcXHNcXFxcdUZFRkZcXFxceEEwJztcblxuLy8gaHR0cHM6Ly9yZW1hcmthYmxlbWFyay5vcmcvYmxvZy8yMDE5LzA5LzI4L2phdmFzY3JpcHQtcmVtb3ZlLXB1bmN0dWF0aW9uL1xuY29uc3QgREVGQVVMVF9QVU5DVFVBVElPTl9SRUdFWCA9ICchXCIjJCUmXFwnKCkqKywtLi86Ozw9Pj9AW1xcXFxdXl9ge3x9fic7XG5cbmludGVyZmFjZSBPcHRpb24ge1xuXHRtYXhMZW5ndGg/OiBudW1iZXI7XG5cdHNwbGl0UHVuY3Q/OiBzdHJpbmc7XG59XG5cbi8qKlxuICogc3BsaXQgdGhlIGxvbmcgdGV4dCB0byBzaG9ydCB0ZXh0c1xuICogVGltZSBDb21wbGV4aXR5OiBPKG4pXG4gKlxuICogQHBhcmFtIHtzdHJpbmd9ICB0ZXh0XG4gKiBAcGFyYW0ge29iamVjdD99IG9wdGlvblxuICogQHBhcmFtIHtudW1iZXI/fSBvcHRpb24ubWF4TGVuZ3RoICBkZWZhdWx0IGlzIDIwMFxuICogQHBhcmFtIHtzdHJpbmc/fSBvcHRpb24uc3BsaXRQdW5jdCBkZWZhdWx0IGlzICcnXG4gKiBAcmV0dXJucyB7c3RyaW5nW119IHNob3J0IHRleHQgbGlzdFxuICovXG5leHBvcnQgY29uc3Qgc3BsaXRMb25nVGV4dCA9IChcblx0dGV4dDogc3RyaW5nLFxuXHR7IG1heExlbmd0aCA9IDIwMCwgc3BsaXRQdW5jdCA9ICcnIH06IE9wdGlvbiA9IHt9LFxuKTogc3RyaW5nW10gPT4ge1xuXHRjb25zdCBpc1NwYWNlT3JQdW5jdCA9IChzOiBzdHJpbmcsIGk6IG51bWJlcikgPT4ge1xuXHRcdGNvbnN0IHJlZ2V4ID0gbmV3IFJlZ0V4cChcblx0XHRcdCdbJyArIFNQQUNFX1JFR0VYICsgREVGQVVMVF9QVU5DVFVBVElPTl9SRUdFWCArIHNwbGl0UHVuY3QgKyAnXScsXG5cdFx0KTtcblx0XHRyZXR1cm4gcmVnZXgudGVzdChzLmNoYXJBdChpKSk7XG5cdH07XG5cblx0Y29uc3QgbGFzdEluZGV4T2ZTcGFjZU9yUHVuY3QgPSAoczogc3RyaW5nLCBsZWZ0OiBudW1iZXIsIHJpZ2h0OiBudW1iZXIpOiBudW1iZXIgPT4ge1xuXHRcdGZvciAobGV0IGkgPSByaWdodDsgaSA+PSBsZWZ0OyBpLS0pIHtcblx0XHRcdGlmIChpc1NwYWNlT3JQdW5jdChzLCBpKSkgcmV0dXJuIGk7XG5cdFx0fVxuXHRcdHJldHVybiAtMTsgLy8gbm90IGZvdW5kXG5cdH07XG5cblx0Y29uc3QgcmVzdWx0OiBzdHJpbmdbXSA9IFtdO1xuXHRjb25zdCBhZGRSZXN1bHQgPSAodGV4dDogc3RyaW5nLCBzdGFydDogbnVtYmVyLCBlbmQ6IG51bWJlcikgPT4ge1xuXHRcdHJlc3VsdC5wdXNoKHRleHQuc2xpY2Uoc3RhcnQsIGVuZCArIDEpKTtcblx0fTtcblxuXHRsZXQgc3RhcnQgPSAwO1xuXHRmb3IgKDs7KSB7XG5cdFx0Ly8gY2hlY2sgdGV4dCdzIGxlbmd0aFxuXHRcdGlmICh0ZXh0Lmxlbmd0aCAtIHN0YXJ0IDw9IG1heExlbmd0aCkge1xuXHRcdFx0YWRkUmVzdWx0KHRleHQsIHN0YXJ0LCB0ZXh0Lmxlbmd0aCAtIDEpO1xuXHRcdFx0YnJlYWs7IC8vIGVuZCBvZiB0ZXh0XG5cdFx0fVxuXG5cdFx0Ly8gY2hlY2sgd2hldGhlciB0aGUgd29yZCBpcyBjdXQgaW4gdGhlIG1pZGRsZS5cblx0XHRsZXQgZW5kID0gc3RhcnQgKyBtYXhMZW5ndGggLSAxO1xuXHRcdGlmIChpc1NwYWNlT3JQdW5jdCh0ZXh0LCBlbmQpIHx8IGlzU3BhY2VPclB1bmN0KHRleHQsIGVuZCArIDEpKSB7XG5cdFx0XHRhZGRSZXN1bHQodGV4dCwgc3RhcnQsIGVuZCk7XG5cdFx0XHRzdGFydCA9IGVuZCArIDE7XG5cdFx0XHRjb250aW51ZTtcblx0XHR9XG5cblx0XHQvLyBmaW5kIGxhc3QgaW5kZXggb2Ygc3BhY2Vcblx0XHRlbmQgPSBsYXN0SW5kZXhPZlNwYWNlT3JQdW5jdCh0ZXh0LCBzdGFydCwgZW5kKTtcblx0XHRpZiAoZW5kID09PSAtMSkge1xuXHRcdFx0Y29uc3Qgc3RyID0gdGV4dC5zbGljZShzdGFydCwgc3RhcnQgKyBtYXhMZW5ndGgpO1xuXHRcdFx0dGhyb3cgbmV3IEVycm9yKFxuXHRcdFx0XHQnVGhlIHdvcmQgaXMgdG9vIGxvbmcgdG8gc3BsaXQgaW50byBhIHNob3J0IHRleHQ6JyArXG5cdFx0XHRcdFx0YFxcbiR7c3RyfSAuLi5gICtcblx0XHRcdFx0XHQnXFxuXFxuVHJ5IHRoZSBvcHRpb24gXCJzcGxpdFB1bmN0XCIgdG8gc3BsaXQgdGhlIHRleHQgYnkgcHVuY3R1YXRpb24uJyxcblx0XHRcdCk7XG5cdFx0fVxuXG5cdFx0Ly8gYWRkIHJlc3VsdFxuXHRcdGFkZFJlc3VsdCh0ZXh0LCBzdGFydCwgZW5kKTtcblx0XHRzdGFydCA9IGVuZCArIDE7XG5cdH1cblxuXHRyZXR1cm4gcmVzdWx0O1xufTtcbiJdfQ==