lexed
Version:
English word and sentence tokenizer, for natural language processing.
42 lines (41 loc) • 2.27 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const abbreviations_1 = require("./abbreviations");
const escape_regex_1 = require("./escape_regex");
function default_1(str) {
let sentences = [];
let arr = str.split(/(\S.+?[….\?!\n])(?=\s+|$|"(\s|$)|'(\s|$)|\))/).filter(x => x);
arr.forEach((single, index) => {
single = single.trim();
if (new RegExp("(^| |\\\(|\\\[|\{)(" + abbreviations_1.default.map(escape_regex_1.default).join("|") + ")[\.!\?] ?$", "i").test(single) || /[ |\.][A-Za-z]\.?$/.test(single)) {
if ((index < arr.length - 1) && !/^[A-Za-z]\s/.test(arr[index + 1]))
arr[index + 1] = single + ' ' + arr[index + 1].trim();
else
sentences.push(single);
}
else if (~["'", '"', "`"].indexOf(single))
sentences[sentences.length - 1] = sentences[sentences.length - 1] + single;
else if (single)
sentences.push(single);
});
sentences.forEach((item, index) => {
if (sentences[index + 1] &&
/^('|"|]|}|\)|>|\/|\|`|"|\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|}|〞|⦆|〉|》|)) /.test(sentences[index + 1]) &&
new RegExp(`${escape_regex_1.default(item)}('|"|]|\\\)|}|>|\\\/|\|\\\`|"|\\\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|}|〞|⦆|〉|》|)) `).test(str)) {
sentences[index] = sentences[index] + sentences[index + 1].charAt(0);
sentences[index + 1] = sentences[index + 1].substr(2);
}
});
sentences.forEach((item, index) => {
if (item.length === 1 &&
sentences[index - 1] &&
/^('|"|]|}|\)|>|\/|\|`|"|\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|}|〞|⦆|〉|》|))$/.test(item) &&
new RegExp(`${escape_regex_1.default(sentences[index - 1])}('|"|]|\\\)|}|>|\\\/|\|\\\`|"|\\\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|}|〞|⦆|〉|》|))\\s`).test(str)) {
sentences[index - 1] = sentences[index - 1] + item;
sentences[index] = "";
}
});
sentences = sentences.filter(x => x);
return sentences;
}
exports.default = default_1;