UNPKG

lexed

Version:

English word and sentence tokenizer, for natural language processing.

github.com/finnlp/lexed

42 lines (41 loc) • 2.27 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const abbreviations_1 = require("./abbreviations"); const escape_regex_1 = require("./escape_regex"); function default_1(str) { let sentences = []; let arr = str.split(/(\S.+?[….\?!\n])(?=\s+|$|"(\s|$)|'(\s|$)|\))/).filter(x => x); arr.forEach((single, index) => { single = single.trim(); if (new RegExp("(^| |\\\(|\\\[|\{)(" + abbreviations_1.default.map(escape_regex_1.default).join("|") + ")[\.!\?] ?$", "i").test(single) || /[ |\.][A-Za-z]\.?$/.test(single)) { if ((index < arr.length - 1) && !/^[A-Za-z]\s/.test(arr[index + 1])) arr[index + 1] = single + ' ' + arr[index + 1].trim(); else sentences.push(single); } else if (~["'", '"', "`"].indexOf(single)) sentences[sentences.length - 1] = sentences[sentences.length - 1] + single; else if (single) sentences.push(single); }); sentences.forEach((item, index) => { if (sentences[index + 1] && /^('|"|]|}|\)|>|\/|\|`|"|\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|｝|〞|｠|〉|》|）) /.test(sentences[index + 1]) && new RegExp(`${escape_regex_1.default(item)}('|"|]|\\\)|}|>|\\\/|\|\\\`|"|\\\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|｝|〞|｠|〉|》|）) `).test(str)) { sentences[index] = sentences[index] + sentences[index + 1].charAt(0); sentences[index + 1] = sentences[index + 1].substr(2); } }); sentences.forEach((item, index) => { if (item.length === 1 && sentences[index - 1] && /^('|"|]|}|\)|>|\/|\|`|"|\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|｝|〞|｠|〉|》|）)$/.test(item) && new RegExp(`${escape_regex_1.default(sentences[index - 1])}('|"|]|\\\)|}|>|\\\/|\|\\\`|"|\\\*|”|“|«|»|”|”|」|«|﹂|’|⟧|›|⸥|】|⁆|﴿|｝|〞|｠|〉|》|）)\\s`).test(str)) { sentences[index - 1] = sentences[index - 1] + item; sentences[index] = ""; } }); sentences = sentences.filter(x => x); return sentences; } exports.default = default_1;