UNPKG

wtf_wikipedia

Version:

parse wikiscript into json

github.com/spencermountain/wtf_wikipedia

spencermountain/wtf_wikipedia

117 lines (109 loc) • 3.52 kB

JavaScript

//split text into sentences, using regex //@spencermountain MIT //(Rule-based sentence boundary segmentation) - chop given text into its proper sentences. // Ignore periods/questions/exclamations used in acronyms/abbreviations/numbers, etc. // @spencermountain 2015 MIT const literalAbbreviations = require('./_abbreviations') const abbreviations = literalAbbreviations.concat('[^]][^]]') const abbrev_reg = new RegExp("(^| |')(" + abbreviations.join('|') + `)[.!?] ?$`, 'i') const acronym_reg = /[ .'][A-Z].? *?$/i const elipses_reg = /\.{3,} +?$/ const circa_reg = / c\.\s$/ const hasWord = /\p{Letter}/iu //turn a nested array into one array const flatten = function (arr) { let all = [] arr.forEach(function (a) { all = all.concat(a) }) return all } const naiive_split = function (text) { //first, split by newline let splits = text.split(/(\n+)/) splits = splits.filter((s) => s.match(/\S/)) //split by period, question-mark, and exclamation-mark splits = splits.map(function (str) { return str.split(/(\S.+?[.!?]"?)(?=\s|$)/g) //\u3002 }) return flatten(splits) } // if this looks like a period within a wikipedia link, return false const isBalanced = function (str) { str = str || '' const open = str.split(/\[\[/) || [] const closed = str.split(/\]\]/) || [] if (open.length > closed.length) { return false } //make sure quotes are closed too const quotes = str.match(/"/g) if (quotes && quotes.length % 2 !== 0 && str.length < 900) { return false } return true } const sentence_parser = function (text) { let sentences = [] //first do a greedy-split.. let chunks = [] //ensure it 'smells like' a sentence if (!text || typeof text !== 'string' || text.trim().length === 0) { return sentences } // This was the splitter regex updated to fix quoted punctuation marks. // let splits = text.split(/(\S.+?[.\?!])(?=\s+|$|")/g); // todo: look for side effects in this regex replacement: let splits = naiive_split(text) //filter-out the grap ones for (let i = 0; i < splits.length; i++) { let s = splits[i] if (!s || s === '') { continue } //this is meaningful whitespace if (!s.match(/\S/)) { //add it to the last one if (chunks[chunks.length - 1]) { chunks[chunks.length - 1] += s continue } else if (splits[i + 1]) { //add it to the next one splits[i + 1] = s + splits[i + 1] continue } } chunks.push(s) } //detection of non-sentence chunks const isSentence = function (hmm) { if (hmm.match(abbrev_reg) || hmm.match(acronym_reg) || hmm.match(elipses_reg) || hmm.match(circa_reg)) { return false } //too short? - no consecutive letters if (hasWord.test(hmm) === false) { return false } if (!isBalanced(hmm)) { return false } return true } //loop through these chunks, and join the non-sentence chunks back together.. for (let i = 0; i < chunks.length; i++) { //should this chunk be combined with the next one? if (chunks[i + 1] && !isSentence(chunks[i])) { chunks[i + 1] = chunks[i] + (chunks[i + 1] || '') //.replace(/ +/g, ' '); } else if (chunks[i] && chunks[i].length > 0) { //this chunk is a proper sentence.. sentences.push(chunks[i]) chunks[i] = '' } } //if we never got a sentence, return the given text if (sentences.length === 0) { return [text] } return sentences } module.exports = sentence_parser