UNPKG

compromise

Version:

modest natural language processing

github.com/spencermountain/compromise

spencermountain/compromise

147 lines (138 loc) • 4.21 kB

JavaScript

import splice from './_splice.js' import apostropheD from './apostrophe-d.js' import apostropheT from './apostrophe-t.js' import french from './french.js' import numberRange from './number-range.js' import numberUnit from './number-unit.js' const byApostrophe = /'/ const numDash = /^[0-9][^-–—]*[-–—].*?[0-9]/ // run tagger on our new implicit terms const reTag = function (terms, view, start, len) { let tmp = view.update() tmp.document = [terms] // offer to re-tag neighbours, too let end = start + len if (start > 0) { start -= 1 } if (terms[end]) { end += 1 } tmp.ptrs = [[0, start, end]] } const byEnd = { // ain't t: (terms, i) => apostropheT(terms, i), // how'd d: (terms, i) => apostropheD(terms, i), } const byStart = { // j'aime j: (terms, i) => french.preJ(terms, i), // l'amour l: (terms, i) => french.preL(terms, i), // d'amerique d: (terms, i) => french.preD(terms, i), } // pull-apart known contractions from model const knownOnes = function (list, term, before, after) { for (let i = 0; i < list.length; i += 1) { let o = list[i] // look for word-word match (cannot-> [can, not]) if (o.word === term.normal) { return o.out } // look for after-match ('re -> [_, are]) else if (after !== null && after === o.after) { return [before].concat(o.out) } // look for before-match (l' -> [le, _]) else if (before !== null && before === o.before && after && after.length > 2) { return o.out.concat(after) // return [o.out, after] //typeof o.out === 'string' ? [o.out, after] : o.out(terms, i) } } return null } const toDocs = function (words, view) { let doc = view.fromText(words.join(' ')) doc.compute(['id', 'alias']) return doc.docs[0] } // there's is usually [there, is] // but can be 'there has' for 'there has (..) been' const thereHas = function (terms, i) { for (let k = i + 1; k < 5; k += 1) { if (!terms[k]) { break } if (terms[k].normal === 'been') { return ['there', 'has'] } } return ['there', 'is'] } //really easy ones const contractions = view => { let { world, document } = view const { model, methods } = world let list = model.one.contractions || [] // let units = new Set(model.one.units || []) // each sentence document.forEach((terms, n) => { // loop through terms backwards for (let i = terms.length - 1; i >= 0; i -= 1) { let before = null let after = null if (byApostrophe.test(terms[i].normal) === true) { let res = terms[i].normal.split(byApostrophe) before = res[0] after = res[1] } // any known-ones, like 'dunno'? let words = knownOnes(list, terms[i], before, after) // ['foo', 's'] if (!words && byEnd.hasOwnProperty(after)) { words = byEnd[after](terms, i, world) } // ['j', 'aime'] if (!words && byStart.hasOwnProperty(before)) { words = byStart[before](terms, i) } // 'there is' vs 'there has' if (before === 'there' && after === 's') { words = thereHas(terms, i) } // actually insert the new terms if (words) { words = toDocs(words, view) splice(document, [n, i], words) reTag(document[n], view, i, words.length) continue } // '44-2' has special care if (numDash.test(terms[i].normal)) { words = numberRange(terms, i) if (words) { words = toDocs(words, view) splice(document, [n, i], words) methods.one.setTag(words, 'NumberRange', world) //add custom tag // is it a time-range, like '5-9pm' if (words[2] && words[2].tags.has('Time')) { methods.one.setTag([words[0]], 'Time', world, null, 'time-range') } reTag(document[n], view, i, words.length) } continue } // split-apart '4km' words = numberUnit(terms, i, world) if (words) { words = toDocs(words, view) splice(document, [n, i], words) methods.one.setTag([words[1]], 'Unit', world, null, 'contraction-unit') } } }) } export default contractions