UNPKG

compromise

Version:

modest natural language processing

github.com/spencermountain/compromise

spencermountain/compromise

2,079 lines (1,546 loc) • 334 kB

JavaScript

function _typeof(obj) { if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function (obj) { return typeof obj; }; } else { _typeof = function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); } function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); } function _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); } function _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); } function _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return self; } function _possibleConstructorReturn(self, call) { if (call && (typeof call === "object" || typeof call === "function")) { return call; } return _assertThisInitialized(self); } //this is a not-well-thought-out way to reduce our dependence on `object===object` stuff var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'.split(''); //generates a unique id for this term function makeId(str) { str = str || '_'; var text = str + '-'; for (var i = 0; i < 7; i++) { text += chars[Math.floor(Math.random() * chars.length)]; } return text; } var _id = makeId; //a hugely-ignorant, and widely subjective transliteration of latin, cryllic, greek unicode characters to english ascii. //approximate visual (not semantic or phonetic) relationship between unicode and ascii characters //http://en.wikipedia.org/wiki/List_of_Unicode_characters //https://docs.google.com/spreadsheet/ccc?key=0Ah46z755j7cVdFRDM1A2YVpwa1ZYWlpJM2pQZ003M0E var compact = { '!': '¡', '?': '¿Ɂ', '"': '“”"❝❞', "'": '‘‛❛❜', '-': '—–', a: 'ªÀÁÂÃÄÅàáâãäåĀāĂăĄąǍǎǞǟǠǡǺǻȀȁȂȃȦȧȺΆΑΔΛάαλАадѦѧӐӑӒӓƛɅæ', b: 'ßþƀƁƂƃƄƅɃΒβϐϦБВЪЬвъьѢѣҌҍ', c: '¢©ÇçĆćĈĉĊċČčƆƇƈȻȼͻͼͽϲϹϽϾСсєҀҁҪҫ', d: 'ÐĎďĐđƉƊȡƋƌǷ', e: 'ÈÉÊËèéêëĒēĔĕĖėĘęĚěƎƏƐǝȄȅȆȇȨȩɆɇΈΕΞΣέεξϱϵ϶ЀЁЕЭеѐёҼҽҾҿӖӗӘәӚӛӬӭ', f: 'ƑƒϜϝӺӻҒғſ', g: 'ĜĝĞğĠġĢģƓǤǥǦǧǴǵ', h: 'ĤĥĦħƕǶȞȟΉΗЂЊЋНнђћҢңҤҥҺһӉӊ', I: 'ÌÍÎÏ', i: 'ìíîïĨĩĪīĬĭĮįİıƖƗȈȉȊȋΊΐΪίιϊІЇії', j: 'ĴĵǰȷɈɉϳЈј', k: 'ĶķĸƘƙǨǩΚκЌЖКжкќҚқҜҝҞҟҠҡ', l: 'ĹĺĻļĽľĿŀŁłƚƪǀǏǐȴȽΙӀӏ', m: 'ΜϺϻМмӍӎ', n: 'ÑñŃńŅņŇňŉŊŋƝƞǸǹȠȵΝΠήηϞЍИЙЛПийлпѝҊҋӅӆӢӣӤӥπ', o: 'ÒÓÔÕÖØðòóôõöøŌōŎŏŐőƟƠơǑǒǪǫǬǭǾǿȌȍȎȏȪȫȬȭȮȯȰȱΌΘΟθοσόϕϘϙϬϭϴОФоѲѳӦӧӨөӪӫ', p: 'ƤƿΡρϷϸϼРрҎҏÞ', q: 'Ɋɋ', r: 'ŔŕŖŗŘřƦȐȑȒȓɌɍЃГЯгяѓҐґ', s: 'ŚśŜŝŞşŠšƧƨȘșȿЅѕ', t: 'ŢţŤťŦŧƫƬƭƮȚțȶȾΓΤτϮТт', u: 'µÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųƯưƱƲǓǔǕǖǗǘǙǚǛǜȔȕȖȗɄΰμυϋύ', v: 'νѴѵѶѷ', w: 'ŴŵƜωώϖϢϣШЩшщѡѿ', x: '×ΧχϗϰХхҲҳӼӽӾӿ', y: 'ÝýÿŶŷŸƳƴȲȳɎɏΎΥΫγψϒϓϔЎУучўѰѱҮүҰұӮӯӰӱӲӳ', z: 'ŹźŻżŽžƩƵƶȤȥɀΖζ' }; //decompress data into two hashes var unicode = {}; Object.keys(compact).forEach(function (k) { compact[k].split('').forEach(function (s) { unicode[s] = k; }); }); var killUnicode = function killUnicode(str) { var chars = str.split(''); chars.forEach(function (s, i) { if (unicode[s]) { chars[i] = unicode[s]; } }); return chars.join(''); }; var unicode_1 = killUnicode; // console.log(killUnicode('bjŏȒk—Ɏó')); var periodAcronym = /([A-Z]\.)+[A-Z]?,?$/; var oneLetterAcronym = /^[A-Z]\.,?$/; var noPeriodAcronym = /[A-Z]{2,}('s|,)?$/; var lowerCaseAcronym = /([a-z]\.){2,}[a-z]\.?$/; var isAcronym = function isAcronym(str) { //like N.D.A if (periodAcronym.test(str) === true) { return true; } //like c.e.o if (lowerCaseAcronym.test(str) === true) { return true; } //like 'F.' if (oneLetterAcronym.test(str) === true) { return true; } //like NDA if (noPeriodAcronym.test(str) === true) { return true; } return false; }; var isAcronym_1 = isAcronym; var hasSlash = /[a-z\u00C0-\u00FF] ?\/ ?[a-z\u00C0-\u00FF]/; /** some basic operations on a string to reduce noise */ var clean = function clean(str) { str = str || ''; str = str.toLowerCase(); str = str.trim(); var original = str; //(very) rough ASCII transliteration - bjŏrk -> bjork str = unicode_1(str); //rough handling of slashes - 'see/saw' if (hasSlash.test(str) === true) { str = str.replace(/\/.*/, ''); } //#tags, @mentions str = str.replace(/^[#@]/, ''); //punctuation str = str.replace(/[,;.!?]+$/, ''); // coerce single curly quotes str = str.replace(/[\u0027\u0060\u00B4\u2018\u2019\u201A\u201B\u2032\u2035\u2039\u203A]+/g, "'"); // coerce double curly quotes str = str.replace(/[\u0022\u00AB\u00BB\u201C\u201D\u201E\u201F\u2033\u2034\u2036\u2037\u2E42\u301D\u301E\u301F\uFF02]+/g, '"'); //coerce Unicode ellipses str = str.replace(/\u2026/g, '...'); //en-dash str = str.replace(/\u2013/g, '-'); //lookin'->looking (make it easier for conjugation) str = str.replace(/([aeiou][ktrp])in$/, '$1ing'); //turn re-enactment to reenactment if (/^(re|un)-?[^aeiou]./.test(str) === true) { str = str.replace('-', ''); } //strip leading & trailing grammatical punctuation if (/^[:;]/.test(str) === false) { str = str.replace(/\.{3,}$/g, ''); str = str.replace(/[",\.!:;\?\)]+$/g, ''); str = str.replace(/^['"\(]+/g, ''); } //do this again.. str = str.trim(); //oh shucks, if (str === '') { str = original; } //compact acronyms if (isAcronym_1(str)) { str = str.replace(/\./g, ''); } //nice-numbers str = str.replace(/([0-9]),([0-9])/g, '$1$2'); return str; }; var clean_1 = clean; // console.log(normalize('Dr. V Cooper')); /** reduced is one step further than clean */ var reduced = function reduced(str) { // remove apostrophes str = str.replace(/['’]s$/, ''); str = str.replace(/s['’]$/, 's'); return str; }; var reduce = reduced; //all punctuation marks, from https://en.wikipedia.org/wiki/Punctuation //we have slightly different rules for start/end - like #hashtags. var startings = /^[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·\&*\•^†‡°¡¿※№÷×ºª%‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F]+/; var endings = /[ \n\t\.’'\[\](){}⟨⟩:,،、‒–—―…!.‹›«»‐\-?‘’;\/⁄·\&*@\•^†‡°¡¿※#№÷×ºª‰+−=‱¶′″‴§~|‖¦©℗®℠™¤₳฿\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E]+$/; //money = ₵¢₡₢$₫₯֏₠€ƒ₣₲₴₭₺₾ℳ₥₦₧₱₰£៛₽₹₨₪৳₸₮₩¥ var hasSlash$1 = /\//; var hasApostrophe = /['’]/; var hasAcronym = /^[a-z]\.([a-z]\.)+/i; var minusNumber = /^[-+\.][0-9]/; /** turn given text into a parsed-up object * seperate the 'meat' of the word from the whitespace+punctuation */ var parseTerm = function parseTerm(str) { var original = str; var pre = ''; var post = ''; str = str.replace(startings, function (found) { pre = found; // support '-40' if ((pre === '-' || pre === '+' || pre === '.') && minusNumber.test(str)) { pre = ''; return found; } return ''; }); str = str.replace(endings, function (found) { post = found; // keep s-apostrophe - "flanders'" or "chillin'" if (hasApostrophe.test(found) && /[sn]['’]$/.test(original) && hasApostrophe.test(pre) === false) { post = post.replace(hasApostrophe, ''); return "'"; } //keep end-period in acronym if (hasAcronym.test(str) === true) { post = post.replace(/\./, ''); return '.'; } return ''; }); //we went too far.. if (str === '') { // do a very mild parse, and hope for the best. original = original.replace(/ *$/, function (after) { post = after || ''; return ''; }); str = original; pre = ''; post = post; } // create the various forms of our text, var clean = clean_1(str); var parsed = { text: str, clean: clean, reduced: reduce(clean), pre: pre, post: post }; // support aliases for slashes if (hasSlash$1.test(str)) { str.split(hasSlash$1).forEach(function (word) { parsed.alias = parsed.alias || {}; parsed.alias[word.trim()] = true; }); } return parsed; }; var parse = parseTerm; function createCommonjsModule(fn, module) { return module = { exports: {} }, fn(module, module.exports), module.exports; } var _01Case = createCommonjsModule(function (module, exports) { var titleCase = /^[A-Z][a-z'\u00C0-\u00FF]/; var upperCase = /^[A-Z]+s?$/; /** convert all text to uppercase */ exports.toUpperCase = function () { this.text = this.text.toUpperCase(); return this; }; /** convert all text to lowercase */ exports.toLowerCase = function () { this.text = this.text.toLowerCase(); return this; }; /** only set the first letter to uppercase * leave any existing uppercase alone */ exports.toTitleCase = function () { this.text = this.text.replace(/^ *[a-z\u00C0-\u00FF]/, function (x) { return x.toUpperCase(); }); //support unicode? return this; }; /** if all letters are uppercase */ exports.isUpperCase = function () { return upperCase.test(this.text); }; /** if the first letter is uppercase, and the rest are lowercase */ exports.isTitleCase = function () { return titleCase.test(this.text); }; exports.titleCase = exports.isTitleCase; }); var _01Case_1 = _01Case.toUpperCase; var _01Case_2 = _01Case.toLowerCase; var _01Case_3 = _01Case.toTitleCase; var _01Case_4 = _01Case.isUpperCase; var _01Case_5 = _01Case.isTitleCase; var _01Case_6 = _01Case.titleCase; var _02Punctuation = createCommonjsModule(function (module, exports) { // these methods are called with '@hasComma' in the match syntax // various unicode quotation-mark formats var startQuote = /(\u0022|\uFF02|\u0027|\u201C|\u2018|\u201F|\u201B|\u201E|\u2E42|\u201A|\u00AB|\u2039|\u2035|\u2036|\u2037|\u301D|\u0060|\u301F)/; var endQuote = /(\u0022|\uFF02|\u0027|\u201D|\u2019|\u201D|\u2019|\u201D|\u201D|\u2019|\u00BB|\u203A|\u2032|\u2033|\u2034|\u301E|\u00B4|\u301E)/; /** search the term's 'post' punctuation */ exports.hasPost = function (punct) { return this.post.indexOf(punct) !== -1; }; /** search the term's 'pre' punctuation */ exports.hasPre = function (punct) { return this.pre.indexOf(punct) !== -1; }; /** does it have a quotation symbol? */ exports.hasQuote = function () { return startQuote.test(this.pre) || endQuote.test(this.post); }; exports.hasQuotation = exports.hasQuote; /** does it have a comma? */ exports.hasComma = function () { return this.hasPost(','); }; /** does it end in a period? */ exports.hasPeriod = function () { return this.hasPost('.') === true && this.hasPost('...') === false; }; /** does it end in an exclamation */ exports.hasExclamation = function () { return this.hasPost('!'); }; /** does it end with a question mark? */ exports.hasQuestionMark = function () { return this.hasPost('?') || this.hasPost('¿'); }; /** is there a ... at the end? */ exports.hasEllipses = function () { return this.hasPost('..') || this.hasPost('…') || this.hasPre('..') || this.hasPre('…'); }; /** is there a semicolon after this word? */ exports.hasSemicolon = function () { return this.hasPost(';'); }; /** is there a slash '/' in this word? */ exports.hasSlash = function () { return /\//.test(this.text); }; /** a hyphen connects two words like-this */ exports.hasHyphen = function () { var hyphen = /(-|–|—)/; return hyphen.test(this.post) || hyphen.test(this.pre); }; /** a dash separates words - like that */ exports.hasDash = function () { var hyphen = / (-|–|—) /; return hyphen.test(this.post) || hyphen.test(this.pre); }; /** is it multiple words combinded */ exports.hasContraction = function () { return Boolean(this.implicit); }; /** try to sensibly put this punctuation mark into the term */ exports.addPunctuation = function (punct) { // dont add doubles if (punct === ',' || punct === ';') { this.post = this.post.replace(punct, ''); } this.post = punct + this.post; return this; }; }); var _02Punctuation_1 = _02Punctuation.hasPost; var _02Punctuation_2 = _02Punctuation.hasPre; var _02Punctuation_3 = _02Punctuation.hasQuote; var _02Punctuation_4 = _02Punctuation.hasQuotation; var _02Punctuation_5 = _02Punctuation.hasComma; var _02Punctuation_6 = _02Punctuation.hasPeriod; var _02Punctuation_7 = _02Punctuation.hasExclamation; var _02Punctuation_8 = _02Punctuation.hasQuestionMark; var _02Punctuation_9 = _02Punctuation.hasEllipses; var _02Punctuation_10 = _02Punctuation.hasSemicolon; var _02Punctuation_11 = _02Punctuation.hasSlash; var _02Punctuation_12 = _02Punctuation.hasHyphen; var _02Punctuation_13 = _02Punctuation.hasDash; var _02Punctuation_14 = _02Punctuation.hasContraction; var _02Punctuation_15 = _02Punctuation.addPunctuation; //declare it up here var wrapMatch = function wrapMatch() {}; /** ignore optional/greedy logic, straight-up term match*/ var doesMatch = function doesMatch(t, reg, index, length) { // support id matches if (reg.id === t.id) { return true; } // support '.' if (reg.anything === true) { return true; } // support '^' (in parentheses) if (reg.start === true && index !== 0) { return false; } // support '$' (in parentheses) if (reg.end === true && index !== length - 1) { return false; } //support a text match if (reg.word !== undefined) { //match contractions if (t.implicit !== null && t.implicit === reg.word) { return true; } // term aliases for slashes and things if (t.alias !== undefined && t.alias.hasOwnProperty(reg.word)) { return true; } // support ~ match if (reg.soft === true && reg.word === t.root) { return true; } //match either .clean or .text return reg.word === t.clean || reg.word === t.text || reg.word === t.reduced; } //support #Tag if (reg.tag !== undefined) { return t.tags[reg.tag] === true; } //support @method if (reg.method !== undefined) { if (typeof t[reg.method] === 'function' && t[reg.method]() === true) { return true; } return false; } //support /reg/ if (reg.regex !== undefined) { return reg.regex.test(t.clean); } //support (one|two) if (reg.choices !== undefined) { // try to support && operator if (reg.operator === 'and') { // must match them all return reg.choices.every(function (r) { return wrapMatch(t, r, index, length); }); } // or must match one return reg.choices.some(function (r) { return wrapMatch(t, r, index, length); }); } return false; }; // wrap result for !negative match logic wrapMatch = function wrapMatch(t, reg, index, length) { var result = doesMatch(t, reg, index, length); if (reg.negative === true) { return !result; } return result; }; var _doesMatch = wrapMatch; var boring = {}; /** check a match object against this term */ var doesMatch_1 = function doesMatch_1(reg, index, length) { return _doesMatch(this, reg, index, length); }; /** does this term look like an acronym? */ var isAcronym_1$1 = function isAcronym_1$1() { return isAcronym_1(this.text); }; /** is this term implied by a contraction? */ var isImplicit = function isImplicit() { return this.text === '' && Boolean(this.implicit); }; /** does the term have at least one good tag? */ var isKnown = function isKnown() { return Object.keys(this.tags).some(function (t) { return boring[t] !== true; }); }; /** cache the root property of the term */ var setRoot = function setRoot(world) { var transform = world.transforms; var str = this.implicit || this.clean; if (this.tags.Plural) { str = transform.toSingular(str, world); } if (this.tags.Verb && !this.tags.Negative && !this.tags.Infinitive) { var tense = null; if (this.tags.PastTense) { tense = 'PastTense'; } else if (this.tags.Gerund) { tense = 'Gerund'; } else if (this.tags.PresentTense) { tense = 'PresentTense'; } else if (this.tags.Participle) { tense = 'Participle'; } else if (this.tags.Actor) { tense = 'Actor'; } str = transform.toInfinitive(str, world, tense); } this.root = str; }; var _03Misc = { doesMatch: doesMatch_1, isAcronym: isAcronym_1$1, isImplicit: isImplicit, isKnown: isKnown, setRoot: setRoot }; var hasSpace = /[\s-]/; var isUpperCase = /^[A-Z-]+$/; // const titleCase = str => { // return str.charAt(0).toUpperCase() + str.substr(1) // } /** return various text formats of this term */ var textOut = function textOut(options, showPre, showPost) { options = options || {}; var word = this.text; var before = this.pre; var after = this.post; // -word- if (options.reduced === true) { word = this.reduced || ''; } if (options.root === true) { word = this.root || ''; } if (options.implicit === true && this.implicit) { word = this.implicit || ''; } if (options.normal === true) { word = this.clean || this.text || ''; } if (options.root === true) { word = this.root || this.reduced || ''; } if (options.unicode === true) { word = unicode_1(word); } // cleanup case if (options.titlecase === true) { if (this.tags.ProperNoun && !this.titleCase()) ; else if (this.tags.Acronym) { word = word.toUpperCase(); //uppercase acronyms } else if (isUpperCase.test(word) && !this.tags.Acronym) { // lowercase everything else word = word.toLowerCase(); } } if (options.lowercase === true) { word = word.toLowerCase(); } // remove the '.'s from 'F.B.I.' (safely) if (options.acronyms === true && this.tags.Acronym) { word = word.replace(/\./g, ''); } // -before/after- if (options.whitespace === true || options.root === true) { before = ''; after = ' '; if ((hasSpace.test(this.post) === false || options.last) && !this.implicit) { after = ''; } } if (options.punctuation === true && !options.root) { //normalized end punctuation if (this.hasPost('.') === true) { after = '.' + after; } else if (this.hasPost('?') === true) { after = '?' + after; } else if (this.hasPost('!') === true) { after = '!' + after; } else if (this.hasPost(',') === true) { after = ',' + after; } else if (this.hasEllipses() === true) { after = '...' + after; } } if (showPre !== true) { before = ''; } if (showPost !== true) { // let keep = after.match(/\)/) || '' after = ''; //keep //after.replace(/[ .?!,]+/, '') } // remove the '.' from 'Mrs.' (safely) if (options.abbreviations === true && this.tags.Abbreviation) { after = after.replace(/^\./, ''); } return before + word + after; }; var _04Text = { textOut: textOut }; var boringTags = { Auxiliary: 1, Possessive: 1 }; /** a subjective ranking of tags kinda tfidf-based */ var rankTags = function rankTags(term, world) { var tags = Object.keys(term.tags); var tagSet = world.tags; tags = tags.sort(function (a, b) { //bury the tags we dont want if (boringTags[b] || !tagSet[b]) { return -1; } // unknown tags are interesting if (!tagSet[b]) { return 1; } if (!tagSet[a]) { return 0; } // then sort by #of parent tags (most-specific tags first) if (tagSet[a].lineage.length > tagSet[b].lineage.length) { return 1; } if (tagSet[a].isA.length > tagSet[b].isA.length) { return -1; } return 0; }); return tags; }; var _bestTag = rankTags; var jsonDefault = { text: true, tags: true, implicit: true, clean: false, id: false, index: false, offset: false, whitespace: false, bestTag: false }; /** return various metadata for this term */ var json = function json(options, world) { options = options || {}; options = Object.assign({}, jsonDefault, options); var result = {}; // default on if (options.text) { result.text = this.text; } if (options.normal) { result.normal = this.normal; } if (options.tags) { result.tags = Object.keys(this.tags); } // default off if (options.clean) { result.clean = this.clean; } if (options.id || options.offset) { result.id = this.id; } if (options.implicit && this.implicit !== null) { result.implicit = this.implicit; } if (options.whitespace) { result.pre = this.pre; result.post = this.post; } if (options.bestTag) { result.bestTag = _bestTag(this, world)[0]; } return result; }; var _05Json = { json: json }; var methods = Object.assign({}, _01Case, _02Punctuation, _03Misc, _04Text, _05Json); function isClientSide() { return typeof window !== 'undefined' && window.document; } /** add spaces at the end */ var padEnd = function padEnd(str, width) { str = str.toString(); while (str.length < width) { str += ' '; } return str; }; /** output for verbose-mode */ var logTag = function logTag(t, tag, reason) { if (isClientSide()) { console.log('%c' + padEnd(t.clean, 3) + ' + ' + tag + ' ', 'color: #6accb2;'); return; } //server-side var log = '\x1b[33m' + padEnd(t.clean, 15) + '\x1b[0m + \x1b[32m' + tag + '\x1b[0m '; if (reason) { log = padEnd(log, 35) + ' ' + reason + ''; } console.log(log); }; /** output for verbose mode */ var logUntag = function logUntag(t, tag, reason) { if (isClientSide()) { console.log('%c' + padEnd(t.clean, 3) + ' - ' + tag + ' ', 'color: #AB5850;'); return; } //server-side var log = '\x1b[33m' + padEnd(t.clean, 3) + ' \x1b[31m - #' + tag + '\x1b[0m '; if (reason) { log = padEnd(log, 35) + ' ' + reason; } console.log(log); }; var isArray = function isArray(arr) { return Object.prototype.toString.call(arr) === '[object Array]'; }; var titleCase = function titleCase(str) { return str.charAt(0).toUpperCase() + str.substr(1); }; var fns = { logTag: logTag, logUntag: logUntag, isArray: isArray, titleCase: titleCase }; /** add a tag, and its descendents, to a term */ var addTag = function addTag(t, tag, reason, world) { var tagset = world.tags; //support '.' or '-' notation for skipping the tag if (tag === '' || tag === '.' || tag === '-') { return; } if (tag[0] === '#') { tag = tag.replace(/^#/, ''); } tag = fns.titleCase(tag); //if we already got this one if (t.tags[tag] === true) { return; } // log it? var isVerbose = world.isVerbose(); if (isVerbose === true) { fns.logTag(t, tag, reason); } //add tag t.tags[tag] = true; //whee! //check tagset for any additional things to do... if (tagset.hasOwnProperty(tag) === true) { //add parent Tags tagset[tag].isA.forEach(function (down) { t.tags[down] = true; if (isVerbose === true) { fns.logTag(t, '→ ' + down); } }); //remove any contrary tags t.unTag(tagset[tag].notA, '←', world); } }; /** support an array of tags */ var addTags = function addTags(term, tags, reason, world) { if (typeof tags !== 'string') { for (var i = 0; i < tags.length; i++) { addTag(term, tags[i], reason, world); } // tags.forEach(tag => addTag(term, tag, reason, world)) } else { addTag(term, tags, reason, world); } }; var add = addTags; /** remove this tag, and its descentents from the term */ var unTag = function unTag(t, tag, reason, world) { var isVerbose = world.isVerbose(); //support '*' for removing all tags if (tag === '*') { t.tags = {}; return t; } // remove the tag if (t.tags[tag] === true) { delete t.tags[tag]; //log in verbose-mode if (isVerbose === true) { fns.logUntag(t, tag, reason); } } //delete downstream tags too var tagset = world.tags; if (tagset[tag]) { var lineage = tagset[tag].lineage; for (var i = 0; i < lineage.length; i++) { if (t.tags[lineage[i]] === true) { delete t.tags[lineage[i]]; if (isVerbose === true) { fns.logUntag(t, ' - ' + lineage[i]); } } } } return t; }; //handle an array of tags var untagAll = function untagAll(term, tags, reason, world) { if (typeof tags !== 'string' && tags) { for (var i = 0; i < tags.length; i++) { unTag(term, tags[i], reason, world); } return; } unTag(term, tags, reason, world); }; var unTag_1 = untagAll; var canBe = function canBe(term, tag, world) { var tagset = world.tags; // cleanup tag if (tag[0] === '#') { tag = tag.replace(/^#/, ''); } //fail-fast if (tagset[tag] === undefined) { return true; } //loop through tag's contradictory tags var enemies = tagset[tag].notA || []; for (var i = 0; i < enemies.length; i++) { if (term.tags[enemies[i]] === true) { return false; } } if (tagset[tag].isA !== undefined) { return canBe(term, tagset[tag].isA, world); //recursive } return true; }; var canBe_1 = canBe; /** add a tag or tags, and their descendents to this term * @param {string | string[]} tags - a tag or tags * @param {string?} [reason] a clue for debugging */ var tag_1 = function tag_1(tags, reason, world) { add(this, tags, reason, world); return this; }; /** only tag this term if it's consistent with it's current tags */ var tagSafe = function tagSafe(tags, reason, world) { if (canBe_1(this, tags, world)) { add(this, tags, reason, world); } return this; }; /** remove a tag or tags, and their descendents from this term * @param {string | string[]} tags - a tag or tags * @param {string?} [reason] a clue for debugging */ var unTag_1$1 = function unTag_1$1(tags, reason, world) { unTag_1(this, tags, reason, world); return this; }; /** is this tag consistent with the word's current tags? * @param {string | string[]} tags - a tag or tags * @returns {boolean} */ var canBe_1$1 = function canBe_1$1(tags, world) { return canBe_1(this, tags, world); }; var tag = { tag: tag_1, tagSafe: tagSafe, unTag: unTag_1$1, canBe: canBe_1$1 }; var Term = /*#__PURE__*/ function () { function Term() { var text = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ''; _classCallCheck(this, Term); text = String(text); var obj = parse(text); // the various forms of our text this.text = obj.text || ''; this.clean = obj.clean; this.reduced = obj.reduced; this.root = null; this.implicit = null; this.pre = obj.pre || ''; this.post = obj.post || ''; this.tags = {}; this.prev = null; this.next = null; this.id = _id(obj.clean); this.isA = 'Term'; // easier than .constructor... // support alternative matches if (obj.alias) { this.alias = obj.alias; } } /** set the text of the Term to something else*/ _createClass(Term, [{ key: "set", value: function set(str) { var obj = parse(str); this.text = obj.text; this.clean = obj.clean; return this; } }]); return Term; }(); /** create a deep-copy of this term */ Term.prototype.clone = function () { var term = new Term(this.text); term.pre = this.pre; term.post = this.post; term.tags = Object.assign({}, this.tags); //use the old id, so it can be matched with .match(doc) // term.id = this.id return term; }; Object.assign(Term.prototype, methods); Object.assign(Term.prototype, tag); var Term_1 = Term; /** return a flat array of Term objects */ var terms = function terms(n) { if (this.length === 0) { return []; } // use cache, if it exists if (this.cache.terms) { if (n !== undefined) { return this.cache.terms[n]; } return this.cache.terms; } var terms = [this.pool.get(this.start)]; for (var i = 0; i < this.length - 1; i += 1) { var id = terms[terms.length - 1].next; if (id === null) { // throw new Error('linked-list broken') console.error("Compromise error: Linked list broken in phrase '" + this.start + "'"); break; } var term = this.pool.get(id); terms.push(term); //return this one? if (n !== undefined && n === i) { return terms[n]; } } // this.cache.terms = terms if (n !== undefined) { return terms[n]; } return terms; }; /** return a shallow or deep copy of this phrase */ var clone = function clone(isShallow) { var _this = this; if (isShallow) { return this.buildFrom(this.start, this.length); } //how do we clone part of the pool? var terms = this.terms(); var newTerms = terms.map(function (t) { return t.clone(); }); //connect these new ids up newTerms.forEach(function (t, i) { //add it to the pool.. _this.pool.add(t); if (newTerms[i + 1]) { t.next = newTerms[i + 1].id; } if (newTerms[i - 1]) { t.prev = newTerms[i - 1].id; } }); return this.buildFrom(newTerms[0].id, newTerms.length); }; /** return last term object */ var lastTerm = function lastTerm() { var terms = this.terms(); return terms[terms.length - 1]; }; /** quick lookup for a term id */ var hasId = function hasId(wantId) { if (this.length === 0 || !wantId) { return false; } if (this.start === wantId) { return true; } // use cache, if available if (this.cache.terms) { var _terms = this.cache.terms; for (var i = 0; i < _terms.length; i++) { if (_terms[i].id === wantId) { return true; } } return false; } // otherwise, go through each term var lastId = this.start; for (var _i = 0; _i < this.length - 1; _i += 1) { var term = this.pool.get(lastId); if (term === undefined) { console.error("Compromise error: Linked list broken. Missing term '".concat(lastId, "' in phrase '").concat(this.start, "'\n")); // throw new Error('linked List error') return false; } if (term.next === wantId) { return true; } lastId = term.next; } return false; }; /** how many seperate, non-empty words is it? */ var wordCount = function wordCount() { return this.terms().filter(function (t) { return t.text !== ''; }).length; }; var _01Utils = { terms: terms, clone: clone, lastTerm: lastTerm, hasId: hasId, wordCount: wordCount }; var trimEnd = function trimEnd(str) { return str.replace(/ +$/, ''); }; /** produce output in the given format */ var text = function text() { var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; var isFirst = arguments.length > 1 ? arguments[1] : undefined; var isLast = arguments.length > 2 ? arguments[2] : undefined; if (typeof options === 'string') { if (options === 'normal') { options = { whitespace: true, unicode: true, lowercase: true, punctuation: true, acronyms: true, abbreviations: true, implicit: true, normal: true }; } else if (options === 'clean') { options = { titlecase: false, lowercase: true, punctuation: true, whitespace: true, unicode: true, implicit: true }; } else if (options === 'reduced') { options = { titlecase: false, lowercase: true, punctuation: false, //FIXME: reversed? whitespace: true, unicode: true, implicit: true, reduced: true }; } else if (options === 'root') { options = { titlecase: false, lowercase: true, punctuation: true, whitespace: true, unicode: true, implicit: true, root: true }; } else { options = {}; } } var terms = this.terms(); //this this phrase a complete sentence? var isFull = false; if (terms[0] && terms[0].prev === null && terms[terms.length - 1].next === null) { isFull = true; } var text = terms.reduce(function (str, t, i) { options.last = isLast && i === terms.length - 1; var showPre = true; var showPost = true; if (isFull === false) { // dont show beginning whitespace if (i === 0 && isFirst) { showPre = false; } // dont show end-whitespace if (i === terms.length - 1 && isLast) { showPost = false; } } var txt = t.textOut(options, showPre, showPost); // if (options.titlecase && i === 0) { // txt = titleCase(txt) // } return str + txt; }, ''); //full-phrases show punctuation, but not whitespace if (isFull === true && isLast) { text = trimEnd(text); } if (options.trim === true) { text = text.trim(); } return text; }; var _02Text = { text: text }; /** remove start and end whitespace */ var trim = function trim() { var terms = this.terms(); if (terms.length > 0) { //trim starting terms[0].pre = terms[0].pre.replace(/^\s+/, ''); //trim ending var lastTerm = terms[terms.length - 1]; lastTerm.post = lastTerm.post.replace(/\s+$/, ''); } return this; }; var _03Change = { trim: trim }; var endOfSentence = /[.?!]\s*$/; // replacing a 'word.' with a 'word!' var combinePost = function combinePost(before, after) { //only transfer the whitespace if (endOfSentence.test(after)) { var whitespace = before.match(/\s*$/); return after + whitespace; } return before; }; //add whitespace to the start of the second bit var addWhitespace = function addWhitespace(beforeTerms, newTerms) { // add any existing pre-whitespace to beginning newTerms[0].pre = beforeTerms[0].pre; var lastTerm = beforeTerms[beforeTerms.length - 1]; //add any existing punctuation to end of our new terms var newTerm = newTerms[newTerms.length - 1]; newTerm.post = combinePost(lastTerm.post, newTerm.post); // remove existing punctuation lastTerm.post = ''; //before ←[space] - after if (lastTerm.post === '') { lastTerm.post += ' '; } }; //insert this segment into the linked-list var stitchIn = function stitchIn(beforeTerms, newTerms, pool) { var lastBefore = beforeTerms[beforeTerms.length - 1]; var lastNew = newTerms[newTerms.length - 1]; var afterId = lastBefore.next; //connect ours in (main → newPhrase) lastBefore.next = newTerms[0].id; //stich the end in (newPhrase → after) lastNew.next = afterId; //do it backwards, too if (afterId) { // newPhrase ← after var afterTerm = pool.get(afterId); afterTerm.prev = lastNew.id; } // before ← newPhrase var beforeId = beforeTerms[0].id; if (beforeId) { var newTerm = newTerms[0]; newTerm.prev = beforeId; } }; // avoid stretching a phrase twice. var unique = function unique(list) { return list.filter(function (o, i) { return list.indexOf(o) === i; }); }; //append one phrase onto another. var appendPhrase = function appendPhrase(before, newPhrase, doc) { var beforeTerms = before.cache.terms || before.terms(); var newTerms = newPhrase.cache.terms || newPhrase.terms(); //spruce-up the whitespace issues addWhitespace(beforeTerms, newTerms); //insert this segment into the linked-list stitchIn(beforeTerms, newTerms, before.pool); // stretch! // make each effected phrase longer var toStretch = [before]; var hasId = before.start; var docs = [doc]; docs = docs.concat(doc.parents()); // find them all! docs.forEach(function (parent) { // only the phrases that should change var shouldChange = parent.list.filter(function (p) { return p.hasId(hasId); }); toStretch = toStretch.concat(shouldChange); }); // don't double-count a phrase toStretch = unique(toStretch); toStretch.forEach(function (p) { p.length += newPhrase.length; }); return before; }; var append = appendPhrase; var hasSpace$1 = / /; //a new space needs to be added, either on the new phrase, or the old one // '[new] [◻old]' -or- '[old] [◻new] [old]' var addWhitespace$1 = function addWhitespace(newTerms) { //add a space before our new text? // add a space after our text var lastTerm = newTerms[newTerms.length - 1]; if (hasSpace$1.test(lastTerm.post) === false) { lastTerm.post += ' '; } return; }; //insert this segment into the linked-list var stitchIn$1 = function stitchIn(main, newPhrase, newTerms) { // [newPhrase] → [main] var lastTerm = newTerms[newTerms.length - 1]; lastTerm.next = main.start; // [before] → [main] var pool = main.pool; var start = pool.get(main.start); if (start.prev) { var before = pool.get(start.prev); before.next = newPhrase.start; } //do it backwards, too // before ← newPhrase newTerms[0].prev = main.terms(0).prev; // newPhrase ← main main.terms(0).prev = lastTerm.id; }; var unique$1 = function unique(list) { return list.filter(function (o, i) { return list.indexOf(o) === i; }); }; //append one phrase onto another var joinPhrase = function joinPhrase(original, newPhrase, doc) { var starterId = original.start; var newTerms = newPhrase.terms(); //spruce-up the whitespace issues addWhitespace$1(newTerms); //insert this segment into the linked-list stitchIn$1(original, newPhrase, newTerms); //increase the length of our phrases var toStretch = [original]; var docs = [doc]; docs = docs.concat(doc.parents()); docs.forEach(function (d) { // only the phrases that should change var shouldChange = d.list.filter(function (p) { return p.hasId(starterId) || p.hasId(newPhrase.start); }); toStretch = toStretch.concat(shouldChange); }); // don't double-count toStretch = unique$1(toStretch); // stretch these phrases toStretch.forEach(function (p) { p.length += newPhrase.length; // change the start too, if necessary if (p.start === starterId) { p.start = newPhrase.start; } }); return original; }; var prepend = joinPhrase; //recursively decrease the length of all the parent phrases var shrinkAll = function shrinkAll(doc, id, deleteLength, after) { var arr = doc.parents(); arr.push(doc); arr.forEach(function (d) { //find our phrase to shrink var phrase = d.list.find(function (p) { return p.hasId(id); }); if (!phrase) { return; } phrase.length -= deleteLength; // does it start with this soon-removed word? if (phrase.start === id) { phrase.start = after.id; } }); // cleanup empty phrase objects doc.list = doc.list.filter(function (p) { if (!p.start || !p.length) { return false; } return true; }); }; /** wrap the linked-list around these terms * so they don't appear any more */ var deletePhrase = function deletePhrase(phrase, doc) { var pool = doc.pool(); var terms = phrase.cache.terms || phrase.terms(); //grab both sides of the chain, var prev = pool.get(terms[0].prev) || {}; var after = pool.get(terms[terms.length - 1].next) || {}; if (terms[0].implicit && prev.implicit) { prev.set(prev.implicit); prev.post += ' '; } // //first, change phrase lengths shrinkAll(doc, phrase.start, phrase.length, after); // connect [prev]->[after] if (prev) { prev.next = after.id; } // connect [prev]<-[after] if (after) { after.prev = prev.id; } // lastly, actually delete the terms from the pool? // for (let i = 0; i < terms.length; i++) { // pool.remove(terms[i].id) // } }; var _delete = deletePhrase; /** put this text at the end */ var append_1 = function append_1(newPhrase, doc) { append(this, newPhrase, doc); return this; }; /** add this text to the beginning */ var prepend_1 = function prepend_1(newPhrase, doc) { prepend(this, newPhrase, doc); return this; }; var delete_1 = function delete_1(doc) { _delete(this, doc); return this; }; // stich-in newPhrase, stretch 'doc' + parents var replace = function replace(newPhrase, doc) { //add it do the end var firstLength = this.length; append(this, newPhrase, doc); //delete original terms var tmp = this.buildFrom(this.start, this.length); tmp.length = firstLength; _delete(tmp, doc); }; /** * Turn this phrase object into 3 phrase objects */ var splitOn = function splitOn(p) { var terms = this.terms(); var result = { before: null, match: null, after: null }; var index = terms.findIndex(function (t) { return t.id === p.start; }); if (index === -1) { return result; } //make all three sections into phrase-objects var start = terms.slice(0, index); if (start.length > 0) { result.before = this.buildFrom(start[0].id, start.length); } var match = terms.slice(index, index + p.length); if (match.length > 0) { result.match = this.buildFrom(match[0].id, match.length); } var end = terms.slice(index + p.length, terms.length); if (end.length > 0) { result.after = this.buildFrom(end[0].id, end.length, this.pool); } return result; }; var _04Insert = { append: append_1, prepend: prepend_1, "delete": delete_1, replace: replace, splitOn: splitOn }; /** return json metadata for this phrase */ var json$1 = function json() { var options = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : {}; var world = arguments.length > 1 ? arguments[1] : undefined; var res = {}; // text data if (options.text) { res.text = this.text(); } if (options.normal) { res.normal = this.text('normal'); } if (options.clean) { res.clean = this.text('clean'); } if (options.reduced) { res.reduced = this.text('reduced'); } if (options.root) { res.root = this.text('root'); } if (options.trim) { if (res.text) { res.text = res.text.trim(); } if (res.normal) { res.normal = res.normal.trim(); } if (res.reduced) { res.reduced = res.reduced.trim(); } } // terms data if (options.terms) { if (options.terms === true) { options.terms = {}; } res.terms = this.terms().map(function (t) { return t.json(options.terms, world); }); } return res; }; var _05Json$1 = { json: json$1 }; /** match any terms after this phrase */ var lookAhead = function lookAhead(regs) { // if empty match string, return everything after if (!regs) { regs = '.*'; } var pool = this.pool; // get a list of all terms preceding our start var terms = []; var getAfter = function getAfter(id) { var term = pool.get(id); if (!term) { return; } terms.push(term); if (term.prev) { getAfter(term.next); //recursion } }; var all = this.terms(); var lastTerm = all[all.length - 1]; getAfter(lastTerm.next); if (terms.length === 0) { return []; } // got the terms, make a phrase from them var p = this.buildFrom(terms[0].id, terms.length); return p.match(regs); }; /** match any terms before this phrase */ var lookBehind = function lookBehind(regs) { // if empty match string, return everything before if (!regs) { regs = '.*'; } var pool = this.pool; // get a list of all terms preceding our start var terms = []; var getBefore = function getBefore(id) { var term = pool.get(id); if (!term) { return; } terms.push(term); if (term.prev) { getBefore(term.prev); //recursion } }; var term = pool.get(this.start); getBefore(term.prev); if (terms.length === 0) { return []; } // got the terms, make a phrase from them var p = this.buildFrom(terms[terms.length - 1].id, terms.length); return p.match(regs); }; var _06Lookahead = { lookAhead: lookAhead, lookBehind: lookBehind }; var methods$1 = Object.assign({}, _01Utils, _02Text, _03Change, _04Insert, _05Json$1, _06Lookahead); // try to avoid doing the match var failFast = function failFast(p, regs) { if (regs.length === 0) { return true; } for (var i = 0; i < regs.length; i += 1) { var reg = regs[i]; // //logical quick-ones if (reg.optional !== true && reg.negative !== true) { //start/end impossibilites if (reg.start === true && i > 0) { return true; } // has almost no effect if (p.cache.words !== undefined && reg.word !== undefined && p.cache.words.hasOwnProperty(reg.word) !== true) { return true; } } //this is not possible if (reg.anything === true && reg.negative === true) { return true; } } return false; }; var _02FailFast = failFast; // i formally apologize for how complicated this is. //found a match? it's greedy? keep going! var getGreedy = function getGreedy(terms, t, reg, until, index, length) { var start = t; for (; t < terms.length; t += 1) { //stop for next-reg match if (until && terms[t].doesMatch(until, index + t, length)) { return t; } var count = t - start + 1; // is it max-length now? if (reg.max !== undefined && count === reg.max) { return t; } //stop here if (terms[t].doesMatch(reg, index + t, length) === false) { // is it too short? if (reg.min !== undefined && count < reg.min) { return null; } return t; } } return t; }; //'unspecific greedy' is a weird situation. var greedyTo = function greedyTo(terms, t, nextReg, index, length) { //if there's no next one, just go off the end! if (!nextReg) { return terms.length; } //otherwise, we're looking for the next one for (; t < terms.length; t += 1) { if (terms[t].doesMatch(nextReg, index + t, length) === true) { return t; } } //guess it doesn't exist, then. return null; }; /** tries to match a sequence of terms, starting from here */ var tryHere = function tryHere(terms, regs, index, length) { var captures = []; var t = 0; // we must satisfy each rule in 'regs' for (var r = 0; r < regs.length; r += 1) { var reg = regs[r]; //should we fail here? if (!terms[t]) { //are all remaining regs optional? var hasNeeds = regs.slice(r).some(function (remain) { return !remain.optional; }); if (hasNeeds === false) { break; } // have unmet needs return false; } //support 'unspecific greedy' .* properly if (reg.anything === true && reg.greedy === true) { var skipto = greedyTo(terms, t, regs[r + 1], reg, index); // ensure it's long enough if (reg.min !== undefined && skipto - t < reg.min) { return false; } // reduce it back, if it's too long if (reg.max !== undefined && skipto - t > reg.max) { t = t + reg.max; continue; } //TODO: support [*] properly if (skipto === null) { return false; //couldn't find it } t = skipto; continue; } //if it looks like a match, continue //we have a special case where an end-anchored greedy match may need to //start matching before the actual end; we do this by (temporarily!) //removing the "end" property from the matching token... since this is //very situation-specific, we *only* do this when we really need to. if (reg.anything === true || reg.end === true && reg.greedy === true && index + t < length - 1 && terms[t].doesMatch(Object.assign({}, reg, { end: false }), index + t, length) === true || terms[t].doesMatch(reg, index + t, length) === true) { var startAt = t; // okay, it was a match, but if it optional too, // we should check the next reg too, to skip it? if (reg.optional && regs[r + 1]) { // does the next reg match it too? if (terms[t].doesMatch(regs[r + 1], index + t, length) === true) { // but does the next reg match the next term?? // only skip if it doesn't if (!terms[t + 1] || terms[t + 1].doesMatch(regs[r + 1], index + t, length) === false) { r += 1; } } } //advance to the next term! t += 1; //check any ending '$' flags if (reg.end === true) { //if this isn't the last term, refuse the match if (t !== terms.length && reg.greedy !== true) { return false; } } //try keep it going! if (reg.greedy === true) { // for greedy checking, we no longer care about the reg.start // value, and leaving it can cause failures for anchored greedy // matches. ditto for end-greedy matches: we need an earlier non- // ending match to succceed until we get to the actual end. t = getGreedy(terms, t, Object.assign({}, reg, { start: false, end: false }), regs[r + 1], index, length); if (t === null) { return false; //greedy was too short