UNPKG

abot_erbase

Version:

entity recognition

github.com/jfseb/abot_erbase

jfseb/erbase_bitmap

384 lines (382 loc) • 16.6 kB

JavaScript

/** * * @module jfseb.fdevstart.analyze * @file erbase * @copyright (c) 2016 Gerd Forstmann * * Basic domain based entity recognition */ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); var WordMatch = require("./inputFilter"); var debug = require("debug"); var debuglog = debug('erbase'); var debuglogV = debug('erbase'); var perflog = debug('perf'); var fdevsta_monmove_1 = require("fdevsta_monmove"); var ERError = require("./ererror"); var AnyObject = Object; function mockDebug(o) { debuglog = o; debuglogV = o; perflog = o; } exports.mockDebug = mockDebug; var utils = require("abot_utils"); var Sentence = require("./sentence"); /** * Given a string, break it down into components, * [['A', 'B'], ['A B']] * * then categorizeWords * returning * * [ [[ { category: 'systemId', word : 'A'}, * { category: 'otherthing', word : 'A'} * ], * // result of B * [ { category: 'systemId', word : 'B'}, * { category: 'otherthing', word : 'A'} * { category: 'anothertryp', word : 'B'} * ] * ], * ]]] * * * */ function tokenizeString(sString, rules, words) { var cnt = 0; var fac = 1; var tokens = fdevsta_monmove_1.BreakDown.tokenizeString(sString); if (debuglog.enabled) { debuglog("here breakdown" + JSON.stringify(tokens)); } //console.log(JSON.stringify(u)); words = words || {}; perflog('this many known words: ' + Object.keys(words).length); var res = []; var cntRec = {}; var categorizedSentence = []; var hasRecombined = false; tokens.tokens.forEach(function (token, index) { var seenIt = WordMatch.categorizeAWordWithOffsets(token, rules, sString, words, cntRec); /* cannot have this, or need to add all fragment words "UI2 Integration" if(seenIt.length === 0) { return false; } */ hasRecombined = hasRecombined || !seenIt.every(function (res) { return !res.rule.range; }); debuglog(debuglog.enabled ? (" categorized " + token + "/" + index + " to " + JSON.stringify(seenIt)) : "-"); debuglog(debuglog.enabled ? (" categorized " + token + "/" + index + " to " + seenIt.map(function (it, idx) { return " " + it.rule.wordType + " " + idx + " " + it.rule.bitindex + " " + it.rule.matchedString + "/" + it.rule.category + " "; }).join("\n")) : "-"); categorizedSentence[index] = seenIt; cnt = cnt + seenIt.length; fac = fac * seenIt.length; }); // have seen the plain categorization, debuglog(" sentences " + tokens.tokens.length + " matches " + cnt + " fac: " + fac); if (debuglog.enabled && tokens.tokens.length) { debuglog("first match " + JSON.stringify(tokens, undefined, 2)); } debuglog(debuglog.enabled ? " prior RangeRule " + JSON.stringify(categorizedSentence) + " " : '-'); if (hasRecombined) { evaluateRangeRulesToPosition(tokens.tokens, tokens.fusable, categorizedSentence); } debuglog(debuglog.enabled ? " after RangeRule " + JSON.stringify(categorizedSentence) + " " : '-'); perflog(" sentences " + tokens.tokens.length + " / " + res.length + " matches " + cnt + " fac: " + fac + " rec : " + JSON.stringify(cntRec, undefined, 2)); return { fusable: tokens.fusable, tokens: tokens.tokens, categorizedWords: categorizedSentence }; } exports.tokenizeString = tokenizeString; function isSameRes(present, res) { if (!((present.rule.matchedString === res.rule.matchedString) && (present.rule.category === res.rule.category) && (present.span === res.span) && (present.rule.bitindex === res.rule.bitindex))) { return 0; } if (present._ranking < res._ranking) { return -1; } return +1; } exports.isSameRes = isSameRes; function mergeIgnoreOrAppend(result, res) { var insertindex = -1; var foundNothing = result.every(function (present, index) { var r = isSameRes(present, res); if (r < 0) { //console.log("overwriting worse \n" + JSON.stringify(res) + '\n' + JSON.stringify(present)+ '\n'); result[index] = res; return false; } else if (r > 0) { //console.log('skipping present'); return false; } return true; }); if (foundNothing) { //debulog('pushing'); result.push(res); } } exports.mergeIgnoreOrAppend = mergeIgnoreOrAppend; function evaluateRangeRulesToPosition(tokens, fusable, categorizedWords) { debuglog(debuglog.enabled ? ("evaluateRangeRulesToPosition... " + JSON.stringify(categorizedWords)) : '-'); categorizedWords.forEach(function (wordlist, index) { wordlist.forEach(function (word) { if (word.rule.range) { //console.log(` got targetindex for RangeRules evaluation : ${targetIndex} ${index} ${fusable.join(" ")}`); var targetIndex = fdevsta_monmove_1.BreakDown.isCombinableRangeReturnIndex(word.rule.range, fusable, index); //console.log(` got targetindex for RangeRules evaluation : ${targetIndex}`); if (targetIndex >= 0) { var combinedWord = fdevsta_monmove_1.BreakDown.combineTokens(word.rule.range, index, tokens); debuglog(debuglog.enabled ? (" test \"" + combinedWord + "\" against \"" + word.rule.range.rule.lowercaseword + "\" " + JSON.stringify(word.rule.range.rule)) : '-'); var res = WordMatch.categorizeWordWithOffsetWithRankCutoffSingle(combinedWord, word.rule.range.rule); debuglog(debuglog.enabled ? (" got res : " + JSON.stringify(res)) : '-'); if (res) { res.span = word.rule.range.high - word.rule.range.low + 1; categorizedWords[targetIndex] = categorizedWords[targetIndex].slice(0); // avoid invalidation of seenit debuglog("pushed sth at " + targetIndex); mergeIgnoreOrAppend(categorizedWords[targetIndex], res); // categorizedWords[targetIndex].push(res); // check that this does not invalidate seenit! } } } }); }); // filter all range rules ! categorizedWords.forEach(function (wordlist, index) { categorizedWords[index] = wordlist.filter(function (word) { return !word.rule.range; }); }); } exports.evaluateRangeRulesToPosition = evaluateRangeRulesToPosition; var clone = utils.cloneDeep; function copyVecMembers(u) { var i = 0; for (i = 0; i < u.length; ++i) { u[i] = clone(u[i]); } return u; } // we can replicate the tail or the head, // we replicate the tail as it is smaller. // [a,b,c ] function isSpanVec(vec, index) { var effectivelen = vec.reduce(function (prev, mem) { return prev += mem.span ? mem.span : 1; }, 0); return effectivelen > index; } exports.isSpanVec = isSpanVec; /** * expand an array [[a1,a2], [b1,b2],[c]] * into all combinations * * if a1 has a span of three, the variations of the lower layer are skipped * * with the special property */ function expandTokenMatchesToSentences(tokens, tokenMatches) { var a = []; var wordMatches = []; debuglogV(debuglog.enabled ? JSON.stringify(tokenMatches) : '-'); tokenMatches.forEach(function (aWordMatches, wordIndex) { wordMatches[wordIndex] = []; aWordMatches.forEach(function (oWordVariant, wordVariantIndex) { wordMatches[wordIndex][wordVariantIndex] = oWordVariant; }); }); debuglog(debuglog.enabled ? JSON.stringify(tokenMatches) : '-'); var result = { errors: [], tokens: tokens, sentences: [] }; var nvecs = []; var res = [[]]; // var nvecs = []; var rvec = []; for (var tokenIndex = 0; tokenIndex < tokenMatches.length; ++tokenIndex) { //vecs is the vector of all so far seen variants up to k length. var nextBase = []; //independent of existence of matches on level k, we retain all vectors which are covered by a span // we skip extending them below for (var u = 0; u < res.length; ++u) { if (isSpanVec(res[u], tokenIndex)) { nextBase.push(res[u]); } } var lenMatches = tokenMatches[tokenIndex].length; if (nextBase.length === 0 && lenMatches === 0) { // the word at index I cannot be understood //if (result.errors.length === 0) { result.errors.push(ERError.makeError_NO_KNOWN_WORD(tokenIndex, tokens)); //} } for (var l = 0; l < lenMatches; ++l) { //debuglog("vecs now" + JSON.stringify(vecs)); var nvecs = []; //vecs.slice(); // copy the vec[i] base vector; //debuglog("vecs copied now" + JSON.stringify(nvecs)); for (var u = 0; u < res.length; ++u) { if (!isSpanVec(res[u], tokenIndex)) { // for each so far constructed result (of length k) in res nvecs.push(res[u].slice()); // make a copy of each vector nvecs[nvecs.length - 1] = copyVecMembers(nvecs[nvecs.length - 1]); // debuglog("copied vecs["+ u+"]" + JSON.stringify(vecs[u])); nvecs[nvecs.length - 1].push(clone(tokenMatches[tokenIndex][l])); // push the lth variant // debuglog("now nvecs " + nvecs.length + " " + JSON.stringify(nvecs)); } } // debuglog(" at " + k + ":" + l + " nextbase >" + JSON.stringify(nextBase)) // debuglog(" append " + k + ":" + l + " nvecs >" + JSON.stringify(nvecs)) nextBase = nextBase.concat(nvecs); // debuglog(" result " + k + ":" + l + " nvecs >" + JSON.stringify(nextBase)) } //constru // debuglog("now at " + k + ":" + l + " >" + JSON.stringify(nextBase)) res = nextBase; } debuglogV(debuglogV.enabled ? ("APPENDING TO RES" + 0 + ":" + l + " >" + JSON.stringify(nextBase)) : '-'); result.sentences = res; return result; } exports.expandTokenMatchesToSentences = expandTokenMatchesToSentences; /** * expand an array [[a1,a2], [b1,b2],[c]] * into all combinations * * if a1 has a span of three, the variations of the lower layer are skipped * * with the special property */ function expandTokenMatchesToSentences2(tokens, tokenMatches) { var a = []; var wordMatches = []; debuglogV(debuglog.enabled ? JSON.stringify(tokenMatches) : '-'); tokenMatches.forEach(function (aWordMatches, wordIndex) { wordMatches[wordIndex] = []; aWordMatches.forEach(function (oWordVariant, wordVariantIndex) { wordMatches[wordIndex][wordVariantIndex] = oWordVariant; }); }); debuglog(debuglog.enabled ? JSON.stringify(tokenMatches) : '-'); var result = { errors: [], tokens: tokens, sentences: [] }; var nvecs = []; var res = [[]]; // var nvecs = []; var rvec = []; for (var tokenIndex = 0; tokenIndex < tokenMatches.length; ++tokenIndex) { //vecs is the vector of all so far seen variants up to k length. var nextBase = []; //independent of existence of matches on level k, we retain all vectors which are covered by a span // we skip extending them below for (var u = 0; u < res.length; ++u) { if (isSpanVec(res[u], tokenIndex)) { nextBase.push(res[u]); } } var lenMatches = tokenMatches[tokenIndex].length; if (nextBase.length === 0 && lenMatches === 0) { // the word at index I cannot be understood //if (result.errors.length === 0) { result.errors.push(ERError.makeError_NO_KNOWN_WORD(tokenIndex, tokens)); //} } for (var l = 0; l < lenMatches; ++l) { //debuglog("vecs now" + JSON.stringify(vecs)); var nvecs = []; //vecs.slice(); // copy the vec[i] base vector; //debuglog("vecs copied now" + JSON.stringify(nvecs)); for (var u = 0; u < res.length; ++u) { if (!isSpanVec(res[u], tokenIndex)) { // for each so far constructed result (of length k) in res nvecs.push(res[u].slice()); // make a copy of each vector nvecs[nvecs.length - 1] = copyVecMembers(nvecs[nvecs.length - 1]); // debuglog("copied vecs["+ u+"]" + JSON.stringify(vecs[u])); nvecs[nvecs.length - 1].push(clone(tokenMatches[tokenIndex][l])); // push the lth variant // debuglog("now nvecs " + nvecs.length + " " + JSON.stringify(nvecs)); } } // debuglog(" at " + k + ":" + l + " nextbase >" + JSON.stringify(nextBase)) // debuglog(" append " + k + ":" + l + " nvecs >" + JSON.stringify(nvecs)) nextBase = nextBase.concat(nvecs); // debuglog(" result " + k + ":" + l + " nvecs >" + JSON.stringify(nextBase)) } //constru // debuglog("now at " + k + ":" + l + " >" + JSON.stringify(nextBase)) res = nextBase; } debuglogV(debuglogV.enabled ? ("APPENDING TO RES" + 0 + ":" + l + " >" + JSON.stringify(nextBase)) : '-'); res = res.filter(function (sentence, index) { var full = 0xFFFFFFFF; //console.log(`sentence ${index} \n`) return sentence.every(function (word, index2) { full = full & word.rule.bitSentenceAnd; // console.log(` word ${index2} ${full} ${word.matchedString} ${tokens[index2]} \n`); return full !== 0; }); }); result.sentences = res; return result; } exports.expandTokenMatchesToSentences2 = expandTokenMatchesToSentences2; function processString(query, rules, words) { words = words || {}; if (!process.env.ABOT_NO_TEST1) { return processString2(query, rules, words); } var tokenStruct = tokenizeString(query, rules, words); evaluateRangeRulesToPosition(tokenStruct.tokens, tokenStruct.fusable, tokenStruct.categorizedWords); if (debuglog.enabled) { debuglog("After matched " + JSON.stringify(tokenStruct.categorizedWords)); } var aSentences = expandTokenMatchesToSentences(tokenStruct.tokens, tokenStruct.categorizedWords); if (debuglog.enabled) { debuglog("after expand" + aSentences.sentences.map(function (oSentence) { return Sentence.rankingProduct(oSentence) + ":" + Sentence.dumpNice(oSentence); //JSON.stringify(oSentence); }).join("\n")); } aSentences.sentences = WordMatch.reinForce(aSentences.sentences); if (debuglog.enabled) { debuglog("after reinforce" + aSentences.sentences.map(function (oSentence) { return Sentence.rankingProduct(oSentence) + ":" + JSON.stringify(oSentence); }).join("\n")); } return aSentences; } exports.processString = processString; function processString2(query, rules, words) { words = words || {}; var tokenStruct = tokenizeString(query, rules, words); evaluateRangeRulesToPosition(tokenStruct.tokens, tokenStruct.fusable, tokenStruct.categorizedWords); if (debuglog.enabled) { debuglog("After matched " + JSON.stringify(tokenStruct.categorizedWords)); } var aSentences = expandTokenMatchesToSentences2(tokenStruct.tokens, tokenStruct.categorizedWords); if (debuglog.enabled) { debuglog("after expand" + aSentences.sentences.map(function (oSentence) { return Sentence.rankingProduct(oSentence) + ":" + Sentence.dumpNice(oSentence); //JSON.stringify(oSentence); }).join("\n")); } aSentences.sentences = WordMatch.reinForce(aSentences.sentences); if (debuglog.enabled) { debuglog("after reinforce" + aSentences.sentences.map(function (oSentence) { return Sentence.rankingProduct(oSentence) + ":" + JSON.stringify(oSentence); }).join("\n")); } return aSentences; } exports.processString2 = processString2; function simplifySentence(res) { return res.map(function (r) { return r.map(function (word) { return word.string + '=>' + word.matchedString + '/' + word.category + (word.span ? '/' + word.span : ''); }); }); } exports.simplifySentence = simplifySentence; //# sourceMappingURL=erbase.js.map