UNPKG

@botonic/plugin-contentful

Version:

## What Does This Plugin Do?

287 lines 11.8 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.getMatchLength = exports.SimilarWordFinder = exports.SimilarWordResult = exports.WordsDistance = void 0; const src_1 = require("@nlpjs/ner/src"); const src_2 = require("@nlpjs/similarity/src"); const keywords_1 = require("./keywords"); const normalizer_1 = require("./normalizer"); const tokens_1 = require("./tokens"); class WordsDistance { constructor(algorithm = 0 /* WordSimilarityAlgorithm.LEVENSHTEIN */) { this.algorithm = algorithm; } distance(left, right) { return (0, src_2.leven)(left, right); } } exports.WordsDistance = WordsDistance; class SimilarWordResult { constructor(candidate, keyword, match, distance) { this.candidate = candidate; this.keyword = keyword; this.match = match; this.distance = distance; } /** * * @return < 0 if this is better than other */ compare(other) { if (this.distance == other.distance) { return other.match.length - this.match.length; } return this.distance - other.distance; } } exports.SimilarWordResult = SimilarWordResult; class PartialMatch { constructor(keyword, match, distance) { this.keyword = keyword; this.match = match; this.distance = distance; } } const TOO_DISTANT = -1; /** * It does not normalize case, ie. uppercase will be considered different than lowercase */ class SimilarWordFinder { /** * @param wordsAreStemmed see {@link StemmedExtraDistance} * @param minMatchLength min number of characters that must match so that we tolerate non-identical matches */ constructor(wordsAreStemmed, minMatchLength = 3) { this.wordsAreStemmed = wordsAreStemmed; this.minMatchLength = minMatchLength; this.candidates = []; } /** * * @param candidate may contain several words (eg. "buenos días") */ addCandidate(candidate) { this.candidates.push(candidate); } createFinder(matchType) { switch (matchType) { case keywords_1.MatchType.ONLY_KEYWORDS_FOUND: return new FindIfOnlyWordsFromKeyword(this.wordsAreStemmed, this.minMatchLength); case keywords_1.MatchType.KEYWORDS_AND_OTHERS_FOUND: return new FindSubstring(this.wordsAreStemmed, this.minMatchLength); case keywords_1.MatchType.ALL_WORDS_IN_KEYWORDS_MIXED_UP: return new FindMixedUp(this.wordsAreStemmed, this.minMatchLength); default: throw new Error(`Unexpected matchType ${String(matchType)}`); } } find(matchType, utterance, maxDistance) { const finder = this.createFinder(matchType); const results = []; for (const candidate of this.candidates) { const matches = finder .find(candidate.keywords, utterance, maxDistance) .map(m => new SimilarWordResult(candidate.owner, m.keyword, m.match, m.distance)); results.push(...matches); } return this.getLongestResultPerCandidate(results); } getLongestResultPerCandidate(results) { const sorted = results.sort((a, b) => a.compare(b)); // avoid duplicates const uniq = []; const findBefore = (needle, before) => { for (let prev = before - 1; prev >= 0; prev--) { if (sorted[prev].candidate === needle) { return true; } } return false; }; for (let i = sorted.length - 1; i >= 0; i--) { if (!findBefore(sorted[i].candidate, i)) { uniq.push(sorted[i]); } } return uniq; } } exports.SimilarWordFinder = SimilarWordFinder; class CandidateFinder { constructor(wordsAreStemmed, minMatchLength = 3) { this.wordsAreStemmed = wordsAreStemmed; this.minMatchLength = minMatchLength; this.similar = new src_1.ExtractorEnum(); this.stemmedDecorator = new StemmedExtraDistance(wordsAreStemmed); } getDistanceCore(utterance, utteranceText, keyword, maxDistance) { const kwMatchString = keyword.matchString; if (utteranceText.length <= this.minMatchLength) { return utteranceText == kwMatchString ? 0 : TOO_DISTANT; } const distance = (0, src_2.leven)(utteranceText, kwMatchString); if (distance > maxDistance + this.stemmedDecorator.extraDistance(kwMatchString)) { return TOO_DISTANT; } if (getMatchLength(utteranceText.length, kwMatchString.length, distance) < this.minMatchLength) { return TOO_DISTANT; } if (distance > maxDistance && !this.stemmedDecorator.verify(utterance.raw, utteranceText, keyword)) { return TOO_DISTANT; } return distance; } utteranceText(utterance, keyword) { if (keyword.hasOnlyStopWords) { return utterance.raw; } // If it was not stemmed (maybe because it was on a black list), we don't want to stem the matching utterance // in case it contains the full keyword but with a typo return keyword.raw == keyword.matchString ? normalizer_1.Word.joinedTokens(utterance.words, false) : utterance.stems.join(' '); } } class FindIfOnlyWordsFromKeyword extends CandidateFinder { find(keywords, utterance, maxDistance) { return keywords .map(keyword => this.getDistance(utterance, keyword, maxDistance)) .filter(match => match.distance != TOO_DISTANT); } getDistance(utterance, keyword, maxDistance) { const utteranceText = this.utteranceText(utterance, keyword); const stemmedDistance = this.getDistanceCore(utterance, utteranceText, keyword, maxDistance); const stemmedMatch = new PartialMatch(keyword, utteranceText, stemmedDistance); // give priority to unstemmed match because it will involve more matching character const tokensMatch = this.getTokensMatch(utterance, keyword, maxDistance); if (tokensMatch && tokensMatch.distance <= stemmedDistance) { return tokensMatch; } return stemmedMatch; } getTokensMatch(utterance, keyword, maxDistance) { const withStopWords = keyword.hasOnlyStopWords; const utteranceTokens = utterance.joinedTokens(withStopWords); const keywordTokens = keyword.joinedTokens(withStopWords); if (Math.abs(utteranceTokens.length - keywordTokens.length) <= maxDistance) { const tokensDistance = (0, src_2.leven)(utteranceTokens, keywordTokens); return new PartialMatch(keyword, utteranceTokens, tokensDistance); } return undefined; } } class FindSubstring extends CandidateFinder { find(keywords, utterance, maxDistance) { return keywords .map(keyword => this.findKeyword(keyword, utterance, maxDistance)) .filter(m => !!m) .map(m => m); } findKeyword(keyword, utterance, maxDistance) { const utteranceText = this.utteranceText(utterance, keyword); const wordPositions = this.similar.getWordPositions(utteranceText); if (keyword.matchString.length < this.minMatchLength) { if (new RegExp(`\\b${keyword.matchString}\\b`).test(utteranceText)) { return new PartialMatch(keyword, keyword.matchString, 0); } return undefined; } const extra = this.stemmedDecorator.extraDistance(keyword.matchString); const minAccuracy = (keyword.matchString.length - (maxDistance + extra)) / keyword.matchString.length; let substrings = this.similar.getBestSubstringList(utteranceText, keyword.matchString, wordPositions, minAccuracy); substrings = substrings.filter((bs) => getMatchLength(bs.len, keyword.matchString.length, bs.levenshtein) >= this.minMatchLength); if (substrings.length == 0) { return undefined; } const bestSubstr = substrings.sort((s1, s2) => s2.accuracy - s1.accuracy)[0]; const match = utteranceText.slice(bestSubstr.start, bestSubstr.end + 1); const distance = keyword.matchString.length - bestSubstr.accuracy * keyword.matchString.length; if (distance > maxDistance && !this.stemmedDecorator.verify(match, match, keyword)) { return undefined; } return new PartialMatch(keyword, match, distance); } } class FindMixedUp extends CandidateFinder { constructor(wordsAreStemmed, minMatchLength = 3) { super(wordsAreStemmed, minMatchLength); this.wordsAreStemmed = wordsAreStemmed; this.minMatchLength = minMatchLength; this.substring = new FindSubstring(wordsAreStemmed, minMatchLength); } find(keywords, utterance, maxDistance) { const matches = []; for (const keyword of keywords) { let submatches = []; for (const subkw of keyword.splitInWords()) { const match = this.substring.findKeyword(subkw, utterance, maxDistance); if (!match) { submatches = undefined; break; } submatches.push(match); } // in case the space between the words in the keyword is missing if ((!submatches || submatches.length == 0) && keyword.raw.includes(' ')) { const wordsWithoutSpace = this.substring.findKeyword(keyword, utterance, maxDistance); if (wordsWithoutSpace) { submatches = []; submatches.push(wordsWithoutSpace); } } if (submatches) { const match = submatches.reduce((m1, m2) => new PartialMatch(keyword, m1.match + (m1.match ? ' ' : '') + m2.match, m1.distance + m2.distance), new PartialMatch(keyword, '', 0)); matches.push(match); } } return matches; } } /** * When keywords contain multiple words and they're stemmed, allow extra distance * in case utterance missed a space eg 'goodmorning' */ class StemmedExtraDistance { constructor(wordsAreStemmed) { this.wordsAreStemmed = wordsAreStemmed; } extraDistance(keyword) { if (!this.wordsAreStemmed) { return 0; } const wordsInKeyword = (0, tokens_1.countOccurrences)(keyword, ' ') + 1; if (wordsInKeyword > 1 && keyword.length > 5) { // in case needle is missing a space, the first word could not be stemmed. // So we need to ignore the suffix return 3 * (wordsInKeyword - 1); } return 0; } verify(utteranceRaw, utteranceNormalized, keyword) { if (!this.wordsAreStemmed) { return true; } const words = keyword.matchString.split(' '); for (const word of words) { // checking also raw because if utterance missing a space, maybe utterance // is more aggressively stemmed than the keyword if (!utteranceRaw.includes(word) && !utteranceNormalized.includes(word)) { return false; } } return true; } } function getMatchLength(utteranceLen, keywordLen, distance) { const difLen = Math.abs(utteranceLen - keywordLen); return Math.min(utteranceLen, keywordLen) - distance + difLen; } exports.getMatchLength = getMatchLength; //# sourceMappingURL=similar-words.js.map