@botonic/plugin-contentful
Version:
## What Does This Plugin Do?
287 lines • 11.8 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.getMatchLength = exports.SimilarWordFinder = exports.SimilarWordResult = exports.WordsDistance = void 0;
const src_1 = require("@nlpjs/ner/src");
const src_2 = require("@nlpjs/similarity/src");
const keywords_1 = require("./keywords");
const normalizer_1 = require("./normalizer");
const tokens_1 = require("./tokens");
class WordsDistance {
constructor(algorithm = 0 /* WordSimilarityAlgorithm.LEVENSHTEIN */) {
this.algorithm = algorithm;
}
distance(left, right) {
return (0, src_2.leven)(left, right);
}
}
exports.WordsDistance = WordsDistance;
class SimilarWordResult {
constructor(candidate, keyword, match, distance) {
this.candidate = candidate;
this.keyword = keyword;
this.match = match;
this.distance = distance;
}
/**
*
* @return < 0 if this is better than other
*/
compare(other) {
if (this.distance == other.distance) {
return other.match.length - this.match.length;
}
return this.distance - other.distance;
}
}
exports.SimilarWordResult = SimilarWordResult;
class PartialMatch {
constructor(keyword, match, distance) {
this.keyword = keyword;
this.match = match;
this.distance = distance;
}
}
const TOO_DISTANT = -1;
/**
* It does not normalize case, ie. uppercase will be considered different than lowercase
*/
class SimilarWordFinder {
/**
* @param wordsAreStemmed see {@link StemmedExtraDistance}
* @param minMatchLength min number of characters that must match so that we tolerate non-identical matches
*/
constructor(wordsAreStemmed, minMatchLength = 3) {
this.wordsAreStemmed = wordsAreStemmed;
this.minMatchLength = minMatchLength;
this.candidates = [];
}
/**
*
* @param candidate may contain several words (eg. "buenos días")
*/
addCandidate(candidate) {
this.candidates.push(candidate);
}
createFinder(matchType) {
switch (matchType) {
case keywords_1.MatchType.ONLY_KEYWORDS_FOUND:
return new FindIfOnlyWordsFromKeyword(this.wordsAreStemmed, this.minMatchLength);
case keywords_1.MatchType.KEYWORDS_AND_OTHERS_FOUND:
return new FindSubstring(this.wordsAreStemmed, this.minMatchLength);
case keywords_1.MatchType.ALL_WORDS_IN_KEYWORDS_MIXED_UP:
return new FindMixedUp(this.wordsAreStemmed, this.minMatchLength);
default:
throw new Error(`Unexpected matchType ${String(matchType)}`);
}
}
find(matchType, utterance, maxDistance) {
const finder = this.createFinder(matchType);
const results = [];
for (const candidate of this.candidates) {
const matches = finder
.find(candidate.keywords, utterance, maxDistance)
.map(m => new SimilarWordResult(candidate.owner, m.keyword, m.match, m.distance));
results.push(...matches);
}
return this.getLongestResultPerCandidate(results);
}
getLongestResultPerCandidate(results) {
const sorted = results.sort((a, b) => a.compare(b));
// avoid duplicates
const uniq = [];
const findBefore = (needle, before) => {
for (let prev = before - 1; prev >= 0; prev--) {
if (sorted[prev].candidate === needle) {
return true;
}
}
return false;
};
for (let i = sorted.length - 1; i >= 0; i--) {
if (!findBefore(sorted[i].candidate, i)) {
uniq.push(sorted[i]);
}
}
return uniq;
}
}
exports.SimilarWordFinder = SimilarWordFinder;
class CandidateFinder {
constructor(wordsAreStemmed, minMatchLength = 3) {
this.wordsAreStemmed = wordsAreStemmed;
this.minMatchLength = minMatchLength;
this.similar = new src_1.ExtractorEnum();
this.stemmedDecorator = new StemmedExtraDistance(wordsAreStemmed);
}
getDistanceCore(utterance, utteranceText, keyword, maxDistance) {
const kwMatchString = keyword.matchString;
if (utteranceText.length <= this.minMatchLength) {
return utteranceText == kwMatchString ? 0 : TOO_DISTANT;
}
const distance = (0, src_2.leven)(utteranceText, kwMatchString);
if (distance >
maxDistance + this.stemmedDecorator.extraDistance(kwMatchString)) {
return TOO_DISTANT;
}
if (getMatchLength(utteranceText.length, kwMatchString.length, distance) <
this.minMatchLength) {
return TOO_DISTANT;
}
if (distance > maxDistance &&
!this.stemmedDecorator.verify(utterance.raw, utteranceText, keyword)) {
return TOO_DISTANT;
}
return distance;
}
utteranceText(utterance, keyword) {
if (keyword.hasOnlyStopWords) {
return utterance.raw;
}
// If it was not stemmed (maybe because it was on a black list), we don't want to stem the matching utterance
// in case it contains the full keyword but with a typo
return keyword.raw == keyword.matchString
? normalizer_1.Word.joinedTokens(utterance.words, false)
: utterance.stems.join(' ');
}
}
class FindIfOnlyWordsFromKeyword extends CandidateFinder {
find(keywords, utterance, maxDistance) {
return keywords
.map(keyword => this.getDistance(utterance, keyword, maxDistance))
.filter(match => match.distance != TOO_DISTANT);
}
getDistance(utterance, keyword, maxDistance) {
const utteranceText = this.utteranceText(utterance, keyword);
const stemmedDistance = this.getDistanceCore(utterance, utteranceText, keyword, maxDistance);
const stemmedMatch = new PartialMatch(keyword, utteranceText, stemmedDistance);
// give priority to unstemmed match because it will involve more matching character
const tokensMatch = this.getTokensMatch(utterance, keyword, maxDistance);
if (tokensMatch && tokensMatch.distance <= stemmedDistance) {
return tokensMatch;
}
return stemmedMatch;
}
getTokensMatch(utterance, keyword, maxDistance) {
const withStopWords = keyword.hasOnlyStopWords;
const utteranceTokens = utterance.joinedTokens(withStopWords);
const keywordTokens = keyword.joinedTokens(withStopWords);
if (Math.abs(utteranceTokens.length - keywordTokens.length) <= maxDistance) {
const tokensDistance = (0, src_2.leven)(utteranceTokens, keywordTokens);
return new PartialMatch(keyword, utteranceTokens, tokensDistance);
}
return undefined;
}
}
class FindSubstring extends CandidateFinder {
find(keywords, utterance, maxDistance) {
return keywords
.map(keyword => this.findKeyword(keyword, utterance, maxDistance))
.filter(m => !!m)
.map(m => m);
}
findKeyword(keyword, utterance, maxDistance) {
const utteranceText = this.utteranceText(utterance, keyword);
const wordPositions = this.similar.getWordPositions(utteranceText);
if (keyword.matchString.length < this.minMatchLength) {
if (new RegExp(`\\b${keyword.matchString}\\b`).test(utteranceText)) {
return new PartialMatch(keyword, keyword.matchString, 0);
}
return undefined;
}
const extra = this.stemmedDecorator.extraDistance(keyword.matchString);
const minAccuracy = (keyword.matchString.length - (maxDistance + extra)) /
keyword.matchString.length;
let substrings = this.similar.getBestSubstringList(utteranceText, keyword.matchString, wordPositions, minAccuracy);
substrings = substrings.filter((bs) => getMatchLength(bs.len, keyword.matchString.length, bs.levenshtein) >=
this.minMatchLength);
if (substrings.length == 0) {
return undefined;
}
const bestSubstr = substrings.sort((s1, s2) => s2.accuracy - s1.accuracy)[0];
const match = utteranceText.slice(bestSubstr.start, bestSubstr.end + 1);
const distance = keyword.matchString.length -
bestSubstr.accuracy * keyword.matchString.length;
if (distance > maxDistance &&
!this.stemmedDecorator.verify(match, match, keyword)) {
return undefined;
}
return new PartialMatch(keyword, match, distance);
}
}
class FindMixedUp extends CandidateFinder {
constructor(wordsAreStemmed, minMatchLength = 3) {
super(wordsAreStemmed, minMatchLength);
this.wordsAreStemmed = wordsAreStemmed;
this.minMatchLength = minMatchLength;
this.substring = new FindSubstring(wordsAreStemmed, minMatchLength);
}
find(keywords, utterance, maxDistance) {
const matches = [];
for (const keyword of keywords) {
let submatches = [];
for (const subkw of keyword.splitInWords()) {
const match = this.substring.findKeyword(subkw, utterance, maxDistance);
if (!match) {
submatches = undefined;
break;
}
submatches.push(match);
}
// in case the space between the words in the keyword is missing
if ((!submatches || submatches.length == 0) &&
keyword.raw.includes(' ')) {
const wordsWithoutSpace = this.substring.findKeyword(keyword, utterance, maxDistance);
if (wordsWithoutSpace) {
submatches = [];
submatches.push(wordsWithoutSpace);
}
}
if (submatches) {
const match = submatches.reduce((m1, m2) => new PartialMatch(keyword, m1.match + (m1.match ? ' ' : '') + m2.match, m1.distance + m2.distance), new PartialMatch(keyword, '', 0));
matches.push(match);
}
}
return matches;
}
}
/**
* When keywords contain multiple words and they're stemmed, allow extra distance
* in case utterance missed a space eg 'goodmorning'
*/
class StemmedExtraDistance {
constructor(wordsAreStemmed) {
this.wordsAreStemmed = wordsAreStemmed;
}
extraDistance(keyword) {
if (!this.wordsAreStemmed) {
return 0;
}
const wordsInKeyword = (0, tokens_1.countOccurrences)(keyword, ' ') + 1;
if (wordsInKeyword > 1 && keyword.length > 5) {
// in case needle is missing a space, the first word could not be stemmed.
// So we need to ignore the suffix
return 3 * (wordsInKeyword - 1);
}
return 0;
}
verify(utteranceRaw, utteranceNormalized, keyword) {
if (!this.wordsAreStemmed) {
return true;
}
const words = keyword.matchString.split(' ');
for (const word of words) {
// checking also raw because if utterance missing a space, maybe utterance
// is more aggressively stemmed than the keyword
if (!utteranceRaw.includes(word) && !utteranceNormalized.includes(word)) {
return false;
}
}
return true;
}
}
function getMatchLength(utteranceLen, keywordLen, distance) {
const difLen = Math.abs(utteranceLen - keywordLen);
return Math.min(utteranceLen, keywordLen) - distance + difLen;
}
exports.getMatchLength = getMatchLength;
//# sourceMappingURL=similar-words.js.map