node-nlp
Version:
Library for NLU (Natural Language Understanding) done in Node.js
436 lines (407 loc) • 15.7 kB
HTML
<html lang="en">
<head>
<meta charset="utf-8">
<title>JSDoc: Source: util/similar-search.js</title>
<script src="scripts/prettify/prettify.js"> </script>
<script src="scripts/prettify/lang-css.js"> </script>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
<link type="text/css" rel="stylesheet" href="styles/prettify-tomorrow.css">
<link type="text/css" rel="stylesheet" href="styles/jsdoc-default.css">
</head>
<body>
<div id="main">
<h1 class="page-title">Source: util/similar-search.js</h1>
<section>
<article>
<pre class="prettyprint source linenums"><code>/*
* Copyright (c) AXA Shared Services Spain S.A.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Class for checking similarity between strings, or search the more similar
* substring inside an string.
*/
class SimilarSearch {
/**
* Constructor of the class. Does the basic initializations.
*/
constructor(settings) {
this.settings = settings || {};
this.alphanumeric =
this.settings.alphanumeric ||
new RegExp(/[a-zA-ZÁÉÍÓÚáéíóúÀÈÌÒÙàèìòùÄËÏÖÜäëïöüÂÊÎÔÛâêîôû0-9]/);
this.collator =
this.settings.collator ||
Intl.Collator('generic', { sensitivity: 'base' });
this.useCollator =
this.settings.useCollator === undefined
? false
: this.settings.useCollator;
this.normalize =
this.settings.normalize === undefined ? false : this.settings.normalize;
}
/**
* Calculates the levenshtein distance between two strings.
* @param {String} str1 First String.
* @param {String} str2 Second String.
* @returns {Number} Levenshtein distance.
*/
getSimilarity(str1, str2) {
if (this.normalize) {
/* eslint-disable */
str1 = str1
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.toLowerCase();
str2 = str2
.normalize("NFD")
.replace(/[\u0300-\u036f]/g, "")
.toLowerCase();
/* eslint-enable */
}
if (str1 === str2) {
return 0;
}
const str1len = str1.length;
const str2len = str2.length;
if (str1len === 0) return str2len;
if (str2len === 0) return str1len;
const prevRow = [];
const str2Char = [];
const str2CharAt = [];
let curCol;
let nextCol;
let tmp;
if (this.useCollator) {
for (let i = 0; i < str2len; i += 1) {
prevRow[i] = i;
str2CharAt[i] = str2.charAt(i);
}
} else {
for (let i = 0; i < str2len; i += 1) {
prevRow[i] = i;
str2Char[i] = str2.charCodeAt(i);
}
}
prevRow[str2len] = str2len;
let strCmp;
let j;
for (let i = 0; i < str1len; i += 1) {
nextCol = i + 1;
for (j = 0; j < str2len; j += 1) {
curCol = nextCol;
if (this.useCollator) {
strCmp = this.collator.compare(str1.charAt(i), str2CharAt[j]) === 0;
} else {
strCmp = str1.charCodeAt(i) === str2Char[j];
}
nextCol = prevRow[j] + (strCmp ? 0 : 1);
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
prevRow[j] = curCol;
}
prevRow[j] = nextCol;
}
return nextCol;
}
/**
* Indicates if a character is alphanumeric.
* @param {Character} c Character.
* @returns {Boolean} True if the character is alphanumeric, false otherwise.
*/
isAlphanumeric(c) {
return this.alphanumeric.test(c);
}
/**
* Given an string, iterates over it and return the start position, end position
* and length of each of the words, without tokenizing the string.
* @param {String} str String to be processed.
* @returns {Object[]} Array of positions of the words, with the start index,
* end index, and length.
*/
getWordPositions(str) {
const strlen = str.length;
const result = [];
let lastIndex = 0;
let currentIndex = 0;
let atWhiteSpace = true;
while (currentIndex < strlen) {
if (!this.isAlphanumeric(str.charAt(currentIndex))) {
if (!atWhiteSpace) {
result.push({
start: lastIndex,
end: currentIndex - 1,
len: currentIndex - lastIndex,
});
atWhiteSpace = true;
}
} else if (atWhiteSpace) {
lastIndex = currentIndex;
atWhiteSpace = false;
}
currentIndex += 1;
}
if (!atWhiteSpace) {
result.push({
start: lastIndex,
end: currentIndex - 1,
len: currentIndex - lastIndex,
});
}
return result;
}
/**
* Given two strings, search best occurence of the second inside the first,
* that is, the consecutive words of the first string that have less
* levenshtein distance with the second one.
* @param {String} str1 First string.
* @param {String} str2 Second string.
* @param {Object[]} words1 Array of positions of the words of the first string.
* If not provided this will be built.
* @returns {Object} Best occurence, expressed as the index of the first character,
* index of the last character, levenshtein distance and accuracy.
*/
getBestSubstring(str1, str2, words1) {
const str1len = str1.length;
const str2len = str2.length;
if (str1len <= str2len) {
const result = {
start: 0,
end: str1len - 1,
len: str1len,
levenshtein: this.getSimilarity(str1, str2),
};
result.accuracy = (str2len - result.levenshtein) / str2len;
return result;
}
const wordPositions = words1 || this.getWordPositions(str1);
const wordPositionsLen = wordPositions.length;
const best = {
start: 0,
end: 0,
len: 0,
levenshtein: undefined,
accuracy: 0,
};
for (let i = 0; i < wordPositionsLen; i += 1) {
for (let j = i; j < wordPositionsLen; j += 1) {
const str3 = str1.substring(
wordPositions[i].start,
wordPositions[j].end + 1
);
const levenshtein = this.getSimilarity(str3, str2);
if (best.levenshtein === undefined || levenshtein < best.levenshtein) {
best.levenshtein = levenshtein;
best.start = wordPositions[i].start;
best.end = wordPositions[j].end;
best.len = best.end - best.start + 1;
}
}
}
best.accuracy = (str2len - best.levenshtein) / str2len;
return best;
}
/**
* Given two strings, search all the occurences of the second inside the first,
* where the accuracy is at least as good as the threshold.
* @param {String} str1 First string.
* @param {String} str2 Second string.
* @param {Object[]} words1 Array of positions of the words of the first string.
* If not provided this will be built.
* @returns {Object[]} List of occurences.
*/
getBestSubstringList(str1, str2, words1, threshold = 1) {
const str1len = str1.length;
const str2len = str2.length;
const result = [];
if (str1len <= str2len) {
const levenshtein = this.getSimilarity(str1, str2);
const accuracy = (str2len - levenshtein) / str2len;
if (accuracy >= threshold) {
result.push({
start: 0,
end: str1len - 1,
len: str1len,
levenshtein,
accuracy,
});
}
return result;
}
const wordPositions = words1 || this.getWordPositions(str1);
const wordPositionsLen = wordPositions.length;
for (let i = 0; i < wordPositionsLen; i += 1) {
for (let j = i; j < wordPositionsLen; j += 1) {
const str3 = str1.substring(
wordPositions[i].start,
wordPositions[j].end + 1
);
const levenshtein = this.getSimilarity(str3, str2);
const accuracy = (str2len - levenshtein) / str2len;
if (accuracy >= threshold) {
result.push({
start: wordPositions[i].start,
end: wordPositions[j].end,
len: wordPositions[j].end - wordPositions[i].start + 1,
levenshtein,
accuracy,
});
}
}
}
return result;
}
reduceEdges(edges, useMaxLength = true) {
for (let i = 0, l = edges.length; i < l; i += 1) {
const edge = edges[i];
if (!edge.discarded) {
for (let j = i + 1; j < l; j += 1) {
const other = edges[j];
if (!other.discarded) {
if (other.start <= edge.end && other.end >= edge.start) {
if (other.accuracy < edge.accuracy) {
other.discarded = true;
} else if (other.accuracy > edge.accuracy) {
edge.discarded = true;
} else if (
(useMaxLength ||
other.entity === edge.entity ||
other.entity === 'number') &&
other.len <= edge.len
) {
other.discarded = true;
} else if (
(useMaxLength ||
other.entity === edge.entity ||
edge.entity === 'number') &&
other.len > edge.len
) {
edge.discarded = true;
}
}
}
}
}
}
const result = [];
for (let i = 0, l = edges.length; i < l; i += 1) {
if (!edges[i].discarded) {
result.push(edges[i]);
}
}
return result;
}
getEdgesFromEntity(
str,
entity,
language,
entityName,
threshold = 1,
srcWordPositions
) {
const wordPositions = srcWordPositions || this.getWordPositions(str);
const locale = entity.getLocaleRules
? entity.getLocaleRules(language)
: entity[language];
const result = [];
if (!locale) {
return result;
}
const optionKeys = Object.keys(locale);
for (let i = 0, li = optionKeys.length; i < li; i += 1) {
const optionName = optionKeys[i];
const texts = locale[optionName];
for (let j = 0, lj = texts.length; j < lj; j += 1) {
const current = this.getBestSubstringList(
str,
texts[j],
wordPositions,
threshold
);
for (let k = 0, lk = current.length; k < lk; k += 1) {
const item = current[k];
item.option = optionName;
item.sourceText = texts[j];
item.entity = entityName || entity.name;
item.utteranceText = str.substring(item.start, item.end + 1);
result.push(item);
}
}
}
return this.reduceEdges(result);
}
/**
* Given an utterance and an array of entities with options, search the
* best option for each entity and return the results.
* @param {String} str Utterance to retrieve entities.
* @param {Object[]} entities Entities Array.
* @param {String} locale Locale for the search.
* @param {String[]} whitelist Whitelist of entity names for the search.
*/
getEdgesFromEntities(str, entities, language, whitelist, threshold = 1) {
const result = [];
const wordPositions = this.getWordPositions(str);
const entityKeys = Object.keys(entities);
for (let i = 0, l = entityKeys.length; i < l; i += 1) {
const entityName = entityKeys[i];
if (!whitelist || whitelist.indexOf(entityName) !== -1) {
const edges = this.getEdgesFromEntity(
str,
entities[entityName],
language,
entityName,
threshold,
wordPositions
);
edges.forEach(srcEdge => {
const edge = srcEdge;
result.push(edge);
});
}
}
return this.reduceEdges(result);
}
}
module.exports = SimilarSearch;
</code></pre>
</article>
</section>
</div>
<nav>
<h2><a href="index.html">Home</a></h2><h3>Classes</h3><ul><li><a href="BinaryNeuralNetworkClassifier.html">BinaryNeuralNetworkClassifier</a></li><li><a href="Classifier.html">Classifier</a></li><li><a href="ConversationContext.html">ConversationContext</a></li><li><a href="DutchStemmer.html">DutchStemmer</a></li><li><a href="EnglishStemmer.html">EnglishStemmer</a></li><li><a href="EnumNamedEntity.html">EnumNamedEntity</a></li><li><a href="Evaluator.html">Evaluator</a></li><li><a href="HungarianStemmer.html">HungarianStemmer</a></li><li><a href="ItalianStemmer.html">ItalianStemmer</a></li><li><a href="Language.html">Language</a></li><li><a href="LogisticRegressionClassifier.html">LogisticRegressionClassifier</a></li><li><a href="Matrix.html">Matrix</a></li><li><a href="MemoryConversationContext.html">MemoryConversationContext</a></li><li><a href="NamedEntity.html">NamedEntity</a></li><li><a href="NerManager.html">NerManager</a></li><li><a href="NlgManager.html">NlgManager</a></li><li><a href="NlpClassifier.html">NlpClassifier</a></li><li><a href="NlpManager.html">NlpManager</a></li><li><a href="NorwegianStemmer.html">NorwegianStemmer</a></li><li><a href="PortugueseStemmer.html">PortugueseStemmer</a></li><li><a href="Recognizer.html">Recognizer</a></li><li><a href="RegexNamedEntity.html">RegexNamedEntity</a></li><li><a href="RomanianStemmer.html">RomanianStemmer</a></li><li><a href="RussianStemmer.html">RussianStemmer</a></li><li><a href="SentimentAnalyzer.html">SentimentAnalyzer</a></li><li><a href="SentimentManager.html">SentimentManager</a></li><li><a href="SimilarSearch.html">SimilarSearch</a></li><li><a href="SlotManager.html">SlotManager</a></li><li><a href="StemmerJa.html">StemmerJa</a></li><li><a href="SwedishStemmer.html">SwedishStemmer</a></li><li><a href="Tokenizer.html">Tokenizer</a></li><li><a href="TrimNamedEntity.html">TrimNamedEntity</a></li><li><a href="TurkishStemmer.html">TurkishStemmer</a></li><li><a href="Vector.html">Vector</a></li><li><a href="XTable.html">XTable</a></li></ul><h3>Global</h3><ul><li><a href="global.html#endsinArr">endsinArr</a></li><li><a href="global.html#prelude">prelude</a></li><li><a href="global.html#regions">regions</a></li><li><a href="global.html#stem">stem</a></li><li><a href="global.html#stopwords">stopwords</a></li></ul>
</nav>
<br class="clear">
<footer>
Documentation generated by <a href="https://github.com/jsdoc3/jsdoc">JSDoc 3.5.5</a> on Sat Oct 13 2018 19:14:51 GMT+0200 (CEST)
</footer>
<script> prettyPrint(); </script>
<script src="scripts/linenumber.js"> </script>
</body>
</html>