@trevor.linton/similarto
Version:
Compares a string against an array of strings and picks the most similar
70 lines (63 loc) • 3.38 kB
JavaScript
const natural = require('natural')
const assert = require('assert')
const compromise = require('compromise')
const debug = require('debug')('similarto')
function wordnet(text) {
return new Promise((resolve, reject) => {
let wn = new natural.WordNet();
wn.lookup(text, (result) => {
if(result.length === 0) {
debug('Found no associations for: ' + text)
resolve([])
} else {
// How do we determine which result we want ?
// How do we know if we should use the definition to help find synonyms?
let def = result.reduce((acc, v) => acc.synsetOffset < v.synsetOffset ? acc : v, result[0]).def
let synonyms = result.map((x) => x.synonyms)
.reduce((acc, x) => acc.concat(x), []).filter((x) => x !== text)
.concat(compromise(def).nouns().toSingular().out('array').map((x) => x.split(' ')).reduce((acc, v) => acc.concat(v), []))
debug('Found associations for: ' + text + ' => ' + synonyms.join(','))
resolve(synonyms)
}
});
});
}
module.exports = async function(compare, against) {
assert.ok(Array.isArray(against), 'An array of strings was not passed as the second argument')
assert.ok(typeof compare === 'string', 'The first argument was not a string.')
assert.ok(against.filter((x) => typeof x !== 'string').length === 0, 'The second argument did not contain an array of strings but an array of mixed bags.')
assert.ok(against.length !== 0, 'The second argument did not contain any values.')
assert.ok(compare !== '', 'The first argument was a blank string.')
assert.ok(against.filter((x) => x === '').length === 0, 'The second argument contained blank strings.')
// get subset of nouns and verbs
let compareObj = compromise(compare)
let compareTerms = compareObj.nouns().toSingular().out('array')
.concat(compareObj.verbs().out('array'))
.concat(compareObj.adjectives().out('array'))
.map((x) => x.split(' '))
.reduce((a, v) => a.concat(v), [])
.filter((x) => x.length > 2);
let againstTerms = against.map((x) => {
let xObj = compromise(x)
return xObj.nouns().toSingular().out('array')
.concat(xObj.verbs().out('array'))
.concat(xObj.adjectives().out('array'))
.map((x) => x.split(' '))
.reduce((a, v) => a.concat(v), [])
.filter((x) => x.length > 2)
});
// expand all terms with synonyms
compareTerms = [...new Set((await Promise.all(compareTerms.map(async (x) => [x].concat(await wordnet(x))))).reduce((acc, v) => acc.concat(v), []))].filter((x) => x.length > 2);
againstTerms = await Promise.all(againstTerms.map(async (y) => {
return [...new Set((await Promise.all(y.map(async (x) => [x].concat(await wordnet(x))))).reduce((acc, v) => acc.concat(v), []) )]
.filter((x) => x.length > 2)
}));
// run distance (maximum association between any two words then over the whole term the average association)
let distances = againstTerms.map((terms) => {
return terms
.map((y) => compareTerms.map((x) => natural.JaroWinklerDistance(x, y, undefined, true)).reduce((acc, x) => Math.max(acc, x), 0))
.reduce((acc, x) => acc + x, 0) / terms.length;
}).map((x) => x || 0);
debug('distances', distances)
return distances.reduce((a, value, index) => a.value > value ? a : {value, index, text:against[index]}, {value:0, index:0, text:against[0]});
}