fuzzystringmatch
Version:
a small library that creates a in-memory index for a fast and fuzzy lookup of search terms
118 lines (93 loc) • 4.72 kB
JavaScript
"use strict"
var util = require('util')
var dedupe = require('dedupe')
var hr = require('hirestime')
var debug = require('debug')
var ResultEntry = require('./ResultEntry')
var prepareTerm = require('./tools/prepareTerm')
var splitter = require('./tools/splitter')
var defaultSplitterConfig = require('./tools/defaultSplitterConfig')
class Matcher {
constructor (digester, configuration) {
this._chunks = digester.getChunks()
this._index = digester.getIndex()
this._configuration = configuration || {splitter: defaultSplitterConfig}
this._debug = debug('fuzzystringmatch:matcher')
this._debugSelect = debug('fuzzystringmatch:matcher:select')
this._debugGroup = debug('fuzzystringmatch:matcher:group')
this._debugCummulate = debug('fuzzystringmatch:matcher:cumulate')
}
match (term, overallCount) {
overallCount = overallCount || 150
var getTimeOverall = hr()
var resultMap = new Map()
var preparedTerm = prepareTerm(term)
var getTimeSelect = hr()
var splitted = splitter(preparedTerm, this._configuration.splitter)
var splittedDeduped = dedupe(splitted)
if (splitted.length != splittedDeduped.length) {
this._debugSelect('%d instead of %d chunks', splittedDeduped.length, splitted.length)
}
// creating a map that describes how often a chunk match has been discovered for a certain subject
//searching for: fork, chunks: fo, or, rk
//index: spork, chunks: sp, po, or, rk
//matching chunks: or, rk
//resultmap: spork => 2 (two chunk matches for index entry 'spork')
splittedDeduped.forEach((chunkText) => {
var chunk = this._chunks.get(chunkText)
if (!chunk) return
var chunkResult = this._index.get(chunk)
if (!chunkResult) return
var getChunkTime = hr()
chunkResult.forEach(subResultEntry => {
var countEntries = resultMap.get(subResultEntry) || 0
resultMap.set(subResultEntry, countEntries + 1)
})
this._debugSelect('chunkresult for "%s": %d entries, %dms', chunkText, chunkResult.size, getChunkTime())
})
this._debugSelect('select took %dms', getTimeSelect())
var getTimeGroup = hr()
var result = []
// creating a map that holds a list of matched subjects indexed by the number of matched chunks
// this makes sorting the whole result set unnecessary as we simply can start interating at the highest
// number of matched chunks
//[ ,
// ,
// [spork]
//]
var numberSubjectsUsed = 0
var numberSubjectsDropped = 0
var discardThreshold = parseInt(splittedDeduped.length * 0.5, 10)
var debugCollection = {}
resultMap.forEach((numberOfMatchingChunks, subject) => {
if (numberOfMatchingChunks < discardThreshold) {
numberSubjectsDropped++
return
}
debugCollection[numberOfMatchingChunks] = debugCollection[numberOfMatchingChunks] || 0
debugCollection[numberOfMatchingChunks]++
result[numberOfMatchingChunks] = result[numberOfMatchingChunks] || []
result[numberOfMatchingChunks].push(subject)
numberSubjectsUsed++
})
this._debugGroup('filter took %dms', getTimeGroup())
this._debugGroup('%d subjects dropped, %d subjects used', numberSubjectsDropped, numberSubjectsUsed)
this._debugGroup('%s', util.inspect(debugCollection))
//iterate the clustered subjects starting with the result that has the highest chunk match score
var numberRelevantClusters = 0
var getTimeCummulate = hr()
var cummulatedResult = []
for (var i = result.length - 1; i >= 0; i--) {
if (result[i]) {
numberRelevantClusters++
var groupedSubResult = result[i].map(subject => new ResultEntry(subject, i, splittedDeduped.length))
cummulatedResult = cummulatedResult.concat(groupedSubResult)
if (cummulatedResult.length >= overallCount || numberRelevantClusters >= 3) break //we decide to only take results from the first three clusters, TODO: pick the threshold from configuration
}
}
this._debugCummulate('cumulate took %dms', getTimeCummulate())
this._debug('overall took %dms', getTimeOverall())
return cummulatedResult
}
}
module.exports = Matcher