levenshtein-search
Version:
A Javascript library for fuzzy substring searches.
387 lines (340 loc) • 10.9 kB
JavaScript
const { searchExact, reverse } = require('./utils')
function isEditDistanceNoGreaterThan (a, b, maxDist) {
if (a.length > b.length) {
[a, b] = [b, a]
}
const lenDelta = b.length - a.length
if (lenDelta > maxDist) {
return false
}
if (maxDist === 0) {
return a === b
}
let firstDiffIdx
for (firstDiffIdx = 0; firstDiffIdx < a.length; firstDiffIdx++) {
if (a[firstDiffIdx] !== b[firstDiffIdx]) break
}
if (firstDiffIdx === a.length) {
return lenDelta <= maxDist
}
let lastDiffIdx
for (lastDiffIdx = a.length - 1; lastDiffIdx >= 0; lastDiffIdx--) {
if (a[lastDiffIdx] !== b[lastDiffIdx + lenDelta]) break
}
a = a.slice(firstDiffIdx, lastDiffIdx + 1)
b = b.slice(firstDiffIdx, lastDiffIdx + 1 + lenDelta)
const [dist, length] = _expand(a, b, maxDist)
return dist + (b.length - length) <= maxDist
}
function editDistance (a, b) {
if (a.length > b.length) {
[a, b] = [b, a]
}
const scores = new Array(a.length + 1)
for (let i = 0; i <= a.length; i++) {
scores[i] = i
}
let _prevScore
let prevScore
for (let i = 0; i < b.length; i++) {
scores[0] = i + 1
prevScore = i
for (let k = 0; k < a.length; k++) {
_prevScore = scores[k + 1]
scores[k + 1] = Math.min(
prevScore + +(a[k] !== b[i]),
scores[k] + 1,
scores[k + 1] + 1
)
prevScore = _prevScore
}
}
return scores[a.length]
}
function makeChar2needleIdx (needle, maxDist) {
const res = {}
for (let i = Math.min(needle.length - 1, maxDist); i >= 0; i--) {
res[needle[i]] = i
}
return res
}
function * fuzzySearch (needle, haystack, maxDist) {
if (needle.length > haystack.length + maxDist) return
const ngramLen = Math.floor(needle.length / (maxDist + 1))
if (maxDist === 0) {
for (const index of searchExact(needle, haystack)) {
yield {
start: index,
end: index + needle.length,
dist: 0
}
}
} else if (ngramLen >= 10) {
yield * fuzzySearchNgrams(needle, haystack, maxDist)
} else {
yield * fuzzySearchCandidates(needle, haystack, maxDist)
}
}
function _expand (needle, haystack, maxDist) {
maxDist = +maxDist
let firstDiff
for (firstDiff = 0; firstDiff < Math.min(needle.length, haystack.length); firstDiff++) {
if (needle.charCodeAt(firstDiff) !== haystack.charCodeAt(firstDiff)) break
}
if (firstDiff) {
needle = needle.slice(firstDiff)
haystack = haystack.slice(firstDiff)
}
if (!needle) {
return [0, firstDiff]
} else if (!haystack) {
if (needle.length <= maxDist) {
return [needle.length, firstDiff]
} else {
return [null, null]
}
}
if (maxDist === 0) return [null, null]
let scores = new Array(needle.length + 1)
for (let i = 0; i <= maxDist; i++) {
scores[i] = i
}
let newScores = new Array(needle.length + 1)
let minScore = null
let minScoreIdx = null
let maxGoodScore = maxDist
let firstGoodScoreIdx = 0
let lastGoodScoreIdx = needle.length - 1
for (let haystackIdx = 0; haystackIdx < haystack.length; haystackIdx++) {
const char = haystack.charCodeAt(haystackIdx)
const needleIdxStart = Math.max(0, firstGoodScoreIdx - 1)
const needleIdxLimit = Math.min(
haystackIdx + maxDist,
needle.length - 1,
lastGoodScoreIdx
)
newScores[0] = scores[0] + 1
firstGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : null
lastGoodScoreIdx = newScores[0] <= maxGoodScore ? 0 : -1
let needleIdx
for (needleIdx = needleIdxStart; needleIdx < needleIdxLimit; needleIdx++) {
const score = newScores[needleIdx + 1] = Math.min(
scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
scores[needleIdx + 1] + 1,
newScores[needleIdx] + 1
)
if (score <= maxGoodScore) {
if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1
lastGoodScoreIdx = Math.max(
lastGoodScoreIdx,
needleIdx + 1 + (maxGoodScore - score)
)
}
}
const lastScore = newScores[needleIdx + 1] = Math.min(
scores[needleIdx] + +(char !== needle.charCodeAt(needleIdx)),
newScores[needleIdx] + 1
)
if (lastScore <= maxGoodScore) {
if (firstGoodScoreIdx === null) firstGoodScoreIdx = needleIdx + 1
lastGoodScoreIdx = needleIdx + 1
}
if (
needleIdx === needle.length - 1 &&
(minScore === null || lastScore <= minScore)
) {
minScore = lastScore
minScoreIdx = haystackIdx
if (minScore < maxGoodScore) maxGoodScore = minScore
}
[scores, newScores] = [newScores, scores]
if (firstGoodScoreIdx === null) break
}
if (minScore !== null && minScore <= maxDist) {
return [minScore, minScoreIdx + 1 + firstDiff]
} else {
return [null, null]
}
}
function * fuzzySearchNgrams (needle, haystack, maxDist) {
// use n-gram search
const ngramLen = Math.floor(needle.length / (maxDist + 1))
const needleLen = needle.length
const haystackLen = haystack.length
for (let ngramStartIdx = 0;
ngramStartIdx <= needle.length - ngramLen;
ngramStartIdx += ngramLen
) {
const ngram = needle.slice(ngramStartIdx, ngramStartIdx + ngramLen)
const ngramEnd = ngramStartIdx + ngramLen
const needleBeforeReversed = reverse(needle.slice(0, ngramStartIdx))
const needleAfter = needle.slice(ngramEnd)
const startIdx = Math.max(0, ngramStartIdx - maxDist)
const endIdx = Math.min(haystackLen, haystackLen - needleLen + ngramEnd + maxDist)
for (const haystackMatchIdx of searchExact(ngram, haystack, startIdx, endIdx)) {
// try to expand left
const [distRight, rightExpandSize] = _expand(
needleAfter,
haystack.slice(
haystackMatchIdx + ngramLen,
haystackMatchIdx - ngramStartIdx + needleLen + maxDist
),
maxDist
)
if (distRight === null) continue
const [distLeft, leftExpandSize] = _expand(
needleBeforeReversed,
reverse(haystack.slice(
Math.max(0, haystackMatchIdx - ngramStartIdx - (maxDist - distRight)),
haystackMatchIdx
)),
maxDist - distRight
)
if (distLeft === null) continue
yield {
start: haystackMatchIdx - leftExpandSize,
end: haystackMatchIdx + ngramLen + rightExpandSize,
dist: distLeft + distRight
}
}
}
}
function * fuzzySearchCandidates (needle, haystack, maxDist) {
const debugFlag = false
if (debugFlag) console.log(`fuzzySearchCandidates(${needle}, ${haystack}, ${maxDist})`)
// prepare some often used things in advance
const needleLen = needle.length
const haystackLen = haystack.length
if (needleLen > haystackLen + maxDist) return
const char2needleIdx = makeChar2needleIdx(needle, maxDist)
let prevCandidates = [] // candidates from the last iteration
let candidates = [] // new candidates from the current iteration
// iterate over the chars in the haystack, updating the candidates for each
for (let i = 0; i < haystack.length; i++) {
const haystackChar = haystack[i]
prevCandidates = candidates
candidates = []
const needleIdx = char2needleIdx[haystackChar]
if (needleIdx !== undefined) {
if (needleIdx + 1 === needleLen) {
if (debugFlag) {
console.log(`yield ${{
start: i,
end: i + 1,
dist: needleIdx
}}`)
}
yield {
start: i,
end: i + 1,
dist: needleIdx
}
} else {
candidates.push({
startIdx: i,
needleIdx: needleIdx + 1,
dist: needleIdx
})
}
}
for (const candidate of prevCandidates) {
// if this sequence char is the candidate's next expected char
if (needle[candidate.needleIdx] === haystackChar) {
// if reached the end of the needle, return a match
if (candidate.needleIdx + 1 === needleLen) {
if (debugFlag) {
console.log(`yield ${{
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist
}}`)
}
yield {
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist
}
} else {
// otherwise, update the candidate's needleIdx and keep it
candidates.push({
startIdx: candidate.startIdx,
needleIdx: candidate.needleIdx + 1,
dist: candidate.dist
})
}
} else {
if (candidate.dist === maxDist) continue
candidates.push({
startIdx: candidate.startIdx,
needleIdx: candidate.needleIdx,
dist: candidate.dist + 1
})
for (let nSkipped = 1; nSkipped <= maxDist - candidate.dist; nSkipped++) {
if (candidate.needleIdx + nSkipped === needleLen) {
if (debugFlag) {
console.log(`yield ${{
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist + nSkipped
}}`)
}
yield {
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist + nSkipped
}
break
} else if (needle[candidate.needleIdx + nSkipped] === haystackChar) {
if (candidate.needleIdx + nSkipped + 1 === needleLen) {
if (debugFlag) {
console.log(`yield ${{
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist + nSkipped
}}`)
}
yield {
start: candidate.startIdx,
end: i + 1,
dist: candidate.dist + nSkipped
}
} else {
candidates.push({
startIdx: candidate.startIdx,
needleIdx: candidate.needleIdx + 1 + nSkipped,
dist: candidate.dist + nSkipped
})
}
break
}
}
if (i + 1 < haystackLen && candidate.needleIdx + 1 < needleLen) {
candidates.push({
startIdx: candidate.startIdx,
needleIdx: candidate.needleIdx + 1,
dist: candidate.dist + 1
})
}
}
}
if (debugFlag) console.log(candidates)
}
for (const candidate of candidates) {
candidate.dist += needle.length - candidate.needleIdx
if (candidate.dist <= maxDist) {
yield {
start: candidate.startIdx,
end: haystack.length,
dist: candidate.dist
}
}
}
}
module.exports = {
_expand,
editDistance,
fuzzySearch,
fuzzySearchNgrams,
fuzzySearchCandidates,
isEditDistanceNoGreaterThan
}