UNPKG

@readium/navigator-html-injectables

Version:

An embeddable solution for connecting frames of HTML publications with a Readium Navigator

362 lines (316 loc) 11.2 kB
/** * Implementation of Myers' online approximate string matching algorithm [1], * with additional optimizations suggested by [2]. * * This has O((k/w) * n) expected-time where `n` is the length of the * text, `k` is the maximum number of errors allowed (always <= the pattern * length) and `w` is the word size. Because JS only supports bitwise operations * on 32 bit integers, `w` is 32. * * As far as I am aware, there aren't any online algorithms which are * significantly better for a wide range of input parameters. The problem can be * solved faster using "filter then verify" approaches which first filter out * regions of the text that cannot match using a "cheap" check and then verify * the remaining potential matches. The verify step requires an algorithm such * as this one however. * * The algorithm's approach is essentially to optimize the classic dynamic * programming solution to the problem by computing columns of the matrix in * word-sized chunks (ie. dealing with 32 chars of the pattern at a time) and * avoiding calculating regions of the matrix where the minimum error count is * guaranteed to exceed the input threshold. * * The paper consists of two parts, the first describes the core algorithm for * matching patterns <= the size of a word (implemented by `advanceBlock` here). * The second uses the core algorithm as part of a larger block-based algorithm * to handle longer patterns. * * [1] G. Myers, “A Fast Bit-Vector Algorithm for Approximate String Matching * Based on Dynamic Programming,” vol. 46, no. 3, pp. 395–415, 1999. * * [2] Šošić, M. (2014). An simd dynamic programming c/c++ library (Doctoral * dissertation, Fakultet Elektrotehnike i računarstva, Sveučilište u Zagrebu). */ /** * Represents a match returned by a call to `search`. */ export interface Match { /** Start offset of match in text. */ start: number; /** End offset of match in text. */ end: number; /** * The number of differences (insertions, deletions or substitutions) between * the pattern and the approximate match in the text. */ errors: number; } function reverse(s: string) { return s.split("").reverse().join(""); } /** * Given the ends of approximate matches for `pattern` in `text`, find * the start of the matches. * * @return Matches with the `start` property set. */ function findMatchStarts(text: string, pattern: string, matches: Match[]) { const patRev = reverse(pattern); return matches.map((m) => { // Find start of each match by reversing the pattern and matching segment // of text and searching for an approx match with the same number of // errors. const minStart = Math.max(0, m.end - pattern.length - m.errors); const textRev = reverse(text.slice(minStart, m.end)); // If there are multiple possible start points, choose the one that // maximizes the length of the match. const start = findMatchEnds(textRev, patRev, m.errors).reduce((min, rm) => { if (m.end - rm.end < min) { return m.end - rm.end; } return min; }, m.end); return { start, end: m.end, errors: m.errors, }; }); } /** * Internal context used when calculating blocks of a column. */ interface Context { /** * Bit-arrays of positive vertical deltas. * * ie. `P[b][i]` is set if the vertical delta for the i'th row in the b'th * block is positive. */ P: Uint32Array; /** Bit-arrays of negative vertical deltas. */ M: Uint32Array; /** Bit masks with a single bit set indicating the last row in each block. */ lastRowMask: Uint32Array; } /** * Return 1 if a number is non-zero or zero otherwise, without using * conditional operators. * * This should get inlined into `advanceBlock` below by the JIT. * * Adapted from https://stackoverflow.com/a/3912218/434243 */ function oneIfNotZero(n: number) { return ((n | -n) >> 31) & 1; } /** * Block calculation step of the algorithm. * * From Fig 8. on p. 408 of [1], additionally optimized to replace conditional * checks with bitwise operations as per Section 4.2.3 of [2]. * * @param ctx - The pattern context object * @param peq - The `peq` array for the current character (`ctx.peq.get(ch)`) * @param b - The block level * @param hIn - Horizontal input delta ∈ {1,0,-1} * @return Horizontal output delta ∈ {1,0,-1} */ function advanceBlock(ctx: Context, peq: Uint32Array, b: number, hIn: number) { let pV = ctx.P[b]; let mV = ctx.M[b]; const hInIsNegative = hIn >>> 31; // 1 if hIn < 0 or 0 otherwise. const eq = peq[b] | hInIsNegative; // Step 1: Compute horizontal deltas. const xV = eq | mV; const xH = (((eq & pV) + pV) ^ pV) | eq; let pH = mV | ~(xH | pV); let mH = pV & xH; // Step 2: Update score (value of last row of this block). const hOut = oneIfNotZero(pH & ctx.lastRowMask[b]) - oneIfNotZero(mH & ctx.lastRowMask[b]); // Step 3: Update vertical deltas for use when processing next char. pH <<= 1; mH <<= 1; mH |= hInIsNegative; pH |= oneIfNotZero(hIn) - hInIsNegative; // set pH[0] if hIn > 0 pV = mH | ~(xV | pH); mV = pH & xV; ctx.P[b] = pV; ctx.M[b] = mV; return hOut; } /** * Find the ends and error counts for matches of `pattern` in `text`. * * Only the matches with the lowest error count are reported. Other matches * with error counts <= maxErrors are discarded. * * This is the block-based search algorithm from Fig. 9 on p.410 of [1]. */ function findMatchEnds(text: string, pattern: string, maxErrors: number) { if (pattern.length === 0) { return []; } // Clamp error count so we can rely on the `maxErrors` and `pattern.length` // rows being in the same block below. maxErrors = Math.min(maxErrors, pattern.length); const matches = []; // Word size. const w = 32; // Index of maximum block level. const bMax = Math.ceil(pattern.length / w) - 1; // Context used across block calculations. const ctx = { P: new Uint32Array(bMax + 1), M: new Uint32Array(bMax + 1), lastRowMask: new Uint32Array(bMax + 1), }; ctx.lastRowMask.fill(1 << 31); ctx.lastRowMask[bMax] = 1 << (pattern.length - 1) % w; // Dummy "peq" array for chars in the text which do not occur in the pattern. const emptyPeq = new Uint32Array(bMax + 1); // Map of UTF-16 character code to bit vector indicating positions in the // pattern that equal that character. const peq = new Map<number, Uint32Array>(); // Version of `peq` that only stores mappings for small characters. This // allows faster lookups when iterating through the text because a simple // array lookup can be done instead of a hash table lookup. const asciiPeq = [] as Uint32Array[]; for (let i = 0; i < 256; i++) { asciiPeq.push(emptyPeq); } // Calculate `ctx.peq` - a map of character values to bitmasks indicating // positions of that character within the pattern, where each bit represents // a position in the pattern. for (let c = 0; c < pattern.length; c += 1) { const val = pattern.charCodeAt(c); if (peq.has(val)) { // Duplicate char in pattern. continue; } const charPeq = new Uint32Array(bMax + 1); peq.set(val, charPeq); if (val < asciiPeq.length) { asciiPeq[val] = charPeq; } for (let b = 0; b <= bMax; b += 1) { charPeq[b] = 0; // Set all the bits where the pattern matches the current char (ch). // For indexes beyond the end of the pattern, always set the bit as if the // pattern contained a wildcard char in that position. for (let r = 0; r < w; r += 1) { const idx = b * w + r; if (idx >= pattern.length) { continue; } const match = pattern.charCodeAt(idx) === val; if (match) { charPeq[b] |= 1 << r; } } } } // Index of last-active block level in the column. let y = Math.max(0, Math.ceil(maxErrors / w) - 1); // Initialize maximum error count at bottom of each block. const score = new Uint32Array(bMax + 1); for (let b = 0; b <= y; b += 1) { score[b] = (b + 1) * w; } score[bMax] = pattern.length; // Initialize vertical deltas for each block. for (let b = 0; b <= y; b += 1) { ctx.P[b] = ~0; ctx.M[b] = 0; } // Process each char of the text, computing the error count for `w` chars of // the pattern at a time. for (let j = 0; j < text.length; j += 1) { // Lookup the bitmask representing the positions of the current char from // the text within the pattern. const charCode = text.charCodeAt(j); let charPeq; if (charCode < asciiPeq.length) { // Fast array lookup. charPeq = asciiPeq[charCode]; } else { // Slower hash table lookup. charPeq = peq.get(charCode); if (typeof charPeq === "undefined") { charPeq = emptyPeq; } } // Calculate error count for blocks that we definitely have to process for // this column. let carry = 0; for (let b = 0; b <= y; b += 1) { carry = advanceBlock(ctx, charPeq, b, carry); score[b] += carry; } // Check if we also need to compute an additional block, or if we can reduce // the number of blocks processed for the next column. if ( score[y] - carry <= maxErrors && y < bMax && (charPeq[y + 1] & 1 || carry < 0) ) { // Error count for bottom block is under threshold, increase the number of // blocks processed for this column & next by 1. y += 1; ctx.P[y] = ~0; ctx.M[y] = 0; let maxBlockScore; if (y === bMax) { const remainder = pattern.length % w; maxBlockScore = remainder === 0 ? w : remainder; } else { maxBlockScore = w; } score[y] = score[y - 1] + maxBlockScore - carry + advanceBlock(ctx, charPeq, y, carry); } else { // Error count for bottom block exceeds threshold, reduce the number of // blocks processed for the next column. while (y > 0 && score[y] >= maxErrors + w) { y -= 1; } } // If error count is under threshold, report a match. if (y === bMax && score[y] <= maxErrors) { if (score[y] < maxErrors) { // Discard any earlier, worse matches. matches.splice(0, matches.length); } matches.push({ start: -1, end: j + 1, errors: score[y], }); // Because `search` only reports the matches with the lowest error count, // we can "ratchet down" the max error threshold whenever a match is // encountered and thereby save a small amount of work for the remainder // of the text. maxErrors = score[y]; } } return matches; } /** * Search for the closest matches for `pattern` in `text`. * * Returns all matches that have the lowest number of errors, or an empty * array if no match was found with `maxErrors` or fewer errors. */ export default function search( text: string, pattern: string, maxErrors: number ): Match[] { const matches = findMatchEnds(text, pattern, maxErrors); return findMatchStarts(text, pattern, matches); }