UNPKG

@a-s8h/liblevenshtein

Version:

Various utilities regarding Levenshtein transducers.

702 lines (664 loc) 22.5 kB
global = if typeof exports is 'object' exports else if typeof window is 'object' window else this global['levenshtein'] ||= {} if typeof require is 'function' {levenshtein: {MaxHeap}} = require '../collection/max-heap' {levenshtein: {Transducer}} = require './transducer' {levenshtein: {Dawg}} = require '../collection/dawg' else MaxHeap = global['levenshtein']['MaxHeap'] Transducer = global['levenshtein']['Transducer'] Dawg = global['levenshtein']['Dawg'] fields = # Dictionary of terms '_dictionary': new Dawg([]) # Search algorithm to use '_algorithm': 'standard' # Sort the candidates as they are discovered '_sort_candidates': true # If sort_candidates, then sort them in a case-insensitive fashion '_case_insensitive_sort': true # Include the distance from the query term for each spelling candidate '_include_distance': true # Maximum number of spelling candidates to return '_maximum_candidates': Infinity # Customer comparator for the max-heap (optional). This should be an arity-2 # function that accepts two pairs of ["term", distance] values. '_custom_comparator': null # Custom transform for spelling candidates (optional). This should be an # arity-1 function that accepts a pair of ["term", distance] values. '_custom_transform': null # Maximum number of spelling errors that are tollerated. This can be # overridden with the second parameter to Transducer.transduce(term, n) '_default_edit_distance': Infinity class Builder constructor: (source, attributes) -> if source instanceof Builder for own field of fields this[field] = source[field] for own attribute, value of attributes this['_' + attribute] = value # The distance of each position in a state can be defined as follows: # # distance = w - i + e # # For every accepting position, it must be the case that w - i <= n - e. It # follows directly that the distance of every accepted position must be no # more than n: # # (w - i <= n - e) <=> (w - i + e <= n) <=> (distance <= n) # # The Levenshtein distance between any two terms is defined as the minimum # edit distance between the two terms. Therefore, iterate over each position # in an accepting state, and take the minimum distance among all its accepting # positions as the corresponding Levenshtein distance. _minimum_distance: () -> if @['_algorithm'] is 'standard' (state, w) -> minimum = Infinity for [i,e] in state distance = w - i + e minimum = distance if distance < minimum minimum else (state, w) -> minimum = Infinity for [i,e,x] in state distance = w - i + e minimum = distance if x isnt 1 and distance < minimum minimum _comparator: () -> if typeof @['_custom_comparator'] is 'function' @['_custom_comparator'] else if @['_sort_candidates'] # Sort by minimum distance from the query term. comparator = (a,b) -> a[1] - b[1] # Sort in a case-insensitive manner. comparator = do (comparator) -> (a,b) -> comparator(a,b) || a[0].toLowerCase().localeCompare(b[0].toLowerCase()) # If the terms are the same, case-insensitive, then compare them in a # case-sensitive manner. unless @['_case_insensitive_sort'] comparator = do (comparator) -> (a,b) -> comparator(a,b) || a[0].localeCompare(b[0]) comparator else () -> 0 #-> If we don't want to sort the matches, make all terms equal _transform: (comparator) -> transform = if typeof @['_custom_transform'] is 'function' @['_custom_transform'] else if @['_include_distance'] is false (candidate) -> candidate[0] (matches) => if isFinite @['_maximum_candidates'] matches['sort']() #-> sorts in reverse matches = matches['heap'] else if @['_sort_candidates'] heap = matches matches = [] matches.push heap['pop']() while heap['peek']() isnt null if typeof transform is 'function' i = -1; while (++i) < matches.length matches[i] = transform(matches[i]) matches _initial_state: () -> if @['_algorithm'] is 'standard' [[0,0]] else [[0,0,0]] # Accepts a state vector and sorts its elements in ascending order. _sort_for_transition: () -> comparator = (a,b) -> a[0] - b[0] || a[1] - b[1] if @['_algorithm'] in ['transposition', 'merge_and_split'] comparator = do (comparator) -> (a,b) -> comparator(a,b) || a[2] - b[2] (state) -> state.sort(comparator) _index_of: (vector, k, i) -> j = 0 while j < k return j if vector[i + j] j += 1 return -1 # Accepts a maximum edit distance and returns a transition function that maps # a position, state vector and table offset of the current state to its next # state. _transition_for_position: () -> switch @['_algorithm'] when 'standard' then (n) => ([i,e], vector, offset) => h = i - offset; w = vector.length if e < n if h <= w - 2 a = n - e + 1; b = w - h k = if a < b then a else b j = @_index_of(vector, k, h) if j == 0 [ [(i + 1), e] ] else if j > 0 [ [i, (e + 1)] [(i + 1), (e + 1)] [(i + j + 1), (e + j)] ] else [ [i, (e + 1)] [(i + 1), (e + 1)] ] else if h == w - 1 if vector[h] [ [(i + 1), e] ] else [ [i, (e + 1)] [(i + 1), (e + 1)] ] else # h == w [ [i, (e + 1)] ] else if e == n if h <= w - 1 if vector[h] [ [(i + 1), n] ] else null else null else null when 'transposition' then (n) => ([i,e,t], vector, offset) => h = i - offset; w = vector.length if e == 0 < n if h <= w - 2 a = n - e + 1; b = w - h k = if a < b then a else b j = @_index_of(vector, k, h) if j == 0 [ [(i + 1), 0, 0] ] else if j == 1 [ [i, 1, 0] [i, 1, 1] # t-position [(i + 1), 1, 0] [(i + 2), 1, 0] # was [(i + j + 1), j, 0], but j=1 ] else if j > 1 [ [i, 1, 0] [(i + 1), 1, 0] [(i + j + 1), j, 0] ] else [ [i, 1, 0] [(i + 1), 1, 0] ] else if h == w - 1 if vector[h] [ [(i + 1), 0, 0] ] else [ [i, 1, 0] [(i + 1), 1, 0] ] else # h == w [ [i, 1, 0] ] else if 1 <= e < n if h <= w - 2 if t is 0 # [i,e] is not a t-position a = n - e + 1; b = w - h k = if a < b then a else b j = @_index_of(vector, k, h) if j == 0 [ [(i + 1), e, 0] ] else if j == 1 [ [i, (e + 1), 0] [i, (e + 1), 1] # t-position [(i + 1), (e + 1), 0] [(i + 2), (e + 1), 0] # was [(i + j + 1), (e + j), 0], but j=1 ] else if j > 1 [ [i, (e + 1), 0] [(i + 1), (e + 1), 0] [(i + j + 1), (e + j), 0] ] else [ [i, (e + 1), 0] [(i + 1), (e + 1), 0] ] else if vector[h] [ [(i + 2), e, 0] ] else null else if h == w - 1 if vector[h] [ [(i + 1), e, 0] ] else [ [i, (e + 1), 0] [(i + 1), (e + 1), 0] ] else # h == w [ [i, (e + 1), 0] ] else if h <= w - 1 and t is 0 if vector[h] [ [(i + 1), n, 0] ] else null else if h <= w - 2 and t is 1 # [i,e] is a t-position if vector[h] [ [(i + 2), n, 0] ] else null else # h == w null when 'merge_and_split' then (n) => ([i,e,s], vector, offset) => h = i - offset; w = vector.length if e == 0 < n if h <= w - 2 if vector[h] [ [(i + 1), e, 0] ] else [ [i, (e + 1), 0] [i, (e + 1), 1] # s-position [(i + 1), (e + 1), 0] [(i + 2), (e + 1), 0] ] else if h == w - 1 if vector[h] [ [(i + 1), e, 0] ] else [ [i, (e + 1), 0] [i, (e + 1), 1] # s-position [(i + 1), (e + 1), 0] ] else # h == w [ [i, (e + 1), 0] ] else if e < n if h <= w - 2 if s is 0 if vector[h] [ [(i + 1), e, 0] ] else [ [i, (e + 1), 0] [i, (e + 1), 1] # s-position [(i + 1), (e + 1), 0] [(i + 2), (e + 1), 0] ] else # [i,e] is an s-position [ [(i + 1), e, 0] ] else if h == w - 1 if s is 0 if vector[h] [ [(i + 1), e, 0] ] else [ [i, (e + 1), 0] [i, (e + 1), 1] # s-position [(i + 1), (e + 1), 0] ] else # [i,e] is an s-position [ [(i + 1), e, 0] ] else # h == w [ [i, (e + 1), 0] ] else if h <= w - 1 if s is 0 if vector[h] [ [(i + 1), n, 0] ] else null else # [i,e] is an s-position [ [(i + 1), e, 0] ] else # h == w null # Given two positions [i,e] and [j,f], for [i,e] to subsume [j,f], it must be # the case that e < f. Therefore, I can remove a redundant check for (e < f) # within the subsumes method by finding the first index that contains a # position having an error greater than the current one (assuming that the # positions are sorted in ascending order, according to error). _bisect_error_right: (state, e, l) -> u = state.length while l < u i = (l + u) >> 1 if e < state[i][1] u = i else l = i + 1 return l # Removes all subsumed positions from a state _unsubsume: () => subsumes = @_subsumes() bisect_error_right = @_bisect_error_right switch @['_algorithm'] when 'standard' (state) -> m = 0 while x = state[m] [i,e] = x; n = bisect_error_right(state, e, m) while y = state[n] [j,f] = y if subsumes(i,e, j,f) state.splice(n,1) else n += 1 m += 1 return when 'transposition' (state) -> m = 0 while x = state[m] [i,e,s] = x; n = bisect_error_right(state, e, m) while y = state[n] [j,f,t] = y if subsumes(i,e,s, j,f,t, n) state.splice(n,1) else n += 1 m += 1 return when 'merge_and_split' (state) -> m = 0 while x = state[m] [i,e,s] = x; n = bisect_error_right(state, e, m) while y = state[n] [j,f,t] = y if subsumes(i,e,s, j,f,t, n) state.splice(n,1) else n += 1 m += 1 return # NOTE: See my comment above bisect_error_right(state,e,l) and how I am using # it in _unsubsume for why I am not checking (e < f) below. _subsumes: () -> switch @['_algorithm'] when 'standard' then (i,e, j,f) -> #(e < f) && Math.abs(j - i) <= (f - e) ((i < j) && (j - i) || (i - j)) <= (f - e) when 'transposition' then (i,e,s, j,f,t, n) -> if s is 1 if t is 1 #(e < f) && (i == j) (i == j) else #(e < f == n) && (i == j) (f == n) && (i == j) else if t is 1 # We have two cases: # # Case 1: (j < i) => (j - i) = - (i - j) # => |j - (i - 1)| = |j - i + 1| # = |-(i - j) + 1| # = |-(i - j - 1)| # = i - j - 1 # # Case 1 holds, because i and j are integers, and j < i implies i is at # least 1 unit greater than j, further implying that i - j - 1 is # non-negative. # # Case 2: (j >= i) => |j - (i - 1)| = |j - i + 1| = j - i + 1 # # Case 2 holds for the same reason case 1 does, in that j - i >= 0, and # adding 1 to the difference will only strengthen its non-negativity. # #Math.abs(j - (i - 1)) <= (f - e); (if (j < i) then (i - j - 1) else (j - i + 1)) <= (f - e) else #(e < f) && Math.abs(j - i) <= (f - e) ((i < j) && (j - i) || (i - j)) <= (f - e) when 'merge_and_split' then(i,e,s, j,f,t) -> if s is 1 and t is 0 false else #(e < f) && Math.abs(j - i) <= (f - e) ((i < j) && (j - i) || (i - j)) <= (f - e) _bisect_left: () -> if @['_algorithm'] (state, position) -> [i,e] = position; l = 0; u = state.length while l < u k = (l + u) >> 1 p = state[k] if (e - p[1] || i - p[0]) > 0 l = k + 1 else u = k return l else (state, position) -> [i,e,x] = position; l = 0; u = state.length while l < u k = (l + u) >> 1 p = state[k] if (e - p[1] || i - p[0] || x - p[2]) > 0 l = k + 1 else u = k return l # Merges the positions of next_state into state_prime, in a # subsumption-friendly manner. _merge_for_subsumption: () -> bisect_left = @_bisect_left() if @['_algorithm'] is 'standard' (state_prime, next_state) -> # Order according to error first, then boundary (both ascending). # While sorting the elements, remove any duplicates. for position in next_state i = bisect_left(state_prime, position) if curr = state_prime[i] if curr[0] != position[0] || curr[1] != position[1] state_prime.splice(i, 0, position) else state_prime.push(position) return else (state_prime, next_state) -> # Order according to error first, then boundary (both ascending). # While sorting the elements, remove any duplicates. for position in next_state i = bisect_left(state_prime, position) if curr = state_prime[i] if curr[0] != position[0] || curr[1] != position[1] || curr[2] != position[2] state_prime.splice(i, 0, position) else state_prime.push(position) return _transition_for_state: () -> merge_for_subsumption = @_merge_for_subsumption() unsubsume = @_unsubsume() transition_for_position = @_transition_for_position() sort_for_transition = @_sort_for_transition() (n) -> transition = transition_for_position(n) (state, vector) => offset = state[0][0]; state_prime = [] for position in state next_state = transition(position, vector, offset) continue unless next_state merge_for_subsumption(state_prime, next_state) unsubsume(state_prime) if state_prime.length > 0 sort_for_transition(state_prime) state_prime else null _characteristic_vector: () -> (x, term, k, i) -> vector = []; j = 0 while j < k vector.push(x is term[i + j]) j += 1 vector _push: (compare) -> maximum_candidates = @['_maximum_candidates'] if isFinite maximum_candidates (candidates, candidate) -> if candidates.length is maximum_candidates # We are maintaining a max-heap so that the element furthest from the # query term will be on the top. If the new candidate is closer to # the query term then it should replace the old one. if compare(candidate, candidates['peek']()) < 0 candidates['pop']() candidates.push(candidate) else candidates.push(candidate) candidates else (candidates, candidate) -> candidates.push(candidate) candidates 'build': () -> comparator = @_comparator() new Transducer({ 'minimum_distance': @_minimum_distance() 'build_matches': do => if isFinite @['_maximum_candidates'] () -> new MaxHeap(comparator) else if @['_sort_candidates'] () -> new MaxHeap (a,b) -> - comparator(a,b) else () -> [] 'transition_for_state': @_transition_for_state() 'characteristic_vector': @_characteristic_vector() 'edges': (dawg_node) -> dawg_node['edges'] 'is_final': (dawg_node) -> dawg_node['is_final'] 'root': do (dawg = @['_dictionary']) -> () -> dawg['root'] 'initial_state': do (initial_state=@_initial_state()) -> () => initial_state 'push': @_push(comparator) 'default_edit_distance': () => @['default_edit_distance']() 'transform': @_transform(comparator) }) # Aliases Builder::transducer to Builder::build, for those who prefer the # syntax, builder.transducer(), over builder.build() Builder::['transducer'] = Builder::['build'] # Initialize the default, property values for own property, value of fields Builder::[property] = value # Performs no operation noop = () -> return # Identity function: returns whatever you give it identity = (x) -> x def_property = def_properties = (properties, params; property, i) -> [validate, translate] = [params['validate'], params['translate']] if typeof properties is 'string' properties = [properties] unless properties instanceof Array throw new Error('Expected "properties" to be of type Array') if validate isnt `undefined` and typeof validate isnt 'function' throw new Error('Expected "validate" to be of type Function') if translate isnt `undefined` and typeof translate isnt 'function' throw new Error('Expected "translate" to be of type Function') validate ||= noop translate ||= identity for property, i in properties if typeof property isnt 'string' throw new Error( "Expected property at index #{i} of properties to be of type String") do (property) -> field = '_' + property Builder::[property] = (value, opts...) -> if value is `undefined` @[field] else validate(value, opts, property) value = translate(value, opts, property) attributes = {} attributes[property] = value new Builder(this, attributes) true def_property 'dictionary', 'validate': (dictionary) -> unless dictionary instanceof Array or dictionary instanceof Dawg throw new Error('dictionary must be either an Array or Dawg') 'translate': (dictionary, [sorted]) -> if dictionary instanceof Array dictionary.sort() unless sorted is true dictionary = new Dawg(dictionary) dictionary def_property 'algorithm', 'validate': (algorithm) -> unless algorithm in ['standard', 'transposition', 'merge_and_split'] throw new Error( 'algorithm must be standard, transposition, or merge_and_split') def_properties ['sort_candidates', 'case_insensitive_sort', 'include_distance'], 'validate': (value, _, property) -> unless typeof value is 'boolean' throw new Error("Expected type of \"#{property}\" to be boolean") def_properties ['maximum_candidates', 'default_edit_distance'], 'validate': (value, _, property) -> unless typeof value is 'number' and 0 <= value throw new Error("Expected \"#{property}\" to be a non-negative number") def_properties ['custom_comparator', 'custom_transform'], 'validate': (value, _, property) -> unless typeof value is 'function' throw new Error("Expected \"#{property}\" to be a function") global['levenshtein']['Builder'] = Builder