motion
Version:
motion - moving development forward
294 lines (265 loc) • 12 kB
JavaScript
// Algorithm: Find all the possible ways the abbr could match the string, then using the various weighting options, return the greatest score of all of them.
// Tests:
// + A string that matches exactly, including the case, has a similar rating of 1.0
// + A string that matches exactly, except for the case, has a similar rating of 0.9
// + A string that has all characters in common, but none are in the same order, has a similar rating of 0.4
// + A string that has zero common characters has a similar rating of 0.0
// To create these scores, each character must be given a weight to its own score, based on the proportive size of the given and test strings.
// The given string must be given a base score as well, based on how many words, capital letters, etc, it has.
// Each bonus is given to a character based on the total potential score of that bonus in the given string.
// All of these are score MODIFIERS - each test character starts with a base score of 1, each applicable boost adds to (or subtracts from) its overall score.
String.scoring_options = {
// just adds on to the acronym boost
firstChar : 0.5,
// such as 'qbf' or 'QBF' when referring to 'The Quick Brown Fox'
acronym : 2.2,
// match boost for matching capital letters, whether query was capital or not. Thus, 'ABC'.score('ABC') > 'abc'.score('abc')
capitalLetter : 0.8,
// when the query character matches case-sensitive
caseMatch : 0.3,
// when two consecutive characters of the query match two consecutive characters of the string
consecutiveChars : 0.5,
nonConsecutiveChars : -0.2,
// when a query character is missing from the string
missingMatch : -6,
// when a query character matches in the string, but is not in order with the rest of the string
outOfOrder : {
// Subtracts this proportionally based on the ACTUAL point value of each character in the abbreviation,
// and then reduces the other boosts by the multiplier value. That way we're SURE it's always positively
// valuable to include a character even if it's out of place.
score: -1.5, // subtracts all of the base value of having the character in there - only bonuses count now.
multiplier: 0.1 // reduces the effect of the other boosts by a LOT - although if there are any boosts at all, the score will at least be positive
}
};
Array.prototype.remove = function(index){
return this.splice(index,1)[0];
};
Array.prototype.includes = function(value){
var i,len=this.length;
for(i=0;i<len;i++){
if(this[i]===value)return true;
}
return false;
};
Array.prototype.count_how = function(how){
var i,count=0,len=this.length;
for(i=0;i<len;i++){
if(how.apply(this[i]))count+=1;
}
return count;
};
Array.prototype.each = function(cb){
var i,len=this.length;
for(i=0;i<len;i++){
cb.apply(this[i], [i]);
}
return this;
};
Array.prototype.map = function(cb){
var i,len=this.length,a=[];
for(i=0;i<len;i++){
a.push(cb.apply(this[i]));
}
return a;
};
Array.prototype.highest = function(how){
return this[this.highest_i(how)];
};
Array.prototype.highest_i = function(how){
var i,s,best_s=this[0],best_i=0,len=this.length;
for(i=0;i<len;i++){
if(how)s=how(this[i]);
else s=this[i];
if(s>best_s){
best_s=s;
best_i=i;
}
}
return best_i;
};
String.prototype.first = function(){
return this.slice(0,1);
};
String.prototype.count_match = function(regexp){
var str=''+this,count=0;
var pos=str.search(regexp);
while(pos>-1){
count += 1;
str=str.slice(pos+1);
pos=str.search(regexp);
}
return count;
};
var MatchTree = function(parent, orig_string, string, abbr, positions){
this.ancestry = function(){
return this.parent ? this.parent.ancestry().concat([this]) : [];
};
this.paths = function(){
var paths=[],abbr_so_far;
if(this.next_matches){
var i,mlen=this.next_matches.length;
for(i=0;i<mlen;i++){
paths = paths.concat(this.next_matches[i].paths());
}
}else{
paths.push(this.ancestry());
}
return paths;
};
// Constructor
if(parent.constructor===String){
this.parent = null;
this.original = ''+parent;
this.positions = [];
this.position = 0;
abbr = orig_string;
string = parent;
orig_string = string;
}else{
this.parent = parent;
this.original = parent.original; // pass the string down each level
this.positions = positions;
this.position = parent.original.length - string.length;
// In this case, the abbr that is sent matches on the first character. Here we analyze that match before matching further.
var match_chr = string.first();
this.chr = abbr.first();
// Add bonuses
this.match_info = [this.chr + this.position];
if(match_chr.toLowerCase()!==this.chr.toLowerCase()){
this.match_info.push('missingMatch');
this.position=parent.position;
}else{
if(this.position<this.parent.position)this.match_info.push('outOfOrder');
if(this.position===0)this.match_info.push('firstChar');
if(this.position===0 || this.original.slice(this.position-1,this.position)==' ')this.match_info.push('acronym');
if(match_chr.toUpperCase()===match_chr)this.match_info.push('capitalLetter');
if(this.chr===match_chr)this.match_info.push('caseMatch');
if(this.position!==0 && this.parent.position===this.position-1)this.match_info.push('consecutiveChars');
else this.match_info.push('nonConsecutiveChars');
}
// Shift over to the next abbreviation character
string=''+this.original;
abbr = abbr.slice(1);
}
// If there is any more abbr to match, then create children
if(abbr.length>0){
this.next_matches = [];
// Create a new MatchTree for each possible match, and save them in this.matches
var chr=abbr[0],pos=string.toLowerCase().indexOf(chr.toLowerCase()),pp;
while(pos>-1){
pp=this.original.length-string.length+pos;
// Can't match the same character twice.
if(!this.positions.includes(pp))
this.next_matches.push(new MatchTree(this, orig_string, string.slice(pos), abbr, this.positions.concat([pp])));
string=string.slice(pos+1);
pos=string.toLowerCase().indexOf(chr.toLowerCase());
}
if(this.next_matches.length===0){
this.next_matches.push(new MatchTree(this, orig_string, string, abbr, this.positions));
}
}
};
var CachedScores = {};
String.prototype.score = function(abbr){
// Use cached version if we've already scored this string for this abbr!
if(CachedScores[this] && CachedScores[this][abbr]) return CachedScores[this][abbr];
// Cheat: if exact match, go ahead and just immediately return the highest score possible
if(this==abbr)return 1.0;
// Set up the scoring options. These are all adding to or subtracting from a regular fact-of-match = 1
var options = String.scoring_options || {};
// when the first character of the string matches the first character of the query
if(!options.firstChar) options.firstChar = 0.5; // just adds on to the acronym boost
// such as 'qbf' or 'QBF' when referring to 'The Quick Brown Fox'
if(!options.acronym) options.acronym = 1;
// match boost for matching capital letters, whether query was capital or not. Thus, 'ABC'.score('ABC') > 'abc'.score('abc')
if(!options.capitalLetter) options.capitalLetter = 0.2;
// when the query character matches case-sensitive
if(!options.caseMatch) options.caseMatch = 0.2;
// when two consecutive characters of the query match two consecutive characters of the string
if(!options.consecutiveChars) options.consecutiveChars = 0.2;
// when a query character is missing from the string
if(!options.missingMatch) options.missingMatch = -5;
// when a query character matches in the string, but is not in order with the rest of the string
if(!options.outOfOrder) options.outOfOrder = {
// Subtracts this proportionally based on the ACTUAL point value of each character in the abbreviation,
// and then reduces the other boosts by the multiplier value. That way we're SURE it's always positively
// valuable to include a character even if it's out of place.
score: -0.95, // subtracts most of the value of having the character in there
multiplier: 0.1 // reduces the effect of the other boosts by quite a bit - although if there are several boosts, the score will still amount to at least something
};
// Other ideas for weights:
// +bump related to the number of matching characters there are in the string that follow the last match
// +bump for how many characters in the string match the test character
// +bump for close proximity
// +bump related to the size of the abbreviation compared to the size of the test string
// +bump for a higher number of matching consecutive characters
// Find all possible match paths
var match_tree = new MatchTree(this, abbr);
// Secondly, determine a potential score on the base string for each bonus type.
var potential_scores = {
// how many /\s\w/ in the string (include the first character)?
words: this.count_match(/\s\w/)+1,
// how many capital letters in the string?
capitals: this.count_match(/[A-Z]/),
// how many characters in the string
length: this.length
};
var potential_score =
options.firstChar +
potential_scores.length +
potential_scores.length * (0.0 + options.caseMatch) +
(potential_scores.length-1) * (0.0 + options.consecutiveChars) +
potential_scores.words * (0.0 + options.acronym) +
potential_scores.capitals * (0.0 + options.capitalLetter);
// Thirdly, give scores to each matched character, proportional to the potential score for each bonus type.
// proportion of the square roots of the lengths?
// var proportion = Math.sqrt(abbr.length) / Math.sqrt(this.length);
var proportion = abbr.length / this.length;
var score_per_character = potential_score * proportion / abbr.length;
// Now, score each match path
var paths = match_tree.paths();
var path,plen=paths.length;
// Then, calculate the score for each path
var i,j,match_infos,score,scores=[],multiplier,_char;
paths.each(function(){
path = this;
score = 0;
path.match_infos=[];
path.each(function(){
_char = this;
score += 1;
multiplier = 1;
if(this.match_info.includes('outOfOrder'))
multiplier = options.outOfOrder.multiplier;
this.match_info.each(function(){
if(options[this])
score += (
this=='outOfOrder' ? (options.outOfOrder.score * (score_per_character-1)) :
// This score gets worse depending on the proximity of the two characters' match locations
( this=='nonConsecutiveChars' ? (((_char.position - _char.parent.position) / abbr.length) * (options.nonConsecutiveChars * 2)) :
(options[this] * multiplier)
)
);
});
path.match_infos.push(this.match_info, score);
});
// Extra: first-abbr-character-is-on-acronym boost - 2/3 of firstChar boost
if(path[0].match_info.includes('acronym') && !path[0].match_info.includes('firstChar'))
score += (options.firstChar * 3/4);
scores.push(score);
});
// Last, return the proportion of the best score to the base potential score.
var best_path_i = scores.highest_i();
var best_path = paths[best_path_i];
var highest = scores[best_path_i];
var final_score = highest===0 ? 0 : highest/potential_score;
// Cache the resulting score!
if(!CachedScores[this]) CachedScores[this] = {};
CachedScores[this][abbr] = final_score;
return final_score;
};
Array.prototype.best_score_index = function(abbr){
return this.highest_i(function(e){
return e.score(abbr);
});
};