zxcvbn
Version:
realistic password strength estimation
610 lines (566 loc) • 20.9 kB
text/coffeescript
frequency_lists = require('./frequency_lists')
adjacency_graphs = require('./adjacency_graphs')
scoring = require('./scoring')
build_ranked_dict = (ordered_list) ->
result = {}
i = 1 # rank starts at 1, not 0
for word in ordered_list
result[word] = i
i += 1
result
RANKED_DICTIONARIES = {}
for name, lst of frequency_lists
RANKED_DICTIONARIES[name] = build_ranked_dict lst
GRAPHS =
qwerty: adjacency_graphs.qwerty
dvorak: adjacency_graphs.dvorak
keypad: adjacency_graphs.keypad
mac_keypad: adjacency_graphs.mac_keypad
L33T_TABLE =
a: ['4', '@']
b: ['8']
c: ['(', '{', '[', '<']
e: ['3']
g: ['6', '9']
i: ['1', '!', '|']
l: ['1', '|', '7']
o: ['0']
s: ['$', '5']
t: ['+', '7']
x: ['%']
z: ['2']
REGEXEN =
recent_year: /19\d\d|200\d|201\d/g
DATE_MAX_YEAR = 2050
DATE_MIN_YEAR = 1000
DATE_SPLITS =
4:[ # for length-4 strings, eg 1191 or 9111, two ways to split:
[1, 2] # 1 1 91 (2nd split starts at index 1, 3rd at index 2)
[2, 3] # 91 1 1
]
5:[
[1, 3] # 1 11 91
[2, 3] # 11 1 91
]
6:[
[1, 2] # 1 1 1991
[2, 4] # 11 11 91
[4, 5] # 1991 1 1
]
7:[
[1, 3] # 1 11 1991
[2, 3] # 11 1 1991
[4, 5] # 1991 1 11
[4, 6] # 1991 11 1
]
8:[
[2, 4] # 11 11 1991
[4, 6] # 1991 11 11
]
matching =
empty: (obj) -> (k for k of obj).length == 0
extend: (lst, lst2) -> lst.push.apply lst, lst2
translate: (string, chr_map) -> (chr_map[chr] or chr for chr in string.split('')).join('')
mod: (n, m) -> ((n % m) + m) % m # mod impl that works for negative numbers
sorted: (matches) ->
# sort on i primary, j secondary
matches.sort (m1, m2) ->
(m1.i - m2.i) or (m1.j - m2.j)
# ------------------------------------------------------------------------------
# omnimatch -- combine everything ----------------------------------------------
# ------------------------------------------------------------------------------
omnimatch: (password) ->
matches = []
matchers = [
@dictionary_match
@reverse_dictionary_match
@l33t_match
@spatial_match
@repeat_match
@sequence_match
@regex_match
@date_match
]
for matcher in matchers
@extend matches, matcher.call(this, password)
@sorted matches
#-------------------------------------------------------------------------------
# dictionary match (common passwords, english, last names, etc) ----------------
#-------------------------------------------------------------------------------
dictionary_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES) ->
# _ranked_dictionaries variable is for unit testing purposes
matches = []
len = password.length
password_lower = password.toLowerCase()
for dictionary_name, ranked_dict of _ranked_dictionaries
for i in [0...len]
for j in [i...len]
if password_lower[i..j] of ranked_dict
word = password_lower[i..j]
rank = ranked_dict[word]
matches.push
pattern: 'dictionary'
i: i
j: j
token: password[i..j]
matched_word: word
rank: rank
dictionary_name: dictionary_name
reversed: false
l33t: false
@sorted matches
reverse_dictionary_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES) ->
reversed_password = password.split('').reverse().join('')
matches = @dictionary_match reversed_password, _ranked_dictionaries
for match in matches
match.token = match.token.split('').reverse().join('') # reverse back
match.reversed = true
# map coordinates back to original string
[match.i, match.j] = [
password.length - 1 - match.j
password.length - 1 - match.i
]
@sorted matches
set_user_input_dictionary: (ordered_list) ->
RANKED_DICTIONARIES['user_inputs'] = build_ranked_dict ordered_list.slice()
#-------------------------------------------------------------------------------
# dictionary match with common l33t substitutions ------------------------------
#-------------------------------------------------------------------------------
# makes a pruned copy of l33t_table that only includes password's possible substitutions
relevant_l33t_subtable: (password, table) ->
password_chars = {}
for chr in password.split('')
password_chars[chr] = true
subtable = {}
for letter, subs of table
relevant_subs = (sub for sub in subs when sub of password_chars)
if relevant_subs.length > 0
subtable[letter] = relevant_subs
subtable
# returns the list of possible 1337 replacement dictionaries for a given password
enumerate_l33t_subs: (table) ->
keys = (k for k of table)
subs = [[]]
dedup = (subs) ->
deduped = []
members = {}
for sub in subs
assoc = ([k,v] for k,v in sub)
assoc.sort()
label = (k+','+v for k,v in assoc).join('-')
unless label of members
members[label] = true
deduped.push sub
deduped
helper = (keys) ->
return if not keys.length
first_key = keys[0]
rest_keys = keys[1..]
next_subs = []
for l33t_chr in table[first_key]
for sub in subs
dup_l33t_index = -1
for i in [0...sub.length]
if sub[i][0] == l33t_chr
dup_l33t_index = i
break
if dup_l33t_index == -1
sub_extension = sub.concat [[l33t_chr, first_key]]
next_subs.push sub_extension
else
sub_alternative = sub.slice(0)
sub_alternative.splice(dup_l33t_index, 1)
sub_alternative.push [l33t_chr, first_key]
next_subs.push sub
next_subs.push sub_alternative
subs = dedup next_subs
helper(rest_keys)
helper(keys)
sub_dicts = [] # convert from assoc lists to dicts
for sub in subs
sub_dict = {}
for [l33t_chr, chr] in sub
sub_dict[l33t_chr] = chr
sub_dicts.push sub_dict
sub_dicts
l33t_match: (password, _ranked_dictionaries = RANKED_DICTIONARIES, _l33t_table = L33T_TABLE) ->
matches = []
for sub in @enumerate_l33t_subs @relevant_l33t_subtable(password, _l33t_table)
break if @empty sub # corner case: password has no relevant subs.
subbed_password = @translate password, sub
for match in @dictionary_match(subbed_password, _ranked_dictionaries)
token = password[match.i..match.j]
if token.toLowerCase() == match.matched_word
continue # only return the matches that contain an actual substitution
match_sub = {} # subset of mappings in sub that are in use for this match
for subbed_chr, chr of sub when token.indexOf(subbed_chr) != -1
match_sub[subbed_chr] = chr
match.l33t = true
match.token = token
match.sub = match_sub
match.sub_display = ("#{k} -> #{v}" for k,v of match_sub).join(', ')
matches.push match
@sorted matches.filter (match) ->
# filter single-character l33t matches to reduce noise.
# otherwise '1' matches 'i', '4' matches 'a', both very common English words
# with low dictionary rank.
match.token.length > 1
# ------------------------------------------------------------------------------
# spatial match (qwerty/dvorak/keypad) -----------------------------------------
# ------------------------------------------------------------------------------
spatial_match: (password, _graphs = GRAPHS) ->
matches = []
for graph_name, graph of _graphs
@extend matches, @spatial_match_helper(password, graph, graph_name)
@sorted matches
SHIFTED_RX: /[~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:"ZXCVBNM<>?]/
spatial_match_helper: (password, graph, graph_name) ->
matches = []
i = 0
while i < password.length - 1
j = i + 1
last_direction = null
turns = 0
if graph_name in ['qwerty', 'dvorak'] and @SHIFTED_RX.exec(password.charAt(i))
# initial character is shifted
shifted_count = 1
else
shifted_count = 0
loop
prev_char = password.charAt(j-1)
found = false
found_direction = -1
cur_direction = -1
adjacents = graph[prev_char] or []
# consider growing pattern by one character if j hasn't gone over the edge.
if j < password.length
cur_char = password.charAt(j)
for adj in adjacents
cur_direction += 1
if adj and adj.indexOf(cur_char) != -1
found = true
found_direction = cur_direction
if adj.indexOf(cur_char) == 1
# index 1 in the adjacency means the key is shifted,
# 0 means unshifted: A vs a, % vs 5, etc.
# for example, 'q' is adjacent to the entry '2@'.
# @ is shifted w/ index 1, 2 is unshifted.
shifted_count += 1
if last_direction != found_direction
# adding a turn is correct even in the initial case when last_direction is null:
# every spatial pattern starts with a turn.
turns += 1
last_direction = found_direction
break
# if the current pattern continued, extend j and try to grow again
if found
j += 1
# otherwise push the pattern discovered so far, if any...
else
if j - i > 2 # don't consider length 1 or 2 chains.
matches.push
pattern: 'spatial'
i: i
j: j-1
token: password[i...j]
graph: graph_name
turns: turns
shifted_count: shifted_count
# ...and then start a new search for the rest of the password.
i = j
break
matches
#-------------------------------------------------------------------------------
# repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
#-------------------------------------------------------------------------------
repeat_match: (password) ->
matches = []
greedy = /(.+)\1+/g
lazy = /(.+?)\1+/g
lazy_anchored = /^(.+?)\1+$/
lastIndex = 0
while lastIndex < password.length
greedy.lastIndex = lazy.lastIndex = lastIndex
greedy_match = greedy.exec password
lazy_match = lazy.exec password
break unless greedy_match?
if greedy_match[0].length > lazy_match[0].length
# greedy beats lazy for 'aabaab'
# greedy: [aabaab, aab]
# lazy: [aa, a]
match = greedy_match
# greedy's repeated string might itself be repeated, eg.
# aabaab in aabaabaabaab.
# run an anchored lazy match on greedy's repeated string
# to find the shortest repeated string
base_token = lazy_anchored.exec(match[0])[1]
else
# lazy beats greedy for 'aaaaa'
# greedy: [aaaa, aa]
# lazy: [aaaaa, a]
match = lazy_match
base_token = match[1]
[i, j] = [match.index, match.index + match[0].length - 1]
# recursively match and score the base string
base_analysis = scoring.most_guessable_match_sequence(
base_token
@omnimatch base_token
)
base_matches = base_analysis.sequence
base_guesses = base_analysis.guesses
matches.push
pattern: 'repeat'
i: i
j: j
token: match[0]
base_token: base_token
base_guesses: base_guesses
base_matches: base_matches
repeat_count: match[0].length / base_token.length
lastIndex = j + 1
matches
MAX_DELTA: 5
sequence_match: (password) ->
# Identifies sequences by looking for repeated differences in unicode codepoint.
# this allows skipping, such as 9753, and also matches some extended unicode sequences
# such as Greek and Cyrillic alphabets.
#
# for example, consider the input 'abcdb975zy'
#
# password: a b c d b 9 7 5 z y
# index: 0 1 2 3 4 5 6 7 8 9
# delta: 1 1 1 -2 -41 -2 -2 69 1
#
# expected result:
# [(i, j, delta), ...] = [(0, 3, 1), (5, 7, -2), (8, 9, 1)]
return [] if password.length == 1
update = (i, j, delta) =>
if j - i > 1 or Math.abs(delta) == 1
if 0 < Math.abs(delta) <= @MAX_DELTA
token = password[i..j]
if /^[a-z]+$/.test(token)
sequence_name = 'lower'
sequence_space = 26
else if /^[A-Z]+$/.test(token)
sequence_name = 'upper'
sequence_space = 26
else if /^\d+$/.test(token)
sequence_name = 'digits'
sequence_space = 10
else
# conservatively stick with roman alphabet size.
# (this could be improved)
sequence_name = 'unicode'
sequence_space = 26
result.push
pattern: 'sequence'
i: i
j: j
token: password[i..j]
sequence_name: sequence_name
sequence_space: sequence_space
ascending: delta > 0
result = []
i = 0
last_delta = null
for k in [1...password.length]
delta = password.charCodeAt(k) - password.charCodeAt(k - 1)
unless last_delta?
last_delta = delta
continue if delta == last_delta
j = k - 1
update(i, j, last_delta)
i = j
last_delta = delta
update(i, password.length - 1, last_delta)
result
#-------------------------------------------------------------------------------
# regex matching ---------------------------------------------------------------
#-------------------------------------------------------------------------------
regex_match: (password, _regexen = REGEXEN) ->
matches = []
for name, regex of _regexen
regex.lastIndex = 0 # keeps regex_match stateless
while rx_match = regex.exec password
token = rx_match[0]
matches.push
pattern: 'regex'
token: token
i: rx_match.index
j: rx_match.index + rx_match[0].length - 1
regex_name: name
regex_match: rx_match
@sorted matches
#-------------------------------------------------------------------------------
# date matching ----------------------------------------------------------------
#-------------------------------------------------------------------------------
date_match: (password) ->
# a "date" is recognized as:
# any 3-tuple that starts or ends with a 2- or 4-digit year,
# with 2 or 0 separator chars (1.1.91 or 1191),
# maybe zero-padded (01-01-91 vs 1-1-91),
# a month between 1 and 12,
# a day between 1 and 31.
#
# note: this isn't true date parsing in that "feb 31st" is allowed,
# this doesn't check for leap years, etc.
#
# recipe:
# start with regex to find maybe-dates, then attempt to map the integers
# onto month-day-year to filter the maybe-dates into dates.
# finally, remove matches that are substrings of other matches to reduce noise.
#
# note: instead of using a lazy or greedy regex to find many dates over the full string,
# this uses a ^...$ regex against every substring of the password -- less performant but leads
# to every possible date match.
matches = []
maybe_date_no_separator = /^\d{4,8}$/
maybe_date_with_separator = ///
^
( \d{1,4} ) # day, month, year
( [\s/\\_.-] ) # separator
( \d{1,2} ) # day, month
\2 # same separator
( \d{1,4} ) # day, month, year
$
///
# dates without separators are between length 4 '1191' and 8 '11111991'
for i in [0..password.length - 4]
for j in [i + 3..i + 7]
break if j >= password.length
token = password[i..j]
continue unless maybe_date_no_separator.exec token
candidates = []
for [k,l] in DATE_SPLITS[token.length]
dmy = @map_ints_to_dmy [
parseInt token[0...k]
parseInt token[k...l]
parseInt token[l...]
]
candidates.push dmy if dmy?
continue unless candidates.length > 0
# at this point: different possible dmy mappings for the same i,j substring.
# match the candidate date that likely takes the fewest guesses: a year closest to 2000.
# (scoring.REFERENCE_YEAR).
#
# ie, considering '111504', prefer 11-15-04 to 1-1-1504
# (interpreting '04' as 2004)
best_candidate = candidates[0]
metric = (candidate) -> Math.abs candidate.year - scoring.REFERENCE_YEAR
min_distance = metric candidates[0]
for candidate in candidates[1..]
distance = metric candidate
if distance < min_distance
[best_candidate, min_distance] = [candidate, distance]
matches.push
pattern: 'date'
token: token
i: i
j: j
separator: ''
year: best_candidate.year
month: best_candidate.month
day: best_candidate.day
# dates with separators are between length 6 '1/1/91' and 10 '11/11/1991'
for i in [0..password.length - 6]
for j in [i + 5..i + 9]
break if j >= password.length
token = password[i..j]
rx_match = maybe_date_with_separator.exec token
continue unless rx_match?
dmy = @map_ints_to_dmy [
parseInt rx_match[1]
parseInt rx_match[3]
parseInt rx_match[4]
]
continue unless dmy?
matches.push
pattern: 'date'
token: token
i: i
j: j
separator: rx_match[2]
year: dmy.year
month: dmy.month
day: dmy.day
# matches now contains all valid date strings in a way that is tricky to capture
# with regexes only. while thorough, it will contain some unintuitive noise:
#
# '2015_06_04', in addition to matching 2015_06_04, will also contain
# 5(!) other date matches: 15_06_04, 5_06_04, ..., even 2015 (matched as 5/1/2020)
#
# to reduce noise, remove date matches that are strict substrings of others
@sorted matches.filter (match) ->
is_submatch = false
for other_match in matches
continue if match is other_match
if other_match.i <= match.i and other_match.j >= match.j
is_submatch = true
break
not is_submatch
map_ints_to_dmy: (ints) ->
# given a 3-tuple, discard if:
# middle int is over 31 (for all dmy formats, years are never allowed in the middle)
# middle int is zero
# any int is over the max allowable year
# any int is over two digits but under the min allowable year
# 2 ints are over 31, the max allowable day
# 2 ints are zero
# all ints are over 12, the max allowable month
return if ints[1] > 31 or ints[1] <= 0
over_12 = 0
over_31 = 0
under_1 = 0
for int in ints
return if 99 < int < DATE_MIN_YEAR or int > DATE_MAX_YEAR
over_31 += 1 if int > 31
over_12 += 1 if int > 12
under_1 += 1 if int <= 0
return if over_31 >= 2 or over_12 == 3 or under_1 >= 2
# first look for a four digit year: yyyy + daymonth or daymonth + yyyy
possible_year_splits = [
[ints[2], ints[0..1]] # year last
[ints[0], ints[1..2]] # year first
]
for [y, rest] in possible_year_splits
if DATE_MIN_YEAR <= y <= DATE_MAX_YEAR
dm = @map_ints_to_dm rest
if dm?
return {
year: y
month: dm.month
day: dm.day
}
else
# for a candidate that includes a four-digit year,
# when the remaining ints don't match to a day and month,
# it is not a date.
return
# given no four-digit year, two digit years are the most flexible int to match, so
# try to parse a day-month out of ints[0..1] or ints[1..0]
for [y, rest] in possible_year_splits
dm = @map_ints_to_dm rest
if dm?
y = @two_to_four_digit_year y
return {
year: y
month: dm.month
day: dm.day
}
map_ints_to_dm: (ints) ->
for [d, m] in [ints, ints.slice().reverse()]
if 1 <= d <= 31 and 1 <= m <= 12
return {
day: d
month: m
}
two_to_four_digit_year: (year) ->
if year > 99
year
else if year > 50
# 87 -> 1987
year + 1900
else
# 15 -> 2015
year + 2000
module.exports = matching