guv

# guv - Scaling governor of cloud workers # (c) 2015 The Grid # guv may be freely distributed under the MIT license debug = require('debug')('guv:heroku') Heroku = require 'heroku-client' child = require 'child_process' async = require 'async' statistics = require 'simple-statistics' exports.dryrun = false exports.getWorkers = (config, callback) -> mainConfig = config['*'] return callback new Error "Missing global configuration (*)" if not mainConfig options = token: mainConfig.apikey or process.env['HEROKU_API_KEY'] heroku = new Heroku options getFormation = (appname, cb) -> heroku.apps(appname) .formation() .list cb appnames = [] for name, role of config continue if name == '*' app = role.app exists = app in appnames appnames.push app if not exists async.map appnames, getFormation, (err, response) -> debug 'getworkers returned', err, response return callback err if err workers = [] for res in response for dyno in res # expose all the data w = dyno # but rename to our expectations w.app = dyno.app.name w.role = dyno.type delete w.type workers.push w return callback null, workers exports.setWorkers = (config, workers, callback) -> options = token: config.apikey or process.env['HEROKU_API_KEY'] heroku = new Heroku options # sort workers into belonging app/formation formations = {} for w in workers formations[w.app] = [] if not formations[w.app] formations[w.app].push { process: w.role, quantity: w.quantity } scaleFormation = (appname, cb) -> formation = formations[appname] heroku.apps(appname) .formation() .batchUpdate(updates: formation, cb) return callback null if exports.dryrun debug 'scaling', workers appnames = Object.keys formations async.map appnames, scaleFormation, (err, res) -> debug 'scaled returned', err, res return callback err, res matchAll = (regexp, str) -> matches = [] str.replace regexp, () -> arr = ([]).slice.call arguments, 0 extras = arr.splice -2 arr.index = extras[0] matches.push arr return matches startsWith = (str, prefix) -> return str.indexOf(prefix) == 0 # input format: # Scale to guv=1, measuremedia=1, solveslow=10, web=3 by team+gridbot@thegrid.io parseScaleTo = (str) -> re = /Scale to (.*) by.*/ match = re.exec str dynostr = match[1] dynos = {} for d in dynostr.split ', ' [name, number] = d.split('=') dynos[name] = parseInt(number) return dynos eventsFromLog = (logdata, started) -> events = [] # TODO: allow to output a cleaned/minimized logfile. Especially for tests # timestamp, target (app|heroku), action re = /^(.*?) (\w+)\[(.*)\]: (.*)$/mg matches = matchAll re, logdata for m in matches [_full, timestamp, target, dyno, info] = m timestamp = new Date(timestamp) # known things to ignore if startsWith info, 'info' else if startsWith info, 'warn' else if startsWith info, 'err!' # debug messages else if startsWith info, 'at=info' else if startsWith info, 'sock=client' # Heroku router message else if startsWith info, 'Error:' # JS exception else if startsWith info, '{"v"' # NewRelic event thing else if startsWith info, 'source=HEROKU_POSTGRESQL' # NewRelic event thing # app specific. FIXME: make general else if startsWith info, 'Measurement task' else if startsWith info, 'New job' else if startsWith info, 'Received measurement' else if startsWith info, 'running: update' else if startsWith info, 'done' else if info.indexOf('noflo-runtime-msgflo:error') != -1 # events we care about else if startsWith info, 'Scale to' # note: affects multiple dynos, each can go up, down or no change scaleTo = parseScaleTo info for name, number of scaleTo events.push { type: 'scale-to', time: timestamp, requested: number, dyno: name, msg: info } # Should we synthesize per-dyno events from it? requires context... else if startsWith info, 'State changed from up to down' events.push { type: 'up->down', time: timestamp, dyno: dyno, msg: info } else if startsWith info, 'State changed from starting to up' events.push { type: 'starting->up', time: timestamp, dyno: dyno, msg: info } else if startsWith info, 'Starting process with command' events.push { type: 'process-starting', time: timestamp, dyno: dyno, msg: info } else if started info events.push { type: 'process-started', time: timestamp, dyno: dyno, msg: info } else if startsWith info, 'Process exited with status' events.push { type: 'process-exited', time: timestamp, dyno: dyno, msg: info } else if startsWith info, 'Stopping all processes with SIGTERM' events.push { type: 'process-stopping', time: timestamp, dyno: dyno, msg: info } else #debug 'unknown-logline', info return events # Basically a finite state machine, one per dyno applyEvent = (state, event) -> # DynoState: requested | starting | up | stopping | (exited) state.lasttransition = {} if not state.lasttransition # 'dyno.N' -> lastTransition: Event } state.dynostate = {} if not state.dynostate # 'dyno.N' -> DynoState state.startups = [] if not state.startups state.shutdowns = [] if not state.shutdowns state.scaleups = [] if not state.scaleups state.uptimes = [] if not state.uptimes state.requestedWorkers = {} if not state.requestedWorkers # 'dyno" -> Number # Note, they can happen initially because we don't generally know initial state if event.dyno # Dyno-specific events #console.log event.dyno, event.type switch event.type when 'scale-to' old = state.requestedWorkers[event.dyno] newValue = event.requested #console.log 'scale:', event.dyno, newValue, old, state.requestedWorkers if newValue > old # TODO: validate that number of running matches expected lastNotExited = 0 for dynoname, dynostate of state.dynostate if startsWith dynoname, "#{event.dyno}." [dynorole, dynonr] = dynoname.split '.' dynonr = parseInt dynonr #console.log 's', dynoname, dynostate if dynostate != 'exited' and dynostate != 'requested' and dynonr > lastNotExited lastNotExited = dynonr firstNew = lastNotExited+1 lastNew = firstNew+(newValue-old)-1 for i in [firstNew..lastNew] name = "#{event.dyno}.#{i}" #console.log 'adding', name state.dynostate[name] = 'requested' state.lasttransition[name] = event else if newValue < old #console.log 'less', event.dyno, old, newValue else null # no change state.requestedWorkers[event.dyno] = newValue when 'process-starting' if state.lasttransition[event.dyno] and state.dynostate[event.dyno] == 'requested' s = dyno: event.dyno start: state.lasttransition[event.dyno] end: event s.duration = s.end.time.getTime() - s.start.time.getTime() state.scaleups.push s state.dynostate[event.dyno] = 'starting' state.lasttransition[event.dyno] = event else debug 'invalid transition', event.type, state.dynostate[event.dyno] when 'process-started' if state.lasttransition[event.dyno] and state.dynostate[event.dyno] == 'starting' s = dyno: event.dyno start: state.lasttransition[event.dyno] end: event s.duration = s.end.time.getTime() - s.start.time.getTime() state.startups.push s state.dynostate[event.dyno] = 'started' state.lasttransition[event.dyno] = event else debug 'invalid transition', event.type, state.dynostate[event.dyno] when 'starting->up' then null when 'up->down' then null when 'process-stopping' if state.dynostate[event.dyno] == 'started' s = dyno: event.dyno start: state.lasttransition[event.dyno] end: event s.duration = s.end.time.getTime() - s.start.time.getTime() state.uptimes.push s state.dynostate[event.dyno] = 'stopping' state.lasttransition[event.dyno] = event else debug 'invalid transition', event.type, state.dynostate[event.dyno] when 'process-exited' if state.dynostate[event.dyno] == 'stopping' and state.lasttransition[event.dyno] s = dyno: event.dyno start: state.lasttransition[event.dyno] end: event s.duration = s.end.time.getTime() - s.start.time.getTime() state.shutdowns.push s state.dynostate[event.dyno] = 'exited' state.lasttransition[event.dyno] = event else debug 'invalid transition', event.type, state.dynostate[event.dyno] else # FIXME: handle. Maybe outside/before calculateStats = (state) -> starts = state.startups.map (s) -> s.duration/1000 stops = state.shutdowns.map (s) -> s.duration/1000 scaleups = state.scaleups.map (s) -> s.duration/1000 uptimes = state.uptimes.map (s) -> s.duration/1000 results = scaleup: statistics.median scaleups scaleup_stddev: statistics.standard_deviation scaleups scaleup_length: scaleups.length uptime: statistics.median uptimes uptime_stddev: statistics.standard_deviation uptimes uptime_length: uptimes.length startup: statistics.mean starts startup_stddev: statistics.standard_deviation starts startup_length: starts.length shutdown: statistics.mean stops shutdown_stddev: statistics.standard_deviation stops shutdown_length: stops.length wasted = results.scaleup + results.startup + results.shutdown # assumes Heroku charges for all these steps results.utilization = results.uptime / (results.uptime + wasted) return results hasDynos = (roles) -> return (s) -> hasRoles = roles? and roles.length # empty array means include everything return true if not hasRoles [role, number] = s.dyno.split '.' include = role in roles return include analyzeStartups = (filename, started, dynos, callback) -> fs = require 'fs' state = {} fs.readFile filename, {encoding: 'utf-8'}, (err, contents) -> return callback err if err events = eventsFromLog contents, started #results = events.map (e) -> "#{e.dyno or ''} #{e.type}" for e in events applyEvent state, e state.startups = state.startups.filter hasDynos(dynos) state.shutdowns = state.shutdowns.filter hasDynos(dynos) state.scaleups = state.scaleups.filter hasDynos(dynos) state.uptimes = state.uptimes.filter hasDynos(dynos) results = calculateStats state return callback null, results collectOption = (value, array) -> array.push value return array # TODO: calculate whole delay from scaling to up by default, and scaling down to down # TODO: allow to separate between (module) loading time, and startup time # TODO: add a guv-update-jobstats tool, would modify 'boot' and 'shutdown' values in config # TODO: add tool for calculating scaling 'waste'. Ratio of time spent processing vs startup+shutdown # MAYBE: allow specifying subsets in time? # MAYBE: allow ignoring certain dynos? exports.startuptime_main = () -> program = require 'commander' filename = null program .arguments('<heroku.log>') .option('--started <regexp>', 'Regular expression matching output sent by process when started', String, 'noflo-runtime-msgflo started') .option('--role <rolename>', 'Calculate stats for a subset of roles. Can be specified multiple times', collectOption, []) .action (f, env) -> filename = f .parse(process.argv) program.started = new RegExp program.started started = (info) -> return program.started.test info roles = program.role analyzeStartups filename, started, roles, (err, res) -> throw err if err console.log res