q-exp
Version:
Reinforcement learning (Q-Learning) library
409 lines (332 loc) • 9.86 kB
JavaScript
;
/**
* Simple Q-learning library for JavaScript ninja
* @author StarColon Projects
*/
var ql = {}
var fs = require('fs');
var Promise = require('bluebird');
var _ = require('underscore');
var colors = require('colors');
var State = require('./state.js');
var Generalizer = require('./generaliz.js');
var config = require('./package.json');
ql.isVerbose = true;
Promise.longStackTraces = true;
/**
* Create a new agent with given predefined actionset
* @param {String} name of the agent file to save or load
* @param {Array} list of actions (string)
* @param {Number} learning rate
*/
ql.newAgent = function(name,actionset,alpha){
var agent = {}
agent.name = name;
agent.actionset = actionset;
agent.func = {};
agent.policy = {};
agent.alpha = alpha || 0.5;
agent.history = [];
return Promise.resolve(agent)
}
// stateGenerator takes a state and an action
// to create a new subsequent state
ql.bindStateGenerator = function(stateGenerator){
return function(agent){
agent.func.stateGenerator = stateGenerator;
return agent;
}
}
ql.bindRewardMeasure = function(rewardOfState){
return function (agent){
agent.func.rewardOfState = rewardOfState;
return agent;
}
}
ql.bindActionCostMeasure = function(actionCost){
return function(agent){
agent.func.actionCost = actionCost;
return agent;
}
}
ql.bindStatePrinter = function(p){
return function(agent){
agent.func.statePrint = p;
return agent;
}
}
ql.clearHistory = function(agent){
agent.history.length = 0;
return Promise.resolve(agent);
}
/**
* Save the learned policy to a physical file
* the name of the agent is used as the file name
*/
ql.save = function(path){
return function(agent){
fs.writeFile(`${path}/${agent.name}.agent`,JSON.stringify(agent.policy));
return Promise.resolve(agent);
}
}
/**
* As the learned policy as a specified file
*/
ql.saveAs = function(fullpath){
return function(agent){
fs.writeFile(`${fullpath}.agent`,JSON.stringify(agent.policy));
return Promise.resolve(agent);
}
}
/**
* Load the policy from a physical file
*/
ql.load = function(path){
return function(agent){
return new Promise((done,reject) => {
fs.readFile(`${path}/${agent.name}.agent`,function(err,policy){
if (err) {
console.error('Unable to load agent'.red);
console.error(err);
return done(agent);
}
policy = JSON.parse(policy);
agent.policy = policy;
ql.isVerbose && console.log('AGENT LOADED'.cyan);
ql.isVerbose && console.log(agent.policy)
done(agent)
})
})
}
}
/**
* Illustrate the policy it learned
*/
ql.revealBrain = function(agent){
if (Object.keys(agent.policy).length==0)
return agent;
console.log('[BRAIN SCAN]'.green)
Object.keys(agent.policy).forEach(function(state){
console.log(state);
console.log(` most probable action: ${agent.policy[state][0].action} (${agent.policy[state][0].reward})`)
})
return agent;
}
/**
* Update the policy from the observation
* @param {State} state
* @param {String} action
* @param {Number} reward value to add
*/
ql.__updatePolicy = function(state,action,rewardAddUp){
return function(agent){
// Register a new state if haven't
if (!agent.policy.hasOwnProperty(state.hash)){
agent.policy[state.hash] = {}
agent.policy[state.hash] = agent.actionset.map(function(a){
return {action: a, reward: a==action ? rewardAddUp : 0}
})
}
else{
// State exists, update the action reward
agent.policy[state.hash] = agent.policy[state.hash].map(function(a){
if (a.action==action)
return {action: action, reward: rewardAddUp};
else return {action:a.action, reward: a.reward}
})
}
// Resort the policy (higher reward comes first)
agent.policy[state.hash] = _.sortBy(agent.policy[state.hash],(s)=>-s.reward);
return Promise.resolve(agent)
}
}
/**
* Explore the reward of the next state after applying an action
* @param {State} current state
*/
ql.__rewardOf = function(state){
return function(agent){
return agent.func['rewardOfState'](state.hash);
}
}
/**
* Determine (predict) the reward we would get
* when perform a particular action on a state
*/
ql.__q = function(state,action){
return function(agent){
var cost = agent.func.actionCost(state,action);
if (cost<0)
return cost;
// Do we have the state and action registered in the policy?
if (agent.policy.hasOwnProperty(state.hash)){
// Yes, we have the state memorised
var _act = (agent.policy[state.hash].filter((a) => a.action==action));
if (_act.length==0)
return agent.funcactionCost(state,a);
else
return _act[0].reward;
}
else{
// Estimate cost from generalised model
if (agent.ϴ){
console.log('Recall policy from generalisation'.cyan);
var actionIndex = agent.actionset.indexOf(action);
var _state = [1].concat(state.state)
var _cost = agent.ϴ[actionIndex].reduce((_c,θi,i) =>
_c + θi*_state[i]
,0)
return _cost;
}
// We don't know anything about the current state
// Guess it based on uniform distribution then
return cost;
}
}
}
/**
* Explore the subsequent states by trying some actions on it
*/
ql.__exploreNext = function(state){
return function(agent){
// List all actions and try
var rewards = agent.actionset.map(function(a){
// Predict the reward we would get
var q = ql.__q(state,a)(agent);
// If the predicted reward remains zero,
// apply some uncertainty noise
if (q==0) q += Math.random();
return {action: a, reward: q}
})
// Sort the actions by rewards (higher first)
return _.sortBy(rewards,(r) => -r.reward);
}
}
/**
* Start a new learning course of the agent
* @param {State} initial state
*/
ql.start = function(initState){
return function(agent){
ql.isVerbose && console.log('Starting...'.cyan);
// Clear the history then start
return ql.clearHistory(agent)
.then(ql.setState(initState))
.then(ql.step);
}
}
/**
* Perceive and store its own state in the history tree
*/
ql.perceiveState = function(agent){
agent.history.push({action: null, state: agent.state});
return agent;
}
/**
* Set the current state
*/
ql.setState = function(state){
return function(agent){
agent.state = state;
// Push the state to the history list too
agent.history.push({action: null, state: state})
return agent;
}
}
ql.getState = function(agent){
return agent.state
}
/**
* Learn from the recent step which introduces a new state
* This should be called after `ql.step`
* and then `ql.setState` strictly
*/
ql.learn = function(agent){
// NOTE:
// Last history = perceived environmental state after a move
// Preceeding of last = A move the agent took
// History primary validations
if (agent.history.length<2){
return Promise.reject('Agent has not yet made any steps.');
}
var L = agent.history.length;
var lastMove = agent.history[L-2];
var currentState = agent.history[L-1];
if (currentState.action!=null){
return Promise.reject('Agent needs to update the current state after a move.');
}
if (lastMove.action == null){
return Promise.reject('Agent should perceive the current state after its recent move.')
}
var reward0 = agent.func.rewardOfState(lastMove.state);
var reward1 = agent.func.rewardOfState(currentState.state);
var delta = agent.alpha * (reward1 - reward0);
// Learn from mistake, update the policy
ql.isVerbose && console.log(agent.name + ' learning new policy ...'.cyan + delta.toFixed(2));
ql.__updatePolicy(lastMove.state, lastMove.action, delta)(agent);
return agent;
}
/**
* Let the agent choose the next best action
*/
ql.step = function(agent){
ql.isVerbose && console.log('STEP BEGINS'.green);
if (!agent.state){
return Promise.reject('Assign the current state first with `ql.setState`');
}
// Explore the next states
var nexts = ql.__exploreNext(agent.state)(agent);
ql.isVerbose && console.log('generated actions:'.yellow);
ql.isVerbose && console.log(nexts);
// Greedily pick the best action
var chosen = nexts[0];
var currentReward = agent.func.rewardOfState(agent.state);
// Register the chosen action
agent.history.push({action: chosen.action, state: agent.state});
ql.isVerbose && console.log(agent.name + ' chose action :'.green + chosen.action);
// Generate the state after an action is taken
agent.state = agent.func.stateGenerator(agent.state, chosen.action);
// Print the state
ql.isVerbose && agent.func.statePrint && agent.func.statePrint(agent.state);
return agent;
}
/**
* Generalise the J* space
* based on the learned state-reward terrain
* @param {String} method of generalisation to exploit
*/
ql.generalize = function(method){
const maxIters = 100;
const alpha = 0.0001; // Keep it tiny for finest adjustment
method = method || 'GD'; // Gradient descent by default
return function(agent){
// Prepare mapping #action --> #reward
var Qa = agent.actionset.map(function(a){
return {states:[], rewards:[]}
});
// Step through policy and collect
// rewards of each action
Object.keys(agent.policy).forEach((hash) => {
var state = State.fromHash(hash);
// Iterate through each action and fill the space
agent.policy[hash].forEach((a,i) => {
// `a` = {action: .... , reward: .....}
Qa[i].states.push(state);
Qa[i].rewards.push(a.reward);
})
})
// By each action space, fit the parameterised ϴ
var ϴspace = []
Qa.forEach((action,i) => {
var states = action.states;
var rewards = action.rewards;
var ϴi = Generalizer.fit(states,rewards,maxIters,alpha,method);
ϴspace[i] = ϴi;
})
// Now we have learned the entire J* space
// Let the agent memorised it for use
agent.ϴ = ϴspace;
return Promise.resolve(agent)
}
}
module.exports = ql;