scalar-autograd
Version:
Scalar-based reverse-mode automatic differentiation in TypeScript.
188 lines (187 loc) • 6.27 kB
JavaScript
"use strict";
// Optimizers.ts
Object.defineProperty(exports, "__esModule", { value: true });
exports.AdamW = exports.Adam = exports.SGD = exports.Optimizer = void 0;
/**
* Abstract base class for all optimizers.
* Ensures only requiresGrad parameters are optimized.
*/
class Optimizer {
trainables;
learningRate;
/**
* Constructs an Optimizer.
* @param trainables Array of Value parameters to optimize.
* @param learningRate Learning rate for updates.
*/
constructor(trainables, learningRate) {
this.trainables = trainables.filter(v => v.requiresGrad);
this.learningRate = learningRate;
}
/**
* Sets grads of all trainables to zero.
*/
zeroGrad() {
for (const v of this.trainables)
v.grad = 0;
}
/**
* Clips global norm of gradients as regularization.
* @param maxNorm Maximum allowed norm for gradients.
*/
clipGradients(maxNorm) {
const totalNorm = Math.sqrt(this.trainables.reduce((sum, v) => sum + v.grad * v.grad, 0));
if (totalNorm > maxNorm) {
const scale = maxNorm / (totalNorm + 1e-6);
for (const v of this.trainables)
v.grad *= scale;
}
}
}
exports.Optimizer = Optimizer;
/**
* Stochastic Gradient Descent (SGD) optimizer. Accepts weightDecay and gradientClip for API consistency (ignored).
*/
class SGD extends Optimizer {
weightDecay;
gradientClip;
/**
* Constructs an SGD optimizer.
* @param trainables Array of Value parameters to optimize.
* @param opts Optional parameters (learningRate, weightDecay, gradientClip).
*/
constructor(trainables, opts = {}) {
super(trainables, opts.learningRate ?? 1e-2);
this.weightDecay = opts.weightDecay ?? 0;
this.gradientClip = opts.gradientClip ?? 0;
}
/**
* Performs a parameter update using standard SGD.
*/
step() {
// Intentionally ignoring weightDecay/gradientClip for SGD
for (const v of this.trainables) {
v.data -= this.learningRate * v.grad;
}
}
resetStateFor(trainable) {
}
}
exports.SGD = SGD;
/**
* Adam optimizer, supports decoupled weight decay and gradient clipping.
*/
class Adam extends Optimizer {
beta1;
beta2;
epsilon;
weightDecay;
gradientClip;
m = new Map();
v = new Map();
stepCount = 0;
/**
* Constructs an Adam optimizer.
* @param trainables Array of Value parameters to optimize.
* @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
*/
constructor(trainables, opts = {}) {
super(trainables, opts.learningRate ?? 0.001);
this.beta1 = opts.beta1 ?? 0.9;
this.beta2 = opts.beta2 ?? 0.999;
this.epsilon = opts.epsilon ?? 1e-8;
this.weightDecay = opts.weightDecay ?? 0;
this.gradientClip = opts.gradientClip ?? 0;
for (const v of this.trainables) {
this.m.set(v, 0);
this.v.set(v, 0);
}
}
/**
* Performs a parameter update using Adam optimization.
*/
step() {
this.stepCount++;
for (const v of this.trainables) {
let grad = v.grad;
if (this.weightDecay > 0)
grad += this.weightDecay * v.data;
let m = this.m.get(v);
let vVal = this.v.get(v);
m = this.beta1 * m + (1 - this.beta1) * grad;
vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
let update = mHat / (Math.sqrt(vHat) + this.epsilon);
if (this.gradientClip > 0) {
update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
}
v.data -= this.learningRate * update;
this.m.set(v, m);
this.v.set(v, vVal);
}
}
resetStateFor(trainable) {
this.m.set(trainable, 0);
this.v.set(trainable, 0);
}
}
exports.Adam = Adam;
/**
* AdamW optimizer, supports decoupled weight decay and gradient clipping (same options as Adam).
*/
class AdamW extends Optimizer {
beta1;
beta2;
epsilon;
weightDecay;
gradientClip;
m = new Map();
v = new Map();
stepCount = 0;
/**
* Constructs an AdamW optimizer.
* @param trainables Array of Value parameters to optimize.
* @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
*/
constructor(trainables, opts = {}) {
super(trainables, opts.learningRate ?? 0.001);
this.beta1 = opts.beta1 ?? 0.9;
this.beta2 = opts.beta2 ?? 0.999;
this.epsilon = opts.epsilon ?? 1e-8;
this.weightDecay = opts.weightDecay ?? 0.01;
this.gradientClip = opts.gradientClip ?? 0;
for (const v of this.trainables) {
this.m.set(v, 0);
this.v.set(v, 0);
}
}
/**
* Performs a parameter update using AdamW optimization (decoupled weight decay).
*/
step() {
this.stepCount++;
for (const v of this.trainables) {
let grad = v.grad;
let m = this.m.get(v);
let vVal = this.v.get(v);
m = this.beta1 * m + (1 - this.beta1) * grad;
vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
let update = mHat / (Math.sqrt(vHat) + this.epsilon);
if (this.gradientClip > 0) {
update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
}
// Weight decay is decoupled as in AdamW paper:
v.data -= this.learningRate * update + this.learningRate * this.weightDecay * v.data;
this.m.set(v, m);
this.v.set(v, vVal);
}
}
resetStateFor(trainable) {
this.m.set(trainable, 0);
this.v.set(trainable, 0);
}
}
exports.AdamW = AdamW;