UNPKG

scalar-autograd

Version:

Scalar-based reverse-mode automatic differentiation in TypeScript.

188 lines (187 loc) 6.27 kB
"use strict"; // Optimizers.ts Object.defineProperty(exports, "__esModule", { value: true }); exports.AdamW = exports.Adam = exports.SGD = exports.Optimizer = void 0; /** * Abstract base class for all optimizers. * Ensures only requiresGrad parameters are optimized. */ class Optimizer { trainables; learningRate; /** * Constructs an Optimizer. * @param trainables Array of Value parameters to optimize. * @param learningRate Learning rate for updates. */ constructor(trainables, learningRate) { this.trainables = trainables.filter(v => v.requiresGrad); this.learningRate = learningRate; } /** * Sets grads of all trainables to zero. */ zeroGrad() { for (const v of this.trainables) v.grad = 0; } /** * Clips global norm of gradients as regularization. * @param maxNorm Maximum allowed norm for gradients. */ clipGradients(maxNorm) { const totalNorm = Math.sqrt(this.trainables.reduce((sum, v) => sum + v.grad * v.grad, 0)); if (totalNorm > maxNorm) { const scale = maxNorm / (totalNorm + 1e-6); for (const v of this.trainables) v.grad *= scale; } } } exports.Optimizer = Optimizer; /** * Stochastic Gradient Descent (SGD) optimizer. Accepts weightDecay and gradientClip for API consistency (ignored). */ class SGD extends Optimizer { weightDecay; gradientClip; /** * Constructs an SGD optimizer. * @param trainables Array of Value parameters to optimize. * @param opts Optional parameters (learningRate, weightDecay, gradientClip). */ constructor(trainables, opts = {}) { super(trainables, opts.learningRate ?? 1e-2); this.weightDecay = opts.weightDecay ?? 0; this.gradientClip = opts.gradientClip ?? 0; } /** * Performs a parameter update using standard SGD. */ step() { // Intentionally ignoring weightDecay/gradientClip for SGD for (const v of this.trainables) { v.data -= this.learningRate * v.grad; } } resetStateFor(trainable) { } } exports.SGD = SGD; /** * Adam optimizer, supports decoupled weight decay and gradient clipping. */ class Adam extends Optimizer { beta1; beta2; epsilon; weightDecay; gradientClip; m = new Map(); v = new Map(); stepCount = 0; /** * Constructs an Adam optimizer. * @param trainables Array of Value parameters to optimize. * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon). */ constructor(trainables, opts = {}) { super(trainables, opts.learningRate ?? 0.001); this.beta1 = opts.beta1 ?? 0.9; this.beta2 = opts.beta2 ?? 0.999; this.epsilon = opts.epsilon ?? 1e-8; this.weightDecay = opts.weightDecay ?? 0; this.gradientClip = opts.gradientClip ?? 0; for (const v of this.trainables) { this.m.set(v, 0); this.v.set(v, 0); } } /** * Performs a parameter update using Adam optimization. */ step() { this.stepCount++; for (const v of this.trainables) { let grad = v.grad; if (this.weightDecay > 0) grad += this.weightDecay * v.data; let m = this.m.get(v); let vVal = this.v.get(v); m = this.beta1 * m + (1 - this.beta1) * grad; vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad; const mHat = m / (1 - Math.pow(this.beta1, this.stepCount)); const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount)); let update = mHat / (Math.sqrt(vHat) + this.epsilon); if (this.gradientClip > 0) { update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip)); } v.data -= this.learningRate * update; this.m.set(v, m); this.v.set(v, vVal); } } resetStateFor(trainable) { this.m.set(trainable, 0); this.v.set(trainable, 0); } } exports.Adam = Adam; /** * AdamW optimizer, supports decoupled weight decay and gradient clipping (same options as Adam). */ class AdamW extends Optimizer { beta1; beta2; epsilon; weightDecay; gradientClip; m = new Map(); v = new Map(); stepCount = 0; /** * Constructs an AdamW optimizer. * @param trainables Array of Value parameters to optimize. * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon). */ constructor(trainables, opts = {}) { super(trainables, opts.learningRate ?? 0.001); this.beta1 = opts.beta1 ?? 0.9; this.beta2 = opts.beta2 ?? 0.999; this.epsilon = opts.epsilon ?? 1e-8; this.weightDecay = opts.weightDecay ?? 0.01; this.gradientClip = opts.gradientClip ?? 0; for (const v of this.trainables) { this.m.set(v, 0); this.v.set(v, 0); } } /** * Performs a parameter update using AdamW optimization (decoupled weight decay). */ step() { this.stepCount++; for (const v of this.trainables) { let grad = v.grad; let m = this.m.get(v); let vVal = this.v.get(v); m = this.beta1 * m + (1 - this.beta1) * grad; vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad; const mHat = m / (1 - Math.pow(this.beta1, this.stepCount)); const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount)); let update = mHat / (Math.sqrt(vHat) + this.epsilon); if (this.gradientClip > 0) { update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip)); } // Weight decay is decoupled as in AdamW paper: v.data -= this.learningRate * update + this.learningRate * this.weightDecay * v.data; this.m.set(v, m); this.v.set(v, vVal); } } resetStateFor(trainable) { this.m.set(trainable, 0); this.v.set(trainable, 0); } } exports.AdamW = AdamW;