herta
Version:
Advanced mathematics framework for scientific, engineering, and financial applications
801 lines (650 loc) • 23.9 kB
JavaScript
/**
* Language Model Mathematics module for herta.js
* Provides mathematical tools for analyzing, designing, and evaluating
* large language models and transformer architectures
*/
const matrix = require('../core/matrix');
const statistics = require('../core/statistics');
const machineLearning = require('./machineLearning');
const languageModelMath = {};
/**
* Calculate self-attention scores in transformer models
* @param {Array} queries - Query vectors Q
* @param {Array} keys - Key vectors K
* @param {number} scalingFactor - Optional scaling factor (typically 1/sqrt(d_k))
* @returns {Array} - Attention scores matrix
*/
languageModelMath.selfAttention = function (queries, keys, scalingFactor = null) {
// Default scaling factor is 1/sqrt(d_k) where d_k is key dimension
if (scalingFactor === null) {
const keyDimension = keys[0].length;
scalingFactor = 1 / Math.sqrt(keyDimension);
}
// Calculate attention scores: scaled dot product of Q and K^T
const scores = matrix.multiply(queries, matrix.transpose(keys));
// Apply scaling factor
const scaledScores = matrix.multiply(scores, scalingFactor);
// Apply softmax along the last dimension
const attentionWeights = [];
for (let i = 0; i < scaledScores.length; i++) {
attentionWeights.push(machineLearning.softmax(scaledScores[i]));
}
return attentionWeights;
};
/**
* Apply attention weights to values in transformer attention mechanism
* @param {Array} attentionWeights - Attention weights matrix
* @param {Array} values - Value vectors V
* @returns {Array} - Output of attention mechanism
*/
languageModelMath.applyAttention = function (attentionWeights, values) {
// Apply attention weights to values: attention_weights @ V
return matrix.multiply(attentionWeights, values);
};
/**
* Calculate multi-head attention
* @param {Array} queries - Query vectors Q
* @param {Array} keys - Key vectors K
* @param {Array} values - Value vectors V
* @param {Array} weights - Projection weights for each head
* @returns {Array} - Concatenated multi-head attention output
*/
languageModelMath.multiHeadAttention = function (queries, keys, values, weights) {
const numHeads = weights.length;
const headOutputs = [];
for (let h = 0; h < numHeads; h++) {
const {
wq, wk, wv, wo
} = weights[h];
// Project inputs to each head
const q = matrix.multiply(queries, wq);
const k = matrix.multiply(keys, wk);
const v = matrix.multiply(values, wv);
// Calculate attention
const attentionWeights = this.selfAttention(q, k);
const attentionOutput = this.applyAttention(attentionWeights, v);
// Project output
const headOutput = matrix.multiply(attentionOutput, wo);
headOutputs.push(headOutput);
}
// Concatenate outputs from all heads
// In a real implementation, we would reshape and concat along feature dimension
// Here we'll just flatten the arrays
const result = [];
for (let i = 0; i < headOutputs[0].length; i++) {
const concatenated = [];
for (let h = 0; h < numHeads; h++) {
concatenated.push(...headOutputs[h][i]);
}
result.push(concatenated);
}
return result;
};
/**
* Apply layer normalization
* @param {Array} x - Input tensor
* @param {number} epsilon - Small constant for numerical stability
* @returns {Array} - Normalized tensor
*/
languageModelMath.layerNorm = function (x, epsilon = 1e-5) {
const normalized = [];
for (let i = 0; i < x.length; i++) {
const xi = x[i];
// Calculate mean and variance
const mean = xi.reduce((sum, val) => sum + val, 0) / xi.length;
const variance = xi.reduce((sum, val) => sum + (val - mean) ** 2, 0) / xi.length;
// Normalize
const xNorm = xi.map((val) => (val - mean) / Math.sqrt(variance + epsilon));
normalized.push(xNorm);
}
return normalized;
};
/**
* Apply position encoding to token embeddings
* @param {Array} embeddings - Token embeddings
* @param {string} method - Encoding method ('sinusoidal' or 'learned')
* @param {Array} learnedPositions - Learned position embeddings (if method is 'learned')
* @returns {Array} - Embeddings with position encoding
*/
languageModelMath.positionEncoding = function (embeddings, method = 'sinusoidal', learnedPositions = null) {
const sequenceLength = embeddings.length;
const embeddingDim = embeddings[0].length;
const result = [];
if (method === 'sinusoidal') {
// Sinusoidal position encoding from "Attention Is All You Need"
for (let pos = 0; pos < sequenceLength; pos++) {
const positionVector = new Array(embeddingDim).fill(0);
for (let i = 0; i < embeddingDim; i += 2) {
const freq = 1 / 10000 ** (i / embeddingDim);
if (i < embeddingDim) {
positionVector[i] = Math.sin(pos * freq);
}
if (i + 1 < embeddingDim) {
positionVector[i + 1] = Math.cos(pos * freq);
}
}
// Add position encoding to token embedding
result.push(embeddings[pos].map((val, idx) => val + positionVector[idx]));
}
} else if (method === 'learned' && learnedPositions) {
// Apply learned position embeddings
for (let pos = 0; pos < sequenceLength; pos++) {
if (pos >= learnedPositions.length) {
throw new Error('Not enough learned position vectors');
}
result.push(embeddings[pos].map((val, idx) => val + learnedPositions[pos][idx]));
}
} else {
throw new Error('Invalid position encoding method or missing learned positions');
}
return result;
};
/**
* Create causal (triangular) attention mask for decoder self-attention
* @param {number} sequenceLength - Length of the sequence
* @returns {Array} - Causal mask where future positions are masked
*/
languageModelMath.createCausalMask = function (sequenceLength) {
const mask = [];
for (let i = 0; i < sequenceLength; i++) {
const row = new Array(sequenceLength).fill(0);
// Set mask to 1 for valid positions (current and previous positions)
for (let j = 0; j <= i; j++) {
row[j] = 1;
}
mask.push(row);
}
return mask;
};
/**
* Apply attention mask to attention scores
* @param {Array} scores - Attention scores
* @param {Array} mask - Attention mask (1 for keep, 0 for mask)
* @param {number} maskValue - Value to replace masked positions with
* @returns {Array} - Masked attention scores
*/
languageModelMath.applyAttentionMask = function (scores, mask, maskValue = -1e9) {
const maskedScores = [];
for (let i = 0; i < scores.length; i++) {
const row = [];
for (let j = 0; j < scores[i].length; j++) {
if (mask[i][j] === 0) {
row.push(maskValue);
} else {
row.push(scores[i][j]);
}
}
maskedScores.push(row);
}
return maskedScores;
};
/**
* Calculate perplexity for language model evaluation
* @param {Array} logProbs - Log probabilities of the true tokens
* @returns {number} - Perplexity score
*/
languageModelMath.perplexity = function (logProbs) {
const sumLogProbs = logProbs.reduce((sum, val) => sum + val, 0);
const avgNegLogProb = -sumLogProbs / logProbs.length;
return Math.exp(avgNegLogProb);
};
/**
* Calculate cross-entropy loss for language modeling
* @param {Array} logits - Model logits (unnormalized predictions)
* @param {Array} targets - Target token indices
* @returns {number} - Cross-entropy loss
*/
languageModelMath.crossEntropyLoss = function (logits, targets) {
let totalLoss = 0;
for (let i = 0; i < targets.length; i++) {
const targetIdx = targets[i];
// Apply softmax to get probabilities
const probs = machineLearning.softmax(logits[i]);
// Calculate negative log likelihood of the target token
const nll = -Math.log(probs[targetIdx] + 1e-10); // Add small epsilon to prevent log(0)
totalLoss += nll;
}
return totalLoss / targets.length;
};
/**
* Apply top-k sampling for text generation
* @param {Array} logits - Unnormalized logits for next token
* @param {number} k - Number of top tokens to sample from
* @returns {number} - Sampled token index
*/
languageModelMath.topKSampling = function (logits, k) {
// Convert logits to probabilities
const probs = machineLearning.softmax(logits);
// Create (index, probability) pairs
const indexProbPairs = probs.map((p, idx) => [idx, p]);
// Sort by probability in descending order
indexProbPairs.sort((a, b) => b[1] - a[1]);
// Take top k items
const topK = indexProbPairs.slice(0, k);
// Normalize probabilities of top k items
const sumProbs = topK.reduce((sum, pair) => sum + pair[1], 0);
const normalizedTopK = topK.map((pair) => [pair[0], pair[1] / sumProbs]);
// Sample from the normalized distribution
const r = Math.random();
let cumulativeProb = 0;
for (const [idx, prob] of normalizedTopK) {
cumulativeProb += prob;
if (r <= cumulativeProb) {
return idx;
}
}
// Fallback in case of numerical issues
return normalizedTopK[0][0];
};
/**
* Apply nucleus (top-p) sampling for text generation
* @param {Array} logits - Unnormalized logits for next token
* @param {number} p - Probability threshold (typically 0.9)
* @returns {number} - Sampled token index
*/
languageModelMath.nucleusSampling = function (logits, p = 0.9) {
// Convert logits to probabilities
const probs = machineLearning.softmax(logits);
// Create (index, probability) pairs
const indexProbPairs = probs.map((p, idx) => [idx, p]);
// Sort by probability in descending order
indexProbPairs.sort((a, b) => b[1] - a[1]);
// Find the smallest set of tokens whose cumulative probability exceeds p
let cumulativeProb = 0;
const selectedIndices = [];
for (const [idx, prob] of indexProbPairs) {
selectedIndices.push(idx);
cumulativeProb += prob;
if (cumulativeProb >= p) {
break;
}
}
// Create a distribution limited to the selected tokens
const selectedProbs = [];
let sumSelectedProbs = 0;
for (const idx of selectedIndices) {
selectedProbs.push([idx, probs[idx]]);
sumSelectedProbs += probs[idx];
}
// Normalize
const normalizedProbs = selectedProbs.map((pair) => [pair[0], pair[1] / sumSelectedProbs]);
// Sample from the normalized distribution
const r = Math.random();
let cumProb = 0;
for (const [idx, prob] of normalizedProbs) {
cumProb += prob;
if (r <= cumProb) {
return idx;
}
}
// Fallback in case of numerical issues
return selectedIndices[0];
};
/**
* Beam search for sequence generation
* @param {Function} scoreNextToken - Function that returns logits for the next token given a sequence
* @param {Array} initialTokens - Initial token sequence
* @param {number} beamWidth - Beam width
* @param {number} maxLength - Maximum sequence length
* @param {number} endTokenId - Token ID that indicates the end of a sequence
* @returns {Array} - Best sequence after beam search
*/
languageModelMath.beamSearch = function (
scoreNextToken,
initialTokens,
beamWidth = 3,
maxLength = 20,
endTokenId = 0
) {
// Initial beam with just the seed sequence
let beams = [
{
tokens: [...initialTokens],
score: 0,
finished: false
}
];
for (let step = 0; step < maxLength; step++) {
const candidates = [];
// For each beam, generate possible extensions
for (const beam of beams) {
if (beam.finished) {
// Keep finished beams
candidates.push(beam);
continue;
}
// Get logits for the next token
const logits = scoreNextToken(beam.tokens);
// Convert to log probabilities
const logProbs = machineLearning.softmax(logits).map((p) => Math.log(p + 1e-10));
// Get top k extensions
const indexLogProbPairs = logProbs.map((lp, idx) => [idx, lp]);
// Sort by log probability
indexLogProbPairs.sort((a, b) => b[1] - a[1]);
// Take top beamWidth extensions
const topK = indexLogProbPairs.slice(0, beamWidth);
// Add each extension as a candidate
for (const [tokenId, logProb] of topK) {
const newTokens = [...beam.tokens, tokenId];
const newScore = beam.score + logProb;
const newFinished = tokenId === endTokenId;
candidates.push({
tokens: newTokens,
score: newScore,
finished: newFinished
});
}
}
// Sort candidates by score and keep top beamWidth
candidates.sort((a, b) => b.score - a.score);
beams = candidates.slice(0, beamWidth);
// Check if all beams are finished
if (beams.every((beam) => beam.finished)) {
break;
}
}
// Return the highest scoring beam
return beams[0].tokens;
};
/**
* Calculate BLEU score for machine translation evaluation
* @param {string} candidate - Candidate translation
* @param {Array} references - Reference translations
* @param {number} maxNGramOrder - Maximum n-gram order to consider
* @returns {number} - BLEU score
*/
languageModelMath.bleuScore = function (candidate, references, maxNGramOrder = 4) {
// Tokenize
const candidateTokens = candidate.split(/\s+/);
const referenceTokens = references.map((ref) => ref.split(/\s+/));
// Calculate modified precision for each n-gram order
const precisions = [];
for (let n = 1; n <= maxNGramOrder; n++) {
// Count n-grams in candidate
const candidateNGrams = {};
for (let i = 0; i <= candidateTokens.length - n; i++) {
const ngram = candidateTokens.slice(i, i + n).join(' ');
candidateNGrams[ngram] = (candidateNGrams[ngram] || 0) + 1;
}
// Count maximum reference n-grams
const maxRefNGrams = {};
for (const refTokens of referenceTokens) {
const refNGrams = {};
for (let i = 0; i <= refTokens.length - n; i++) {
const ngram = refTokens.slice(i, i + n).join(' ');
refNGrams[ngram] = (refNGrams[ngram] || 0) + 1;
}
// Update max counts
for (const ngram in refNGrams) {
maxRefNGrams[ngram] = Math.max(maxRefNGrams[ngram] || 0, refNGrams[ngram]);
}
}
// Calculate clipped counts
let clippedCount = 0;
let totalCount = 0;
for (const ngram in candidateNGrams) {
const count = candidateNGrams[ngram];
const clippedNGramCount = Math.min(count, maxRefNGrams[ngram] || 0);
clippedCount += clippedNGramCount;
totalCount += count;
}
// Precision for this n-gram order
precisions.push(totalCount === 0 ? 0 : clippedCount / totalCount);
}
// Calculate brevity penalty
const candidateLength = candidateTokens.length;
// Find closest reference length
let closestRefLength = referenceTokens[0].length;
let closestDiff = Math.abs(candidateLength - closestRefLength);
for (let i = 1; i < referenceTokens.length; i++) {
const diff = Math.abs(candidateLength - referenceTokens[i].length);
if (diff < closestDiff) {
closestDiff = diff;
closestRefLength = referenceTokens[i].length;
}
}
// Brevity penalty
const bp = candidateLength >= closestRefLength ? 1 : Math.exp(1 - closestRefLength / candidateLength);
// Calculate final BLEU score
const avgLogPrecision = precisions.reduce((sum, p) => sum + (Math.log(p || 1e-10) / maxNGramOrder), 0);
return bp * Math.exp(avgLogPrecision);
};
/**
* Create a sinusoidal rotary positional embedding (RoPE)
* @param {number} dim - Embedding dimension
* @param {number} maxSequenceLength - Maximum sequence length
* @param {number} base - Base for frequency calculation
* @returns {Array} - Rotation matrices for each position
*/
languageModelMath.createRotaryEmbedding = function (dim, maxSequenceLength, base = 10000) {
const rotaryEmbeddings = [];
for (let pos = 0; pos < maxSequenceLength; pos++) {
const rotations = [];
for (let i = 0; i < dim; i += 2) {
const theta = pos / base ** (i / dim);
// Create 2D rotation matrix
const rotationMatrix = [
[Math.cos(theta), -Math.sin(theta)],
[Math.sin(theta), Math.cos(theta)]
];
rotations.push(rotationMatrix);
}
rotaryEmbeddings.push(rotations);
}
return rotaryEmbeddings;
};
/**
* Apply rotary positional embeddings to queries and keys
* @param {Array} x - Input tensor (queries or keys)
* @param {Array} rotaryEmbeddings - Rotary embeddings
* @returns {Array} - Tensor with rotary encoding applied
*/
languageModelMath.applyRotaryEmbedding = function (x, rotaryEmbeddings) {
const result = [];
for (let i = 0; i < x.length; i++) {
const pos = i % rotaryEmbeddings.length;
const rotations = rotaryEmbeddings[pos];
const embedding = x[i];
const rotated = new Array(embedding.length).fill(0);
// Apply each 2D rotation to pairs of features
for (let j = 0; j < embedding.length; j += 2) {
if (j + 1 >= embedding.length) continue;
const rotationIdx = (j / 2) % rotations.length;
const rotationMatrix = rotations[rotationIdx];
const x1 = embedding[j];
const x2 = embedding[j + 1];
rotated[j] = rotationMatrix[0][0] * x1 + rotationMatrix[0][1] * x2;
rotated[j + 1] = rotationMatrix[1][0] * x1 + rotationMatrix[1][1] * x2;
}
result.push(rotated);
}
return result;
};
/**
* Flash attention - an efficient attention implementation
* @param {Array} queries - Query vectors
* @param {Array} keys - Key vectors
* @param {Array} values - Value vectors
* @param {number} blockSize - Block size for tiling
* @returns {Array} - Output of attention mechanism
*/
languageModelMath.flashAttention = function (queries, keys, values, blockSize = 32) {
const seqLen = queries.length;
const head_dim = queries[0].length;
const scaling = 1 / Math.sqrt(head_dim);
// Simplified flash attention that processes the data in blocks
// Real flash attention uses a more complex tiling strategy and in-place updates
const output = new Array(seqLen).fill(0).map(() => new Array(head_dim).fill(0));
const logsumexp = new Array(seqLen).fill(-Infinity);
// Process in blocks to simulate memory-efficient attention
for (let blockStart = 0; blockStart < seqLen; blockStart += blockSize) {
const blockEnd = Math.min(blockStart + blockSize, seqLen);
for (let i = 0; i < seqLen; i++) {
let rowMax = -Infinity;
const expValues = new Array(blockEnd - blockStart).fill(0);
// Find max value for numerical stability
for (let j = blockStart; j < blockEnd; j++) {
const score = matrix.dot(queries[i], keys[j]) * scaling;
rowMax = Math.max(rowMax, score);
}
// Calculate exp values with improved numerical stability
let expSum = 0;
for (let j = blockStart; j < blockEnd; j++) {
const score = matrix.dot(queries[i], keys[j]) * scaling;
const expScore = Math.exp(score - rowMax);
expValues[j - blockStart] = expScore;
expSum += expScore;
}
// Adjust for previous blocks (if any)
const oldMax = logsumexp[i];
const newMax = Math.max(oldMax, rowMax);
let normalizer;
if (oldMax === -Infinity) {
normalizer = expSum;
} else {
normalizer = Math.exp(oldMax - newMax) + expSum;
}
logsumexp[i] = newMax + Math.log(normalizer);
// Update output
const scale = Math.exp(rowMax - logsumexp[i]);
for (let j = blockStart; j < blockEnd; j++) {
const weight = expValues[j - blockStart] * scale;
for (let d = 0; d < head_dim; d++) {
output[i][d] += weight * values[j][d];
}
}
}
}
return output;
};
/**
* Calculate token-level BPE (Byte Pair Encoding) merges for tokenization
* @param {string} text - Input text
* @param {number} numMerges - Number of merge operations to perform
* @returns {Object} - BPE vocabulary and merge operations
*/
languageModelMath.bytePairEncoding = function (text, numMerges) {
// Initialize with character-level vocabulary
let tokens = text.split('').map((c) => c);
// Count pairs
const countPairs = (tokens) => {
const pairCounts = {};
for (let i = 0; i < tokens.length - 1; i++) {
const pair = `${tokens[i]},${tokens[i + 1]}`;
pairCounts[pair] = (pairCounts[pair] || 0) + 1;
}
return pairCounts;
};
// Perform merge operations
const mergeOperations = [];
const vocabulary = new Set(tokens);
for (let i = 0; i < numMerges; i++) {
const pairCounts = countPairs(tokens);
// Find most frequent pair
let bestPair = null;
let maxCount = 0;
for (const pair in pairCounts) {
if (pairCounts[pair] > maxCount) {
maxCount = pairCounts[pair];
bestPair = pair;
}
}
if (!bestPair) break;
// Split pair into original tokens
const [first, second] = bestPair.split(',');
const newToken = first + second;
// Add to vocabulary and record merge operation
vocabulary.add(newToken);
mergeOperations.push([first, second, newToken]);
// Apply the merge
const newTokens = [];
for (let j = 0; j < tokens.length; j++) {
if (j < tokens.length - 1 && tokens[j] === first && tokens[j + 1] === second) {
newTokens.push(newToken);
j++; // Skip the next token
} else {
newTokens.push(tokens[j]);
}
}
tokens = newTokens;
}
return {
vocabulary: Array.from(vocabulary),
mergeOperations,
encode(text) {
// Simple encoding function using the learned merges
let tokens = text.split('');
// Apply merges in order
for (const [first, second, merged] of mergeOperations) {
const newTokens = [];
for (let i = 0; i < tokens.length; i++) {
if (i < tokens.length - 1 && tokens[i] === first && tokens[i + 1] === second) {
newTokens.push(merged);
i++; // Skip the next token
} else {
newTokens.push(tokens[i]);
}
}
tokens = newTokens;
}
return tokens;
}
};
};
/**
* Calculate Kullback-Leibler divergence between two distributions
* @param {Array} p - First probability distribution
* @param {Array} q - Second probability distribution
* @returns {number} - KL divergence
*/
languageModelMath.klDivergence = function (p, q) {
if (p.length !== q.length) {
throw new Error('Distributions must have the same length');
}
let divergence = 0;
for (let i = 0; i < p.length; i++) {
if (p[i] > 0) {
// Avoid log(0)
const qi = Math.max(q[i], 1e-10);
divergence += p[i] * Math.log(p[i] / qi);
}
}
return divergence;
};
/**
* Calculate metrics for language model evaluation
* @param {Array} predictions - Model predictions (token IDs)
* @param {Array} targets - Target token IDs
* @param {Object} vocabSize - Size of the vocabulary
* @returns {Object} - Evaluation metrics
*/
languageModelMath.evaluateLanguageModel = function (predictions, targets, vocabSize) {
// Calculate accuracy
let correct = 0;
for (let i = 0; i < predictions.length; i++) {
if (predictions[i] === targets[i]) {
correct++;
}
}
const accuracy = correct / predictions.length;
// Calculate perplexity
let crossEntropy = 0;
for (let i = 0; i < predictions.length; i++) {
// One-hot encode the target
const target = new Array(vocabSize).fill(0);
target[targets[i]] = 1;
// Convert prediction to probability distribution
const prediction = new Array(vocabSize).fill(0);
prediction[predictions[i]] = 1;
// Calculate cross-entropy loss
crossEntropy -= Math.log(prediction[targets[i]] || 1e-10);
}
const avgCrossEntropy = crossEntropy / predictions.length;
const perplexity = Math.exp(avgCrossEntropy);
return {
accuracy,
perplexity,
crossEntropy: avgCrossEntropy
};
};
module.exports = languageModelMath;