@shumai/shumai
Version:
A fast, network-connected, differentiable tensor library for TypeScript (and JavaScript). Built with bun + flashlight for software engineers and researchers alike.
209 lines (208 loc) • 10.9 kB
TypeScript
import type { Tensor } from '../tensor';
import { Module } from './module';
/**
* A module to generate the positional encoding for a Transformer of a given input dimension,
*
* $$ \mathrm{PE}_{i, 2z} = \sin \left( \frac{i}{10000^{2z/d}} \right) $$
*
* $$ \mathrm{PE}_{i, 2z + 1} = \cos \left( \frac{i}{10000^{2z/d}} \right) $$
*
* where $i$ is the sequence position, $2z$ and $2z+1$ are the dimensions of the input embedding, and $d$ is the dimensionality of the input embedding.
*
* The multiplicative factors $\frac{1}{10000^{2z/d}}$ are precomputed during object creation as they are constant for all $i$.
*
* The full PE is initially precomputed for all $i$ up to 256 (or `initSequenceLength` given in the constructor). If the module is called with a sequence length larger than what has already been computed, the additional PE values are also calculated and then stored.
*/
export declare class TransformerPositionalEncoding extends Module {
/**
* The default `initSequenceLength` if none is supplied in the constructor.
*/
static readonly DEFAULT_SEQUENCE_LENGTH = 256;
/**
* The base of the exponent in the positional encoding.
*/
static readonly ENCODING_BASE = 10000;
private dim;
private sequenceLength;
private encodingFactors;
private encoding;
/**
* @param dim - Number of dimensions of each input embedding
* @param initSequenceLength - Initial sequence length that the positional embedding should be computed for, or {@link DEFAULT_SEQUENCE_LENGTH} if not specified
*/
constructor(dim: number, initSequenceLength?: number);
/**
* Calculate positional encodings at a given range of sequence positions.
*
* @param start - Start of the range to calculate
* @param end - End of the range to calculate
*
* @returns a Tensor of calculated positional embeddings with shape `[end - start, dim]`
*/
calculate(start: number, end: number): Tensor;
/**
* @param sequenceLength - Length of the sequence for which the positional embedding should be calculated
* @returns a Tensor of positional embeddings with shape `[length, dim]`, using precomputed values if available
*/
forward(sequenceLength: number): Tensor;
}
/**
* Scaled dot-product mechanism as described by Vaswani et al. The {@link scaleFactor} is computed during object creation as $\frac{1}{\sqrt{d}}$, where $d$ is the dimensionality of the inputs.
*/
export declare class TransformerDotProductAttention extends Module {
private dim;
private scaleFactor;
/**
* @param dim - Number of dimensions of the inputs
*/
constructor(dim: number);
protected scale(tensor: Tensor): Tensor;
/**
* @param queries - Tensor of query embeddings, shape `[..., queryTokens, dim]`
* @param keys - Tensor of key embeddings, shape `[..., keyTokens, dim]`
* @param values - Tensor of value embeddings each corresponding to a key, shape `[..., keyTokens, dim]`
* @param mask - Tensor mask of shape `[queryTokens, keyTokens]` where a 1 in position $(i, j)$ indicates that the $i$th query should not attend to the $j$th key
* @returns A Tensor of shape `[..., queryTokens, dim]`
*/
forward(queries: Tensor, keys: Tensor, values: Tensor, mask?: Tensor): Tensor;
}
/**
* Multi-head attention mechanism as described by Vaswani et al. The input Tensors are linearly embedded before being passed to {@link TransformerDotProductAttention | scaled dot-product attentions}.
*/
export declare class TransformerMultiheadAttention extends Module {
private dim;
private heads;
private attentionDim;
private queryEmbed;
private keyEmbed;
private valueEmbed;
private attention;
private concatEmbed;
/**
* @param dim - Number of dimensions of the input embeddings
* @param heads - Number of heads for the multi-head attention
* @param attentionDim - Number of dimensions of the further embeddings which are passed to the scaled dot-product attention mechanisms, or `dim` if not specified
*/
constructor(dim: number, heads: number, attentionDim?: number);
/**
* @param queries - Tensor of query vectors, shape `[..., queryTokens, dim]`
* @param keys - Tensor of key vectors, shape `[..., keyTokens, dim]`
* @param values - Tensor of value vectors each corresponding to a key, shape `[..., keyTokens, dim]`
* @param mask - Tensor mask of shape `[queryTokens, keyTokens]` for the {@link TransformerDotProductAttention}
* @returns A Tensor of shape `[..., queryTokens, dim]`
*/
forward(queries: Tensor, keys: Tensor, values: Tensor, mask?: Tensor): Tensor;
}
/**
* A layer of the Transformer encoder, as described by Vaswani et al, consisting of a {@link TransformerMultiheadAttention | multi-head attention} layer and a fully-connected feed forward network. Both of these use residual connections and are normalised with {@link LayerNorm}.
*/
export declare class TransformerEncoderLayer extends Module {
private dim;
private heads;
private attentionDim;
private feedForwardDim;
private mha;
private mhaNorm;
private ff;
private ffNorm;
/**
* @param dim - Number of dimensions of the input embeddings
* @param heads - Number of heads in the multi-head attention mechanism
* @param attentionDim - Number of dimensions of the embeddings which are passed to the scaled dot-product attention mechanisms, or `dim` if not specified
* @param feedForwardDim - Number of dimensions in the hidden layer of the feed forward network, or `dim` if not specified
*/
constructor(dim: number, heads: number, attentionDim?: number, feedForwardDim?: number);
/**
* @param input - Input Tensor of shape `[..., tokens, dim]`
* @returns A Tensor of shape `[..., tokens, dim]`
*/
forward(input: Tensor): Tensor;
}
/**
* Transformer encoder as described by Vaswani et al containing an arbitrary number of {@link TransformerEncoderLayer | TransformerEncoderLayers}.
*
* This module includes the {@link TransformerPositionalEncoding | positional encoding}, but does not include any initial embedding of an input sequence into vectors (which should have been separately done by e.g. word2vec).
*/
export declare class TransformerEncoder extends Module {
private dim;
private heads;
private depth;
private attentionDim;
private feedForwardDim;
private positional;
private layers;
/**
* @param dim - Number of dimensions of the input embeddings
* @param heads - Number of heads in each multi-head attention mechanism
* @param depth - Number of encoder layers
* @param attentionDim - Number of dimensions of the embeddings which are passed to the scaled dot-product attention mechanisms, or `dim` if not specified
* @param feedForwardDim - Number of dimensions in the hidden layer of each feed forward network, or `dim` if not specified
* @param initSequenceLength - Initial sequence length that the positional encoding should be computed for, or {@link TransformerPositionalEncoding.DEFAULT_SEQUENCE_LENGTH} if not specified
*/
constructor(dim: number, heads: number, depth: number, attentionDim?: number, feedForwardDim?: number, initSequenceLength?: number);
/**
* @param input - Input Tensor of shape `[..., tokens, dim]`
* @returns A Tensor of shape `[..., tokens, dim]`
*/
forward(input: Tensor): Tensor;
}
/**
* A layer of the Transformer decoder, as described by Vaswani et al, consisting of a masked {@link TransformerMultiheadAttention | multi-head} self-attention layer, an unmasked {@link TransformerMultiheadAttention | multi-head} cross-attention layer and a fully-connected feed forward network. All of these use residual connections and are normalised with {@link LayerNorm}.
*/
export declare class TransformerDecoderLayer extends Module {
private dim;
private heads;
private attentionDim;
private feedForwardDim;
private maskedSelfAttention;
private maskedSelfAttentionNorm;
private crossAttention;
private crossAttentionNorm;
private ff;
private ffNorm;
/**
* @param dim - Number of dimensions of the input embeddings
* @param heads - Number of heads in each multi-head attention mechanism
* @param attentionDim - Number of dimensions of the embeddings which are passed to the scaled dot-product attention mechanisms, or `dim` if not specified
* @param feedForwardDim - Number of dimensions in the hidden layer of the feed forward network, or `dim` if not specified
*/
constructor(dim: number, heads: number, attentionDim?: number, feedForwardDim?: number);
/**
* @param sequenceLength - Length of sequence for which the mask should be generated
* @returns A Tensor mask of shape `[sequenceLength, sequenceLength]` where row $i$ should have 0s in positions up to $i$ and 1s everywhere else
*/
static getSelfAttentionMask(sequenceLength: number): Tensor;
/**
* @param input - Tensor from the previous decoder layer, shape `[..., tokens, dim]`
* @param encoderOutput - Tensor output by the encoder, shape `[..., encoderTokens, dim]`
* @returns A Tensor of shape `[..., tokens, dim]`
*/
forward(input: Tensor, encoderOutput: Tensor): Tensor;
}
/**
* Transformer decoder as described by Vaswani et al containing an arbitrary number of {@link TransformerDecoderLayer | TransformerDecoderLayers}.
*/
export declare class TransformerDecoder extends Module {
private dim;
private heads;
private depth;
private attentionDim;
private feedForwardDim;
private positional;
private layers;
/**
* @param dim - Number of dimensions of the input embeddings
* @param heads - Number of heads in each multi-head attention mechanism
* @param depth - Number of decoder layers
* @param attentionDim - Number of dimensions of the embeddings which are passed to the scaled dot-product mechanisms, or `dim` if not specified
* @param feedForwardDim - Number of dimensions in the hidden layer of each feed forward network, or `dim` if not specified
* @param initSequenceLength - Initial sequence length that the positional encoding should be computed for, or {@link TransformerPositionalEncoding.DEFAULT_SEQUENCE_LENGTH} if not specified
*/
constructor(dim: number, heads: number, depth: number, attentionDim?: number, feedForwardDim?: number, initSequenceLength?: number);
/**
* @param input - Input Tensor of shape `[..., tokens, dim]`
* @param encoderOutput - Tensor output by the encoder, shape `[..., encoderTokens, dim]`
* @returns A Tensor of shape `[..., tokens, dim]`
*/
forward(input: Tensor, encoderOutput: Tensor): Tensor;
}