@modelx/model
Version:
Deep Learning Classification, LSTM Time Series, Regression and Multi-Layered Perceptrons with Tensorflow
538 lines (524 loc) • 23.2 kB
text/typescript
import { TensorScriptOptions, TensorScriptProperties, Matrix, Vector, TensorScriptLayers, NestedArray, InputTextArray, PredictionOptions, Shape, TensorScriptLSTMModelContext, LambdaLayer, DenseLayer, asyncForEach, Features, Corpus} from './model_interface';
import * as Tensorflow from '@tensorflow/tfjs-node';
import { BaseNeuralNetwork, } from './base_neural_network';
import range from 'lodash.range';
import TSNE from 'tsne-js';
//https://towardsdatascience.com/understanding-feature-engineering-part-4-deep-learning-methods-for-text-data-96c44370bbfa
export type LabeledWeight = {
[index: string]: Matrix;
}
export type IdToFeature = { [index: number]: string | number };
export type FeatureToId = { [index: string]: number };
/**
* use a corpus to generate features from an embedding layer with Tensorflow
* @class FeatureEmbedding
* @implements {BaseNeuralNetwork}
*/
export class FeatureEmbedding extends BaseNeuralNetwork {
layers?: TensorScriptLayers;
featureToId?: FeatureToId;
idToFeature?: IdToFeature;
featureIds?: Matrix;
numberOfFeatures?: number;
loss?: number;
importedEmbeddings?: boolean;
// settings: TensorScriptOptions;
static async getFeatureDataSet(this: any, { inputMatrixFeatures, PAD = 'PAD', initialIdToFeature, initialFeatureToId, }: {
inputMatrixFeatures: Corpus;
PAD?: string;
initialIdToFeature?: IdToFeature;
initialFeatureToId?: FeatureToId;
}) {
let featIndex = initialFeatureToId ? Object.keys(initialFeatureToId).length : 1;
const idToFeature:IdToFeature = {
0: this && this.settings && this.settings.PAD
? this.settings.PAD
: PAD,
...initialIdToFeature,
};
const featureToId = inputMatrixFeatures.reduce((result, inputFeatureArray) => {
inputFeatureArray.forEach((inputFeature) => {
if (!result[inputFeature]) {
if (idToFeature[featIndex]) featIndex++;
result[inputFeature] = featIndex;
//@ts-ignore
idToFeature[featIndex] = inputFeature;
featIndex++;
}
});
return result;
}, {
[PAD]: 0,
...initialFeatureToId,
});
//@ts-ignore
const featureIds = inputMatrixFeatures.map(inputFeatureArray =>
inputFeatureArray.map(inputFeature => featureToId[inputFeature])
);
// console.log('featureIds', featureIds);
// console.log('featureIds.length', featureIds.length);
// console.log('inputMatrixFeatures.length', inputMatrixFeatures.length);
const numberOfFeatures = Object.keys(featureToId).length;
return {
featureToId, //word2id
idToFeature, //id2word
featureIds, //wids
numberOfFeatures, //vocab_size
};
}
static getMergedArray(base:Vector = [], merger:Vector= [], append=false, truncate=true) {
let arr = new Array().concat(base);
if (append) arr.splice(base.length-merger.length,merger.length,...merger);
else arr.splice(0, merger.length, ...merger);
if (truncate && append) return arr.slice(-1 * base.length);
else if (truncate) return arr.slice(0, base.length);
else return arr;
}
/**
*/
static async getContextPairs(this:any, { inputMatrix, numberOfFeatures, window_size = 2, tf, }: { inputMatrix: Matrix; numberOfFeatures: number; window_size?: number; tf?: any;}) {
const tensorflow = this && this.tf ? this.tf : Tensorflow;
const context_length = (this && this.settings && this.settings.windowSize ? this.settings.windowSize : window_size) * 2;
const [emptyXVector, emptyYVector] = await Promise.all([
tensorflow.zeros([context_length]).array(),
tensorflow.zeros([numberOfFeatures]).array(),
]);
const x:Matrix = [];
const y:Matrix = [];
inputMatrix.forEach((inputVector:Vector,inputVectorIndex:number) => {
inputVector.forEach((word:number, index:number) => {
if (word != 0) {
const output = new Array().concat(emptyYVector);
const inputMerger = new Array().concat(inputMatrix[inputVectorIndex]);
inputMerger.splice(index, 1);
const input = FeatureEmbedding.getMergedArray(emptyXVector, inputMerger,true);
output[word] = 1;
// x.push([[word],input]);
x.push(input);
y.push(output);
}
});
});
return {
context_length,
emptyXVector,
emptyYVector,
x,
y,
};
}
getMergedArray: typeof FeatureEmbedding.getMergedArray;
getFeatureDataSet: typeof FeatureEmbedding.getFeatureDataSet;
getContextPairs: typeof FeatureEmbedding.getContextPairs;
constructor(options:TensorScriptOptions = {}, properties?:TensorScriptProperties) {
const config = {
layers: [],
type: 'cbow',
compile: {
loss: 'categoricalCrossentropy',
optimizer: 'rmsprop',
},
fit: {
epochs: 15,
batchSize: 1,
},
embedSize: 50,
windowSize: 2,
PAD: 'PAD',
streamInputMatrix: true,
initialLayerInitializerType: 'randomNormal',
initialLayerInitializerOptions: { seed: 1 },
...options
};
super(config, properties);
this.type = 'FeatureEmbedding';
this.featureToId;
this.idToFeature;
this.featureIds;
this.numberOfFeatures;
this.getMergedArray = FeatureEmbedding.getMergedArray;
this.getFeatureDataSet = FeatureEmbedding.getFeatureDataSet.bind(this);
this.getContextPairs = FeatureEmbedding.getContextPairs.bind(this);
return this;
}
/**
* Adds dense layers to tensorflow classification model
* @override
* @param {Array<Array<number>>} x_matrix - independent variables
* @param {Array<Array<number>>} y_matrix - dependent variables
* @param {Array<Object>} layers - model dense layer parameters
*/
generateLayers(this: FeatureEmbedding, x_matrix: Matrix, y_matrix: Matrix, layers?: TensorScriptLayers) {
// const xShape = this.getInputShape(x_matrix);
if (!this.numberOfFeatures) throw ReferenceError(`${this.settings.name} model is missing numberOfFeatures`);
if (!this.settings.embedSize) throw ReferenceError(`${this.settings.name} model is missing embedSize`);
const yShape:Vector = [this.numberOfFeatures, this.settings.embedSize,];// this.getInputShape(y_matrix);
this.yShape = yShape;
// this.xShape = xShape;
const denseLayers: TensorScriptLayers = [];
denseLayers.push({
units: this.numberOfFeatures,
inputDim: this.numberOfFeatures,
outputDim: this.settings.embedSize,
inputLength: (this.settings.windowSize || 2) * 2,
embeddingsInitializer: this.settings.initialLayerInitializerType
? this.tf.initializers[this.settings.initialLayerInitializerType](this.settings.
initialLayerInitializerOptions)
: undefined,
});
//TODO:NOT USED:
denseLayers.push({
lambdaFunction: 'result = tf.mean(input,1,true)',
lambdaOutputShape: [this.numberOfFeatures, this.settings.embedSize]
});
//TODO:END NOT USED:
denseLayers.push({ units: this.numberOfFeatures, activation: 'softmax', });
this.layers = denseLayers;
this.model.add(this.tf.layers.embedding(denseLayers[0]));
// this.model.add(new lambdaLayer(denseLayers[1]));
this.model.add(this.tf.layers.flatten());
this.model.add(this.tf.layers.dense(denseLayers[2]));
if (layers && layers.length && layers[0].weights) {
const originalModelWeights = this.model.getWeights();
originalModelWeights[0] = layers[0].weights;
this.model.setWeights(originalModelWeights);
// const postOriginalModelWeights = this.model.getWeights();
// const layerData = postOriginalModelWeights.map(w => w.dataSync());
// console.log('layerData', layerData);
}
// console.log('this.model.layers',this.model.layers)
}
async trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss, inputVectorIndex, inputVectorLength,}: { x_input_matrix: Matrix, y_output_matrix: Matrix, epoch: number, trainingLoss: number, inputVectorIndex?:number, inputVectorLength?:number, }) {
let loss = Infinity;
if (this.settings.fit?.callbacks?.onEpochBegin) this.settings.fit?.callbacks?.onEpochBegin(epoch, { loss:trainingLoss });
await asyncForEach(x_input_matrix, async (x_input:Vector, xIndex:number) => {
if (this.settings.fit?.callbacks?.onBatchBegin) this.settings.fit?.callbacks?.onBatchBegin(xIndex, { loss:trainingLoss, });
const y_output = y_output_matrix[xIndex];
const xShape = this.getInputShape([x_input]);
const xs = this.tf.tensor(x_input, xShape);
const yShape = this.getInputShape([y_output]);
const ys = this.tf.tensor(y_output, yShape);
// const xdata = await xs.data()
// console.log({ xs, xdata, xShape });
loss = await this.model.trainOnBatch(xs, ys);
if (this.settings.fit?.callbacks?.onYield) this.settings.fit?.callbacks?.onYield(epoch, xIndex, { loss,
inputVectorIndex,
inputVectorLength,
completion: `${( 100 * ( ((inputVectorIndex||xIndex)+1) / (inputVectorLength||x_input_matrix.length) ) ).toFixed(2)}%` });
if (this.settings.fit?.callbacks?.onBatchEnd) this.settings.fit?.callbacks?.onBatchEnd(xIndex, { loss });
// console.log({ x_input, xIndex, xShape, y_output, yShape, loss })
xs.dispose();
ys.dispose();
});
if (this.settings.fit?.callbacks?.onEpochEnd) this.settings.fit?.callbacks?.onEpochEnd(epoch, { loss });
return {
loss
};
}
async generateBatch({ epoch }: { epoch: number }) {
if (!this.featureIds) throw ReferenceError(`${this.settings.name} model is missing featureIds`);
const preTransformedMatrix: Matrix = this.featureIds;
const context_length = (this && this.settings && this.settings.windowSize ? this.settings.windowSize : 2) * 2;
const [emptyXVector, emptyYVector] = await Promise.all([
this.tf.zeros([context_length]).array(),
this.tf.zeros([this.numberOfFeatures]).array(),
]);
let x_input_matrix: Matrix = [];
let y_output_matrix: Matrix = [];
let trainingLoss = Infinity;
await asyncForEach(preTransformedMatrix, async (inputVector: Vector, inputVectorIndex: number) => {
await asyncForEach(inputVector, async (word:number, index:number) => {
if (this.settings.checkInputMatrix && this.numberOfFeatures&& word >= (this.numberOfFeatures)) {
console.warn('invalid word in corpus', {
trainingLoss,
word,
'this.idToFeature[word]': this.idToFeature&& this.idToFeature[word],
'this.numberOfFeatures': this.numberOfFeatures,
});
} else if (word != 0) {
const output = new Array().concat(emptyYVector);
const inputMerger = new Array().concat(preTransformedMatrix[inputVectorIndex]);
inputMerger.splice(index, 1);
const input = FeatureEmbedding.getMergedArray(emptyXVector, inputMerger,true);
output[word] = 1;
// x.push([[word],input]);
// x.push(input);
// y.push(output);
x_input_matrix = [input];
y_output_matrix = [output];
// console.log({ input, output });
if (this.settings.checkInputMatrix && this.numberOfFeatures&& this.numberOfFeatures > 0 && input.filter((wordInput:number) => this.numberOfFeatures&&wordInput >= this.numberOfFeatures).length) {
console.warn('Input matrix contains unknown weight', { input, 'this.numberOfFeatures': this.numberOfFeatures });
} else {
x_input_matrix = [input];
y_output_matrix = [output];
// console.log({ input, output });
const modelStatus = await this.trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss , inputVectorIndex, inputVectorLength: preTransformedMatrix.length, });
trainingLoss = modelStatus.loss;
}
}
});
});
return {
loss: trainingLoss,
};
}
async exportEmbeddings() {
if (this.trained !== true) throw new ReferenceError('The model has to be trained before embeddings can be exported');
const weights = await this.predict();
const labeledWeights = this.labelWeights(weights);
return {
featureToId: this.featureToId,
idToFeature: this.idToFeature,
featureIds: this.featureIds,
numberOfFeatures: this.numberOfFeatures,
labeledWeights,
};
}
async importEmbeddings({ featureToId, idToFeature, featureIds, numberOfFeatures, labeledWeights, addNewWeights = true, inputMatrixFeatures, fixImportedWeights = false, }: {
featureToId?: FeatureToId;
idToFeature?: IdToFeature;
featureIds?: Matrix,
numberOfFeatures?: number;
labeledWeights: LabeledWeights;
addNewWeights?: boolean;
fixImportedWeights?: boolean;
inputMatrixFeatures?: Corpus;
}) {
// console.log(this.settings.name,'before - labeledWeights', labeledWeights);
this.model = undefined;
let updatedModelProperties;
// let newWeights:LabeledWeights = {};
if (addNewWeights) {
if (inputMatrixFeatures) {
updatedModelProperties = await this.getFeatureDataSet({ inputMatrixFeatures, initialIdToFeature: idToFeature, initialFeatureToId: featureToId, });
featureToId = updatedModelProperties?.featureToId;
idToFeature = updatedModelProperties?.idToFeature;
featureIds = updatedModelProperties?.featureIds;
numberOfFeatures = updatedModelProperties?.numberOfFeatures;
// console.log(this.settings.name,'updatedModelProperties',updatedModelProperties);
}
if (featureToId) {
await asyncForEach(Object.keys(featureToId), async (weightLabel: string) => {
if (!labeledWeights[weightLabel] || !labeledWeights[weightLabel].length) {
// newWeights[weightLabel] = await this.tf.randomUniform([1, this.settings.embedSize], -1, 1).array();
labeledWeights[weightLabel] = await this.tf.randomUniform([1, this.settings.embedSize], -1, 1).array();
if (Array.isArray(labeledWeights[weightLabel][0])) {
if (typeof labeledWeights[weightLabel].flat ==='function') labeledWeights[weightLabel] = labeledWeights[weightLabel].flat();
else labeledWeights[weightLabel] = labeledWeights[weightLabel].reduce((acc:number[], val:number) => acc.concat(val), []);
}
}
});
}
}
if (fixImportedWeights) {
Object.keys(labeledWeights).forEach(weightLabel => {
if (Array.isArray(labeledWeights[weightLabel][0])) {
if (typeof labeledWeights[weightLabel].flat ==='function') labeledWeights[weightLabel] = labeledWeights[weightLabel].flat();
else labeledWeights[weightLabel] = labeledWeights[weightLabel].reduce((acc:number[], val:number) => acc.concat(val), []);
}
});
}
// console.log(this.settings.name,'newWeights', newWeights);
// console.log(this.settings.name,'after - labeledWeights', labeledWeights);
const firstLabeledWeight = Object.keys(labeledWeights)[0];
if (!firstLabeledWeight || !labeledWeights[firstLabeledWeight] || labeledWeights[firstLabeledWeight].length !== this.settings.embedSize) throw new RangeError(`imported weights (${labeledWeights[firstLabeledWeight]?labeledWeights[firstLabeledWeight].length:'firstLabeledWeight:undefined'}) must have the same embedding size as model (${this.settings.embedSize})`);
const trainedWeights = this.tf.variable(this.tf.tensor(Object.values(labeledWeights)));
this.featureToId = featureToId;
this.idToFeature = idToFeature;
this.featureIds = featureIds;
this.numberOfFeatures = numberOfFeatures;
if (trainedWeights.shape[0] !== this.numberOfFeatures) {
console.warn('INVALID NUMBER OF this.numberOfFeatures', {
'trainedWeights.shape[0]': trainedWeights.shape[0],
'this.numberOfFeatures': this.numberOfFeatures,
});
this.numberOfFeatures = trainedWeights.shape[0];
}
this.compileModel({ layers: [{ weights: trainedWeights }] });
this.importedEmbeddings = true;
}
compileModel({ layers, }: { layers?: DenseLayer[] } = {}) {
this.model = undefined;
this.model = this.tf.sequential();
//@ts-ignore
this.generateLayers.call(this, [], [], layers || this.layers, /* x_test, y_test */);
this.model.compile(this.settings.compile);
this.compiled = true;
}
async train(x_matrix: Matrix, y_matrix:Matrix, layers?: DenseLayer[]) {
if (!this.featureToId || !this.idToFeature || !this.featureIds || !this.numberOfFeatures) {
const featureEmbedDataSet = await this.getFeatureDataSet({ inputMatrixFeatures: x_matrix, });
this.featureToId = featureEmbedDataSet.featureToId;
this.idToFeature = featureEmbedDataSet.idToFeature;
this.featureIds = featureEmbedDataSet.featureIds;
this.numberOfFeatures = featureEmbedDataSet.numberOfFeatures;
} else if (this.importedEmbeddings && x_matrix && x_matrix.length) this.featureIds = x_matrix;
if (this.compiled === false) this.compileModel({layers});
let loss = Infinity;
if (this.settings.fit?.callbacks?.onTrainBegin) this.settings.fit?.callbacks?.onTrainBegin({ loss });
await asyncForEach(range(0, this.settings.fit?.epochs), async (epoch: number) => {
if (this.settings.streamInputMatrix) {
let modelStatus = await this.generateBatch({epoch,});
loss = modelStatus.loss;
} else {
if (!this.numberOfFeatures) throw ReferenceError(`${this.settings.name} model is missing numberOfFeatures`);
if (!this.featureIds) throw ReferenceError(`${this.settings.name} model is missing featureIds`);
const cxt = await this.getContextPairs({ tf:this.tf, numberOfFeatures: this.numberOfFeatures, inputMatrix: this.featureIds });
const x_input_matrix = cxt.x;
const y_output_matrix = cxt.y;
let modelStatus = await this.trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss:loss, });
loss = modelStatus.loss;
}
});
if (this.settings.fit?.callbacks?.onTrainEnd) this.settings.fit?.callbacks?.onTrainEnd({ loss });
this.loss = loss;
this.trained = true;
return this.model;
}
// async calculate(x_matrix: Matrix | Vector | InputTextArray) {
async calculate() {
return this.model.getWeights()[0];
}
// async predict(input_matrix: any[], options: PredictionOptions | undefined) {
async predict(options: PredictionOptions = {}) {
const predictions = await this.calculate();
if (options.json === false) {
return await predictions.data();
} else {
// console.log({predictions})
const arr = await predictions.array()
if (!this.yShape) throw new Error('Model is missing yShape');
return this.reshape(arr, predictions.shape);
}
}
/**
* Converts matrix of layer weights into labeled features
* @example
const weights = [
[1.5,1,4,1.6,3.5],
[4.3,3.2,5.5,6.5]
]
FeatureEmbeddingInstance.labelWeights(weights) //=>
weights = {
car:[1.5,1,4,1.6,3.5],
boat:[4.3,3.2,5.5,6.5]
}
*/
labelWeights(weights: Matrix) {
return weights.reduce((result: { [index: string]: Vector;}, weight:Vector, index:number) => {
if(this.idToFeature) result[this.idToFeature[index]] = weight;
return result;
}, {});
}
/**
* Uses tSNE to reduce dimensionality of features
* @example
const weights = [
[1.5,1,4,1.6,3.5],
[4.3,3.2,5.5,6.5]
]
FeatureEmbeddingInstance.reduceWeights(weights) //=>
[
[1,2],
[2,3],
]
*/
async reduceWeights(weights: Matrix, options?: any) {
let model = new TSNE({
dim: 2,
perplexity: 30.0,
earlyExaggeration: 4.0,
learningRate: 100.0,
nIter: 1000,
metric: 'euclidean',
...options
});
model.init({
data: weights,
type: options ? options.type : 'dense'
});
let [error, iter] = model.run();
// console.log({ error, iter });
let output = model.getOutput();
return output;
}
/**
* Uses either cosineProximity or Eucledian distance to rank similarity
@example
//weights = [ [1,2,3,], [1,2,2], [0,-1,3] ]
//labeledWeights = [ {car:[1,2,3,],tesla:[1,2,2],boat:[0,-1,3]}]
FeatureEmbeddingInstance.findSimilarFeatures(weights,{features:['car'], limit:2,}) //=>
{
car:[
{
comparedFeature: 'tesla',
proximity: -0.5087087154388428,
distance: 0.03015853278338909
},
{
comparedFeature: 'boat',
proximity: -0.3032159209251404,
distance: 0.036241017282009125
},
]
}
*/
async findSimilarFeatures(weights: Matrix, options: SimilarFeatureOptions = {}) {
const tf:typeof Tensorflow = this.tf;
const { features = [], limit=5, labeledWeights, metric='distance' } = options;
const labeledFeatureWeights = labeledWeights || this.labelWeights(weights);
if(this.settings && this.settings.PAD) delete labeledFeatureWeights[this.settings.PAD]
return features.reduce((result:SimilarFeatures,feature: string) => {
const featureWeights = labeledFeatureWeights[feature];
if(!featureWeights) throw new ReferenceError(`Invalid feature: ${feature}`);
const sims = Object.keys(labeledFeatureWeights)
.map(searchFeature => {
const prox = tf.tidy(() => {
const proximity = tf.metrics.cosineProximity(
tf.tensor(featureWeights),
tf.tensor(labeledFeatureWeights[searchFeature])
);
const distance = tf.metrics.meanSquaredError(
tf.tensor(featureWeights),
tf.tensor(labeledFeatureWeights[searchFeature])
);
return [
proximity.asScalar().dataSync()[0],
distance.asScalar().dataSync()[0],
];
});
return {
comparedFeature: searchFeature,
proximity: prox[0],
distance: prox[1],
}
})
.sort((a, b) => (metric === 'distance')
? a.distance - b.distance
: a.proximity - b.proximity);
// sims.shift()
result[feature]= sims.slice(1, limit+1);
return result;
}, {});
}
}
export type LabeledWeights = {
[index: string]: Vector;
}
export enum SimilarityMetric {
DISTANCE = 'distance',
PROXIMITY = 'proximity',
}
export type SimilarFeatureOptions = {
features?: string[];
limit?: number;
labeledWeights?: LabeledWeights;
metric?: SimilarityMetric;
}
export type SimilarFeatures = { [index: string]: SimilarFeature[];}
export type SimilarFeature = {
comparedFeature: string;
distance: number;
proximity: number;
}