@modelx/model
Version:
Deep Learning Classification, LSTM Time Series, Regression and Multi-Layered Perceptrons with Tensorflow
511 lines (510 loc) • 25 kB
JavaScript
import { asyncForEach } from './model_interface';
import * as Tensorflow from '@tensorflow/tfjs-node';
import { BaseNeuralNetwork, } from './base_neural_network';
import range from 'lodash.range';
import TSNE from 'tsne-js';
/**
* use a corpus to generate features from an embedding layer with Tensorflow
* @class FeatureEmbedding
* @implements {BaseNeuralNetwork}
*/
export class FeatureEmbedding extends BaseNeuralNetwork {
constructor(options = {}, properties) {
const config = {
layers: [],
type: 'cbow',
compile: {
loss: 'categoricalCrossentropy',
optimizer: 'rmsprop',
},
fit: {
epochs: 15,
batchSize: 1,
},
embedSize: 50,
windowSize: 2,
PAD: 'PAD',
streamInputMatrix: true,
initialLayerInitializerType: 'randomNormal',
initialLayerInitializerOptions: { seed: 1 },
...options
};
super(config, properties);
this.type = 'FeatureEmbedding';
this.featureToId;
this.idToFeature;
this.featureIds;
this.numberOfFeatures;
this.getMergedArray = FeatureEmbedding.getMergedArray;
this.getFeatureDataSet = FeatureEmbedding.getFeatureDataSet.bind(this);
this.getContextPairs = FeatureEmbedding.getContextPairs.bind(this);
return this;
}
// settings: TensorScriptOptions;
static async getFeatureDataSet({ inputMatrixFeatures, PAD = 'PAD', initialIdToFeature, initialFeatureToId, }) {
let featIndex = initialFeatureToId ? Object.keys(initialFeatureToId).length : 1;
const idToFeature = {
0: this && this.settings && this.settings.PAD
? this.settings.PAD
: PAD,
...initialIdToFeature,
};
const featureToId = inputMatrixFeatures.reduce((result, inputFeatureArray) => {
inputFeatureArray.forEach((inputFeature) => {
if (!result[inputFeature]) {
if (idToFeature[featIndex])
featIndex++;
result[inputFeature] = featIndex;
//@ts-ignore
idToFeature[featIndex] = inputFeature;
featIndex++;
}
});
return result;
}, {
[PAD]: 0,
...initialFeatureToId,
});
//@ts-ignore
const featureIds = inputMatrixFeatures.map(inputFeatureArray => inputFeatureArray.map(inputFeature => featureToId[inputFeature]));
// console.log('featureIds', featureIds);
// console.log('featureIds.length', featureIds.length);
// console.log('inputMatrixFeatures.length', inputMatrixFeatures.length);
const numberOfFeatures = Object.keys(featureToId).length;
return {
featureToId,
idToFeature,
featureIds,
numberOfFeatures,
};
}
static getMergedArray(base = [], merger = [], append = false, truncate = true) {
let arr = new Array().concat(base);
if (append)
arr.splice(base.length - merger.length, merger.length, ...merger);
else
arr.splice(0, merger.length, ...merger);
if (truncate && append)
return arr.slice(-1 * base.length);
else if (truncate)
return arr.slice(0, base.length);
else
return arr;
}
/**
*/
static async getContextPairs({ inputMatrix, numberOfFeatures, window_size = 2, tf, }) {
const tensorflow = this && this.tf ? this.tf : Tensorflow;
const context_length = (this && this.settings && this.settings.windowSize ? this.settings.windowSize : window_size) * 2;
const [emptyXVector, emptyYVector] = await Promise.all([
tensorflow.zeros([context_length]).array(),
tensorflow.zeros([numberOfFeatures]).array(),
]);
const x = [];
const y = [];
inputMatrix.forEach((inputVector, inputVectorIndex) => {
inputVector.forEach((word, index) => {
if (word != 0) {
const output = new Array().concat(emptyYVector);
const inputMerger = new Array().concat(inputMatrix[inputVectorIndex]);
inputMerger.splice(index, 1);
const input = FeatureEmbedding.getMergedArray(emptyXVector, inputMerger, true);
output[word] = 1;
// x.push([[word],input]);
x.push(input);
y.push(output);
}
});
});
return {
context_length,
emptyXVector,
emptyYVector,
x,
y,
};
}
/**
* Adds dense layers to tensorflow classification model
* @override
* @param {Array<Array<number>>} x_matrix - independent variables
* @param {Array<Array<number>>} y_matrix - dependent variables
* @param {Array<Object>} layers - model dense layer parameters
*/
generateLayers(x_matrix, y_matrix, layers) {
// const xShape = this.getInputShape(x_matrix);
if (!this.numberOfFeatures)
throw ReferenceError(`${this.settings.name} model is missing numberOfFeatures`);
if (!this.settings.embedSize)
throw ReferenceError(`${this.settings.name} model is missing embedSize`);
const yShape = [this.numberOfFeatures, this.settings.embedSize,]; // this.getInputShape(y_matrix);
this.yShape = yShape;
// this.xShape = xShape;
const denseLayers = [];
denseLayers.push({
units: this.numberOfFeatures,
inputDim: this.numberOfFeatures,
outputDim: this.settings.embedSize,
inputLength: (this.settings.windowSize || 2) * 2,
embeddingsInitializer: this.settings.initialLayerInitializerType
? this.tf.initializers[this.settings.initialLayerInitializerType](this.settings.
initialLayerInitializerOptions)
: undefined,
});
//TODO:NOT USED:
denseLayers.push({
lambdaFunction: 'result = tf.mean(input,1,true)',
lambdaOutputShape: [this.numberOfFeatures, this.settings.embedSize]
});
//TODO:END NOT USED:
denseLayers.push({ units: this.numberOfFeatures, activation: 'softmax', });
this.layers = denseLayers;
this.model.add(this.tf.layers.embedding(denseLayers[0]));
// this.model.add(new lambdaLayer(denseLayers[1]));
this.model.add(this.tf.layers.flatten());
this.model.add(this.tf.layers.dense(denseLayers[2]));
if (layers && layers.length && layers[0].weights) {
const originalModelWeights = this.model.getWeights();
originalModelWeights[0] = layers[0].weights;
this.model.setWeights(originalModelWeights);
// const postOriginalModelWeights = this.model.getWeights();
// const layerData = postOriginalModelWeights.map(w => w.dataSync());
// console.log('layerData', layerData);
}
// console.log('this.model.layers',this.model.layers)
}
async trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss, inputVectorIndex, inputVectorLength, }) {
var _a, _b, _c, _d, _e, _f, _g, _h;
let loss = Infinity;
if ((_b = (_a = this.settings.fit) === null || _a === void 0 ? void 0 : _a.callbacks) === null || _b === void 0 ? void 0 : _b.onEpochBegin)
(_d = (_c = this.settings.fit) === null || _c === void 0 ? void 0 : _c.callbacks) === null || _d === void 0 ? void 0 : _d.onEpochBegin(epoch, { loss: trainingLoss });
await asyncForEach(x_input_matrix, async (x_input, xIndex) => {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k, _l, _m;
if ((_b = (_a = this.settings.fit) === null || _a === void 0 ? void 0 : _a.callbacks) === null || _b === void 0 ? void 0 : _b.onBatchBegin)
(_d = (_c = this.settings.fit) === null || _c === void 0 ? void 0 : _c.callbacks) === null || _d === void 0 ? void 0 : _d.onBatchBegin(xIndex, { loss: trainingLoss, });
const y_output = y_output_matrix[xIndex];
const xShape = this.getInputShape([x_input]);
const xs = this.tf.tensor(x_input, xShape);
const yShape = this.getInputShape([y_output]);
const ys = this.tf.tensor(y_output, yShape);
// const xdata = await xs.data()
// console.log({ xs, xdata, xShape });
loss = await this.model.trainOnBatch(xs, ys);
if ((_f = (_e = this.settings.fit) === null || _e === void 0 ? void 0 : _e.callbacks) === null || _f === void 0 ? void 0 : _f.onYield)
(_h = (_g = this.settings.fit) === null || _g === void 0 ? void 0 : _g.callbacks) === null || _h === void 0 ? void 0 : _h.onYield(epoch, xIndex, { loss,
inputVectorIndex,
inputVectorLength,
completion: `${(100 * (((inputVectorIndex || xIndex) + 1) / (inputVectorLength || x_input_matrix.length))).toFixed(2)}%` });
if ((_k = (_j = this.settings.fit) === null || _j === void 0 ? void 0 : _j.callbacks) === null || _k === void 0 ? void 0 : _k.onBatchEnd)
(_m = (_l = this.settings.fit) === null || _l === void 0 ? void 0 : _l.callbacks) === null || _m === void 0 ? void 0 : _m.onBatchEnd(xIndex, { loss });
// console.log({ x_input, xIndex, xShape, y_output, yShape, loss })
xs.dispose();
ys.dispose();
});
if ((_f = (_e = this.settings.fit) === null || _e === void 0 ? void 0 : _e.callbacks) === null || _f === void 0 ? void 0 : _f.onEpochEnd)
(_h = (_g = this.settings.fit) === null || _g === void 0 ? void 0 : _g.callbacks) === null || _h === void 0 ? void 0 : _h.onEpochEnd(epoch, { loss });
return {
loss
};
}
async generateBatch({ epoch }) {
if (!this.featureIds)
throw ReferenceError(`${this.settings.name} model is missing featureIds`);
const preTransformedMatrix = this.featureIds;
const context_length = (this && this.settings && this.settings.windowSize ? this.settings.windowSize : 2) * 2;
const [emptyXVector, emptyYVector] = await Promise.all([
this.tf.zeros([context_length]).array(),
this.tf.zeros([this.numberOfFeatures]).array(),
]);
let x_input_matrix = [];
let y_output_matrix = [];
let trainingLoss = Infinity;
await asyncForEach(preTransformedMatrix, async (inputVector, inputVectorIndex) => {
await asyncForEach(inputVector, async (word, index) => {
if (this.settings.checkInputMatrix && this.numberOfFeatures && word >= (this.numberOfFeatures)) {
console.warn('invalid word in corpus', {
trainingLoss,
word,
'this.idToFeature[word]': this.idToFeature && this.idToFeature[word],
'this.numberOfFeatures': this.numberOfFeatures,
});
}
else if (word != 0) {
const output = new Array().concat(emptyYVector);
const inputMerger = new Array().concat(preTransformedMatrix[inputVectorIndex]);
inputMerger.splice(index, 1);
const input = FeatureEmbedding.getMergedArray(emptyXVector, inputMerger, true);
output[word] = 1;
// x.push([[word],input]);
// x.push(input);
// y.push(output);
x_input_matrix = [input];
y_output_matrix = [output];
// console.log({ input, output });
if (this.settings.checkInputMatrix && this.numberOfFeatures && this.numberOfFeatures > 0 && input.filter((wordInput) => this.numberOfFeatures && wordInput >= this.numberOfFeatures).length) {
console.warn('Input matrix contains unknown weight', { input, 'this.numberOfFeatures': this.numberOfFeatures });
}
else {
x_input_matrix = [input];
y_output_matrix = [output];
// console.log({ input, output });
const modelStatus = await this.trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss, inputVectorIndex, inputVectorLength: preTransformedMatrix.length, });
trainingLoss = modelStatus.loss;
}
}
});
});
return {
loss: trainingLoss,
};
}
async exportEmbeddings() {
if (this.trained !== true)
throw new ReferenceError('The model has to be trained before embeddings can be exported');
const weights = await this.predict();
const labeledWeights = this.labelWeights(weights);
return {
featureToId: this.featureToId,
idToFeature: this.idToFeature,
featureIds: this.featureIds,
numberOfFeatures: this.numberOfFeatures,
labeledWeights,
};
}
async importEmbeddings({ featureToId, idToFeature, featureIds, numberOfFeatures, labeledWeights, addNewWeights = true, inputMatrixFeatures, fixImportedWeights = false, }) {
// console.log(this.settings.name,'before - labeledWeights', labeledWeights);
this.model = undefined;
let updatedModelProperties;
// let newWeights:LabeledWeights = {};
if (addNewWeights) {
if (inputMatrixFeatures) {
updatedModelProperties = await this.getFeatureDataSet({ inputMatrixFeatures, initialIdToFeature: idToFeature, initialFeatureToId: featureToId, });
featureToId = updatedModelProperties === null || updatedModelProperties === void 0 ? void 0 : updatedModelProperties.featureToId;
idToFeature = updatedModelProperties === null || updatedModelProperties === void 0 ? void 0 : updatedModelProperties.idToFeature;
featureIds = updatedModelProperties === null || updatedModelProperties === void 0 ? void 0 : updatedModelProperties.featureIds;
numberOfFeatures = updatedModelProperties === null || updatedModelProperties === void 0 ? void 0 : updatedModelProperties.numberOfFeatures;
// console.log(this.settings.name,'updatedModelProperties',updatedModelProperties);
}
if (featureToId) {
await asyncForEach(Object.keys(featureToId), async (weightLabel) => {
if (!labeledWeights[weightLabel] || !labeledWeights[weightLabel].length) {
// newWeights[weightLabel] = await this.tf.randomUniform([1, this.settings.embedSize], -1, 1).array();
labeledWeights[weightLabel] = await this.tf.randomUniform([1, this.settings.embedSize], -1, 1).array();
if (Array.isArray(labeledWeights[weightLabel][0])) {
if (typeof labeledWeights[weightLabel].flat === 'function')
labeledWeights[weightLabel] = labeledWeights[weightLabel].flat();
else
labeledWeights[weightLabel] = labeledWeights[weightLabel].reduce((acc, val) => acc.concat(val), []);
}
}
});
}
}
if (fixImportedWeights) {
Object.keys(labeledWeights).forEach(weightLabel => {
if (Array.isArray(labeledWeights[weightLabel][0])) {
if (typeof labeledWeights[weightLabel].flat === 'function')
labeledWeights[weightLabel] = labeledWeights[weightLabel].flat();
else
labeledWeights[weightLabel] = labeledWeights[weightLabel].reduce((acc, val) => acc.concat(val), []);
}
});
}
// console.log(this.settings.name,'newWeights', newWeights);
// console.log(this.settings.name,'after - labeledWeights', labeledWeights);
const firstLabeledWeight = Object.keys(labeledWeights)[0];
if (!firstLabeledWeight || !labeledWeights[firstLabeledWeight] || labeledWeights[firstLabeledWeight].length !== this.settings.embedSize)
throw new RangeError(`imported weights (${labeledWeights[firstLabeledWeight] ? labeledWeights[firstLabeledWeight].length : 'firstLabeledWeight:undefined'}) must have the same embedding size as model (${this.settings.embedSize})`);
const trainedWeights = this.tf.variable(this.tf.tensor(Object.values(labeledWeights)));
this.featureToId = featureToId;
this.idToFeature = idToFeature;
this.featureIds = featureIds;
this.numberOfFeatures = numberOfFeatures;
if (trainedWeights.shape[0] !== this.numberOfFeatures) {
console.warn('INVALID NUMBER OF this.numberOfFeatures', {
'trainedWeights.shape[0]': trainedWeights.shape[0],
'this.numberOfFeatures': this.numberOfFeatures,
});
this.numberOfFeatures = trainedWeights.shape[0];
}
this.compileModel({ layers: [{ weights: trainedWeights }] });
this.importedEmbeddings = true;
}
compileModel({ layers, } = {}) {
this.model = undefined;
this.model = this.tf.sequential();
//@ts-ignore
this.generateLayers.call(this, [], [], layers || this.layers);
this.model.compile(this.settings.compile);
this.compiled = true;
}
async train(x_matrix, y_matrix, layers) {
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
if (!this.featureToId || !this.idToFeature || !this.featureIds || !this.numberOfFeatures) {
const featureEmbedDataSet = await this.getFeatureDataSet({ inputMatrixFeatures: x_matrix, });
this.featureToId = featureEmbedDataSet.featureToId;
this.idToFeature = featureEmbedDataSet.idToFeature;
this.featureIds = featureEmbedDataSet.featureIds;
this.numberOfFeatures = featureEmbedDataSet.numberOfFeatures;
}
else if (this.importedEmbeddings && x_matrix && x_matrix.length)
this.featureIds = x_matrix;
if (this.compiled === false)
this.compileModel({ layers });
let loss = Infinity;
if ((_b = (_a = this.settings.fit) === null || _a === void 0 ? void 0 : _a.callbacks) === null || _b === void 0 ? void 0 : _b.onTrainBegin)
(_d = (_c = this.settings.fit) === null || _c === void 0 ? void 0 : _c.callbacks) === null || _d === void 0 ? void 0 : _d.onTrainBegin({ loss });
await asyncForEach(range(0, (_e = this.settings.fit) === null || _e === void 0 ? void 0 : _e.epochs), async (epoch) => {
if (this.settings.streamInputMatrix) {
let modelStatus = await this.generateBatch({ epoch, });
loss = modelStatus.loss;
}
else {
if (!this.numberOfFeatures)
throw ReferenceError(`${this.settings.name} model is missing numberOfFeatures`);
if (!this.featureIds)
throw ReferenceError(`${this.settings.name} model is missing featureIds`);
const cxt = await this.getContextPairs({ tf: this.tf, numberOfFeatures: this.numberOfFeatures, inputMatrix: this.featureIds });
const x_input_matrix = cxt.x;
const y_output_matrix = cxt.y;
let modelStatus = await this.trainOnBatch({ x_input_matrix, y_output_matrix, epoch, trainingLoss: loss, });
loss = modelStatus.loss;
}
});
if ((_g = (_f = this.settings.fit) === null || _f === void 0 ? void 0 : _f.callbacks) === null || _g === void 0 ? void 0 : _g.onTrainEnd)
(_j = (_h = this.settings.fit) === null || _h === void 0 ? void 0 : _h.callbacks) === null || _j === void 0 ? void 0 : _j.onTrainEnd({ loss });
this.loss = loss;
this.trained = true;
return this.model;
}
// async calculate(x_matrix: Matrix | Vector | InputTextArray) {
async calculate() {
return this.model.getWeights()[0];
}
// async predict(input_matrix: any[], options: PredictionOptions | undefined) {
async predict(options = {}) {
const predictions = await this.calculate();
if (options.json === false) {
return await predictions.data();
}
else {
// console.log({predictions})
const arr = await predictions.array();
if (!this.yShape)
throw new Error('Model is missing yShape');
return this.reshape(arr, predictions.shape);
}
}
/**
* Converts matrix of layer weights into labeled features
* @example
const weights = [
[1.5,1,4,1.6,3.5],
[4.3,3.2,5.5,6.5]
]
FeatureEmbeddingInstance.labelWeights(weights) //=>
weights = {
car:[1.5,1,4,1.6,3.5],
boat:[4.3,3.2,5.5,6.5]
}
*/
labelWeights(weights) {
return weights.reduce((result, weight, index) => {
if (this.idToFeature)
result[this.idToFeature[index]] = weight;
return result;
}, {});
}
/**
* Uses tSNE to reduce dimensionality of features
* @example
const weights = [
[1.5,1,4,1.6,3.5],
[4.3,3.2,5.5,6.5]
]
FeatureEmbeddingInstance.reduceWeights(weights) //=>
[
[1,2],
[2,3],
]
*/
async reduceWeights(weights, options) {
let model = new TSNE({
dim: 2,
perplexity: 30.0,
earlyExaggeration: 4.0,
learningRate: 100.0,
nIter: 1000,
metric: 'euclidean',
...options
});
model.init({
data: weights,
type: options ? options.type : 'dense'
});
let [error, iter] = model.run();
// console.log({ error, iter });
let output = model.getOutput();
return output;
}
/**
* Uses either cosineProximity or Eucledian distance to rank similarity
@example
//weights = [ [1,2,3,], [1,2,2], [0,-1,3] ]
//labeledWeights = [ {car:[1,2,3,],tesla:[1,2,2],boat:[0,-1,3]}]
FeatureEmbeddingInstance.findSimilarFeatures(weights,{features:['car'], limit:2,}) //=>
{
car:[
{
comparedFeature: 'tesla',
proximity: -0.5087087154388428,
distance: 0.03015853278338909
},
{
comparedFeature: 'boat',
proximity: -0.3032159209251404,
distance: 0.036241017282009125
},
]
}
*/
async findSimilarFeatures(weights, options = {}) {
const tf = this.tf;
const { features = [], limit = 5, labeledWeights, metric = 'distance' } = options;
const labeledFeatureWeights = labeledWeights || this.labelWeights(weights);
if (this.settings && this.settings.PAD)
delete labeledFeatureWeights[this.settings.PAD];
return features.reduce((result, feature) => {
const featureWeights = labeledFeatureWeights[feature];
if (!featureWeights)
throw new ReferenceError(`Invalid feature: ${feature}`);
const sims = Object.keys(labeledFeatureWeights)
.map(searchFeature => {
const prox = tf.tidy(() => {
const proximity = tf.metrics.cosineProximity(tf.tensor(featureWeights), tf.tensor(labeledFeatureWeights[searchFeature]));
const distance = tf.metrics.meanSquaredError(tf.tensor(featureWeights), tf.tensor(labeledFeatureWeights[searchFeature]));
return [
proximity.asScalar().dataSync()[0],
distance.asScalar().dataSync()[0],
];
});
return {
comparedFeature: searchFeature,
proximity: prox[0],
distance: prox[1],
};
})
.sort((a, b) => (metric === 'distance')
? a.distance - b.distance
: a.proximity - b.proximity);
// sims.shift()
result[feature] = sims.slice(1, limit + 1);
return result;
}, {});
}
}
export var SimilarityMetric;
(function (SimilarityMetric) {
SimilarityMetric["DISTANCE"] = "distance";
SimilarityMetric["PROXIMITY"] = "proximity";
})(SimilarityMetric || (SimilarityMetric = {}));