nmr-learning
Version:
Learn a database of chemical shift and coupling constants assignments
240 lines (212 loc) • 7.82 kB
JavaScript
const FS = require('fs');
const path = require('path');
const OCLE = require('openchemlib-extended');
const predictor = require('nmr-predictor');
const autoassigner = require('../../nmr-auto-assignment/src/index');
// const cheminfo = require('./preprocess/cheminfo');
// const maybridge = require('./preprocess/maybridge');
const compilePredictionTable = require('./compilePredictionTable');
const stats = require('./stats');
function loadFile(filename) {
return FS.readFileSync(path.join(__dirname, filename)).toString();
}
const prior = JSON.parse(loadFile('/../data/histogram_0_15ppm.json'));
async function start() {
var maxIterations = 15; // Set the number of interations for training
var ignoreLabile = true; // Set the use of labile protons during training
var learningRatio = 0.8; // A number between 0 and 1
const levels = [5, 4, 3];
var testSet = JSON.parse(loadFile('/../data/assigned298.json')); // File.parse("/data/nmrsignal298.json");//"/Research/NMR/AutoAssign/data/cobasSimulated";
// var dataset1 = JSON.parse(FS.readFileSync('/home/acastillo/Documents/data/procjson/big4.json').toString());//JSON.parse(FS.readFileSync('/home/acastillo/Documents/data/procjson/cheminfo443_y.json').toString());
var dataset1 = JSON.parse(
FS.readFileSync(
'/home/acastillo/Documents/data/procjson/cheminfo443_y.json'
).toString()
);
var dataset2 = JSON.parse(
FS.readFileSync(
'/home/acastillo/Documents/data/procjson/maybridge_y.json'
).toString()
);
var dataset3 = []; // JSON.parse(FS.readFileSync('/home/acastillo/Documents/data/procjson/big0.json').toString());
// dataset3.splice(0, 500)
var datasets = [dataset1, dataset2, dataset3];
var start, date;
var prevError = 0;
var prevCont = 0;
var dataset, max, ds, i, j, k, nAtoms;
var solutions;
// var fastDB = [];
var fastDB = JSON.parse(loadFile('/../data/h_13.json'));
console.log(`Cheminfo All: ${dataset1.length}`);
console.log(`MayBridge All: ${dataset2.length}`);
console.log(`Other All: ${dataset3.length}`);
// Remove the overlap molecules from train and test
var removed = 0;
var trainDataset = [];
for (i = 0; i < testSet.length; i++) {
for (ds = 0; ds < datasets.length; ds++) {
dataset = datasets[ds];
for (j = dataset.length - 1; j >= 0; j--) {
if (
dataset[j].general.ocl.hasLabile ||
testSet[i].diaID === dataset[j].general.ocl.id
) {
// if (testSet[i].diaID === dataset[j].general.ocl.id) {
dataset.splice(j, 1);
removed++;
break;
}
}
}
}
if (start === 0) {
start += removed;
}
for (ds = 0; ds < datasets.length; ds++) {
dataset = datasets[ds];
for (j = 0; j < dataset.length; j++) {
trainDataset.push(dataset[j]);
}
}
console.log(`Cheminfo Final: ${dataset1.length}`);
console.log(`MayBridge Final: ${dataset2.length}`);
console.log(`Other Final: ${dataset3.length}`);
console.log(
`Overlaped molecules: ${removed}. They were removed from training datasets`
);
// Run the learning process. After each iteration the system has seen every single molecule once
// We have to use another stop criteria like convergence
var iteration = 14;
maxIterations = 20;
var convergence = false;
try {
while (iteration < maxIterations && !convergence) {
date = new Date();
start = date.getTime();
var count = 0;
dataset = trainDataset; // datasets[ds];
max = dataset.length;
predictor.setDb(fastDB, 'proton', 'proton');
// we could now loop on the sdf to add the int index
let promises = [];
for (i = 0; i < max; i++) {
promises.push(
autoassigner(dataset[i], {
minScore: 1,
unassigned: 1,
maxSolutions: 2500,
timeout: 2000,
errorCS: -0.01,
predictor: predictor,
condensed: true,
OCLE: OCLE,
levels: [5],
use: 'median',
ignoreLabile: ignoreLabile,
learningRatio: learningRatio,
iteration: iteration
})
);
}
await Promise.all(promises).then((results) => {
for (let i = 0; i < max; i++) {
let result = results[i];
solutions = result.getAssignments();
if (
result.timeoutTerminated ||
result.nSolutions > solutions.length
) {
console.log(`${i} Too much solutions`);
} else {
// Get the unique assigments in the assignment variable.
// if(solutions.length > 0)
// console.log(solutions.length)
let solution = null;
if (solutions !== null && solutions.length > 0) {
solution = solutions[0];
let assignment = solution.assignment;
if (solutions.length > 1) {
nAtoms = assignment.length;
for (j = 0; j < nAtoms; j++) {
let signalId = assignment[j];
if (signalId !== '*') {
for (k = 1; k < solutions.length; k++) {
if (signalId !== solutions[k].assignment[j]) {
assignment[j] = '*';
break;
}
}
}
}
}
}
// Only save the last state
result.setAssignmentOnSample(dataset[i], solution);
// console.log(JSON.stringify(dataset[i].spectra.nmr[0]))
}
}
});
// Create the fast prediction table. It contains the prediction at last iteration
// Becasuse that, the iteration parameter has not effect on the stats
fastDB = compilePredictionTable(dataset, { iteration, OCLE }).H;
predictor.setDb(fastDB, 'proton', 'proton');
FS.writeFileSync(
`${__dirname}/../data/h_${iteration}.json`,
JSON.stringify(fastDB)
);
console.log(
`${Object.keys(fastDB[1]).length} ${Object.keys(fastDB[2]).length} ${
Object.keys(fastDB[3]).length
} ${Object.keys(fastDB[4]).length} ${Object.keys(fastDB[5]).length}`
);
// predictor.setDb(fastDB, 'proton', 'proton');
// console.log(JSON.stringify(fastDB));
date = new Date();
// Evalueate the error
console.log(`Iteration ${iteration}`);
console.log(`Time ${date.getTime() - start}`);
console.log(`New entries in the db: ${count}`);
start = date.getTime();
// var error = comparePredictors(datasetSim,{"db":db,"dataset":testSet,"iteration":"="+iteration});
var histParams = { from: 0, to: 1, nBins: 30 };
var error = await stats.cmp2asg(testSet, predictor, {
db: fastDB,
dataset: testSet,
ignoreLabile: ignoreLabile,
histParams: histParams,
levels: [5, 4, 3, 2],
use: 'median',
OCLE: OCLE
});
date = new Date();
console.log(
`Error: ${error.error} count: ${error.count} min: ${error.min} max: ${
error.max
}`
);
var data = error.hist;
var sumHist = 0;
for (let k = 0; k < data.length; k++) {
sumHist += data[k].y / error.count;
if (sumHist > 0) {
sumHist *= 1;
}
console.log(
`${data[k].x},${data[k].y},${data[k].y / error.count},${sumHist}`
);
}
console.log(`Time comparing ${date.getTime() - start}`);
if (prevCont === count && prevError <= error) {
// convergence = true;
}
prevCont = count;
prevError = error;
iteration++;
}
} catch (e) {
console.log(e);
}
console.log('Done');
}
start();