UNPKG

sdf-parser

Version:
305 lines (286 loc) 9.39 kB
'use strict'; var ensureString = require('ensure-string'); var dynamicTyping = require('dynamic-typing'); /** * * @param {*} string * @param {*} substring * @param {*} eol * @returns */ function getEntriesBoundaries(string, substring, eol) { const res = []; let previous = 0; let next = 0; while (next !== -1) { next = string.indexOf(substring, previous); if (next !== -1) { res.push([previous, next]); const nextMatch = string.indexOf(eol, next + substring.length); if (nextMatch === -1) { next = -1; } else { previous = nextMatch + eol.length; next = previous; } } else { res.push([previous, string.length]); } } return res; } /** * Parse the molfile and the properties with > < labels > * @param {string} sdfPart * @param {*} labels * @param {*} currentLabels * @param {object} options * @returns */ function getMolecule$1(sdfPart, labels, currentLabels, options) { let parts = sdfPart.split(`${options.eol}>`); if (parts.length === 0 || parts[0].length <= 5) return; let molecule = {}; molecule.molfile = parts[0] + options.eol; for (let j = 1; j < parts.length; j++) { let lines = parts[j].split(options.eol); let from = lines[0].indexOf('<'); let to = lines[0].indexOf('>'); let label = lines[0].slice(from + 1, to); currentLabels.push(label); if (!labels[label]) { labels[label] = { counter: 0, isNumeric: options.dynamicTyping, keep: false, }; if ( (!options.exclude || !options.exclude.includes(label)) && (!options.include || options.include.includes(label)) ) { labels[label].keep = true; if (options.modifiers[label]) { labels[label].modifier = options.modifiers[label]; } if (options.forEach[label]) { labels[label].forEach = options.forEach[label]; } } } if (labels[label].keep) { for (let k = 1; k < lines.length - 1; k++) { if (molecule[label]) { molecule[label] += options.eol + lines[k]; } else { molecule[label] = lines[k]; } } if (labels[label].modifier) { let modifiedValue = labels[label].modifier(molecule[label]); if (modifiedValue === undefined || modifiedValue === null) { delete molecule[label]; } else { molecule[label] = modifiedValue; } } if ( labels[label].isNumeric && (!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/)) ) { labels[label].isNumeric = false; } } } return molecule; } /** * Parse a SDF file * @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse * @param {object} [options={}] * @param {string[]} [options.include] - List of fields to include * @param {string[]} [options.exclude] - List of fields to exclude * @param {Function} [options.filter] - Callback allowing to filter the molecules * @param {boolean} [options.dynamicTyping] - Dynamically type the data * @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields * @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n * @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file * @returns {object} - Object containing the molecules, the labels and the statistics */ function parse(sdf, options = {}) { options = { ...options }; if (options.modifiers === undefined) options.modifiers = {}; if (options.forEach === undefined) options.forEach = {}; if (options.dynamicTyping === undefined) options.dynamicTyping = true; sdf = ensureString.ensureString(sdf); if (typeof sdf !== 'string') { throw new TypeError('Parameter "sdf" must be a string'); } if (options.eol === undefined) { options.eol = '\n'; if (options.mixedEOL) { sdf = sdf.replaceAll('\r\n', '\n'); sdf = sdf.replaceAll('\r', '\n'); } else { // we will find the delimiter in order to be much faster and not use regular expression let header = new Set(sdf.slice(0, 1000)); if (header.has('\r\n')) { options.eol = '\r\n'; } else if (header.has('\r')) { options.eol = '\r'; } } } let entriesBoundaries = getEntriesBoundaries( sdf, `${options.eol}$$$$`, options.eol, ); let molecules = []; let labels = {}; let start = Date.now(); for (let i = 0; i < entriesBoundaries.length; i++) { let sdfPart = sdf.slice(...entriesBoundaries[i]); if (sdfPart.length < 40) continue; let currentLabels = []; const molecule = getMolecule$1(sdfPart, labels, currentLabels, options); if (!molecule) continue; if (!options.filter || options.filter(molecule)) { molecules.push(molecule); // only now we can increase the counter for (let j = 0; j < currentLabels.length; j++) { labels[currentLabels[j]].counter++; } } } // all numeric fields should be converted to numbers for (let label in labels) { let currentLabel = labels[label]; if (currentLabel.isNumeric) { currentLabel.minValue = Infinity; currentLabel.maxValue = -Infinity; for (let j = 0; j < molecules.length; j++) { if (molecules[j][label]) { let value = Number.parseFloat(molecules[j][label]); molecules[j][label] = value; if (value > currentLabel.maxValue) { currentLabel.maxValue = value; } if (value < currentLabel.minValue) { currentLabel.minValue = value; } } } } } // we check that a label is in all the records for (let key in labels) { if (labels[key].counter === molecules.length) { labels[key].always = true; } else { labels[key].always = false; } } let statistics = []; for (let key in labels) { let statistic = labels[key]; statistic.label = key; statistics.push(statistic); } return { time: Date.now() - start, molecules, labels: Object.keys(labels), statistics, }; } class MolfileStream extends TransformStream { #buffer = ''; constructor() { super({ transform: (chunk, controller) => { this.#buffer += chunk; let begin = 0; let index = 0; while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) { // we need to check if the delimiter '\n' is in the current buffer // if it is not we need to wait for the next chunk const endOfDelimiter = this.#buffer.indexOf('\n', index); if (endOfDelimiter === -1) { index = begin; break; } const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1; // need to remove the last eol because we will split on eol+'>' in getMolecule if (index - eolLength - begin > 40) { controller.enqueue(this.#buffer.slice(begin, index - eolLength)); } index = endOfDelimiter + eolLength; begin = index; } this.#buffer = this.#buffer.slice(begin); }, flush: (controller) => { if (this.#buffer && this.#buffer.length > 40) { controller.enqueue(this.#buffer); } }, }); } } /** * Parse a SDF file as an iterator * @param {ReadableStream} readStream - SDF file to parse * @param {object} [options={}] - iterator options * @param {Function} [options.filter] - Callback allowing to filter the molecules * @param {string} [options.eol='\n'] - End of line character * @param {boolean} [options.dynamicTyping] - Dynamically type the data * @yields {object} - Molecule object */ async function* iterator(readStream, options = {}) { const { eol = '\n', dynamicTyping = true } = options; const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol })); for await (const entry of moleculeStream) { const molecule = getMolecule(entry, { eol, dynamicTyping, }); if (!options.filter || options.filter(molecule)) { yield molecule; } } } /** * Convert a SDF part to an object * @param {string} sdfPart - text containing the molfile * @param {object} options - options * @param {string} options.eol - end of line character * @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans) * @returns */ function getMolecule(sdfPart, options) { const { eol, dynamicTyping: dynamicTyping$1 } = options; let parts = sdfPart.split(`${eol}>`); if (parts.length === 0 || parts[0].length <= 5) return; let molecule = {}; molecule.molfile = parts[0] + eol; for (let j = 1; j < parts.length; j++) { let lines = parts[j].split(eol); let from = lines[0].indexOf('<'); let to = lines[0].indexOf('>'); let label = lines[0].slice(from + 1, to); for (let k = 1; k < lines.length - 1; k++) { if (molecule[label]) { molecule[label] += eol + lines[k]; } else { molecule[label] = lines[k]; } } if (dynamicTyping$1) { molecule[label] = dynamicTyping.parseString(molecule[label]); } } return molecule; } exports.MolfileStream = MolfileStream; exports.iterator = iterator; exports.parse = parse;