sdf-parser
Version:
305 lines (286 loc) • 9.39 kB
JavaScript
;
var ensureString = require('ensure-string');
var dynamicTyping = require('dynamic-typing');
/**
*
* @param {*} string
* @param {*} substring
* @param {*} eol
* @returns
*/
function getEntriesBoundaries(string, substring, eol) {
const res = [];
let previous = 0;
let next = 0;
while (next !== -1) {
next = string.indexOf(substring, previous);
if (next !== -1) {
res.push([previous, next]);
const nextMatch = string.indexOf(eol, next + substring.length);
if (nextMatch === -1) {
next = -1;
} else {
previous = nextMatch + eol.length;
next = previous;
}
} else {
res.push([previous, string.length]);
}
}
return res;
}
/**
* Parse the molfile and the properties with > < labels >
* @param {string} sdfPart
* @param {*} labels
* @param {*} currentLabels
* @param {object} options
* @returns
*/
function getMolecule$1(sdfPart, labels, currentLabels, options) {
let parts = sdfPart.split(`${options.eol}>`);
if (parts.length === 0 || parts[0].length <= 5) return;
let molecule = {};
molecule.molfile = parts[0] + options.eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(options.eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].slice(from + 1, to);
currentLabels.push(label);
if (!labels[label]) {
labels[label] = {
counter: 0,
isNumeric: options.dynamicTyping,
keep: false,
};
if (
(!options.exclude || !options.exclude.includes(label)) &&
(!options.include || options.include.includes(label))
) {
labels[label].keep = true;
if (options.modifiers[label]) {
labels[label].modifier = options.modifiers[label];
}
if (options.forEach[label]) {
labels[label].forEach = options.forEach[label];
}
}
}
if (labels[label].keep) {
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += options.eol + lines[k];
} else {
molecule[label] = lines[k];
}
}
if (labels[label].modifier) {
let modifiedValue = labels[label].modifier(molecule[label]);
if (modifiedValue === undefined || modifiedValue === null) {
delete molecule[label];
} else {
molecule[label] = modifiedValue;
}
}
if (
labels[label].isNumeric &&
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
) {
labels[label].isNumeric = false;
}
}
}
return molecule;
}
/**
* Parse a SDF file
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
* @param {object} [options={}]
* @param {string[]} [options.include] - List of fields to include
* @param {string[]} [options.exclude] - List of fields to exclude
* @param {Function} [options.filter] - Callback allowing to filter the molecules
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
* @returns {object} - Object containing the molecules, the labels and the statistics
*/
function parse(sdf, options = {}) {
options = { ...options };
if (options.modifiers === undefined) options.modifiers = {};
if (options.forEach === undefined) options.forEach = {};
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
sdf = ensureString.ensureString(sdf);
if (typeof sdf !== 'string') {
throw new TypeError('Parameter "sdf" must be a string');
}
if (options.eol === undefined) {
options.eol = '\n';
if (options.mixedEOL) {
sdf = sdf.replaceAll('\r\n', '\n');
sdf = sdf.replaceAll('\r', '\n');
} else {
// we will find the delimiter in order to be much faster and not use regular expression
let header = new Set(sdf.slice(0, 1000));
if (header.has('\r\n')) {
options.eol = '\r\n';
} else if (header.has('\r')) {
options.eol = '\r';
}
}
}
let entriesBoundaries = getEntriesBoundaries(
sdf,
`${options.eol}$$$$`,
options.eol,
);
let molecules = [];
let labels = {};
let start = Date.now();
for (let i = 0; i < entriesBoundaries.length; i++) {
let sdfPart = sdf.slice(...entriesBoundaries[i]);
if (sdfPart.length < 40) continue;
let currentLabels = [];
const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
if (!molecule) continue;
if (!options.filter || options.filter(molecule)) {
molecules.push(molecule);
// only now we can increase the counter
for (let j = 0; j < currentLabels.length; j++) {
labels[currentLabels[j]].counter++;
}
}
}
// all numeric fields should be converted to numbers
for (let label in labels) {
let currentLabel = labels[label];
if (currentLabel.isNumeric) {
currentLabel.minValue = Infinity;
currentLabel.maxValue = -Infinity;
for (let j = 0; j < molecules.length; j++) {
if (molecules[j][label]) {
let value = Number.parseFloat(molecules[j][label]);
molecules[j][label] = value;
if (value > currentLabel.maxValue) {
currentLabel.maxValue = value;
}
if (value < currentLabel.minValue) {
currentLabel.minValue = value;
}
}
}
}
}
// we check that a label is in all the records
for (let key in labels) {
if (labels[key].counter === molecules.length) {
labels[key].always = true;
} else {
labels[key].always = false;
}
}
let statistics = [];
for (let key in labels) {
let statistic = labels[key];
statistic.label = key;
statistics.push(statistic);
}
return {
time: Date.now() - start,
molecules,
labels: Object.keys(labels),
statistics,
};
}
class MolfileStream extends TransformStream {
#buffer = '';
constructor() {
super({
transform: (chunk, controller) => {
this.#buffer += chunk;
let begin = 0;
let index = 0;
while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) {
// we need to check if the delimiter '\n' is in the current buffer
// if it is not we need to wait for the next chunk
const endOfDelimiter = this.#buffer.indexOf('\n', index);
if (endOfDelimiter === -1) {
index = begin;
break;
}
const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1;
// need to remove the last eol because we will split on eol+'>' in getMolecule
if (index - eolLength - begin > 40) {
controller.enqueue(this.#buffer.slice(begin, index - eolLength));
}
index = endOfDelimiter + eolLength;
begin = index;
}
this.#buffer = this.#buffer.slice(begin);
},
flush: (controller) => {
if (this.#buffer && this.#buffer.length > 40) {
controller.enqueue(this.#buffer);
}
},
});
}
}
/**
* Parse a SDF file as an iterator
* @param {ReadableStream} readStream - SDF file to parse
* @param {object} [options={}] - iterator options
* @param {Function} [options.filter] - Callback allowing to filter the molecules
* @param {string} [options.eol='\n'] - End of line character
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
* @yields {object} - Molecule object
*/
async function* iterator(readStream, options = {}) {
const { eol = '\n', dynamicTyping = true } = options;
const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol }));
for await (const entry of moleculeStream) {
const molecule = getMolecule(entry, {
eol,
dynamicTyping,
});
if (!options.filter || options.filter(molecule)) {
yield molecule;
}
}
}
/**
* Convert a SDF part to an object
* @param {string} sdfPart - text containing the molfile
* @param {object} options - options
* @param {string} options.eol - end of line character
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
* @returns
*/
function getMolecule(sdfPart, options) {
const { eol, dynamicTyping: dynamicTyping$1 } = options;
let parts = sdfPart.split(`${eol}>`);
if (parts.length === 0 || parts[0].length <= 5) return;
let molecule = {};
molecule.molfile = parts[0] + eol;
for (let j = 1; j < parts.length; j++) {
let lines = parts[j].split(eol);
let from = lines[0].indexOf('<');
let to = lines[0].indexOf('>');
let label = lines[0].slice(from + 1, to);
for (let k = 1; k < lines.length - 1; k++) {
if (molecule[label]) {
molecule[label] += eol + lines[k];
} else {
molecule[label] = lines[k];
}
}
if (dynamicTyping$1) {
molecule[label] = dynamicTyping.parseString(molecule[label]);
}
}
return molecule;
}
exports.MolfileStream = MolfileStream;
exports.iterator = iterator;
exports.parse = parse;