sdf-parser
Version:
217 lines (200 loc) • 6.73 kB
text/typescript
import { ensureString } from 'ensure-string';
import { getEntriesBoundaries } from './getEntriesBoundaries.ts';
import type { LabelInfo } from './util/getMolecule.ts';
import { getMolecule } from './util/getMolecule.ts';
/**
* A parsed SDF molecule entry. The `molfile` field contains the raw molfile
* string. Additional fields are populated from the SDF `> <field>` sections.
*/
export interface Molecule {
/** The raw V2000/V3000 molfile block. */
molfile: string;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
[label: string]: any;
}
/**
* Options for the {@link parse} function.
*/
export interface ParseOptions {
/**
* Modifier functions applied to field values after parsing. The function
* receives the raw string value and may return a transformed value. Returning
* `undefined` or `null` removes the field from the molecule.
*/
modifiers?: Record<string, (value: string) => unknown>;
/**
* Callback functions called for each field value. The callbacks are stored
* on the label info and available in statistics.
*/
forEach?: Record<string, (value: unknown) => void>;
/**
* When `true`, numeric string values are automatically converted to numbers.
* @default true
*/
dynamicTyping?: boolean;
/**
* End-of-line character. Auto-detected from the file content when not set.
* Detected as `'\r\n'` for Windows-style files; falls back to `'\n'`.
* @default '\n'
*/
eol?: string;
/**
* When `true`, normalises all `\r\n` sequences to `\n` before parsing.
* Useful for SDF files with Windows-style line endings.
* @default false
*/
mixedEOL?: boolean;
/**
* Only include fields whose names appear in this list.
* When combined with `exclude`, the field must satisfy both constraints.
*/
include?: string[];
/**
* Exclude fields whose names appear in this list.
* When combined with `include`, the field must satisfy both constraints.
*/
exclude?: string[];
/**
* A predicate function to filter molecules. Only molecules for which this
* function returns `true` are included in the result.
*/
filter?: (molecule: Molecule) => boolean;
}
/**
* Statistics for a single SDF field label, as returned in
* {@link ParseResult.statistics}.
*/
export interface LabelStatistic {
/** Field label name. */
label: string;
/** Number of molecules that contain this field. */
counter: number;
/** Whether all parsed values are numeric. */
isNumeric: boolean;
/** Whether this field is included in the output (not excluded). */
keep: boolean;
/** Minimum numeric value, only set when `isNumeric` is `true`. */
minValue?: number;
/** Maximum numeric value, only set when `isNumeric` is `true`. */
maxValue?: number;
/** Whether every molecule in the result contains this field. */
always: boolean;
}
/**
* Return value of the {@link parse} function.
*/
export interface ParseResult {
/** Wall-clock time taken to parse, in milliseconds. */
time: number;
/** Parsed molecule entries. */
molecules: Molecule[];
/** Sorted list of all field label names found in the file. */
labels: string[];
/** Per-label statistics. */
statistics: LabelStatistic[];
}
/**
* Synchronously parse an SDF file into an array of molecule objects.
* @param sdf - The SDF content as a string, `ArrayBuffer`, or `ArrayBufferView`.
* @param options - Parsing options.
* @returns A {@link ParseResult} containing molecules and statistics.
* @example
* ```ts
* import { readFileSync } from 'node:fs';
* import { parse } from 'sdf-parser';
*
* const sdf = readFileSync('compounds.sdf', 'utf8');
* const { molecules, statistics } = parse(sdf);
* ```
*/
export function parse(sdf: unknown, options: ParseOptions = {}): ParseResult {
options = { ...options };
if (options.modifiers === undefined) options.modifiers = {};
if (options.forEach === undefined) options.forEach = {};
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
// ensureString converts ArrayBuffer/ArrayBufferView to string
const sdfString = ensureString(sdf as Parameters<typeof ensureString>[0]);
if (typeof sdfString !== 'string') {
throw new TypeError('Parameter "sdf" must be a string');
}
if (options.eol === undefined) {
options.eol = '\n';
if (!options.mixedEOL && sdfString.slice(0, 1000).includes('\r\n')) {
options.eol = '\r\n';
}
}
let workingSdf = sdfString;
if (options.mixedEOL) {
workingSdf = workingSdf.replaceAll('\r\n', '\n');
}
const eol = options.eol;
const modifiers = options.modifiers;
const forEachMap = options.forEach;
const dynamicTyping = options.dynamicTyping;
const entriesBoundaries = getEntriesBoundaries(workingSdf, `${eol}$$$$`, eol);
const molecules: Molecule[] = [];
const labels: Record<string, LabelInfo> = {};
const start = Date.now();
for (const boundary of entriesBoundaries) {
const sdfPart = workingSdf.slice(...boundary);
if (sdfPart.length < 40) continue;
const currentLabels: string[] = [];
const molecule = getMolecule(sdfPart, labels, currentLabels, {
eol,
dynamicTyping,
modifiers,
forEach: forEachMap,
include: options.include,
exclude: options.exclude,
});
if (!molecule) continue;
if (!options.filter || options.filter(molecule)) {
molecules.push(molecule);
for (const label of currentLabels) {
labels[label].counter++;
}
}
}
// Convert all numeric fields and compute min/max
for (const label in labels) {
const currentLabel = labels[label];
if (currentLabel.isNumeric) {
currentLabel.minValue = Infinity;
currentLabel.maxValue = -Infinity;
for (const molecule of molecules) {
if (molecule[label]) {
const value = Number.parseFloat(molecule[label]);
molecule[label] = value;
if (value > (currentLabel.maxValue ?? -Infinity)) {
currentLabel.maxValue = value;
}
if (value < (currentLabel.minValue ?? Infinity)) {
currentLabel.minValue = value;
}
}
}
}
}
for (const key in labels) {
labels[key].always = labels[key].counter === molecules.length;
}
const statistics: LabelStatistic[] = [];
for (const key in labels) {
const info = labels[key];
statistics.push({
label: key,
counter: info.counter,
isNumeric: info.isNumeric,
keep: info.keep,
minValue: info.minValue,
maxValue: info.maxValue,
always: info.always ?? false,
});
}
return {
time: Date.now() - start,
molecules,
labels: Object.keys(labels),
statistics,
};
}