UNPKG

sdf-parser

Version:
217 lines (200 loc) 6.73 kB
import { ensureString } from 'ensure-string'; import { getEntriesBoundaries } from './getEntriesBoundaries.ts'; import type { LabelInfo } from './util/getMolecule.ts'; import { getMolecule } from './util/getMolecule.ts'; /** * A parsed SDF molecule entry. The `molfile` field contains the raw molfile * string. Additional fields are populated from the SDF `> <field>` sections. */ export interface Molecule { /** The raw V2000/V3000 molfile block. */ molfile: string; // eslint-disable-next-line @typescript-eslint/no-explicit-any [label: string]: any; } /** * Options for the {@link parse} function. */ export interface ParseOptions { /** * Modifier functions applied to field values after parsing. The function * receives the raw string value and may return a transformed value. Returning * `undefined` or `null` removes the field from the molecule. */ modifiers?: Record<string, (value: string) => unknown>; /** * Callback functions called for each field value. The callbacks are stored * on the label info and available in statistics. */ forEach?: Record<string, (value: unknown) => void>; /** * When `true`, numeric string values are automatically converted to numbers. * @default true */ dynamicTyping?: boolean; /** * End-of-line character. Auto-detected from the file content when not set. * Detected as `'\r\n'` for Windows-style files; falls back to `'\n'`. * @default '\n' */ eol?: string; /** * When `true`, normalises all `\r\n` sequences to `\n` before parsing. * Useful for SDF files with Windows-style line endings. * @default false */ mixedEOL?: boolean; /** * Only include fields whose names appear in this list. * When combined with `exclude`, the field must satisfy both constraints. */ include?: string[]; /** * Exclude fields whose names appear in this list. * When combined with `include`, the field must satisfy both constraints. */ exclude?: string[]; /** * A predicate function to filter molecules. Only molecules for which this * function returns `true` are included in the result. */ filter?: (molecule: Molecule) => boolean; } /** * Statistics for a single SDF field label, as returned in * {@link ParseResult.statistics}. */ export interface LabelStatistic { /** Field label name. */ label: string; /** Number of molecules that contain this field. */ counter: number; /** Whether all parsed values are numeric. */ isNumeric: boolean; /** Whether this field is included in the output (not excluded). */ keep: boolean; /** Minimum numeric value, only set when `isNumeric` is `true`. */ minValue?: number; /** Maximum numeric value, only set when `isNumeric` is `true`. */ maxValue?: number; /** Whether every molecule in the result contains this field. */ always: boolean; } /** * Return value of the {@link parse} function. */ export interface ParseResult { /** Wall-clock time taken to parse, in milliseconds. */ time: number; /** Parsed molecule entries. */ molecules: Molecule[]; /** Sorted list of all field label names found in the file. */ labels: string[]; /** Per-label statistics. */ statistics: LabelStatistic[]; } /** * Synchronously parse an SDF file into an array of molecule objects. * @param sdf - The SDF content as a string, `ArrayBuffer`, or `ArrayBufferView`. * @param options - Parsing options. * @returns A {@link ParseResult} containing molecules and statistics. * @example * ```ts * import { readFileSync } from 'node:fs'; * import { parse } from 'sdf-parser'; * * const sdf = readFileSync('compounds.sdf', 'utf8'); * const { molecules, statistics } = parse(sdf); * ``` */ export function parse(sdf: unknown, options: ParseOptions = {}): ParseResult { options = { ...options }; if (options.modifiers === undefined) options.modifiers = {}; if (options.forEach === undefined) options.forEach = {}; if (options.dynamicTyping === undefined) options.dynamicTyping = true; // ensureString converts ArrayBuffer/ArrayBufferView to string const sdfString = ensureString(sdf as Parameters<typeof ensureString>[0]); if (typeof sdfString !== 'string') { throw new TypeError('Parameter "sdf" must be a string'); } if (options.eol === undefined) { options.eol = '\n'; if (!options.mixedEOL && sdfString.slice(0, 1000).includes('\r\n')) { options.eol = '\r\n'; } } let workingSdf = sdfString; if (options.mixedEOL) { workingSdf = workingSdf.replaceAll('\r\n', '\n'); } const eol = options.eol; const modifiers = options.modifiers; const forEachMap = options.forEach; const dynamicTyping = options.dynamicTyping; const entriesBoundaries = getEntriesBoundaries(workingSdf, `${eol}$$$$`, eol); const molecules: Molecule[] = []; const labels: Record<string, LabelInfo> = {}; const start = Date.now(); for (const boundary of entriesBoundaries) { const sdfPart = workingSdf.slice(...boundary); if (sdfPart.length < 40) continue; const currentLabels: string[] = []; const molecule = getMolecule(sdfPart, labels, currentLabels, { eol, dynamicTyping, modifiers, forEach: forEachMap, include: options.include, exclude: options.exclude, }); if (!molecule) continue; if (!options.filter || options.filter(molecule)) { molecules.push(molecule); for (const label of currentLabels) { labels[label].counter++; } } } // Convert all numeric fields and compute min/max for (const label in labels) { const currentLabel = labels[label]; if (currentLabel.isNumeric) { currentLabel.minValue = Infinity; currentLabel.maxValue = -Infinity; for (const molecule of molecules) { if (molecule[label]) { const value = Number.parseFloat(molecule[label]); molecule[label] = value; if (value > (currentLabel.maxValue ?? -Infinity)) { currentLabel.maxValue = value; } if (value < (currentLabel.minValue ?? Infinity)) { currentLabel.minValue = value; } } } } } for (const key in labels) { labels[key].always = labels[key].counter === molecules.length; } const statistics: LabelStatistic[] = []; for (const key in labels) { const info = labels[key]; statistics.push({ label: key, counter: info.counter, isNumeric: info.isNumeric, keep: info.keep, minValue: info.minValue, maxValue: info.maxValue, always: info.always ?? false, }); } return { time: Date.now() - start, molecules, labels: Object.keys(labels), statistics, }; }