UNPKG

molstar

Version:

A comprehensive macromolecular library.

268 lines (267 loc) 11.3 kB
/** * Copyright (c) 2017-2022 mol* contributors, licensed under MIT, See LICENSE file for more info. * * @author Zepei Xu <xuzepei19950617@gmail.com> * @author Alexander Rose <alexander.rose@weirdbyte.de> * @author Eric E <etongfu@@outlook.com> */ // NOTES // When want to created undefined string column, must use // undefStr = UndefinedColumn(molecule.num_atoms, ColumnType.str) // but not // const undefPooledStr = UndefinedColumn(molecule.num_atoms, ColumnType.pooledStr); // because latter actuall return a column of zeros import { Column } from '../../../mol-data/db'; import { TokenBuilder, Tokenizer } from '../common/text/tokenizer'; import { TokenColumnProvider as TokenColumn } from '../common/text/column/token'; import { ReaderResult as Result } from '../result'; import { Task, chunkedSubtask } from '../../../mol-task'; const { skipWhitespace, eatValue, markLine, getTokenString, skipStrictWhitespace } = Tokenizer; function createEmptyMolecule() { return { mol_name: '', num_atoms: 0, num_bonds: 0, num_subst: 0, num_feat: 0, num_sets: 0, mol_type: '', charge_type: '', status_bits: '', mol_comment: '' }; } function State(tokenizer, runtimeCtx) { return { tokenizer, molecule: createEmptyMolecule(), runtimeCtx }; } const reWhitespace = /\s+/g; function handleMolecule(state) { const { tokenizer, molecule } = state; while (getTokenString(tokenizer) !== '@<TRIPOS>MOLECULE' && tokenizer.position < tokenizer.data.length) { markLine(tokenizer); } markLine(tokenizer); molecule.mol_name = getTokenString(tokenizer); markLine(tokenizer); const values = getTokenString(tokenizer).trim().split(reWhitespace); molecule.num_atoms = parseInt(values[0]); molecule.num_bonds = parseInt(values[1]); molecule.num_subst = parseInt(values[2]); molecule.num_feat = parseInt(values[3]); molecule.num_sets = parseInt(values[4]); markLine(tokenizer); const mol_type = getTokenString(tokenizer); if (mol_type.startsWith('@<TRIPOS>')) return; molecule.mol_type = mol_type; markLine(tokenizer); const charge_type = getTokenString(tokenizer); if (charge_type.startsWith('@<TRIPOS>')) return; molecule.charge_type = charge_type; markLine(tokenizer); const status_bits = getTokenString(tokenizer); if (status_bits.startsWith('@<TRIPOS>')) return; molecule.status_bits = status_bits; markLine(tokenizer); const mol_comment = getTokenString(tokenizer); if (mol_comment.startsWith('@<TRIPOS>')) return; molecule.mol_comment = mol_comment; } /** * Just read the columns and get the max count of columns for the **atoms** and **bonds** * @param linesToRead The total lines * @param tokenIndexColums * @param tokenizer * @description * !!!This function has side effects!!! * 1. Called inside of the `chunkedSubtask` * 2. It will change the parameters of the `tokenIndexColums` , `tokenizer` and `linesToRead` * @returns The max count of columns */ function _readColumnsAndGetMaxCount(linesToRead, tokenIndexColums, tokenizer) { let maxColumnCount = 0; for (let i = 0; i < linesToRead; i++) { let tokenIndex = 0; skipWhitespace(tokenizer); while (true) { skipStrictWhitespace(tokenizer); tokenizer.tokenStart = tokenizer.position; eatValue(tokenizer); if (tokenizer.tokenStart === tokenizer.tokenEnd) break; const col = tokenIndexColums[tokenIndex++]; if (!col) continue; TokenBuilder.addUnchecked(col, tokenizer.tokenStart, tokenizer.tokenEnd); } if (tokenIndex > maxColumnCount) maxColumnCount = tokenIndex; for (let cI = tokenIndex; cI < tokenIndexColums.length; cI++) { TokenBuilder.addUnchecked(tokenIndexColums[cI], 0, 0); } } return maxColumnCount + 1; } async function handleAtoms(state) { const { tokenizer, molecule } = state; // skip empty lines and '@<TRIPOS>ATOM' while (getTokenString(tokenizer) !== '@<TRIPOS>ATOM' && tokenizer.position < tokenizer.data.length) { markLine(tokenizer); } const initialTokenizerPosition = tokenizer.position; const initialTokenizerLineNumber = tokenizer.lineNumber; // columns const atom_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const atom_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const xTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const yTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const zTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const atom_typeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const subst_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const subst_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const chargeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const undefFloat = Column.Undefined(molecule.num_atoms, Column.Schema.float); const undefInt = Column.Undefined(molecule.num_atoms, Column.Schema.int); const undefStr = Column.Undefined(molecule.num_atoms, Column.Schema.str); tokenizer.position = initialTokenizerPosition; tokenizer.lineNumber = initialTokenizerLineNumber; let maxColumnCount = 0; const tokenIndexToColumn = [ atom_idTokens, atom_nameTokens, xTokens, yTokens, zTokens, atom_typeTokens, subst_idTokens, subst_nameTokens, chargeTokens, status_bitTokens ]; const { length } = tokenizer; let linesAlreadyRead = 0; await chunkedSubtask(state.runtimeCtx, 100000, void 0, chunkSize => { const linesToRead = Math.min(molecule.num_atoms - linesAlreadyRead, chunkSize); maxColumnCount = _readColumnsAndGetMaxCount(linesToRead, tokenIndexToColumn, tokenizer); linesAlreadyRead += linesToRead; return linesToRead; }, ctx => ctx.update({ message: 'Parsing...', current: tokenizer.position, max: length })); const ret = { count: molecule.num_atoms, atom_id: TokenColumn(atom_idTokens)(Column.Schema.int), atom_name: TokenColumn(atom_nameTokens)(Column.Schema.str), x: TokenColumn(xTokens)(Column.Schema.float), y: TokenColumn(yTokens)(Column.Schema.float), z: TokenColumn(zTokens)(Column.Schema.float), atom_type: maxColumnCount > 5 ? TokenColumn(atom_typeTokens)(Column.Schema.str) : undefStr, subst_id: maxColumnCount > 6 ? TokenColumn(subst_idTokens)(Column.Schema.int) : undefInt, subst_name: maxColumnCount > 7 ? TokenColumn(subst_nameTokens)(Column.Schema.str) : undefStr, charge: maxColumnCount > 8 ? TokenColumn(chargeTokens)(Column.Schema.float) : undefFloat, status_bit: maxColumnCount > 9 ? TokenColumn(status_bitTokens)(Column.Schema.str) : undefStr, }; return ret; } async function handleBonds(state) { const { tokenizer, molecule } = state; while (getTokenString(tokenizer) !== '@<TRIPOS>BOND' && tokenizer.position < tokenizer.data.length) { markLine(tokenizer); } const initialTokenizerPosition = tokenizer.position; const initialTokenizerLineNumber = tokenizer.lineNumber; // columns const bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const origin_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const target_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const bondTypeTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); tokenizer.position = initialTokenizerPosition; tokenizer.lineNumber = initialTokenizerLineNumber; let maxColumnCount = 0; const tokenIndexToColumn = [ bond_idTokens, origin_bond_idTokens, target_bond_idTokens, bondTypeTokens, status_bitTokens ]; const { length } = tokenizer; let linesAlreadyRead = 0; await chunkedSubtask(state.runtimeCtx, 100000, void 0, chunkSize => { const linesToRead = Math.min(molecule.num_bonds - linesAlreadyRead, chunkSize); maxColumnCount = _readColumnsAndGetMaxCount(linesToRead, tokenIndexToColumn, tokenizer); linesAlreadyRead += linesToRead; return linesToRead; }, ctx => ctx.update({ message: 'Parsing...', current: tokenizer.position, max: length })); const ret = { count: molecule.num_bonds, bond_id: TokenColumn(bond_idTokens)(Column.Schema.int), origin_atom_id: TokenColumn(origin_bond_idTokens)(Column.Schema.int), target_atom_id: TokenColumn(target_bond_idTokens)(Column.Schema.int), bond_type: TokenColumn(bondTypeTokens)(Column.Schema.str), status_bits: maxColumnCount > 4 ? TokenColumn(status_bitTokens)(Column.Schema.str) : Column.Undefined(molecule.num_bonds, Column.Schema.str), }; return ret; } function handleCrysin(state) { const { tokenizer } = state; while (tokenizer.position < tokenizer.data.length) { const l = getTokenString(tokenizer); if (l === '@<TRIPOS>MOLECULE') { return; } else if (l === '@<TRIPOS>CRYSIN') { break; } else { markLine(tokenizer); } } if (tokenizer.position >= tokenizer.data.length) return; markLine(tokenizer); const values = getTokenString(tokenizer).trim().split(reWhitespace); return { a: parseFloat(values[0]), b: parseFloat(values[1]), c: parseFloat(values[2]), alpha: parseFloat(values[3]), beta: parseFloat(values[4]), gamma: parseFloat(values[5]), spaceGroup: parseInt(values[6], 10), setting: parseInt(values[7], 10), }; } async function parseInternal(ctx, data, name) { const tokenizer = Tokenizer(data); ctx.update({ message: 'Parsing...', current: 0, max: data.length }); const structures = []; while (tokenizer.position < data.length) { const state = State(tokenizer, ctx); handleMolecule(state); const atoms = await handleAtoms(state); const bonds = await handleBonds(state); const crysin = handleCrysin(state); structures.push({ molecule: state.molecule, atoms, bonds, crysin }); skipWhitespace(tokenizer); while (getTokenString(tokenizer) !== '@<TRIPOS>MOLECULE' && tokenizer.position < tokenizer.data.length) { markLine(tokenizer); } } const result = { name, structures }; return Result.success(result); } export function parseMol2(data, name) { return Task.create('Parse MOL2', async (ctx) => { return await parseInternal(ctx, data, name); }); }