UNPKG

molstar

Version:

A comprehensive macromolecular library.

732 lines 27.4 kB
"use strict"; /** * Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info. * * @author David Sehnal <david.sehnal@gmail.com> * @author Alexander Rose <alexander.rose@weirdbyte.de> */ Object.defineProperty(exports, "__esModule", { value: true }); exports.parseCifText = void 0; var tslib_1 = require("tslib"); /** * mmCIF parser. * * Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax * * Differences I'm aware of: * - Except keywords (data_, loop_, save_) everything is case sensitive. * - The tokens . and ? are treated the same as the values '.' and '?'. * - Ignores \ in the multiline values: * ;abc\ * efg * ; * should have the value 'abcefg' but will have the value 'abc\\nefg' instead. * Post processing of this is left to the consumer of the data. * - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed. * */ var Data = (0, tslib_1.__importStar)(require("../data-model")); var tokenizer_1 = require("../../common/text/tokenizer"); var result_1 = require("../../result"); var mol_task_1 = require("../../../../mol-task"); /** * Eat everything until a whitespace/newline occurs. */ function eatValue(state) { while (state.position < state.length) { switch (state.data.charCodeAt(state.position)) { case 9: // \t case 10: // \n case 13: // \r case 32: // ' ' state.tokenEnd = state.position; return; default: ++state.position; break; } } state.tokenEnd = state.position; } /** * Eats an escaped value. Handles the "degenerate" cases as well. * * "Degenerate" cases: * - 'xx'x' => xx'x * - 'xxxNEWLINE => 'xxx * */ function eatEscaped(state, esc) { var next, c; ++state.position; while (state.position < state.length) { c = state.data.charCodeAt(state.position); if (c === esc) { next = state.data.charCodeAt(state.position + 1); switch (next) { case 9: // \t case 10: // \n case 13: // \r case 32: // ' ' // get rid of the quotes. state.tokenStart++; state.tokenEnd = state.position; state.isEscaped = true; ++state.position; return; default: if (next === void 0) { // = "end of stream" // get rid of the quotes. state.tokenStart++; state.tokenEnd = state.position; state.isEscaped = true; ++state.position; return; } ++state.position; break; } } else { // handle 'xxxNEWLINE => 'xxx if (c === 10 || c === 13) { state.tokenEnd = state.position; return; } ++state.position; } } state.tokenEnd = state.position; } /** * Eats an escaped value "triple quote" (''') value. */ function eatTripleQuote(state) { // skip the ''' state.position += 3; while (state.position < state.length) { if (state.data.charCodeAt(state.position) === 39 /* ' */ && isTripleQuoteAtPosition(state)) { // get rid of the quotes. state.tokenStart += 3; state.tokenEnd = state.position; state.isEscaped = true; state.position += 3; return; } ++state.position; } state.tokenEnd = state.position; } /** * Eats a multiline token of the form NL;....NL; */ function eatMultiline(state) { var prev = 59, pos = state.position + 1, c; while (pos < state.length) { c = state.data.charCodeAt(pos); if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r state.position = pos + 1; // get rid of the ; state.tokenStart++; // remove trailing newlines pos--; c = state.data.charCodeAt(pos); while (c === 10 || c === 13) { pos--; c = state.data.charCodeAt(pos); } state.tokenEnd = pos + 1; state.isEscaped = true; return; } else { // handle line numbers if (c === 13) { // \r state.lineNumber++; } else if (c === 10 && prev !== 13) { // \r\n state.lineNumber++; } prev = c; ++pos; } } state.position = pos; return prev; } function eatImportGet(state) { // _import.get [{'save':orient_matrix 'file':templ_attr.cif}] // skipWhitespace(state) while (state.position < state.length) { switch (state.data.charCodeAt(state.position)) { case 93: // ] ++state.position; state.tokenEnd = state.position; state.isImportGet = false; return; default: ++state.position; break; } } } /** * Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function. */ function skipCommentLine(state) { while (state.position < state.length) { var c = state.data.charCodeAt(state.position); if (c === 10 || c === 13) { return; } ++state.position; } } /** * Skips all the whitespace - space, tab, newline, CR * Handles incrementing line count. */ function skipWhitespace(state) { var prev = 10; while (state.position < state.length) { var c = state.data.charCodeAt(state.position); switch (c) { case 9: // '\t' case 32: // ' ' prev = c; ++state.position; break; case 10: // \n // handle \r\n if (prev !== 13) { ++state.lineNumber; } prev = c; ++state.position; break; case 13: // \r prev = c; ++state.position; ++state.lineNumber; break; default: return prev; } } return prev; } /** * Returns true if there are two consecutive ' in +1 and +2 positions. */ function isTripleQuoteAtPosition(state) { if (state.length - state.position < 2) return false; if (state.data.charCodeAt(state.position + 1) !== 39) return false; // ' if (state.data.charCodeAt(state.position + 2) !== 39) return false; // ' return true; } function isData(state) { // here we already assume the 5th char is _ and that the length >= 5 // d/D var c = state.data.charCodeAt(state.tokenStart); if (c !== 68 && c !== 100) return false; // a/A c = state.data.charCodeAt(state.tokenStart + 1); if (c !== 65 && c !== 97) return false; // t/t c = state.data.charCodeAt(state.tokenStart + 2); if (c !== 84 && c !== 116) return false; // a/A c = state.data.charCodeAt(state.tokenStart + 3); if (c !== 65 && c !== 97) return false; return true; } function isSave(state) { // here we already assume the 5th char is _ and that the length >= 5 // s/S var c = state.data.charCodeAt(state.tokenStart); if (c !== 83 && c !== 115) return false; // a/A c = state.data.charCodeAt(state.tokenStart + 1); if (c !== 65 && c !== 97) return false; // v/V c = state.data.charCodeAt(state.tokenStart + 2); if (c !== 86 && c !== 118) return false; // e/E c = state.data.charCodeAt(state.tokenStart + 3); if (c !== 69 && c !== 101) return false; return true; } function isLoop(state) { // here we already assume the 5th char is _ and that the length >= 5 if (state.tokenEnd - state.tokenStart !== 5) return false; // l/L var c = state.data.charCodeAt(state.tokenStart); if (c !== 76 && c !== 108) return false; // o/O c = state.data.charCodeAt(state.tokenStart + 1); if (c !== 79 && c !== 111) return false; // o/O c = state.data.charCodeAt(state.tokenStart + 2); if (c !== 79 && c !== 111) return false; // p/P c = state.data.charCodeAt(state.tokenStart + 3); if (c !== 80 && c !== 112) return false; return true; } function isImportGet(state) { // _import.get [{'save':orient_matrix 'file':templ_attr.cif}] if (state.tokenEnd - state.tokenStart !== 11) return false; if (state.data.charCodeAt(state.tokenStart + 1) !== 105) return false; // i if (state.data.charCodeAt(state.tokenStart + 2) !== 109) return false; // m if (state.data.charCodeAt(state.tokenStart + 3) !== 112) return false; // p if (state.data.charCodeAt(state.tokenStart + 4) !== 111) return false; // o if (state.data.charCodeAt(state.tokenStart + 5) !== 114) return false; // r if (state.data.charCodeAt(state.tokenStart + 6) !== 116) return false; // t if (state.data.charCodeAt(state.tokenStart + 7) !== 46) return false; // . if (state.data.charCodeAt(state.tokenStart + 8) !== 103) return false; // g if (state.data.charCodeAt(state.tokenStart + 9) !== 101) return false; // e if (state.data.charCodeAt(state.tokenStart + 10) !== 116) return false; // t return true; } /** * Checks if the current token shares the namespace with string at <start,end). */ function isNamespace(state, start, end) { var i; var nsLen = end - start; var offset = state.tokenStart - start; var tokenLen = state.tokenEnd - state.tokenStart; if (tokenLen < nsLen) return false; for (i = start; i < end; ++i) { if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset)) return false; } if (nsLen === tokenLen) return true; if (state.data.charCodeAt(i + offset) === 46) { // . return true; } return false; } /** * Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd. */ function getNamespaceEnd(state) { var i; for (i = state.tokenStart; i < state.tokenEnd; ++i) { if (state.data.charCodeAt(i) === 46) return i; } return i; } /** * Get the namespace string. endIndex is obtained by the getNamespaceEnd() function. */ function getNamespace(state, endIndex) { return state.data.substring(state.tokenStart, endIndex); } /** * Returns true if the current token contain no '.', otherwise returns false. */ function isFlatNamespace(state) { var i; for (i = state.tokenStart; i < state.tokenEnd; ++i) { if (state.data.charCodeAt(i) === 46) return false; } return true; } /** * String representation of the current token. */ function getTokenString(state) { return state.data.substring(state.tokenStart, state.tokenEnd); } /** * Move to the next token. */ function moveNextInternal(state) { var prev = skipWhitespace(state); if (state.position >= state.length) { state.tokenType = 6 /* End */; return; } state.tokenStart = state.position; state.tokenEnd = state.position; state.isEscaped = false; var c = state.data.charCodeAt(state.position); switch (c) { case 35: // #, comment skipCommentLine(state); state.tokenType = 5 /* Comment */; break; case 39: // ', escaped value if (isTripleQuoteAtPosition(state)) { eatTripleQuote(state); state.tokenType = 3 /* Value */; break; } case 34: // ", escaped value eatEscaped(state, c); state.tokenType = 3 /* Value */; break; case 59: // ;, possible multiline value // multiline value must start at the beginning of the line. if (prev === 10 || prev === 13) { // /n or /r eatMultiline(state); } else { eatValue(state); } state.tokenType = 3 /* Value */; break; default: if (state.isImportGet) { eatImportGet(state); } else { eatValue(state); } // escaped is always Value if (state.isEscaped) { state.tokenType = 3 /* Value */; // _ means column name, including _import.get } else if (state.data.charCodeAt(state.tokenStart) === 95) { // _ if (state.inSaveFrame && isImportGet(state)) { state.isImportGet = true; } state.tokenType = 4 /* ColumnName */; // 5th char needs to be _ for data_, save_ or loop_ } else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) { if (isData(state)) state.tokenType = 0 /* Data */; else if (isSave(state)) state.tokenType = 1 /* Save */; else if (isLoop(state)) state.tokenType = 2 /* Loop */; else state.tokenType = 3 /* Value */; // all other tests failed, we are at Value token. } else { state.tokenType = 3 /* Value */; } break; } } /** * Moves to the next non-comment token. */ function moveNext(state) { moveNextInternal(state); while (state.tokenType === 5 /* Comment */) moveNextInternal(state); } function createTokenizer(data, runtimeCtx) { return { data: data, length: data.length, position: 0, tokenStart: 0, tokenEnd: 0, tokenType: 6 /* End */, lineNumber: 1, isEscaped: false, isImportGet: false, inSaveFrame: false, runtimeCtx: runtimeCtx }; } function FrameContext() { return { categoryNames: [], categoryData: Object.create(null) }; } function CifCategories(categoryNames, categoryData) { var categories = Object.create(null); for (var _i = 0, categoryNames_1 = categoryNames; _i < categoryNames_1.length; _i++) { var name_1 = categoryNames_1[_i]; var d = categoryData[name_1]; categories[name_1] = Data.CifCategory(d.name, d.rowCount, d.fieldNames, d.fields); } return categories; } function CifBlock(ctx, header, saveFrames) { return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header, saveFrames); } function CifSaveFrame(ctx, header) { return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header); } function addFields(ctx, name, rowCount, fieldNames, fields) { var _a; if (name in ctx.categoryData) { var cat = ctx.categoryData[name]; (_a = cat.fieldNames).push.apply(_a, fieldNames); Object.assign(cat.fields, fields); } else { ctx.categoryData[name] = { name: name, rowCount: rowCount, fieldNames: fieldNames, fields: fields }; ctx.categoryNames.push(name); } } /** * Reads a category containing a single row. */ function handleSingle(tokenizer, ctx) { var nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer); var name = getNamespace(tokenizer, nsEnd); var fields = Object.create(null); var fieldNames = []; var readingNames = true; while (readingNames) { if (tokenizer.tokenType !== 4 /* ColumnName */ || !isNamespace(tokenizer, nsStart, nsEnd)) { readingNames = false; break; } var fieldName = getTokenString(tokenizer).substring(name.length + 1); moveNext(tokenizer); if (tokenizer.tokenType !== 3 /* Value */) { return { hasError: true, errorLine: tokenizer.lineNumber, errorMessage: 'Expected value.' }; } fields[fieldName] = Data.CifField.ofTokens({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 }); fieldNames[fieldNames.length] = fieldName; moveNext(tokenizer); } addFields(ctx, name.substr(1), 1, fieldNames, fields); return { hasError: false, errorLine: 0, errorMessage: '' }; } function readLoopChunk(chunkSize, state) { var tokenizer = state.tokenizer, tokens = state.tokens, fieldCount = state.fieldCount; var tokenCount = state.tokenCount; var counter = 0; while (tokenizer.tokenType === 3 /* Value */ && counter < chunkSize) { tokenizer_1.TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd); moveNext(tokenizer); counter++; } state.tokenCount = tokenCount; return counter; } function updateLoopChunk(ctx, state) { return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }); } // const readLoopChunks = ChunkedSubtask(1000000, // (size, state: LoopReadState) => readLoopChunk(state, size), // (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length })); /** * Reads a loop. */ function handleLoop(tokenizer, ctx) { return (0, tslib_1.__awaiter)(this, void 0, void 0, function () { var loopLine, name, isFlat, fieldNames, rowCountEstimate, tokens, fieldCount, i, state, rowCount, i, fields, fields, i; return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: loopLine = tokenizer.lineNumber; moveNext(tokenizer); name = getNamespace(tokenizer, getNamespaceEnd(tokenizer)); isFlat = isFlatNamespace(tokenizer); fieldNames = []; while (tokenizer.tokenType === 4 /* ColumnName */) { fieldNames[fieldNames.length] = isFlat ? getTokenString(tokenizer) : getTokenString(tokenizer).substring(name.length + 1); moveNext(tokenizer); } rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32; tokens = []; fieldCount = fieldNames.length; for (i = 0; i < fieldCount; i++) tokens[i] = tokenizer_1.TokenBuilder.create(tokenizer.data, rowCountEstimate); state = { fieldCount: fieldCount, tokenCount: 0, tokenizer: tokenizer, tokens: tokens }; return [4 /*yield*/, (0, mol_task_1.chunkedSubtask)(tokenizer.runtimeCtx, 1000000, state, readLoopChunk, updateLoopChunk)]; case 1: _a.sent(); if (state.tokenCount % fieldCount !== 0) { return [2 /*return*/, { hasError: true, errorLine: tokenizer.lineNumber, errorMessage: "The number of values for loop starting at line " + loopLine + " is not a multiple of the number of columns." }]; } rowCount = (state.tokenCount / fieldCount) | 0; if (isFlat) { for (i = 0; i < fieldCount; i++) { fields = { '': Data.CifField.ofTokens(tokens[i]) }; addFields(ctx, fieldNames[i].substr(1), rowCount, [''], fields); } } else { fields = Object.create(null); for (i = 0; i < fieldCount; i++) { fields[fieldNames[i]] = Data.CifField.ofTokens(tokens[i]); } addFields(ctx, name.substr(1), rowCount, fieldNames, fields); } return [2 /*return*/, { hasError: false, errorLine: 0, errorMessage: '' }]; } }); }); } /** * Creates an error result. */ function error(line, message) { return result_1.ReaderResult.error(message, line); } /** * Creates a data result. */ function result(data) { return result_1.ReaderResult.success(data); } /** * Parses an mmCIF file. * * @returns CifParserResult wrapper of the result. */ function parseInternal(data, runtimeCtx) { return (0, tslib_1.__awaiter)(this, void 0, void 0, function () { var dataBlocks, tokenizer, blockHeader, blockCtx, saveFrames, saveCtx, saveFrame, saveHeader, token, cat, cat; return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: dataBlocks = []; tokenizer = createTokenizer(data, runtimeCtx); blockHeader = ''; blockCtx = FrameContext(); saveFrames = []; saveCtx = FrameContext(); saveFrame = Data.CifSaveFrame(saveCtx.categoryNames, CifCategories(saveCtx.categoryNames, saveCtx.categoryData), ''); saveHeader = ''; runtimeCtx.update({ message: 'Parsing...', current: 0, max: data.length }); moveNext(tokenizer); _a.label = 1; case 1: if (!(tokenizer.tokenType !== 6 /* End */)) return [3 /*break*/, 7]; token = tokenizer.tokenType; if (!(token === 0 /* Data */)) return [3 /*break*/, 2]; if (tokenizer.inSaveFrame) { return [2 /*return*/, error(tokenizer.lineNumber, 'Unexpected data block inside a save frame.')]; } if (blockCtx.categoryNames.length > 0) { dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames)); } blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd); blockCtx = FrameContext(); saveFrames = []; moveNext(tokenizer); return [3 /*break*/, 6]; case 2: if (!(token === 1 /* Save */)) return [3 /*break*/, 3]; if (tokenizer.tokenEnd - tokenizer.tokenStart === 5) { // end of save frame if (saveCtx.categoryNames.length > 0) { saveFrames[saveFrames.length] = CifSaveFrame(saveCtx, saveHeader); } tokenizer.inSaveFrame = false; } else { // start of save frame if (tokenizer.inSaveFrame) { return [2 /*return*/, error(tokenizer.lineNumber, 'Save frames cannot be nested.')]; } tokenizer.inSaveFrame = true; saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd); saveCtx = FrameContext(); // saveFrame = CifSaveFrame(saveCtx, saveHeader); } moveNext(tokenizer); return [3 /*break*/, 6]; case 3: if (!(token === 2 /* Loop */)) return [3 /*break*/, 5]; return [4 /*yield*/, handleLoop(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx)]; case 4: cat = _a.sent(); if (cat.hasError) { return [2 /*return*/, error(cat.errorLine, cat.errorMessage)]; } return [3 /*break*/, 6]; case 5: if (token === 4 /* ColumnName */) { cat = handleSingle(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx); if (cat.hasError) { return [2 /*return*/, error(cat.errorLine, cat.errorMessage)]; } // Out of options } else { console.log(tokenizer.tokenType, tokenizer_1.Tokenizer.getTokenString(tokenizer)); return [2 /*return*/, error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.')]; } _a.label = 6; case 6: return [3 /*break*/, 1]; case 7: // Check if the latest save frame was closed. if (tokenizer.inSaveFrame) { return [2 /*return*/, error(tokenizer.lineNumber, "Unfinished save frame (" + saveFrame.header + ").")]; } if (blockCtx.categoryNames.length > 0 || saveFrames.length > 0) { dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames)); } return [2 /*return*/, result(Data.CifFile(dataBlocks))]; } }); }); } function parseCifText(data) { var _this = this; return mol_task_1.Task.create('Parse CIF', function (ctx) { return (0, tslib_1.__awaiter)(_this, void 0, void 0, function () { return (0, tslib_1.__generator)(this, function (_a) { switch (_a.label) { case 0: return [4 /*yield*/, parseInternal(data, ctx)]; case 1: return [2 /*return*/, _a.sent()]; } }); }); }); } exports.parseCifText = parseCifText; //# sourceMappingURL=parser.js.map