molstar
Version:
A comprehensive macromolecular library.
732 lines • 27.4 kB
JavaScript
"use strict";
/**
* Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseCifText = void 0;
var tslib_1 = require("tslib");
/**
* mmCIF parser.
*
* Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
*
* Differences I'm aware of:
* - Except keywords (data_, loop_, save_) everything is case sensitive.
* - The tokens . and ? are treated the same as the values '.' and '?'.
* - Ignores \ in the multiline values:
* ;abc\
* efg
* ;
* should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
* Post processing of this is left to the consumer of the data.
* - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
*
*/
var Data = (0, tslib_1.__importStar)(require("../data-model"));
var tokenizer_1 = require("../../common/text/tokenizer");
var result_1 = require("../../result");
var mol_task_1 = require("../../../../mol-task");
/**
* Eat everything until a whitespace/newline occurs.
*/
function eatValue(state) {
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
state.tokenEnd = state.position;
return;
default:
++state.position;
break;
}
}
state.tokenEnd = state.position;
}
/**
* Eats an escaped value. Handles the "degenerate" cases as well.
*
* "Degenerate" cases:
* - 'xx'x' => xx'x
* - 'xxxNEWLINE => 'xxx
*
*/
function eatEscaped(state, esc) {
var next, c;
++state.position;
while (state.position < state.length) {
c = state.data.charCodeAt(state.position);
if (c === esc) {
next = state.data.charCodeAt(state.position + 1);
switch (next) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
// get rid of the quotes.
state.tokenStart++;
state.tokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
default:
if (next === void 0) { // = "end of stream"
// get rid of the quotes.
state.tokenStart++;
state.tokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
}
++state.position;
break;
}
}
else {
// handle 'xxxNEWLINE => 'xxx
if (c === 10 || c === 13) {
state.tokenEnd = state.position;
return;
}
++state.position;
}
}
state.tokenEnd = state.position;
}
/**
* Eats an escaped value "triple quote" (''') value.
*/
function eatTripleQuote(state) {
// skip the '''
state.position += 3;
while (state.position < state.length) {
if (state.data.charCodeAt(state.position) === 39 /* ' */ && isTripleQuoteAtPosition(state)) {
// get rid of the quotes.
state.tokenStart += 3;
state.tokenEnd = state.position;
state.isEscaped = true;
state.position += 3;
return;
}
++state.position;
}
state.tokenEnd = state.position;
}
/**
* Eats a multiline token of the form NL;....NL;
*/
function eatMultiline(state) {
var prev = 59, pos = state.position + 1, c;
while (pos < state.length) {
c = state.data.charCodeAt(pos);
if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
state.position = pos + 1;
// get rid of the ;
state.tokenStart++;
// remove trailing newlines
pos--;
c = state.data.charCodeAt(pos);
while (c === 10 || c === 13) {
pos--;
c = state.data.charCodeAt(pos);
}
state.tokenEnd = pos + 1;
state.isEscaped = true;
return;
}
else {
// handle line numbers
if (c === 13) { // \r
state.lineNumber++;
}
else if (c === 10 && prev !== 13) { // \r\n
state.lineNumber++;
}
prev = c;
++pos;
}
}
state.position = pos;
return prev;
}
function eatImportGet(state) {
// _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
// skipWhitespace(state)
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 93: // ]
++state.position;
state.tokenEnd = state.position;
state.isImportGet = false;
return;
default:
++state.position;
break;
}
}
}
/**
* Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
*/
function skipCommentLine(state) {
while (state.position < state.length) {
var c = state.data.charCodeAt(state.position);
if (c === 10 || c === 13) {
return;
}
++state.position;
}
}
/**
* Skips all the whitespace - space, tab, newline, CR
* Handles incrementing line count.
*/
function skipWhitespace(state) {
var prev = 10;
while (state.position < state.length) {
var c = state.data.charCodeAt(state.position);
switch (c) {
case 9: // '\t'
case 32: // ' '
prev = c;
++state.position;
break;
case 10: // \n
// handle \r\n
if (prev !== 13) {
++state.lineNumber;
}
prev = c;
++state.position;
break;
case 13: // \r
prev = c;
++state.position;
++state.lineNumber;
break;
default:
return prev;
}
}
return prev;
}
/**
* Returns true if there are two consecutive ' in +1 and +2 positions.
*/
function isTripleQuoteAtPosition(state) {
if (state.length - state.position < 2)
return false;
if (state.data.charCodeAt(state.position + 1) !== 39)
return false; // '
if (state.data.charCodeAt(state.position + 2) !== 39)
return false; // '
return true;
}
function isData(state) {
// here we already assume the 5th char is _ and that the length >= 5
// d/D
var c = state.data.charCodeAt(state.tokenStart);
if (c !== 68 && c !== 100)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 65 && c !== 97)
return false;
// t/t
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 84 && c !== 116)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 65 && c !== 97)
return false;
return true;
}
function isSave(state) {
// here we already assume the 5th char is _ and that the length >= 5
// s/S
var c = state.data.charCodeAt(state.tokenStart);
if (c !== 83 && c !== 115)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 65 && c !== 97)
return false;
// v/V
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 86 && c !== 118)
return false;
// e/E
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 69 && c !== 101)
return false;
return true;
}
function isLoop(state) {
// here we already assume the 5th char is _ and that the length >= 5
if (state.tokenEnd - state.tokenStart !== 5)
return false;
// l/L
var c = state.data.charCodeAt(state.tokenStart);
if (c !== 76 && c !== 108)
return false;
// o/O
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 79 && c !== 111)
return false;
// o/O
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 79 && c !== 111)
return false;
// p/P
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 80 && c !== 112)
return false;
return true;
}
function isImportGet(state) {
// _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
if (state.tokenEnd - state.tokenStart !== 11)
return false;
if (state.data.charCodeAt(state.tokenStart + 1) !== 105)
return false; // i
if (state.data.charCodeAt(state.tokenStart + 2) !== 109)
return false; // m
if (state.data.charCodeAt(state.tokenStart + 3) !== 112)
return false; // p
if (state.data.charCodeAt(state.tokenStart + 4) !== 111)
return false; // o
if (state.data.charCodeAt(state.tokenStart + 5) !== 114)
return false; // r
if (state.data.charCodeAt(state.tokenStart + 6) !== 116)
return false; // t
if (state.data.charCodeAt(state.tokenStart + 7) !== 46)
return false; // .
if (state.data.charCodeAt(state.tokenStart + 8) !== 103)
return false; // g
if (state.data.charCodeAt(state.tokenStart + 9) !== 101)
return false; // e
if (state.data.charCodeAt(state.tokenStart + 10) !== 116)
return false; // t
return true;
}
/**
* Checks if the current token shares the namespace with string at <start,end).
*/
function isNamespace(state, start, end) {
var i;
var nsLen = end - start;
var offset = state.tokenStart - start;
var tokenLen = state.tokenEnd - state.tokenStart;
if (tokenLen < nsLen)
return false;
for (i = start; i < end; ++i) {
if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset))
return false;
}
if (nsLen === tokenLen)
return true;
if (state.data.charCodeAt(i + offset) === 46) { // .
return true;
}
return false;
}
/**
* Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
*/
function getNamespaceEnd(state) {
var i;
for (i = state.tokenStart; i < state.tokenEnd; ++i) {
if (state.data.charCodeAt(i) === 46)
return i;
}
return i;
}
/**
* Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
*/
function getNamespace(state, endIndex) {
return state.data.substring(state.tokenStart, endIndex);
}
/**
* Returns true if the current token contain no '.', otherwise returns false.
*/
function isFlatNamespace(state) {
var i;
for (i = state.tokenStart; i < state.tokenEnd; ++i) {
if (state.data.charCodeAt(i) === 46)
return false;
}
return true;
}
/**
* String representation of the current token.
*/
function getTokenString(state) {
return state.data.substring(state.tokenStart, state.tokenEnd);
}
/**
* Move to the next token.
*/
function moveNextInternal(state) {
var prev = skipWhitespace(state);
if (state.position >= state.length) {
state.tokenType = 6 /* End */;
return;
}
state.tokenStart = state.position;
state.tokenEnd = state.position;
state.isEscaped = false;
var c = state.data.charCodeAt(state.position);
switch (c) {
case 35: // #, comment
skipCommentLine(state);
state.tokenType = 5 /* Comment */;
break;
case 39: // ', escaped value
if (isTripleQuoteAtPosition(state)) {
eatTripleQuote(state);
state.tokenType = 3 /* Value */;
break;
}
case 34: // ", escaped value
eatEscaped(state, c);
state.tokenType = 3 /* Value */;
break;
case 59: // ;, possible multiline value
// multiline value must start at the beginning of the line.
if (prev === 10 || prev === 13) { // /n or /r
eatMultiline(state);
}
else {
eatValue(state);
}
state.tokenType = 3 /* Value */;
break;
default:
if (state.isImportGet) {
eatImportGet(state);
}
else {
eatValue(state);
}
// escaped is always Value
if (state.isEscaped) {
state.tokenType = 3 /* Value */;
// _ means column name, including _import.get
}
else if (state.data.charCodeAt(state.tokenStart) === 95) { // _
if (state.inSaveFrame && isImportGet(state)) {
state.isImportGet = true;
}
state.tokenType = 4 /* ColumnName */;
// 5th char needs to be _ for data_, save_ or loop_
}
else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) {
if (isData(state))
state.tokenType = 0 /* Data */;
else if (isSave(state))
state.tokenType = 1 /* Save */;
else if (isLoop(state))
state.tokenType = 2 /* Loop */;
else
state.tokenType = 3 /* Value */;
// all other tests failed, we are at Value token.
}
else {
state.tokenType = 3 /* Value */;
}
break;
}
}
/**
* Moves to the next non-comment token.
*/
function moveNext(state) {
moveNextInternal(state);
while (state.tokenType === 5 /* Comment */)
moveNextInternal(state);
}
function createTokenizer(data, runtimeCtx) {
return {
data: data,
length: data.length,
position: 0,
tokenStart: 0,
tokenEnd: 0,
tokenType: 6 /* End */,
lineNumber: 1,
isEscaped: false,
isImportGet: false,
inSaveFrame: false,
runtimeCtx: runtimeCtx
};
}
function FrameContext() {
return { categoryNames: [], categoryData: Object.create(null) };
}
function CifCategories(categoryNames, categoryData) {
var categories = Object.create(null);
for (var _i = 0, categoryNames_1 = categoryNames; _i < categoryNames_1.length; _i++) {
var name_1 = categoryNames_1[_i];
var d = categoryData[name_1];
categories[name_1] = Data.CifCategory(d.name, d.rowCount, d.fieldNames, d.fields);
}
return categories;
}
function CifBlock(ctx, header, saveFrames) {
return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header, saveFrames);
}
function CifSaveFrame(ctx, header) {
return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header);
}
function addFields(ctx, name, rowCount, fieldNames, fields) {
var _a;
if (name in ctx.categoryData) {
var cat = ctx.categoryData[name];
(_a = cat.fieldNames).push.apply(_a, fieldNames);
Object.assign(cat.fields, fields);
}
else {
ctx.categoryData[name] = { name: name, rowCount: rowCount, fieldNames: fieldNames, fields: fields };
ctx.categoryNames.push(name);
}
}
/**
* Reads a category containing a single row.
*/
function handleSingle(tokenizer, ctx) {
var nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer);
var name = getNamespace(tokenizer, nsEnd);
var fields = Object.create(null);
var fieldNames = [];
var readingNames = true;
while (readingNames) {
if (tokenizer.tokenType !== 4 /* ColumnName */ || !isNamespace(tokenizer, nsStart, nsEnd)) {
readingNames = false;
break;
}
var fieldName = getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
if (tokenizer.tokenType !== 3 /* Value */) {
return {
hasError: true,
errorLine: tokenizer.lineNumber,
errorMessage: 'Expected value.'
};
}
fields[fieldName] = Data.CifField.ofTokens({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 });
fieldNames[fieldNames.length] = fieldName;
moveNext(tokenizer);
}
addFields(ctx, name.substr(1), 1, fieldNames, fields);
return {
hasError: false,
errorLine: 0,
errorMessage: ''
};
}
function readLoopChunk(chunkSize, state) {
var tokenizer = state.tokenizer, tokens = state.tokens, fieldCount = state.fieldCount;
var tokenCount = state.tokenCount;
var counter = 0;
while (tokenizer.tokenType === 3 /* Value */ && counter < chunkSize) {
tokenizer_1.TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd);
moveNext(tokenizer);
counter++;
}
state.tokenCount = tokenCount;
return counter;
}
function updateLoopChunk(ctx, state) {
return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length });
}
// const readLoopChunks = ChunkedSubtask(1000000,
// (size, state: LoopReadState) => readLoopChunk(state, size),
// (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }));
/**
* Reads a loop.
*/
function handleLoop(tokenizer, ctx) {
return (0, tslib_1.__awaiter)(this, void 0, void 0, function () {
var loopLine, name, isFlat, fieldNames, rowCountEstimate, tokens, fieldCount, i, state, rowCount, i, fields, fields, i;
return (0, tslib_1.__generator)(this, function (_a) {
switch (_a.label) {
case 0:
loopLine = tokenizer.lineNumber;
moveNext(tokenizer);
name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
isFlat = isFlatNamespace(tokenizer);
fieldNames = [];
while (tokenizer.tokenType === 4 /* ColumnName */) {
fieldNames[fieldNames.length] = isFlat
? getTokenString(tokenizer)
: getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
}
rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
tokens = [];
fieldCount = fieldNames.length;
for (i = 0; i < fieldCount; i++)
tokens[i] = tokenizer_1.TokenBuilder.create(tokenizer.data, rowCountEstimate);
state = {
fieldCount: fieldCount,
tokenCount: 0,
tokenizer: tokenizer,
tokens: tokens
};
return [4 /*yield*/, (0, mol_task_1.chunkedSubtask)(tokenizer.runtimeCtx, 1000000, state, readLoopChunk, updateLoopChunk)];
case 1:
_a.sent();
if (state.tokenCount % fieldCount !== 0) {
return [2 /*return*/, {
hasError: true,
errorLine: tokenizer.lineNumber,
errorMessage: "The number of values for loop starting at line " + loopLine + " is not a multiple of the number of columns."
}];
}
rowCount = (state.tokenCount / fieldCount) | 0;
if (isFlat) {
for (i = 0; i < fieldCount; i++) {
fields = { '': Data.CifField.ofTokens(tokens[i]) };
addFields(ctx, fieldNames[i].substr(1), rowCount, [''], fields);
}
}
else {
fields = Object.create(null);
for (i = 0; i < fieldCount; i++) {
fields[fieldNames[i]] = Data.CifField.ofTokens(tokens[i]);
}
addFields(ctx, name.substr(1), rowCount, fieldNames, fields);
}
return [2 /*return*/, {
hasError: false,
errorLine: 0,
errorMessage: ''
}];
}
});
});
}
/**
* Creates an error result.
*/
function error(line, message) {
return result_1.ReaderResult.error(message, line);
}
/**
* Creates a data result.
*/
function result(data) {
return result_1.ReaderResult.success(data);
}
/**
* Parses an mmCIF file.
*
* @returns CifParserResult wrapper of the result.
*/
function parseInternal(data, runtimeCtx) {
return (0, tslib_1.__awaiter)(this, void 0, void 0, function () {
var dataBlocks, tokenizer, blockHeader, blockCtx, saveFrames, saveCtx, saveFrame, saveHeader, token, cat, cat;
return (0, tslib_1.__generator)(this, function (_a) {
switch (_a.label) {
case 0:
dataBlocks = [];
tokenizer = createTokenizer(data, runtimeCtx);
blockHeader = '';
blockCtx = FrameContext();
saveFrames = [];
saveCtx = FrameContext();
saveFrame = Data.CifSaveFrame(saveCtx.categoryNames, CifCategories(saveCtx.categoryNames, saveCtx.categoryData), '');
saveHeader = '';
runtimeCtx.update({ message: 'Parsing...', current: 0, max: data.length });
moveNext(tokenizer);
_a.label = 1;
case 1:
if (!(tokenizer.tokenType !== 6 /* End */)) return [3 /*break*/, 7];
token = tokenizer.tokenType;
if (!(token === 0 /* Data */)) return [3 /*break*/, 2];
if (tokenizer.inSaveFrame) {
return [2 /*return*/, error(tokenizer.lineNumber, 'Unexpected data block inside a save frame.')];
}
if (blockCtx.categoryNames.length > 0) {
dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
}
blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
blockCtx = FrameContext();
saveFrames = [];
moveNext(tokenizer);
return [3 /*break*/, 6];
case 2:
if (!(token === 1 /* Save */)) return [3 /*break*/, 3];
if (tokenizer.tokenEnd - tokenizer.tokenStart === 5) { // end of save frame
if (saveCtx.categoryNames.length > 0) {
saveFrames[saveFrames.length] = CifSaveFrame(saveCtx, saveHeader);
}
tokenizer.inSaveFrame = false;
}
else { // start of save frame
if (tokenizer.inSaveFrame) {
return [2 /*return*/, error(tokenizer.lineNumber, 'Save frames cannot be nested.')];
}
tokenizer.inSaveFrame = true;
saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
saveCtx = FrameContext();
// saveFrame = CifSaveFrame(saveCtx, saveHeader);
}
moveNext(tokenizer);
return [3 /*break*/, 6];
case 3:
if (!(token === 2 /* Loop */)) return [3 /*break*/, 5];
return [4 /*yield*/, handleLoop(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx)];
case 4:
cat = _a.sent();
if (cat.hasError) {
return [2 /*return*/, error(cat.errorLine, cat.errorMessage)];
}
return [3 /*break*/, 6];
case 5:
if (token === 4 /* ColumnName */) {
cat = handleSingle(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx);
if (cat.hasError) {
return [2 /*return*/, error(cat.errorLine, cat.errorMessage)];
}
// Out of options
}
else {
console.log(tokenizer.tokenType, tokenizer_1.Tokenizer.getTokenString(tokenizer));
return [2 /*return*/, error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.')];
}
_a.label = 6;
case 6: return [3 /*break*/, 1];
case 7:
// Check if the latest save frame was closed.
if (tokenizer.inSaveFrame) {
return [2 /*return*/, error(tokenizer.lineNumber, "Unfinished save frame (" + saveFrame.header + ").")];
}
if (blockCtx.categoryNames.length > 0 || saveFrames.length > 0) {
dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
}
return [2 /*return*/, result(Data.CifFile(dataBlocks))];
}
});
});
}
function parseCifText(data) {
var _this = this;
return mol_task_1.Task.create('Parse CIF', function (ctx) { return (0, tslib_1.__awaiter)(_this, void 0, void 0, function () {
return (0, tslib_1.__generator)(this, function (_a) {
switch (_a.label) {
case 0: return [4 /*yield*/, parseInternal(data, ctx)];
case 1: return [2 /*return*/, _a.sent()];
}
});
}); });
}
exports.parseCifText = parseCifText;
//# sourceMappingURL=parser.js.map