molstar
Version:
A comprehensive macromolecular library.
701 lines (700 loc) • 23.7 kB
JavaScript
"use strict";
/**
* Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseCifText = parseCifText;
const tslib_1 = require("tslib");
/**
* mmCIF parser.
*
* Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
*
* Differences I'm aware of:
* - Except keywords (data_, loop_, save_) everything is case sensitive.
* - The tokens . and ? are treated the same as the values '.' and '?'.
* - Ignores \ in the multiline values:
* ;abc\
* efg
* ;
* should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
* Post processing of this is left to the consumer of the data.
* - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
*
*/
const Data = tslib_1.__importStar(require("../data-model"));
const tokenizer_1 = require("../../common/text/tokenizer");
const result_1 = require("../../result");
const mol_task_1 = require("../../../../mol-task");
/**
* Eat everything until a whitespace/newline occurs.
*/
function eatValue(state) {
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
state.tokenEnd = state.position;
return;
default:
++state.position;
break;
}
}
state.tokenEnd = state.position;
}
/**
* Eats an escaped value. Handles the "degenerate" cases as well.
*
* "Degenerate" cases:
* - 'xx'x' => xx'x
* - 'xxxNEWLINE => 'xxx
*
*/
function eatEscaped(state, esc) {
let next, c;
++state.position;
while (state.position < state.length) {
c = state.data.charCodeAt(state.position);
if (c === esc) {
next = state.data.charCodeAt(state.position + 1);
switch (next) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
// get rid of the quotes.
state.tokenStart++;
state.tokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
default:
if (next === void 0) { // = "end of stream"
// get rid of the quotes.
state.tokenStart++;
state.tokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
}
++state.position;
break;
}
}
else {
// handle 'xxxNEWLINE => 'xxx
if (c === 10 || c === 13) {
state.tokenEnd = state.position;
return;
}
++state.position;
}
}
state.tokenEnd = state.position;
}
/**
* Eats an escaped value "triple quote" (''') value.
*/
function eatTripleQuote(state) {
// skip the '''
state.position += 3;
while (state.position < state.length) {
if (state.data.charCodeAt(state.position) === 39 /* ' */ && isTripleQuoteAtPosition(state)) {
// get rid of the quotes.
state.tokenStart += 3;
state.tokenEnd = state.position;
state.isEscaped = true;
state.position += 3;
return;
}
++state.position;
}
state.tokenEnd = state.position;
}
/**
* Eats a multiline token of the form NL;....NL;
*/
function eatMultiline(state) {
let prev = 59, pos = state.position + 1, c;
while (pos < state.length) {
c = state.data.charCodeAt(pos);
if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
state.position = pos + 1;
// get rid of the ;
state.tokenStart++;
// remove trailing newlines
pos--;
c = state.data.charCodeAt(pos);
while (c === 10 || c === 13) {
pos--;
c = state.data.charCodeAt(pos);
}
state.tokenEnd = pos + 1;
state.isEscaped = true;
return;
}
else {
// handle line numbers
if (c === 13) { // \r
state.lineNumber++;
}
else if (c === 10 && prev !== 13) { // \r\n
state.lineNumber++;
}
prev = c;
++pos;
}
}
state.position = pos;
return prev;
}
function eatImportGet(state) {
// _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
// skipWhitespace(state)
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 93: // ]
++state.position;
state.tokenEnd = state.position;
state.isImportGet = false;
return;
default:
++state.position;
break;
}
}
}
/**
* Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
*/
function skipCommentLine(state) {
while (state.position < state.length) {
const c = state.data.charCodeAt(state.position);
if (c === 10 || c === 13) {
return;
}
++state.position;
}
}
/**
* Skips all the whitespace - space, tab, newline, CR
* Handles incrementing line count.
*/
function skipWhitespace(state) {
let prev = 10;
while (state.position < state.length) {
const c = state.data.charCodeAt(state.position);
switch (c) {
case 9: // '\t'
case 32: // ' '
prev = c;
++state.position;
break;
case 10: // \n
// handle \r\n
if (prev !== 13) {
++state.lineNumber;
}
prev = c;
++state.position;
break;
case 13: // \r
prev = c;
++state.position;
++state.lineNumber;
break;
default:
return prev;
}
}
return prev;
}
/**
* Returns true if there are two consecutive ' in +1 and +2 positions.
*/
function isTripleQuoteAtPosition(state) {
if (state.length - state.position < 2)
return false;
if (state.data.charCodeAt(state.position + 1) !== 39)
return false; // '
if (state.data.charCodeAt(state.position + 2) !== 39)
return false; // '
return true;
}
function isData(state) {
// here we already assume the 5th char is _ and that the length >= 5
// d/D
let c = state.data.charCodeAt(state.tokenStart);
if (c !== 68 && c !== 100)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 65 && c !== 97)
return false;
// t/t
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 84 && c !== 116)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 65 && c !== 97)
return false;
return true;
}
function isSave(state) {
// here we already assume the 5th char is _ and that the length >= 5
// s/S
let c = state.data.charCodeAt(state.tokenStart);
if (c !== 83 && c !== 115)
return false;
// a/A
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 65 && c !== 97)
return false;
// v/V
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 86 && c !== 118)
return false;
// e/E
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 69 && c !== 101)
return false;
return true;
}
function isLoop(state) {
// here we already assume the 5th char is _ and that the length >= 5
if (state.tokenEnd - state.tokenStart !== 5)
return false;
// l/L
let c = state.data.charCodeAt(state.tokenStart);
if (c !== 76 && c !== 108)
return false;
// o/O
c = state.data.charCodeAt(state.tokenStart + 1);
if (c !== 79 && c !== 111)
return false;
// o/O
c = state.data.charCodeAt(state.tokenStart + 2);
if (c !== 79 && c !== 111)
return false;
// p/P
c = state.data.charCodeAt(state.tokenStart + 3);
if (c !== 80 && c !== 112)
return false;
return true;
}
function isImportGet(state) {
// _import.get [{'save':orient_matrix 'file':templ_attr.cif}]
if (state.tokenEnd - state.tokenStart !== 11)
return false;
if (state.data.charCodeAt(state.tokenStart + 1) !== 105)
return false; // i
if (state.data.charCodeAt(state.tokenStart + 2) !== 109)
return false; // m
if (state.data.charCodeAt(state.tokenStart + 3) !== 112)
return false; // p
if (state.data.charCodeAt(state.tokenStart + 4) !== 111)
return false; // o
if (state.data.charCodeAt(state.tokenStart + 5) !== 114)
return false; // r
if (state.data.charCodeAt(state.tokenStart + 6) !== 116)
return false; // t
if (state.data.charCodeAt(state.tokenStart + 7) !== 46)
return false; // .
if (state.data.charCodeAt(state.tokenStart + 8) !== 103)
return false; // g
if (state.data.charCodeAt(state.tokenStart + 9) !== 101)
return false; // e
if (state.data.charCodeAt(state.tokenStart + 10) !== 116)
return false; // t
return true;
}
/**
* Checks if the current token shares the namespace with string at <start,end).
*/
function isNamespace(state, start, end) {
let i;
const nsLen = end - start;
const offset = state.tokenStart - start;
const tokenLen = state.tokenEnd - state.tokenStart;
if (tokenLen < nsLen)
return false;
for (i = start; i < end; ++i) {
if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset))
return false;
}
if (nsLen === tokenLen)
return true;
if (state.data.charCodeAt(i + offset) === 46) { // .
return true;
}
return false;
}
/**
* Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
*/
function getNamespaceEnd(state) {
let i;
for (i = state.tokenStart; i < state.tokenEnd; ++i) {
if (state.data.charCodeAt(i) === 46)
return i;
}
return i;
}
/**
* Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
*/
function getNamespace(state, endIndex) {
return state.data.substring(state.tokenStart, endIndex);
}
/**
* Returns true if the current token contain no '.', otherwise returns false.
*/
function isFlatNamespace(state) {
let i;
for (i = state.tokenStart; i < state.tokenEnd; ++i) {
if (state.data.charCodeAt(i) === 46)
return false;
}
return true;
}
/**
* String representation of the current token.
*/
function getTokenString(state) {
return state.data.substring(state.tokenStart, state.tokenEnd);
}
/**
* Move to the next token.
*/
function moveNextInternal(state) {
const prev = skipWhitespace(state);
if (state.position >= state.length) {
state.tokenType = 6 /* CifTokenType.End */;
return;
}
state.tokenStart = state.position;
state.tokenEnd = state.position;
state.isEscaped = false;
const c = state.data.charCodeAt(state.position);
switch (c) {
case 35: // #, comment
skipCommentLine(state);
state.tokenType = 5 /* CifTokenType.Comment */;
break;
case 39: // ', escaped value
if (isTripleQuoteAtPosition(state)) {
eatTripleQuote(state);
state.tokenType = 3 /* CifTokenType.Value */;
break;
}
case 34: // ", escaped value
eatEscaped(state, c);
state.tokenType = 3 /* CifTokenType.Value */;
break;
case 59: // ;, possible multiline value
// multiline value must start at the beginning of the line.
if (prev === 10 || prev === 13) { // /n or /r
eatMultiline(state);
}
else {
eatValue(state);
}
state.tokenType = 3 /* CifTokenType.Value */;
break;
default:
if (state.isImportGet) {
eatImportGet(state);
}
else {
eatValue(state);
}
// escaped is always Value
if (state.isEscaped) {
state.tokenType = 3 /* CifTokenType.Value */;
// _ means column name, including _import.get
}
else if (state.data.charCodeAt(state.tokenStart) === 95) { // _
if (state.inSaveFrame && isImportGet(state)) {
state.isImportGet = true;
}
state.tokenType = 4 /* CifTokenType.ColumnName */;
// 5th char needs to be _ for data_, save_ or loop_
}
else if (state.tokenEnd - state.tokenStart >= 5 && state.data.charCodeAt(state.tokenStart + 4) === 95) {
if (isData(state))
state.tokenType = 0 /* CifTokenType.Data */;
else if (isSave(state))
state.tokenType = 1 /* CifTokenType.Save */;
else if (isLoop(state))
state.tokenType = 2 /* CifTokenType.Loop */;
else
state.tokenType = 3 /* CifTokenType.Value */;
// all other tests failed, we are at Value token.
}
else {
state.tokenType = 3 /* CifTokenType.Value */;
}
break;
}
}
/**
* Moves to the next non-comment token.
*/
function moveNext(state) {
moveNextInternal(state);
while (state.tokenType === 5 /* CifTokenType.Comment */)
moveNextInternal(state);
}
function createTokenizer(data, runtimeCtx) {
return {
data,
length: data.length,
position: 0,
tokenStart: 0,
tokenEnd: 0,
tokenType: 6 /* CifTokenType.End */,
lineNumber: 1,
isEscaped: false,
isImportGet: false,
inSaveFrame: false,
runtimeCtx
};
}
function FrameContext() {
return { categoryNames: [], categoryData: Object.create(null) };
}
function CifCategories(categoryNames, categoryData) {
const categories = Object.create(null);
for (const name of categoryNames) {
const d = categoryData[name];
categories[name] = Data.CifCategory(d.name, d.rowCount, d.fieldNames, d.fields);
}
return categories;
}
function CifBlock(ctx, header, saveFrames) {
return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header, saveFrames);
}
function CifSaveFrame(ctx, header) {
return Data.CifBlock(ctx.categoryNames, CifCategories(ctx.categoryNames, ctx.categoryData), header);
}
function addFields(ctx, name, rowCount, fieldNames, fields) {
if (name in ctx.categoryData) {
const cat = ctx.categoryData[name];
cat.fieldNames.push(...fieldNames);
Object.assign(cat.fields, fields);
}
else {
ctx.categoryData[name] = { name, rowCount, fieldNames, fields };
ctx.categoryNames.push(name);
}
}
/**
* Reads a category containing a single row.
*/
function handleSingle(tokenizer, ctx) {
const nsStart = tokenizer.tokenStart, nsEnd = getNamespaceEnd(tokenizer);
const name = getNamespace(tokenizer, nsEnd);
const fields = Object.create(null);
const fieldNames = [];
let readingNames = true;
while (readingNames) {
if (tokenizer.tokenType !== 4 /* CifTokenType.ColumnName */ || !isNamespace(tokenizer, nsStart, nsEnd)) {
readingNames = false;
break;
}
const fieldName = getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
if (tokenizer.tokenType !== 3 /* CifTokenType.Value */) {
return {
hasError: true,
errorLine: tokenizer.lineNumber,
errorMessage: 'Expected value.'
};
}
fields[fieldName] = Data.CifField.ofTokens({ data: tokenizer.data, indices: [tokenizer.tokenStart, tokenizer.tokenEnd], count: 1 });
fieldNames[fieldNames.length] = fieldName;
moveNext(tokenizer);
}
addFields(ctx, name.substr(1), 1, fieldNames, fields);
return {
hasError: false,
errorLine: 0,
errorMessage: ''
};
}
function readLoopChunk(chunkSize, state) {
const { tokenizer, tokens, fieldCount } = state;
let tokenCount = state.tokenCount;
let counter = 0;
while (tokenizer.tokenType === 3 /* CifTokenType.Value */ && counter < chunkSize) {
tokenizer_1.TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.tokenStart, tokenizer.tokenEnd);
moveNext(tokenizer);
counter++;
}
state.tokenCount = tokenCount;
return counter;
}
function updateLoopChunk(ctx, state) {
return ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length });
}
// const readLoopChunks = ChunkedSubtask(1000000,
// (size, state: LoopReadState) => readLoopChunk(state, size),
// (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.tokenizer.data.length }));
/**
* Reads a loop.
*/
async function handleLoop(tokenizer, ctx) {
const loopLine = tokenizer.lineNumber;
moveNext(tokenizer);
const name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
const isFlat = isFlatNamespace(tokenizer);
const fieldNames = [];
while (tokenizer.tokenType === 4 /* CifTokenType.ColumnName */) {
fieldNames[fieldNames.length] = isFlat
? getTokenString(tokenizer)
: getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
}
const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
const tokens = [];
const fieldCount = fieldNames.length;
for (let i = 0; i < fieldCount; i++)
tokens[i] = tokenizer_1.TokenBuilder.create(tokenizer.data, rowCountEstimate);
const state = {
fieldCount,
tokenCount: 0,
tokenizer,
tokens
};
await (0, mol_task_1.chunkedSubtask)(tokenizer.runtimeCtx, 1000000, state, readLoopChunk, updateLoopChunk);
if (state.tokenCount % fieldCount !== 0) {
return {
hasError: true,
errorLine: tokenizer.lineNumber,
errorMessage: `The number of values for loop starting at line ${loopLine} is not a multiple of the number of columns.`
};
}
const rowCount = (state.tokenCount / fieldCount) | 0;
if (isFlat) {
for (let i = 0; i < fieldCount; i++) {
const fields = { '': Data.CifField.ofTokens(tokens[i]) };
addFields(ctx, fieldNames[i].substr(1), rowCount, [''], fields);
}
}
else {
const fields = Object.create(null);
for (let i = 0; i < fieldCount; i++) {
fields[fieldNames[i]] = Data.CifField.ofTokens(tokens[i]);
}
addFields(ctx, name.substr(1), rowCount, fieldNames, fields);
}
return {
hasError: false,
errorLine: 0,
errorMessage: ''
};
}
/**
* Creates an error result.
*/
function error(line, message) {
return result_1.ReaderResult.error(message, line);
}
/**
* Creates a data result.
*/
function result(data) {
return result_1.ReaderResult.success(data);
}
/**
* Parses an mmCIF file.
*
* @returns CifParserResult wrapper of the result.
*/
async function parseInternal(data, runtimeCtx) {
const dataBlocks = [];
const tokenizer = createTokenizer(data, runtimeCtx);
let blockHeader = '';
let blockCtx = FrameContext();
// the next three initial values are never used in valid files
let saveFrames = [];
let saveCtx = FrameContext();
const saveFrame = Data.CifSaveFrame(saveCtx.categoryNames, CifCategories(saveCtx.categoryNames, saveCtx.categoryData), '');
let saveHeader = '';
runtimeCtx.update({ message: 'Parsing...', current: 0, max: data.length });
moveNext(tokenizer);
while (tokenizer.tokenType !== 6 /* CifTokenType.End */) {
const token = tokenizer.tokenType;
// Data block
if (token === 0 /* CifTokenType.Data */) {
if (tokenizer.inSaveFrame) {
return error(tokenizer.lineNumber, 'Unexpected data block inside a save frame.');
}
if (blockCtx.categoryNames.length > 0) {
dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
}
blockHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
blockCtx = FrameContext();
saveFrames = [];
moveNext(tokenizer);
// Save frame
}
else if (token === 1 /* CifTokenType.Save */) {
if (tokenizer.tokenEnd - tokenizer.tokenStart === 5) { // end of save frame
if (saveCtx.categoryNames.length > 0) {
saveFrames[saveFrames.length] = CifSaveFrame(saveCtx, saveHeader);
}
tokenizer.inSaveFrame = false;
}
else { // start of save frame
if (tokenizer.inSaveFrame) {
return error(tokenizer.lineNumber, 'Save frames cannot be nested.');
}
tokenizer.inSaveFrame = true;
saveHeader = data.substring(tokenizer.tokenStart + 5, tokenizer.tokenEnd);
saveCtx = FrameContext();
// saveFrame = CifSaveFrame(saveCtx, saveHeader);
}
moveNext(tokenizer);
// Loop
}
else if (token === 2 /* CifTokenType.Loop */) {
const cat = await handleLoop(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx);
if (cat.hasError) {
return error(cat.errorLine, cat.errorMessage);
}
// Single row
}
else if (token === 4 /* CifTokenType.ColumnName */) {
const cat = handleSingle(tokenizer, tokenizer.inSaveFrame ? saveCtx : blockCtx);
if (cat.hasError) {
return error(cat.errorLine, cat.errorMessage);
}
// Out of options
}
else {
console.log(tokenizer.tokenType, tokenizer_1.Tokenizer.getTokenString(tokenizer));
return error(tokenizer.lineNumber, 'Unexpected token. Expected data_, loop_, or data name.');
}
}
// Check if the latest save frame was closed.
if (tokenizer.inSaveFrame) {
return error(tokenizer.lineNumber, `Unfinished save frame (${saveFrame.header}).`);
}
if (blockCtx.categoryNames.length > 0 || saveFrames.length > 0) {
dataBlocks.push(CifBlock(blockCtx, blockHeader, saveFrames));
}
return result(Data.CifFile(dataBlocks));
}
function parseCifText(data) {
return mol_task_1.Task.create('Parse CIF', async (ctx) => {
return await parseInternal(data, ctx);
});
}