UNPKG

crystcif-parse

Version:

A parser for crystallographic CIF files

359 lines (318 loc) 11.2 kB
'use strict'; var tokens = require('./tokens.js'); /** Represents a single value (string or numerical) in a CIF file. * @class * @param {string} type Type of the value (int, float, string, mstring, * N/A or ?) * @param {*} pvalue] Parsed value itself (should be appropriate to * type, unnecessary for N/A and ?) * @param {int} [prec] Precision number (only for numerals) */ var CifValue = function(type, value, prec) { /** @member {string} */ this.type = type; /** @member {int} */ this.prec = prec; switch (type) { case 'int': case 'float': /** @member {number} */ this.num = value; break; case 'string': case 'mstring': /** @member {string} */ this.text = value; break; default: break; } } CifValue.prototype = { get_value: function() { return (this.num !== undefined) ? this.num : this.text; // Universal function } } /** Formats an error message including line number * @param {string} msg Error message * @param {int} line Line number */ function errormsg(msg, line) { return new Error("ERROR @ line " + line + ": " + msg); } /** * Split a text CIF file into elementary tokens for further processing. * @param {string} cif CIF file in text string format * @return {Array} Array of parsed tokens */ function tokenize(cif) { /* Split into tokens (separated by non-blank characters except for the quoted strings and semicolon text ) */ // Grab a bunch of regular expressions var eol_re = tokens.tokenRegex('end_of_line', false, false); var all_re = [tokens.tokenRegex('whitespace', false, false), tokens.tokenRegex('quotestring', true), tokens.tokenRegex('semicolontext', true), tokens.tokenRegex('tag', true), tokens.tokenRegex('data_header', true), tokens.tokenRegex('loop_kw', true) ]; // Now create fake matches for each of them var all_m = []; for (var i = 0; i < all_re.length; ++i) { all_m.push({ 'index': -1 }); } var tokenized = []; var line_index = 1; var cifsl = cif.slice(); while (cifsl.length > 0) { var slice_i = 0; // First, try to see if it's any of the various non-whitespace types var m_type = 1; var m = null; for (; m_type < all_re.length; ++m_type) { m = cifsl.match(all_re[m_type]); if (m) break; } if (m) { tokenized.push({ 'val': m[0], 'type': ['quotestring', 'semicolontext', 'tag', 'data_headers', 'loop_kw' ][m_type - 1], 'line': line_index }); slice_i = m[0].length; // cifsl = cifsl.slice(m[0].length); } else { // Now check for whitespace all_re[0].lastIndex = 0; var w = all_re[0].exec(cifsl); if (w) { if (w.index == 0) { // Trim slice_i = w[0].length; // cifsl = cifsl.slice(w[0].length); } else { // Capture an unknown // A regular value/string tokenized.push({ 'val': cifsl.slice(0, w.index), 'type': 'unknown', 'line': line_index }); slice_i = w.index + w[0].length; // cifsl = cifsl.slice(w.index + w[0].length); } } else { // Ran out of string to parse if (cifsl.length > 0) { tokenized.push({ 'val': cifsl, 'type': 'unknown', 'line': line_index }); slice_i = cifsl.length; } } } // Slice out the next part var parsed = cifsl.slice(0, slice_i); cifsl = cifsl.slice(slice_i); var newlines = parsed.match(eol_re); if (newlines) { line_index += newlines.length; } } return tokenized; } module.exports.tokenize = tokenize; /** * Parse a single token as a value. * @param {Object} tok Token to parse (must not be a reserved keyword * like a data_ or loop_ token) * @return {CifValue} Parsed value */ function parseValue(tok) { // If it's a string, easy one if (tok.type == 'quotestring') { return new CifValue('string', tok.val.slice(1, tok.val.length - 1)); } if (tok.type == 'semicolontext') { return new CifValue('mstring', tok.val.slice(1, tok.val.length - 1)); } if (tok.type != 'unknown') { // Something's wrong return null; } // We now know it's unknown, so... var strval = tok.val; // First, check for special types if (strval.trim() == '.') { return new CifValue('N/A'); } else if (strval.trim() == '?') { return new CifValue('?'); } var type; // It can be a numeric value var m = tokens.tokenRegex('numeric', true, true).exec(strval.trim()); if (m) { // Does it have a precision? var prec = null; var strnum = m[3]; // Will be undefined if there's a precision if (strnum === undefined) { prec = parseInt(m[2]); strnum = m[1]; } // Integer or float? var num; if (strnum.match(tokens.tokenRegex('float', true, true))) { num = parseFloat(strnum); type = 'float'; } else { num = parseInt(strnum); type = 'int'; } return new CifValue(type, num, prec); } // Or it's just an unquoted string return new CifValue('string', strval); } module.exports.parseValue = parseValue; /** * Finds and splits the data blocks from a tokenized CIF file. * @param {Array} ciftokens Array of tokens contained in the file * @return {Array} Array of data blocks in the form * [name, [tokens]] */ function parseDataBlocks(ciftokens) { // Identify all data blocks var tagre = tokens.tokenRegex('tag'); var data_headers = []; for (var i = 0; i < ciftokens.length; ++i) { var tok = ciftokens[i]; if (tok.type == 'data_headers') { var name = tok.val.match(tagre); if (name.length != 1) { throw errormsg('Invalid data header ' + tok.val, tok.line); } data_headers.push([i, name[0].slice(1)]); } } // Now gather the blocks var data_blocks = []; for (var i = 0; i < data_headers.length; ++i) { var dh = data_headers[i]; var end = ((i < data_headers.length - 1) ? data_headers[i + 1][0] : ciftokens.length); var db = [dh[1], ciftokens.slice(dh[0] + 1, end)]; data_blocks.push(db); } return data_blocks; } module.exports.parseDataBlocks = parseDataBlocks; /** * Parses a series of tokens defining a data block into data items. * @param {Array} blocktokens Array of tokens defining the block * @return {Array} Array of parsed tata items */ function parseDataItems(blocktokens) { // Parse the data items inside a data block var data_items = []; /* There are two possible structures here: 1) alternating series of tag - value 2) loop with series of tags, then corresponding series of values */ // Acceptable value token types var vtypes = ['quotestring', 'semicolontext', 'unknown']; data_items = []; var btokens = blocktokens.slice(); while (btokens.length > 0) { var btok = btokens.shift(); // What type is it? if (btok === undefined) { break; } switch (btok.type) { case 'tag': var valtok = btokens.shift(); if (valtok == null || !vtypes.includes(valtok.type)) { throw errormsg('Invalid or missing value for tag ' + btok.val, btok.line); } data_items.push({ 'tag': btok.val, 'type': 'single', 'value': parseValue(valtok) }); break; case 'loop_kw': // Start by parsing the header var header = []; var ltok = btokens.shift(); var loop_end = btok.line; // Keep track of line while (ltok !== undefined && ltok.type == 'tag') { header.push(ltok.val); loop_end = ltok.line; ltok = btokens.shift(); } var body = []; while (ltok !== undefined && vtypes.includes(ltok.type)) { body.push(parseValue(ltok)); loop_end = ltok.line; ltok = btokens.shift(); } // Put back that last one... btokens.unshift(ltok); // Check if the loop is correct if (body.length % header.length != 0) { throw errormsg('Invalid loop - values must be a multiple of tags', loop_end); } var tagn = header.length; var loopn = body.length / header.length; for (var i = 0; i < header.length; ++i) { var di = { 'tag': header[i], 'type': 'loop', 'value': [], } for (var j = 0; j < loopn; ++j) { di.value.push(body[j * tagn + i]); } data_items.push(di); } break; default: break; } } return data_items; } module.exports.parseDataItems = parseDataItems; /** * Parses a cif file returning the data blocks and items (not interpreted). * @param {string} ciftext CIF file as a string * @return {Object} Parsed CIF file as data structure */ module.exports.parseCif = function parseCif(ciftext) { // First, extract the tokens var tk = tokenize(ciftext); // Then the blocks var db = parseDataBlocks(tk); // Now on to the items for each block var cifdict = {}; for (var i = 0; i < db.length; ++i) { var block = db[i]; cifdict[block[0]] = {}; // SAVE frames are not supported for now, so we only look // for data items var items = parseDataItems(block[1]); for (var j = 0; j < items.length; ++j) { cifdict[block[0]][items[j].tag] = items[j]; } } return cifdict; }