UNPKG

cdap-avsc

Version:

This project is a clone of mtth/avsc repo with modifications required by CDAP

702 lines (652 loc) 18.3 kB
/* jshint node: true */ 'use strict'; /** * IDL to schema parsing logic. * */ var files = require('./files'), utils = require('./utils'), path = require('path'), util = require('util'); var f = util.format; /** * Assemble an IDL file into a decoded schema. * */ function assemble(fpath, opts, cb) { if (!cb && typeof opts == 'function') { cb = opts; opts = undefined; } opts = opts || {}; if (!opts.importHook) { opts.importHook = files.createImportHook(); } // Types found in imports. We store them separately to be able to insert them // in the correct order in the final attributes. var importedTypes = []; var attrs, imports; opts.importHook(fpath, 'idl', function (err, str) { if (err) { cb(err); return; } if (!str) { // Skipped import (likely already imported). cb(null, {}); return; } try { var protocol = parse(str, opts); } catch (err) { err.path = fpath; // To help debug which file caused the error. cb(err); return; } attrs = protocol.attrs; imports = protocol.imports; fetchImports(); }); function fetchImports() { var info = imports.shift(); if (!info) { // We are done with this file. We prepend all imported types to this // file's and we can return the final result. We also perform a JSON // serialization rountrip to remove non-numerical attributes from unions // and transform Javadocs into strings. if (importedTypes.length) { attrs.types = attrs.types ? importedTypes.concat(attrs.types) : importedTypes; } cb(null, JSON.parse(JSON.stringify(attrs))); } else { var importPath = path.join(path.dirname(fpath), info.name); if (info.kind === 'idl') { assemble(importPath, opts, mergeImportedAttrs); } else { // We are importing a protocol or schema file. opts.importHook(importPath, info.kind, function (err, str) { if (err) { cb(err); return; } switch (info.kind) { case 'protocol': case 'schema': try { var obj = JSON.parse(str); } catch (err) { err.path = importPath; cb(err); return; } var attrs = info.kind === 'schema' ? {types: [obj]} : obj; mergeImportedAttrs(null, attrs); break; default: cb(new Error(f('invalid import kind: %s', info.kind))); } }); } } } function mergeImportedAttrs(err, importedAttrs) { if (err) { cb(err); return; } // Merge first the types (where we don't need to check for duplicates // since `parse` will take care of it), then the messages (where we need // to, as duplicates will overwrite each other). (importedAttrs.types || []).forEach(function (typeAttrs) { // Ensure the imported protocol's namespace is inherited correctly (it // might be different from the current one). if (typeAttrs.namespace === undefined) { var namespace = importedAttrs.namespace; if (!namespace) { var match = /^(.*)\.[^.]+$/.exec(importedAttrs.protocol); if (match) { namespace = match[1]; } } typeAttrs.namespace = namespace || ''; } importedTypes.push(typeAttrs); }); try { Object.keys(importedAttrs.messages || {}).forEach(function (name) { if (!attrs.messages) { attrs.messages = {}; } if (attrs.messages[name]) { throw new Error(f('duplicate message: %s', name)); } attrs.messages[name] = importedAttrs.messages[name]; }); } catch (err) { cb(err); return; } fetchImports(); // Continue importing any remaining imports. } } /** * Parse an IDL into attributes. * * Not to be confused with `avro.parse` which parses attributes into types. * */ function parse(str, opts) { var parser = new Parser(str, opts); return {attrs: parser._readProtocol(), imports: parser._imports}; } // Helpers. /** * Simple class to split an input string into tokens. * * There are different types of tokens, characterized by their `id`: * * + `number` numbers. * + `name` references. * + `string` double-quoted. * + `operator`, anything else, always single character. * + `json`, special, must be asked for (the tokenizer doesn't have enough * context to predict these). * * This tokenizer also handles Javadoc extraction, via the `addJavadoc` method. * */ function Tokenizer(str) { this._str = str; this._pos = 0; this._queue = new BoundedQueue(3); // Bounded queue of last emitted tokens. this._token = undefined; // Current token. this._doc = undefined; // Javadoc. } Tokenizer.prototype.get = function (opts) { if (opts && opts.id && opts.id !== this._token.id) { throw this.error(f('expected %s but got %s', opts.id, this._token.val)); } else if (opts && opts.val && opts.val !== this._token.val) { throw this.error(f('expected %s but got %s', opts.val, this._token.val)); } else { return this._token; } }; Tokenizer.prototype.next = function (opts) { this._skip(); this._queue.push(this._pos); var pos = this._pos; var str = this._str; var c = str.charAt(pos); var id; if (!c) { if (opts && opts.id === '(eof)') { return {id: '(eof)'}; } else { throw this.error('unexpected end of input'); } } if (opts && opts.id === 'json') { id = 'json'; this._pos = this._endOfJson(); } else if (c === '"') { id = 'string'; this._pos = this._endOfString(); } else if (/[0-9]/.test(c)) { id = 'number'; this._pos = this._endOf(/[0-9]/); } else if (/[`A-Za-z_.]/.test(c)) { id = 'name'; this._pos = this._endOf(/[`A-Za-z0-9_.]/); } else { id = 'operator'; this._pos = pos + 1; } this._token = {id: id, val: str.slice(pos, this._pos)}; if (id === 'json') { // Let's be nice and give a more helpful error message when this occurs // (JSON parsing errors wouldn't let us find the location otherwise). try { this._token.val = JSON.parse(this._token.val); } catch (err) { throw this.error('invalid JSON'); } } else if (id === 'name') { // Unescape names (our parser doesn't need them). this._token.val = this._token.val.replace(/`/g, ''); } return this.get(opts); }; Tokenizer.prototype.prev = function (opts) { var pos = this._queue.pop(); if (pos === undefined) { throw new Error('cannot backtrack more'); } this._pos = pos; return this.get(opts); }; Tokenizer.prototype.error = function (msg) { var pos = this._queue.peek() || 1; // Use after whitespace position. var str = this._str; var lineNum = 1; var lineStart = 0; var i; for (i = 0; i < pos; i++) { if (str.charAt(i) === '\n') { lineNum++; lineStart = i; } } var err = new Error(msg); err.lineNum = lineNum; err.colNum = pos - lineStart; return err; }; Tokenizer.prototype.addJavadoc = function (attrs) { if (this._doc === undefined || attrs.doc !== undefined) { return; } attrs.doc = this._doc; this._doc = undefined; }; /** Skip whitespace and comments. */ Tokenizer.prototype._skip = function () { var str = this._str; var pos, c; // `pos` used for javadocs. while ((c = str.charAt(this._pos)) && /\s/.test(c)) { this._pos++; } if (c === '/') { switch (str.charAt(this._pos + 1)) { case '/': this._pos += 2; while ((c = str.charAt(this._pos)) && c !== '\n') { this._pos++; } return this._skip(); case '*': this._pos += 2; if (str.charAt(this._pos) === '*') { pos = this._pos + 1; } while ((c = str.charAt(this._pos++))) { if (c === '*' && str.charAt(this._pos) === '/') { this._pos++; if (pos !== undefined) { this._doc = new Javadoc(str.slice(pos, this._pos - 2)); } return this._skip(); } } throw this.error('unterminated comment'); } } }; /** Generic end of method. */ Tokenizer.prototype._endOf = function (pat) { var pos = this._pos; var str = this._str; while (pat.test(str.charAt(pos))) { pos++; } return pos; }; /** * Find end of a string. * * The specification doesn't explicitly say so, but IDLs likely only allow * double quotes for strings (C- and Java-style). * */ Tokenizer.prototype._endOfString = function () { var pos = this._pos + 1; // Skip first double quote. var str = this._str; var c; while ((c = str.charAt(pos))) { if (c === '"') { return pos + 1; } if (c === '\\') { pos += 2; } else { pos++; } } throw this.error('unterminated string'); }; /** * Returns end of JSON object, throwing an error if the end is reached first. * */ Tokenizer.prototype._endOfJson = function () { var pos = utils.jsonEnd(this._str, this._pos); if (pos < 0) { throw new Error('invalid JSON at ' + this._pos); } return pos; }; /** * Parser from tokens to attributes. * */ function Parser(str, opts) { this._oneWayVoid = !!(opts && opts.oneWayVoid); this._reassignJavadoc = !!(opts && opts.reassignJavadoc); this._imports = []; this._tk = new Tokenizer(str); this._tk.next(); // Prime tokenizer. } Parser.prototype._readProtocol = function () { var tk = this._tk; var attrs = {}; var types = []; var messages = {}; var hasMessage = false; while (tk.get().val === 'import') { this._readImport(); } while (tk.get().val === '@') { this._readAnnotation(attrs); } tk.addJavadoc(attrs); tk.get({val: 'protocol'}); attrs.protocol = tk.next({id: 'name'}).val; tk.next({val: '{'}); tk.next(); while (tk.get().val !== '}') { if (tk.get().val === 'import') { this._readImport(); } else { var typeAttrs = this._readType(); if (typeAttrs.name) { // This was a named type declaration. Not very clean to rely on this, // but since the IDL spec doesn't consistently delimit type // declaration (e.g. fixed end with `;` but other bracketed types // don't) we aren't able to tell whether this is the start of a // message otherwise. types.push(typeAttrs); } else { hasMessage = true; var oneWay = false; if (typeAttrs === 'void' || typeAttrs.type === 'void') { if (this._oneWayVoid) { oneWay = true; } if (typeAttrs === 'void') { typeAttrs = 'null'; } else { typeAttrs.type = 'null'; } } var message = this._readMessage(typeAttrs); if (oneWay) { message.attrs['one-way'] = true; } if (messages[message.name]) { // We have to do this check here otherwise the duplicate will be // overwritten (and `parse` won't be able to catch it). throw new Error(f('duplicate message: %s', message.name)); } messages[message.name] = message.attrs; } } } tk.next({id: '(eof)'}); if (types.length) { attrs.types = types; } if (hasMessage) { attrs.messages = messages; } return attrs; }; Parser.prototype._readImport = function () { var tk = this._tk; tk.get({val: 'import'}); var kind = tk.next({id: 'name'}).val; var fname = JSON.parse(tk.next({id: 'string'}).val); this._imports.push({kind: kind, name: fname}); tk.next({val: ';'}); tk.next(); }; Parser.prototype._readAnnotation = function (attrs) { var tk = this._tk; tk.get({val: '@'}); // Annotations are allowed to have names which aren't valid Avro names, // we must advance until we hit the first left parenthesis. var parts = []; while (tk.next().val !== '(') { parts.push(tk.get().val); } attrs[parts.join('')] = tk.next({id: 'json'}).val; tk.next({val: ')'}); tk.next(); }; Parser.prototype._readMessage = function (responseAttrs) { var tk = this._tk; var messageAttrs; if (this._reassignJavadoc) { messageAttrs = {}; messageAttrs.response = reassignJavadoc(responseAttrs, messageAttrs); } else { messageAttrs = {response: responseAttrs}; } while (tk.get().val === '@') { this._readAnnotation(messageAttrs); } var name = tk.get({id: 'name'}).val; messageAttrs.request = []; tk.next({val: '('}); if (tk.next().val !== ')') { tk.prev(); do { tk.next(); // Skip `(` or `,`. messageAttrs.request.push(this._readField()); } while (tk.get().val !== ')'); } if (tk.next().val === 'throws') { // It doesn't seem like the IDL allows multiple error types, even though // the spec always prescribes a union (or they don't indicate which // syntax to use). To be safe, we'll only allow one custom error type. tk.next(); messageAttrs.errors = [this._readType()]; } else if (tk.get().val === 'oneway') { tk.next(); messageAttrs['one-way'] = true; } tk.get({val: ';'}); tk.next(); return {name: name, attrs: messageAttrs}; }; Parser.prototype._readField = function () { var tk = this._tk; var attrs = {type: this._readType()}; if (this._reassignJavadoc) { attrs.type = reassignJavadoc(attrs.type, attrs); } while (tk.get().val === '@') { this._readAnnotation(attrs); } tk.addJavadoc(attrs); attrs.name = tk.get({id: 'name'}).val; if (tk.next().val === '=') { attrs['default'] = tk.next({id: 'json'}).val; tk.next(); } return attrs; }; Parser.prototype._readType = function () { var tk = this._tk; var attrs = {}; while (tk.get().val === '@') { this._readAnnotation(attrs); } tk.addJavadoc(attrs); switch (tk.get().val) { case 'record': case 'error': return this._readRecord(attrs); case 'fixed': return this._readFixed(attrs); case 'enum': return this._readEnum(attrs); case 'map': return this._readMap(attrs); case 'array': return this._readArray(attrs); case 'union': return this._readUnion(attrs); default: var type = tk.get().val; tk.next(); if (Object.keys(attrs).length) { attrs.type = type; return attrs; } else { return type; } } }; Parser.prototype._readFixed = function (attrs) { var tk = this._tk; attrs.type = tk.get({val: 'fixed'}).val; attrs.name = tk.next({id: 'name'}).val; tk.next({val: '('}); attrs.size = parseInt(tk.next({id: 'number'}).val); tk.next({val: ')'}); if (tk.next().val === ';') { tk.next(); } return attrs; }; Parser.prototype._readMap = function (attrs) { var tk = this._tk; attrs.type = tk.get({val: 'map'}).val; tk.next({val: '<'}); tk.next(); attrs.values = this._readType(); tk.get({val: '>'}); tk.next(); return attrs; }; Parser.prototype._readArray = function (attrs) { var tk = this._tk; attrs.type = tk.get({val: 'array'}).val; tk.next({val: '<'}); tk.next(); attrs.items = this._readType(); tk.get({val: '>'}); tk.next(); return attrs; }; Parser.prototype._readEnum = function (attrs) { var tk = this._tk; attrs.type = tk.get({val: 'enum'}).val; attrs.name = tk.next({id: 'name'}).val; tk.next({val: '{'}); attrs.symbols = []; do { attrs.symbols.push(tk.next().val); } while (tk.next().val !== '}'); tk.next(); return attrs; }; Parser.prototype._readUnion = function (attrs) { var tk = this._tk; var arr = []; tk.get({val: 'union'}); tk.next({val: '{'}); do { tk.next(); arr.push(this._readType()); } while (tk.get().val !== '}'); tk.next(); Object.keys(attrs).forEach(function (name) { // We can do this since `JSON.stringify` will ignore non-numeric keys on // array objects. This lets us be consistent with field and message // attribute transfer (e.g. for `doc` and `order`). arr[name] = attrs[name]; }); return arr; }; Parser.prototype._readRecord = function (attrs) { var tk = this._tk; attrs.type = tk.get({id: 'name'}).val; attrs.name = tk.next({id: 'name'}).val; attrs.fields = []; tk.next({val: '{'}); while (tk.next().val !== '}') { attrs.fields.push(this._readField()); tk.get({val: ';'}); } tk.next(); return attrs; }; /** * Simple bounded queue. * * Not the fastest, but will definitely do. * */ function BoundedQueue(length) { this._length = length | 0; this._data = []; } BoundedQueue.prototype.push = function (val) { this._data.push(val); if (this._data.length > this._length) { this._data.shift(); } }; BoundedQueue.prototype.peek = function () { return this._data[this._data.length - 1]; }; BoundedQueue.prototype.pop = function () { return this._data.pop(); }; /** * Javadoc wrapper class. * * This is used to be able to distinguish between normal `doc` annotations and * Javadoc comments, to correctly support the `reassignJavadoc` option. * * The parsing done is very simple and simply removes the line prefixes and * leading / trailing empty lines. It's better to be conservative with * formatting rather than risk losing information. * */ function Javadoc(str) { str = str.replace(/^[ \t]+|[ \t]+$/g, ''); // Trim whitespace. var lines = str.split('\n').map(function (line, i) { return i ? line.replace(/^\s*\*\s?/, '') : line; }); while (!lines[0]) { lines.shift(); } while (!lines[lines.length - 1]) { lines.pop(); } this._str = lines.join('\n'); } Javadoc.prototype.toJSON = function () { return this._str; }; /** * Transfer a key from an object to another and return the new source. * * If the source becomes an object with a single type attribute set, its `type` * attribute is returned instead. * */ function reassignJavadoc(from, to) { if (!(from.doc instanceof Javadoc)) { // Nothing to transfer. return from; } to.doc = from.doc; delete from.doc; return Object.keys(from).length === 1 ? from.type : from; } module.exports = { BoundedQueue: BoundedQueue, Tokenizer: Tokenizer, assemble: assemble, parse: parse };