UNPKG

paragate

Version:

fast parsers based on Chevrotain

github.com/loveencounterflow/paragate

loveencounterflow/paragate

640 lines (605 loc) • 17.8 kB

JavaScript

(function() { 'use strict'; var $parse, CND, GRAMMAR, Htmlish_grammar, MAIN, PGTH, alert, assign, badge, dd, debug, echo, freeze, grammar, help, info, isa, jr, lets, log, merge_texts, new_datom, new_grammar, parse, rpr, strip_quotes, type_of, types, urge, validate, warn, whisper; //########################################################################################################### CND = require('cnd'); badge = 'PARAGATE/GRAMMARS/HTMLISH'; rpr = CND.rpr; log = CND.get_logger('plain', badge); info = CND.get_logger('info', badge); whisper = CND.get_logger('whisper', badge); alert = CND.get_logger('alert', badge); debug = CND.get_logger('debug', badge); warn = CND.get_logger('warn', badge); help = CND.get_logger('help', badge); urge = CND.get_logger('urge', badge); echo = CND.echo.bind(CND); //........................................................................................................... ({assign, jr} = CND); // CHVTN = require 'chevrotain' ({new_datom, lets, freeze} = (new (require('datom')).Datom({ dirty: false })).export()); types = require('./types'); ({isa, type_of, validate} = types); GRAMMAR = require('./grammar'); ({ HTMLISH: PGTH } = require('intertext')); //=========================================================================================================== //----------------------------------------------------------------------------------------------------------- this.lexer_modes = { //......................................................................................................... outside_mode: { o_escaped: { match: /\\./u }, o_comment: { match: //, line_breaks: true }, o_cdata: { match: /<!\[CDATA\[[\s\S]*?]]>/ }, o_doctype: { match: /<!DOCTYPE\s+[^>]*>/ }, o_xmldecl: { match: /<\?xml\s+[\s\S]*?\?>/ }, o_pi: { match: /<\?[\s\S]*?\?>/ }, i_slash_open: { match: /<\//, push_mode: "inside_mode" }, i_open: { match: /</, push_mode: "inside_mode" }, o_text: { match: /[^<\\]+/ } }, //......................................................................................................... inside_mode: { i_close: { match: />/, pop_mode: true }, i_special_close: { match: /\?>/, pop_mode: true }, i_slash_close: { match: /\/>/, pop_mode: true }, stm_slash1: { match: /\/(?!>)/, push_mode: 'slashtext_mode' }, i_slash: { match: /\// }, v_equals: { match: /\s*=\s*/, push_mode: 'value_mode' }, i_name: { match: /[^\s!?=\{\[\(<\/>\)\]\}'"]+/ }, i_whitespace: { match: /[ \t\r\n]/, skip: true } }, //......................................................................................................... slashtext_mode: { stm_slash2: { match: /\//, switch_mode: "outside_mode" }, stm_text: { match: /[^\/]+/ } }, //......................................................................................................... value_mode: { v_value: { match: /"[^"]*"|'[^']*'|[^>\s\/]+/, pop_mode: true } } }; //----------------------------------------------------------------------------------------------------------- this.summarize = function(t) { // `t` is an object whose keys are token names and whose values are token patterns //--------------------------------------------------------------------------------------------------------- this.RULE('document', () => { return this.MANY(() => { return this.OR([ { ALT: () => { return this.CONSUME(t.o_escaped); } }, { ALT: () => { return this.CONSUME(t.o_doctype); } }, { ALT: () => { return this.CONSUME(t.o_xmldecl); } }, { ALT: () => { return this.CONSUME(t.o_pi); } }, { ALT: () => { return this.CONSUME(t.o_cdata); } }, { ALT: () => { return this.CONSUME(t.o_comment); } }, { ALT: () => { return this.CONSUME(t.o_text); } }, { ALT: () => { return this.CONSUME(t.stm_text); } }, { ALT: () => { return this.SUBRULE(this.osntag); } }, { ALT: () => { return this.SUBRULE(this.ctag); } }, { ALT: () => { return this.CONSUME(t.stm_slash2); } } ]); }); }); //--------------------------------------------------------------------------------------------------------- this.RULE('osntag', ()/* `<a b=c>`, `<a b=c/>`, or `<a b=c/` */ => { this.CONSUME(t.i_open); this.CONSUME(t.i_name); this.OPTION(() => { return this.SUBRULE(this.attributes); }); return this.OR([ { ALT: () => { return this.CONSUME(t.i_close); } }, { ALT: () => { return this.CONSUME(t.i_slash_close); } }, { ALT: () => { return this.CONSUME(t.stm_slash1); } } ]); }); //--------------------------------------------------------------------------------------------------------- this.RULE('ctag', ()/* `</a>` */ => { this.CONSUME(t.i_slash_open); this.CONSUME(t.i_name); return this.CONSUME(t.i_close); }); //--------------------------------------------------------------------------------------------------------- this.RULE('attributes', () => { return this.AT_LEAST_ONE(() => { return this.SUBRULE(this.attribute); }); }); //--------------------------------------------------------------------------------------------------------- return this.RULE('attribute', () => { this.CONSUME(t.i_name); return this.OPTION(() => { this.CONSUME(t.v_equals); return this.CONSUME(t.v_value); }); }); }; //----------------------------------------------------------------------------------------------------------- dd = function(d) { var k, ref; /* TAINT implement as optional functionality of `DATOM.new_datom()` */ for (k in d) { if ((ref = d[k]) === (void 0) || ref === null || ref === '') { delete d[k]; } } return d; }; //----------------------------------------------------------------------------------------------------------- strip_quotes = function(x) { if (!isa.text(x)) { return x; } if (/^".*"$/.test(x)) { return x.slice(1, x.length - 1); } if (/^'.*'$/.test(x)) { return x.slice(1, x.length - 1); } return x; }; //----------------------------------------------------------------------------------------------------------- this.linearize = function*(source, tree, level = 0) { var $key, $vnr, _, atrs, attribute, attributes, c, clasz, d, e, i, j, k, l, len, len1, len2, name, ref, ref1, ref2, ref3, ref4, ref5, ref6, start, start1, stop, stop2, subtree, text, text1, text2, text3, token_name, type, ukid, ukids, v, x; if (tree == null) { return null; } ({ //......................................................................................................... name: token_name, $key, start, stop, text, $vnr } = tree); //......................................................................................................... if ($key === '^token') { switch (token_name) { case 'o_escaped': yield dd({ $key: '^text', start, stop, text, $vnr, $: '^Ω1^' }); break; case 'o_text': case 'stm_text': yield dd({ $key: '^text', start, stop, text, $vnr, $: '^Ω2^' }); break; case 'stm_slash2': yield dd({ $key: '>tag', type: 'nctag', start, stop, text, $vnr, $: '^Ω3^' }); break; case 'o_comment': yield dd({ $key: '^comment', start, stop, text, $vnr, $: '^Ω4^' }); break; case 'o_pi': yield dd({ $key: '^pi', start, stop, text, $vnr, $: '^Ω5^' }); break; case 'o_doctype': yield dd({ $key: '^doctype', start, stop, text, $vnr, $: '^Ω6^' }); break; case 'o_cdata': start1 = start + 9; stop2 = stop - 3; text1 = source.slice(start, start1); text2 = source.slice(start1, stop2); text3 = source.slice(stop2, stop); yield dd({ $key: '<cdata', start, stop: start1, text: text1, $vnr, $: '^Ω7^' }); if (text2 !== '') { yield dd({ $key: '^text', start: start1, stop: stop2, text: text2, $vnr, $: '^Ω8^' }); } yield dd({ $key: '>cdata', start: stop2, stop, text: text3, $vnr, $: '^Ω9^' }); break; default: yield dd({ $key: '^unknown', $value: tree, $vnr, $: '^Ω10^' }); } return null; } if ($key !== '^document' && $key !== '^node') { throw new Error(`^445^ unknown $key ${rpr($key)}`); } //......................................................................................................... ({ukids} = tree); /* NOTE we assume that unique kids exist and that values are stored in source order */ for (_ in ukids) { ukid = ukids[_]; $vnr = ukid.$vnr; break; } //......................................................................................................... if ($key === '^document') { if (!this.settings.bare) { yield dd({ $key: '<document', start: 0, stop: 0, source, errors: tree.errors, $vnr: [-2e308], $: '^Ω11^' }); } ref = tree.kids; for (i = 0, len = ref.length; i < len; i++) { subtree = ref[i]; yield* this.linearize(source, subtree, level + 1); } x = text.length; if (!this.settings.bare) { yield dd({ $key: '>document', start: x, stop: x, $vnr: [2e308], $: '^Ω12^' }); } return null; } if ((name = (ref1 = tree.ukids) != null ? (ref2 = ref1.i_name) != null ? ref2.text : void 0 : void 0) == null) { //......................................................................................................... return null; } switch (/* may happen when parsing errors occur */token_name) { //....................................................................................................... case 'osntag': $key = '<tag'; if (tree.ukids.i_close != null) { type = 'otag'; } else if (tree.ukids.i_slash_close != null) { type = 'stag'; $key = '^tag'; } else if (tree.ukids.stm_slash1 != null) { type = 'ntag'; } if ((attributes = tree.ukids.attributes) != null) { atrs = {}; ref3 = attributes.kids; for (j = 0, len1 = ref3.length; j < len1; j++) { attribute = ref3[j]; k = attribute.ukids.i_name.text; v = strip_quotes((ref4 = (ref5 = attribute.ukids.v_value) != null ? ref5.text : void 0) != null ? ref4 : true); atrs[k] = v; } d = { $key, name, type, text, start, stop, atrs, $vnr, $: '^Ω13^' }; } else { d = { $key, name, type, text, start, stop, $vnr, $: '^Ω14^' }; } //..................................................................................................... // parse compact tag name: if ((d.name != null) && d.name !== '') { e = this._parse_compact_tagname(d.name); if (e.id != null) { if (d.id != null) { throw new Error(`^paragate/htmlish/linearize@1^ duplicate IDs in ${rpr(d)}`); } d.id = e.id; } if (e.prefix != null) { if (d.prefix != null) { throw new Error(`^paragate/htmlish/linearize@1^ duplicate prefixes in ${rpr(d)}`); } d.prefix = e.prefix; } if (e.class != null) { clasz = d.class != null ? new Set(d.class.split(/\s+/)) : new Set(); ref6 = e.class; for (l = 0, len2 = ref6.length; l < len2; l++) { c = ref6[l]; clasz.add(c); } d.class = [...clasz]; } if (e.name != null) { d.name = e.name; } } //..................................................................................................... yield dd(d); break; //....................................................................................................... case 'ctag': yield dd({ $key: '>tag', name, type: 'ctag', text, start, stop, $vnr, $: '^Ω15^' }); break; default: //....................................................................................................... yield dd({ $key: '^unknown', $value: tree, $vnr, $: '^Ω16^' }); } return null; }; //----------------------------------------------------------------------------------------------------------- this._parse_compact_tagname = function(text) { return PGTH.parse_compact_tagname(text, true); }; //----------------------------------------------------------------------------------------------------------- $parse = function(grammar = null) { var SP, line_nr; SP = require('steampipes'); if (grammar == null) { grammar = new new_grammar({ bare: true }); } line_nr = 0; return SP.$(function(line, send) { var d, i, len, ref; line_nr++; send(new_datom('^newline', { $vnr: [line_nr, 0], $: '^Ω17^' })); ref = grammar.parse(line); for (i = 0, len = ref.length; i < len; i++) { d = ref[i]; send(lets(d, function(d) { return d.$vnr[0] = line_nr; })); } return null; }); }; //----------------------------------------------------------------------------------------------------------- merge_texts = function(d1, d2) { var R; // { '$key': '^text', start: 0, stop: 7, text: 'before ', '$vnr': [ 1, 1 ], '$': '^Ω18^' } R = { $key: '^text', start: d1.start, stop: d2.stop, text: d1.text + d2.text, $vnr: d1.$vnr, $: d1.$ }; return R; }; //----------------------------------------------------------------------------------------------------------- parse = function(...P) { var R, d, i, idx, len, prv_d, tokens; tokens = this._parse(...P); R = []; prv_d = null; for (idx = i = 0, len = tokens.length; i < len; idx = ++i) { d = tokens[idx]; if (d.$key === '^text') { if ((prv_d != null) && (prv_d.$key === '^text')) { prv_d = merge_texts(prv_d, d); continue; } prv_d = d; continue; } if (prv_d) { R.push(prv_d); } prv_d = null; R.push(d); } if (prv_d != null) { R.push(prv_d); } return freeze(R); }; //########################################################################################################### /* TAINT this seems backwards (but works?) */ MAIN = this; new_grammar = function(settings) { var R; R = GRAMMAR.new_grammar('Htmlish', MAIN, settings); R._parse = R.parse; R.parse = parse; return R; }; grammar = new_grammar(); Htmlish_grammar = grammar.constructor; module.exports = {Htmlish_grammar, grammar, new_grammar, $parse}; }).call(this); //# sourceMappingURL=htmlish.grammar.js.map