UNPKG

incr-regex-package

Version:

An incremental regular expression parser in JavaScript; useful for input validation, RegExp

454 lines (404 loc) 16.4 kB
/** * Copyright (c) 2016, Nurul Choudhury * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * */ "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RxParser = undefined; var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); var _utils = require("./utils"); var _rxprint = require("./rxprint"); var _rxtree = require("./rxtree"); function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } if (!_utils.array_append) { throw new Error("array_append is undefined"); } function isMulti(op) { return op && op.type == 'U' && op.op == 'MULTIRANGE'; } // Matching function // ================= // function __matchX(regexp) { // return function(ch) { return [ch !== DONE && (ch === undefined || ch.match(regexp)), undefined]; }; // } // function anychX(ch) { return [ch !== DONE, undefined]; } // function __matchcX(c) { // return function(ch) { return [(ch !== DONE) && (ch === undefined || ch == c), c]; }; // } //function anych(ch) { return [ch !== DONE, undefined]; } var MATCH_FALSE = [false, undefined]; var MATCH_TRUE = [true, undefined]; function __matchc(c) { return function (ch) { return ch !== _rxtree.DONE && (ch === undefined || ch === c) ? [true, c] : MATCH_FALSE; }; } function isNotAlnum(ch) { return !/\w|\d/.test(ch || '.'); } //function isNotAlpha(ch) { return !/\w/.test(ch||'.'); } function endofstr(prev, ch) { var l = !prev || isNotAlnum(prev); var r = isNotAlnum(ch); return l && !r || !l && r ? MATCH_TRUE : MATCH_FALSE; } //Match begining of string -- This is not suppoorted function begining() { return false; } //========================== /* Convert a RegExp parse tree to a finite state machine (FSM) so given a subtree 't' and a connector (next state) set the 'nextNode' to the connector cases of t: t - simple match => t.nextNode = connector (shortened to t -> connector ) - A . B => A -> B -> connector - A* => A -> t -> connector - A | B => A -> connector, B -> connector - A ? => t -> connector, A -> connector */ //matchable,dot,or,zero_or_one,zero_or_more var _metaMap = { "*": _rxtree.ZERO_OR_MORE, "+": _rxtree.ONE_OR_MORE, "?": _rxtree.ZERO_OR_ONE }; //const _stdRegexp = { "\\d": /\d/, "\\D": /\D/, "\\s": /\s/, "\\S": /\S/, "\\w": /\w/, "\\W": /\W/ }; var chmap = { 't': "\t", 'n': "\n", 'r': "\r" }; //function logit(msg,val) { console.log("logit: "+msg); return val;} function convert(str) { if (str == '<SKIP>') return _rxtree.SKIP; if (str == '(' || str == '(?:') return _rxtree.LP; if (str == ')') return _rxtree.RP; if (str == '.') return (0, _rxtree.ANYCH)(); if (str == '[\\b]') return _rxtree.BS; if (str == '^' || str == '$') return { type: 'N', val: str, multi: _rxtree.BOUNDARY, op: 'BOUNDARY', match: begining }; if (str == '\\b' || str == '\\B') return { type: 'N', val: str, multi: _rxtree.BOUNDARY, op: 'BOUNDARY', match: endofstr }; if (/^\[([^\]]|\\.)*\]$/.test(str)) return (0, _rxtree.makeCharSet)(str); //{type: 'N', val: str, multi: MANY, op: 'CHARSET', match: __match(new RegExp(str))}; if (str == '|') return _rxtree.OR; if (/^[?+*]\??$/.test(str)) return _metaMap[str.substring(0, 1)]; if (/^\{[^}]*\}$/.test(str)) return { type: 'U', val: str, op: 'MULTIRANGE', fn: (0, _utils.parseMulti)(str) }; if (/^\\[bdDsSwW]$/.test(str)) return (0, _rxtree.stdRxMeta)(str); //{type: 'N', val: str, multi: MANY, op: 'SPECIAL-CHARSET',match: __match(_stdRegexp[str])}; if (/^\\[trn]$/.test(str)) return { type: 'N', val: chmap[str.substring(1, 2)], multi: _rxtree.TERM, op: 'NON-PRINTING', match: (0, _rxtree.__match)("\\" + str.substring(1)) }; if (/^\\[.?+*{}()$^\\:|\][]$/.test(str)) return { type: 'N', val: str.substring(1, 2), multi: _rxtree.TERM, op: 'SINGLE', match: __matchc(str.substring(1)) }; return { type: 'N', val: str, multi: _rxtree.TERM, op: 'SINGLE', match: __matchc(str) }; } /* export function clearNodeMarkers(aNode) { if( aNode === undefined ) return undefined; if( aNode === DONE ) { DONE.marker = 0; return DONE; } //if( aNode.type === 'U' ) return {type: 'U', val: aNode.val, op: aNode.op, match: aNode.match}; if( aNode.type === 'N' && aNode.oper === undefined ) { aNode.marker = 0; } else { clearNodeMarkers(aNode.right); clearNodeMarkers(aNode.left); } aNode.marker = 0; } */ // ============= // Parser helpers // // Helper function for Precedence // odd values are left associative and even value aare right associative // e.g. '.' opererator // a . b . c => a . (b . c) RIGHT ASSOCIATIVE // a . b . c => (a . b) . c LEFT ASSOCIATIVE // // for the efficient evaluation of regular expressions // right associative is more efficient to evaluate // // Note the unary operators ( ? * + ) must be left associative // // a+* => (a+)* // // a is higher precedence true // a == b function gtPrec(a, b) { if (a < b) return false; if (a > b) return true; return (0, _utils.odd)(a); } var mapper = [{ match: ["(", "|", ")"], put: ["(", ")"] }, { match: ["<SKIP>", "|", "<SKIP>"], put: ["<SKIP>"] }, { match: ["<SKIP>", "<SKIP>"], put: ["<SKIP>"] }, { match: ["(", "<SKIP>", ")"], put: ["<SKIP>"] }, { match: ["<SKIP>", "*"], put: ["<SKIP>"] }, { match: ["<SKIP>", "+"], put: ["<SKIP>"] }, { match: ["(", "|"], put: ["(", "<SKIP>", "|"] }, { match: ["|", ")"], put: ["|", "<SKIP>", ")"] }, { match: ["|", "|"], put: ["|", "<SKIP>", "|"] }, { match: ["(", ")"], put: ["<SKIP>"] }]; // Match an item in the mapper table against the tokenList at position ix // Note = is the mathematical concept of equality and not an assignment // // let m = mapper[i].match ; // for some i // let tokenList = prefixArray + m + rest ; // + means array concatination // let ix = prefixArray.length; // then // matchMapper(tokenList,ix) = m function matchMapper(tokenList, ix) { for (var i = 0; i < mapper.length; i++) { if ((0, _utils.array_match)(tokenList, mapper[i].match, ix)) return mapper[i]; } return undefined; } // Note = is the mathematical concept of equality and not an assignment // if no 'i' exists where matchMapper(tokenList,i) has a value // then updateTokens(list) = list // else // for smallest ix // let map = matchMapper(pre + m + post,ix) // then // updateTokens(pre + m + post) = pre + map + updateTokens(post) function updateTokens(tokenList) { var res = []; for (var i = 0, l = tokenList.length; i < l; i++) { var mapV = matchMapper(tokenList, i); if (mapV) { (0, _utils.array_append)(res, mapV.put); i += mapV.match.length - 1; } else res.push(tokenList[i]); } return res; } /* Simple operator precedence grammar has problems with some accepable regular expression examples: /|abc.../ - expression cannot start with a binary operator, change to /<SKIP>|abc.../ /...(|)...)- expression cannot start with a binary operator, change to /...<SKIP>.../ /...(|abc...)- expression cannot start with a binary operator, change to /...(<SKIP>|abc.../ /...(xyz||abc...)- expression cannot start with a binary operator, change to /...(xyz|<SKIP>|abc.../ /...abc|)...)- expression cannot start with a binary operator, change to /...abc|<SKIP>).../ /...()...)- expression cannot start with a binary operator, change to /...<SKIP>.../ /...(<SKIP>)...)- reduce the complexityof skip instruction to allow further optimization, change to /...<SKIP>.../ /...<SKIP>*...)- optimize zero or more skips to a single skip, change to /...<SKIP>.../ /...<SKIP>+...)- optimize repeated skip, change to /...<SKIP>.../ /...<SKIP><SKIP>...)- optimized repeated skip, change to /...<SKIP>.../ */ function fixTokens(tokenList) { if (tokenList) { if (tokenList[tokenList.length - 1] == "|") tokenList = tokenList.concat("<SKIP>"); if (tokenList[0] == "|") tokenList = ["<SKIP>"].concat(tokenList); } var newList = updateTokens(tokenList); while (!(0, _utils.array_eq)(tokenList, newList)) { tokenList = newList; newList = updateTokens(tokenList); } return tokenList; } function isRegExp(s) { return s instanceof RegExp; } // Actual parser /* This is a parser for regular expressions, it uses a simple operator precidence parser It uses a regular expression as the tokenizer (TOKINIZATION_RX) */ var RxParser = exports.RxParser = function () { function RxParser() { _classCallCheck(this, RxParser); this.operand = []; this.operator = []; this.basePrec = 0; this.wasOp = true; this.lastop = undefined; } _createClass(RxParser, [{ key: "toString", value: function toString() { return "{ operand: " + this.operand.map(_rxprint.printExpr) + " operator: " + this.operator.map(function (e) { return e.toString(); }) + " prec: " + this.BasePrec + " wasOp: " + this.wasOp + "}"; } }, { key: "opState", value: function opState(from, to, op) { var tp = function tp(x) { return x ? "OPERATOR" : "OPERAND"; }; this.lastop = op; if (this.wasOp != from) { throw new Error("RegExp parsing expected: " + tp(from) + " but was: " + tp(this.wasOp)); } this.wasOp = to; } }, { key: "addToken", value: function addToken(a) { if (!a) return this.finishUp(); var c = convert(a); //console.log(c); if ((c.type == 'N' || c.type == 'L') && !this.wasOp) { this.pushOp(_rxtree.DOT, this.basePrec + 4); this.opState(false, true); } switch (c.type) { case 'L': this.opState(true, true, _rxtree.LP);this.basePrec += 10;break; case 'R': this.opState(false, false, _rxtree.RP);this.basePrec -= 10; if (this.basePrec < 0) throw Error("Syntax error " + this.basePrec);break; case '': case 'B': this.pushOp(c, this.basePrec + 2); this.opState(false, true, c); break; case 'U': this.pushOp(c, this.basePrec + 7);this.opState(false, false, c);break; case 'N': this.operand.push(c);this.opState(true, false, c);break; default: throw Error("Syntax error - in regexp"); } return this; } }, { key: "pushOp", value: function pushOp(op, prec) { var t = this.topV() || { prec: -100 }; //console.log("top",prec, op, t); while (t && gtPrec(t.prec, prec)) { var a, b; b = this.popOper(); if (!t.op.type || t.op.type === 'B') { a = this.popOper(); //console.log("pushOp",{ op: t.op, left: a, right: b }); this.operand.push((0, _rxtree.RX_OP)(t.op, a, b)); } else { if (isMulti(t.op)) { this.operand.push(this.applyMulti(t.op, b)); } else if (t.op === _rxtree.ONE_OR_MORE /*t.op.val == "+" */) { this.oneOrMore(b); //this.operand.push({oper: DOT, left: b, right:{ oper: ZERO_OR_MORE, left: b}}); } else { this.unaryOp(t.op, b); } //this.operand.push({ oper: t.op, left: b}); } //console.log("pushOp",{ oper: t.op, left: a, right: b }); } this.operator.pop(); t = this.topV(); } if (prec >= 0) this.operator.push({ op: op, prec: prec }); } }, { key: "finishUp", value: function finishUp() { if (this.wasOp === undefined) return this; if (!this.wasOp) { this.pushOp(_rxtree.DOT, 0); this.operand.push(_rxtree.DONE); this.pushOp(undefined, -1); } else this.pushOp(undefined, -1); this.wasOp = undefined; return this; } }, { key: "val", value: function val() { return this.operand.length === 0 ? undefined : this.operand[this.operand.length - 1]; } }, { key: "topV", value: function topV() { return this.operator.length === 0 ? undefined : this.operator[this.operator.length - 1]; } }, { key: "popOper", value: function popOper() { return this.operand.pop(); } }, { key: "applyMulti", value: function applyMulti(op, b) { var min = op.fn.min; var max = op.fn.max; var i; if ((0, _rxtree.boundary)(b)) throw new SyntaxError("repetition of boundary element: '" + b.val + "'' has no meaning"); var applyIt = function applyIt(p, b, max) { if (max === 0) return p; for (i = 0; i < max; i++) { b = (0, _rxtree.copyNode)(b); p = p ? (0, _rxtree.RX_CONS)(p, b) : b; } return p || b; }; // 0 or more if (min === 0) { if (max === undefined) return (0, _rxtree.RX_ZERO_OR_MORE)(b); //{ oper: ZERO_OR_MORE, left: b}; else return applyIt(undefined, (0, _rxtree.RX_ZERO_OR_ONE)(b) /*{ oper: ZERO_OR_ONE, left: b}*/, max); } else if (max === undefined) { // 1 or more return applyIt(applyIt(undefined, b, min), (0, _rxtree.RX_ZERO_OR_MORE)(b) /*{ oper: ZERO_OR_MORE, left: b}*/, 1); } // min and max are present return applyIt(applyIt(b, b, min - 1), (0, _rxtree.RX_ZERO_OR_ONE)((0, _rxtree.copyNode)(b)) /*{ oper: ZERO_OR_ONE, left: copyNode(b)}*/, max - min); } }, { key: "oneOrMore", value: function oneOrMore(expr) { if ((0, _rxtree.boundary)(expr)) throw new SyntaxError("repetition of boundary element: " + expr.val + " has no meaning"); //performs the following operation: this.operand.push({oper: DOT, left: expr, right:{ oper: ZERO_OR_MORE, left: expr}}); this.operand.push((0, _rxtree.RX_ONE_OR_MORE)(expr) /*RX_CONS(expr, RX_ZERO_OR_MORE(copyNode(expr)))*/); } }, { key: "unaryOp", value: function unaryOp(op, expr) { if ((0, _rxtree.boundary)(expr)) throw new SyntaxError("modifier (" + op.val + ") of boundary element: " + expr.val + " has no meaning"); this.operand.push((0, _rxtree.RX_UNARY)(op, expr)); } }], [{ key: "parse", value: function parse(str) { //console.log("str", str instanceof RegExp, isRegExp(str), RegExp); if (typeof str != 'string') { str = str.toString().replace(/\\\//g, "/").replace(/^\/|\/$/g, ""); //console.log("str-conv",str); } //console.log("str",str); var list = fixTokens(str.match(_utils.TOKINIZATION_RX)); // tokenize the regular expression list = list || []; //let scripter = (p,tok) var s = list.reduce(function (parser, op) { return parser.addToken(op); }, new RxParser()); //perform the parsing if (s.val()) s.pushOp(_rxtree.DOT, 0); s.operand.push(_rxtree.DONE); s.pushOp(undefined, -1); return (0, _rxtree.makeFSM)(s.val()); } }]); return RxParser; }(); // Generate a string that will match the regex. // /* Work in progress export generateStr(aNode, prefix, chooser) { if( aNode === undefined ) return prefix; if( aNode === DONE ) return prefix; if( aNode.type === 'N' && aNode.oper === undefined ) return genSingle(aNode,prefix,chooser); if( dot(aNode) ) return generateStr(aNode.right,generateStr(aNode.left,prefix, chooser), chooser); if(zero_or_more(aNode) || zero_or_one(aNode)) { let ix = chooser.count(0,zero_or_one(aNode)?10:1, prefix); for(let i=0; i<ix; i++) { prefix = generateStr(aNode.left,prefix, chooser); } return prefix; } if( or(aNode) ) { // collect all the or nodes // pick one at random // use that to generate the string let n = aNode; let list = []; while( or(n)) { list.push(n.left); n = n.right; } list.push(n); n = selectRandom(list); } throw new Error("Copy of an invalid node " + aNode); } */