UNPKG

incr-regex-package

Version:

An incremental regular expression parser in JavaScript; useful for input validation, RegExp

github.com/nurulc/incr-regex-package

nurulc/incr-regex-package

441 lines (371 loc) • 15.3 kB

JavaScript

/** * Copyright (c) 2016, Nurul Choudhury * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * */ "use strict"; import { parseMulti, odd, array_eq, array_match, array_append, TOKINIZATION_RX } from "./utils"; import {printExpr} from "./rxprint"; import { TERM, BOUNDARY, ANYCH, boundary, SKIP, BS, LP, RP, OR, ZERO_OR_ONE, ZERO_OR_MORE, ONE_OR_MORE, DOT, DONE, RX_OP, RX_UNARY, RX_CONS, RX_ZERO_OR_ONE,RX_ZERO_OR_MORE, RX_ONE_OR_MORE, copyNode, stdRxMeta,makeCharSet, makeFSM, __match } from "./rxtree" if( ! array_append ) { throw new Error("array_append is undefined"); } function isMulti(op) { return op && op.type == 'U' && op.op == 'MULTIRANGE'; } // Matching function // ================= // function __matchX(regexp) { // return function(ch) { return [ch !== DONE && (ch === undefined || ch.match(regexp)), undefined]; }; // } // function anychX(ch) { return [ch !== DONE, undefined]; } // function __matchcX(c) { // return function(ch) { return [(ch !== DONE) && (ch === undefined || ch == c), c]; }; // } //function anych(ch) { return [ch !== DONE, undefined]; } const MATCH_FALSE = [false,undefined]; const MATCH_TRUE = [true,undefined]; function __matchc(c) { return function(ch) { return ((ch !== DONE) && (ch === undefined || ch === c))?[true, c]:MATCH_FALSE; }; } function isNotAlnum(ch) { return !/\w|\d/.test(ch||'.'); } //function isNotAlpha(ch) { return !/\w/.test(ch||'.'); } function endofstr(prev,ch) { let l = !prev || isNotAlnum(prev); let r = isNotAlnum(ch); return (l && !r) || (!l && r) ? MATCH_TRUE:MATCH_FALSE; } //Match begining of string -- This is not suppoorted function begining() { return false; } //========================== /* Convert a RegExp parse tree to a finite state machine (FSM) so given a subtree 't' and a connector (next state) set the 'nextNode' to the connector cases of t: t - simple match => t.nextNode = connector (shortened to t -> connector ) - A . B => A -> B -> connector - A* => A -> t -> connector - A | B => A -> connector, B -> connector - A ? => t -> connector, A -> connector */ //matchable,dot,or,zero_or_one,zero_or_more const _metaMap = { "*": ZERO_OR_MORE, "+": ONE_OR_MORE, "?": ZERO_OR_ONE }; //const _stdRegexp = { "\\d": /\d/, "\\D": /\D/, "\\s": /\s/, "\\S": /\S/, "\\w": /\w/, "\\W": /\W/ }; const chmap = { 't': "\t", 'n': "\n", 'r': "\r" }; //function logit(msg,val) { console.log("logit: "+msg); return val;} function convert(str) { if(str == '<SKIP>') return SKIP; if(str == '(' || str == '(?:') return LP; if(str == ')' ) return RP; if(str == '.' ) return ANYCH(); if(str == '[\\b]' ) return BS; if(str == '^' || str == '$' ) return {type: 'N', val: str, multi: BOUNDARY, op: 'BOUNDARY', match: begining}; if(str == '\\b' || str == '\\B' ) return {type: 'N', val: str, multi: BOUNDARY, op: 'BOUNDARY', match: endofstr}; if( (/^\[([^\]]|\\.)*\]$/).test(str) ) return makeCharSet(str);//{type: 'N', val: str, multi: MANY, op: 'CHARSET', match: __match(new RegExp(str))}; if(str == '|' ) return OR; if((/^[?+*]\??$/).test(str) ) return _metaMap[str.substring(0,1)]; if((/^\{[^}]*\}$/ ).test(str)) return {type: 'U', val: str, op: 'MULTIRANGE', fn: parseMulti(str)}; if((/^\\[bdDsSwW]$/).test(str) ) return stdRxMeta(str); //{type: 'N', val: str, multi: MANY, op: 'SPECIAL-CHARSET',match: __match(_stdRegexp[str])}; if((/^\\[trn]$/).test(str) ) return {type: 'N', val: chmap[str.substring(1,2)], multi: TERM, op: 'NON-PRINTING',match: __match("\\"+str.substring(1))}; if((/^\\[.?+*{}()$^\\:|\][]$/).test(str) ) return {type: 'N', val: str.substring(1,2), multi: TERM, op: 'SINGLE', match: __matchc(str.substring(1)) }; return { type: 'N', val: str, multi: TERM, op: 'SINGLE', match: __matchc(str) }; } /* export function clearNodeMarkers(aNode) { if( aNode === undefined ) return undefined; if( aNode === DONE ) { DONE.marker = 0; return DONE; } //if( aNode.type === 'U' ) return {type: 'U', val: aNode.val, op: aNode.op, match: aNode.match}; if( aNode.type === 'N' && aNode.oper === undefined ) { aNode.marker = 0; } else { clearNodeMarkers(aNode.right); clearNodeMarkers(aNode.left); } aNode.marker = 0; } */ // ============= // Parser helpers // // Helper function for Precedence // odd values are left associative and even value aare right associative // e.g. '.' opererator // a . b . c => a . (b . c) RIGHT ASSOCIATIVE // a . b . c => (a . b) . c LEFT ASSOCIATIVE // // for the efficient evaluation of regular expressions // right associative is more efficient to evaluate // // Note the unary operators ( ? * + ) must be left associative // // a+* => (a+)* // // a is higher precedence true // a == b function gtPrec(a,b) { if( a<b ) return false; if( a>b ) return true; return odd(a); } const mapper = [ { match: ["(", "|", ")"], put: ["(", ")" ] }, { match: ["<SKIP>", "|", "<SKIP>"], put: ["<SKIP>"] }, { match: ["<SKIP>", "<SKIP>"], put: ["<SKIP>"] }, { match: ["(", "<SKIP>", ")"], put: ["<SKIP>"] }, { match: ["<SKIP>", "*"], put: ["<SKIP>"] }, { match: ["<SKIP>", "+"], put: ["<SKIP>"] }, { match: ["(", "|"], put: ["(", "<SKIP>", "|" ] }, { match: ["|", ")"], put: ["|", "<SKIP>", ")" ] }, { match: ["|", "|"], put: [ "|", "<SKIP>", "|"] }, { match: ["(", ")"], put: ["<SKIP>"] } ]; // Match an item in the mapper table against the tokenList at position ix // Note = is the mathematical concept of equality and not an assignment // // let m = mapper[i].match ; // for some i // let tokenList = prefixArray + m + rest ; // + means array concatination // let ix = prefixArray.length; // then // matchMapper(tokenList,ix) = m function matchMapper(tokenList, ix) { for(let i=0; i<mapper.length; i++) { if(array_match(tokenList, mapper[i].match, ix)) return mapper[i]; } return undefined; } // Note = is the mathematical concept of equality and not an assignment // if no 'i' exists where matchMapper(tokenList,i) has a value // then updateTokens(list) = list // else // for smallest ix // let map = matchMapper(pre + m + post,ix) // then // updateTokens(pre + m + post) = pre + map + updateTokens(post) function updateTokens(tokenList) { let res = []; for(let i=0, l=tokenList.length; i< l; i++) { let mapV = matchMapper(tokenList, i); if( mapV ) { array_append(res,mapV.put); i += mapV.match.length-1; } else res.push(tokenList[i]); } return res; } /* Simple operator precedence grammar has problems with some accepable regular expression examples: /|abc.../ - expression cannot start with a binary operator, change to /<SKIP>|abc.../ /...(|)...)- expression cannot start with a binary operator, change to /...<SKIP>.../ /...(|abc...)- expression cannot start with a binary operator, change to /...(<SKIP>|abc.../ /...(xyz||abc...)- expression cannot start with a binary operator, change to /...(xyz|<SKIP>|abc.../ /...abc|)...)- expression cannot start with a binary operator, change to /...abc|<SKIP>).../ /...()...)- expression cannot start with a binary operator, change to /...<SKIP>.../ /...(<SKIP>)...)- reduce the complexityof skip instruction to allow further optimization, change to /...<SKIP>.../ /...<SKIP>*...)- optimize zero or more skips to a single skip, change to /...<SKIP>.../ /...<SKIP>+...)- optimize repeated skip, change to /...<SKIP>.../ /...<SKIP><SKIP>...)- optimized repeated skip, change to /...<SKIP>.../ */ function fixTokens(tokenList) { if(tokenList ) { if(tokenList[tokenList.length-1] == "|") tokenList = tokenList.concat("<SKIP>"); if(tokenList[0] == "|") tokenList = ["<SKIP>"].concat(tokenList); } let newList = updateTokens(tokenList); while(!array_eq(tokenList,newList)) { tokenList = newList; newList = updateTokens(tokenList); } return tokenList; } function isRegExp(s) { return s instanceof RegExp; } // Actual parser /* This is a parser for regular expressions, it uses a simple operator precidence parser It uses a regular expression as the tokenizer (TOKINIZATION_RX) */ export class RxParser { constructor() { this.operand = []; this.operator = []; this.basePrec = 0; this.wasOp = true; this.lastop = undefined; } toString() { return "{ operand: "+ this.operand.map(printExpr) + " operator: " + this.operator.map( e => e.toString()) + " prec: " + this.BasePrec + " wasOp: " + this.wasOp + "}"; } static parse(str) { //console.log("str", str instanceof RegExp, isRegExp(str), RegExp); if( (typeof str) != 'string' ) { str = str.toString().replace(/\\\//g, "/").replace(/^\/|\/$/g,""); //console.log("str-conv",str); } //console.log("str",str); var list = fixTokens(str.match(TOKINIZATION_RX)); // tokenize the regular expression list = list || []; //let scripter = (p,tok) var s = list.reduce( function(parser,op) { return parser.addToken(op); }, new RxParser()); //perform the parsing if( s.val() ) s.pushOp(DOT,0); s.operand.push(DONE); s.pushOp(undefined,-1); return makeFSM(s.val()); } opState(from,to,op) { const tp = (x) => x ? "OPERATOR" : "OPERAND"; this.lastop = op; if( this.wasOp != from ) { throw new Error("RegExp parsing expected: " + (tp(from)) + " but was: " + tp(this.wasOp)); } this.wasOp = to; } addToken(a) { if(!a) return this.finishUp(); const c = convert(a); //console.log(c); if( (c.type == 'N' || c.type == 'L') && !this.wasOp ) { this.pushOp(DOT, this.basePrec+4); this.opState(false,true); } switch(c.type) { case 'L': this.opState(true,true,LP); this.basePrec += 10; break; case 'R': this.opState(false,false,RP); this.basePrec -= 10; if( this.basePrec < 0 ) throw Error("Syntax error "+ this.basePrec); break; case '' : case 'B' : this.pushOp(c, this.basePrec+2); this.opState(false,true,c); break; case 'U' : this.pushOp(c, this.basePrec+7); this.opState(false,false,c); break; case 'N' : this.operand.push(c); this.opState(true,false,c); break; default : throw Error("Syntax error - in regexp"); } return this; } pushOp(op,prec) { var t = this.topV() || {prec: -100}; //console.log("top",prec, op, t); while( t && gtPrec(t.prec, prec) ) { var a,b; b = this.popOper(); if( !t.op.type || t.op.type === 'B' ) { a = this.popOper(); //console.log("pushOp",{ op: t.op, left: a, right: b }); this.operand.push(RX_OP(t.op, a, b) ); } else { if( isMulti(t.op) ) { this.operand.push(this.applyMulti(t.op, b)); } else if( t.op === ONE_OR_MORE /*t.op.val == "+" */ ) { this.oneOrMore(b); //this.operand.push({oper: DOT, left: b, right:{ oper: ZERO_OR_MORE, left: b}}); } else { this.unaryOp(t.op,b); } //this.operand.push({ oper: t.op, left: b}); } //console.log("pushOp",{ oper: t.op, left: a, right: b }); } this.operator.pop(); t=this.topV(); } if( prec >= 0 ) this.operator.push({ op: op, prec: prec}); } finishUp() { if( this.wasOp === undefined ) return this; if(!this.wasOp) { this.pushOp(DOT,0); this.operand.push(DONE); this.pushOp(undefined,-1); } else this.pushOp(undefined,-1); this.wasOp = undefined; return this; } val() { return this.operand.length === 0 ? undefined : this.operand[this.operand.length-1]; } topV() { return this.operator.length === 0 ? undefined : this.operator[this.operator.length-1]; } popOper() { return this.operand.pop(); } applyMulti(op, b) { var min = op.fn.min; var max = op.fn.max; var i; if( boundary(b) ) throw new SyntaxError("repetition of boundary element: '"+b.val+ "'' has no meaning"); var applyIt = (p,b,max) => { if( max === 0 ) return p; for(i=0; i< max; i++) { b = copyNode(b); p = p ? RX_CONS(p,b) : b; } return p || b; }; // 0 or more if( min === 0 ) { if( max === undefined ) return RX_ZERO_OR_MORE(b);//{ oper: ZERO_OR_MORE, left: b}; else return applyIt(undefined, RX_ZERO_OR_ONE(b)/*{ oper: ZERO_OR_ONE, left: b}*/,max); } else if( max === undefined) { // 1 or more return applyIt(applyIt(undefined,b,min), RX_ZERO_OR_MORE(b)/*{ oper: ZERO_OR_MORE, left: b}*/, 1); } // min and max are present return applyIt(applyIt(b,b,min-1), RX_ZERO_OR_ONE(copyNode(b))/*{ oper: ZERO_OR_ONE, left: copyNode(b)}*/, max-min); } oneOrMore(expr) { if( boundary(expr)) throw new SyntaxError("repetition of boundary element: "+expr.val+ " has no meaning"); //performs the following operation: this.operand.push({oper: DOT, left: expr, right:{ oper: ZERO_OR_MORE, left: expr}}); this.operand.push(RX_ONE_OR_MORE(expr)/*RX_CONS(expr, RX_ZERO_OR_MORE(copyNode(expr)))*/); } unaryOp(op, expr) { if( boundary(expr) ) throw new SyntaxError("modifier (" +op.val+") of boundary element: "+expr.val+ " has no meaning"); this.operand.push(RX_UNARY(op, expr)); } } // Generate a string that will match the regex. // /* Work in progress export generateStr(aNode, prefix, chooser) { if( aNode === undefined ) return prefix; if( aNode === DONE ) return prefix; if( aNode.type === 'N' && aNode.oper === undefined ) return genSingle(aNode,prefix,chooser); if( dot(aNode) ) return generateStr(aNode.right,generateStr(aNode.left,prefix, chooser), chooser); if(zero_or_more(aNode) || zero_or_one(aNode)) { let ix = chooser.count(0,zero_or_one(aNode)?10:1, prefix); for(let i=0; i<ix; i++) { prefix = generateStr(aNode.left,prefix, chooser); } return prefix; } if( or(aNode) ) { // collect all the or nodes // pick one at random // use that to generate the string let n = aNode; let list = []; while( or(n)) { list.push(n.left); n = n.right; } list.push(n); n = selectRandom(list); } throw new Error("Copy of an invalid node " + aNode); } */