UNPKG

jetiny-regulex

Version:

JavaScript Regular Expression Parser and Visualizer.

github.com/jetiny/regulex

1,549 lines (1,436 loc) • 430 kB

JavaScript

(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : typeof define === 'function' && define.amd ? define(['exports'], factory) : (factory((global.regulex = {}))); }(this, (function (exports) { 'use strict'; // if (typeof define !== 'function') var define = require('amdefine')(module); // define(function () { //@CHANGE /*Kit*/ var isBrowser = typeof window === 'object' && window.toString() === "[object Window]"; /** Build sorted Set from array. This function will corrupt the original array Proper usage:a=Set(a); @param {ArrayLike} a @return {Set} return new ArrayLike Set */ function Set(a, _sorted) { if (a._Set) return a; if (!_sorted) a = sortUnique(a); //@returns Boolean. Detect if x is in set. //`cmp` is custom compare functions return -1,0,1. // function cmp(x,item):Ordering(LT=-1|EQ=0|GT=1); a.contains = function (x, cmp) { return !!~bsearch(a, x, cmp); }; a.indexOf = function (x, cmp) { return bsearch(a, x, cmp); }; a.toArray = function () { return copyArray(a); }; /** Union with another Set @param {Set|Array} b If b is an array,it will be corrupted by sortUnqiue @return {Set} return new Set */ a.union = function (b) { b = Set(b); var n = a.length + b.length, c = new a.constructor(n); for (var i = 0, j = 0, k = 0; k < n; k++) { //merge if (a[i] === b[j]) { c[k] = a[i++];j++;n--; } else if (a[i] < b[j]) c[k] = a[i++];else c[k] = b[j++]; } c.length = n; return Set(c.length === n ? c : copyArray(c, n), true); }; a.inspect = a.toArray; a._Set = true; return a; } var LT = -1, EQ = 0, GT = 1; function _cmp(a, b) { return a < b ? LT : a === b ? EQ : GT; } function bsearch(a, x, cmp) { var lo = 0, n = a.length, hi = n - 1, pivot, c; if (n < 1) return -1; cmp = cmp || _cmp; //custom compare functions if (n === 1) return cmp(x, a[lo]) === EQ ? lo : -1; if (cmp(x, a[lo]) === LT || cmp(x, a[hi]) === GT) return -1; do { pivot = lo + (hi - lo + 1 >> 1); c = cmp(x, a[pivot]); if (c === EQ) return pivot; if (c === LT) hi = pivot - 1;else lo = pivot + 1; } while (lo <= hi); return -1; } /** Return sorted Set. This function will corrupt the original array Proper usage: a=sortUnique(a); @param {ArrayLike} a @return {ArrayLike} new unique sorted array */ function sortUnique(a) { var n = a.length; if (n <= 1) return a; //do a shell sort var k = 1, hi = n / 3 | 0, i, j, tmp; while (k < hi) k = k * 3 + 1; while (k > 0) { for (i = k; i < n; i++) { for (j = i; j >= k && a[j] < a[j - k]; j -= k) { tmp = a[j];a[j] = a[j - k];a[j - k] = tmp; } } k = k / 3 | 0; } var last = a[0], x; for (i = 1, j = 1; i < n; i++) { x = a[i]; if (x === last) continue; last = a[j++] = a[i]; } a.length = j; return a.length === j ? a : copyArray(a, j); //Typed Array length property only has a getter } function copyArray(a, size) { size = typeof size === 'undefined' ? a.length : size; var ret = new a.constructor(size), i = size; while (i--) ret[i] = a[i]; return ret; } /** Unique by toString. This function will corrupt the original array but preserve the original order. */ function hashUnique(a) { var table = {}, i = 0, j = 0, l = a.length, x; for (; i < l; i++) { x = a[i]; if (table.hasOwnProperty(x)) continue; table[x] = 1; a[j++] = x; } a.length = j; return a; } /** Object id unique. This function will corrupt the original array. Correct usage: a=idUnique(a); @param {[Object]} NonPrimitive Array */ function idUnique(a) { var i, j, l = a.length, p, guid = (Math.random() * 1E10).toString(32) + (+new Date()).toString(32); for (i = j = 0; i < l; i++) { p = a[i]; if (p == null) continue; if (p.hasOwnProperty(guid)) continue; Object.defineProperty(p, guid, { value: 1, enumerable: false }); a[j++] = p; } i = j; while (i--) { //clean guid a[i][guid] = undefined; } a.length = j; return a; } /** Classify charsets to non-overlapping sorted disjoint ranges. @param {[Range]} @return {ranges:DisjointRanges,map:OriginalRangesToDisjoinRangesMap} Example: classify(['az','09','a','bb']) => { ranges:['a','b','cz','09'], map:{'az':['a','b','cz'],'09':['09'],'a':['a'],'b':['b']} } */ function classify(ranges) { ranges = ranges.map(function (c) { return !c[1] ? c + c : c; }); var i, j, k, l, r, n; ranges = sortUnique(ranges);n = ranges.length; var singleMap = Object.create(null), headMap = Object.create(null), tailMap = Object.create(null), head, tail; for (i = 0; i < n; i++) { r = ranges[i];tail = r[1];headMap[r[0]] = true;tailMap[tail] = true; for (j = i; j < n; j++) { head = ranges[j][0]; if (head >= tail) { if (head === tail) singleMap[tail] = true; break; } } } var chars = sortUnique(ranges.join('').split('')), results = Object.keys(singleMap), c = chars[0], tmpMap = Object.create(null), map = Object.create(null); for (i = 0; i < n; i++) tmpMap[ranges[i]] = []; if (singleMap[c]) { for (i = 0; i < n; i++) { r = ranges[i]; if (r[0] === c) tmpMap[r].push(c);else if (r[0] > c) break; } } for (i = 0, l = chars.length - 1; i < l; i++) { head = chars[i];tail = chars[i + 1]; if (tailMap[head]) head = succ(head); if (headMap[tail]) tail = pred(tail); if (head <= tail) { c = head === tail ? head : head + tail; for (j = 0; j < n; j++) { r = ranges[j]; if (r[0] > tail) break; if (r[0] <= head && tail <= r[1]) tmpMap[r].push(c), results.push(c); } } head = chars[i];tail = chars[i + 1]; //keep insert order,push single char later if (singleMap[tail]) { for (j = 0; j < n; j++) { r = ranges[j]; if (r[0] > tail) break; if (r[0] <= tail && tail <= r[1]) tmpMap[r].push(tail); } } } results = sortUnique(results); for (k in tmpMap) map[k[0] === k[1] ? k[0] : k] = tmpMap[k]; return { ranges: results, map: map }; } /** Convert exclude ranges to include ranges Example: ^b-y, ['by'] to ["\0a","z\uffff"] @param {[Range]} @return Sorted disjoint ranges */ function negate(ranges /*:[Range rg]*/) { var MIN_CHAR = "\u0000", // work around UglifyJS's bug // it will convert unicode escape to raw char // that will cause error in IE // because IE recognize "\uFFFF" in source code as "\uFFFD" MAX_CHAR = String.fromCharCode(0xFFFF); ranges = classify(ranges).ranges; var negated = []; if (!ranges.length) return negated; if (ranges[0][0] !== MIN_CHAR) ranges.unshift(MAX_CHAR); var hi = ranges.length - 1; if ((ranges[hi][1] || ranges[hi][0]) !== MAX_CHAR) ranges.push(MIN_CHAR); ranges.reduce(function (acc, r) { var start = succ(acc[1] || acc[0]), end = pred(r[0]); if (start < end) negated.push(start + end); if (start === end) negated.push(start); return r; }); return negated; } /** Parse simple regex style charset string like '^a-bcdf' to disjoint ranges. Character classes like "\w\s" are not supported! @param {String} charset Valid regex charset [^a-z0-9_] input as "^a-z0-9_". @return {[Range]} return sorted disjoint ranges */ function parseCharset(charset /*:String*/) { charset = charset.split(''); var chars = [], ranges = [], exclude = charset[0] === '^' && charset.length > 1 && charset.shift(); charset.forEach(function (c) { if (chars[0] == '-' && chars.length > 1) { //chars=['-','a'],c=='z' if (chars[1] > c) // z-a is invalid throw new Error('Charset range out of order:' + chars[1] + '-' + c + '!'); ranges.push(chars[1] + c); chars.splice(0, 2); } else chars.unshift(c); }); ranges = ranges.concat(chars); //convert exclude to include return exclude ? negate(ranges) : classify(ranges).ranges; } /** Coalesce closed ranges. ['ac','d','ez'] will be coalesced to ['az'] @param {[Range]} ranges Sorted disjoint ranges return by `classify`. @return {[Range]} Compressed ranges */ function coalesce(ranges) { if (!ranges.length) return []; var results = [ranges[0]]; ranges.reduce(function (a, b) { var prev = results.length - 1; if (a[a.length - 1] === pred(b[0])) { return results[prev] = results[prev][0] + b[b.length - 1]; } results.push(b); return b; }); return results.reduce(function (results, range) { if (range.length === 2 && range[0] === pred(range[1])) { results.push(range[0]); results.push(range[1]); } else { results.push(range); } return results; }, []); } function chr(n) { return String.fromCharCode(n); } function ord(c) { return c.charCodeAt(0); } function pred(c) { return String.fromCharCode(c.charCodeAt(0) - 1); } function succ(c) { return String.fromCharCode(c.charCodeAt(0) + 1); } var printEscapeMap = { "\n": "\\n", "\t": "\\t", "\f": "\\f", "\r": "\\r", " ": " ", "\\": "\\\\", "\0": "\\0" }; // Convert string to printable,replace all control chars and unicode to hex escape function toPrint(s, isRaw) { var ctrl = /[\x00-\x1F\x7F-\x9F]/, unicode = /[\u009F-\uFFFF]/; s = s.split('').map(function (c) { if (!isRaw && printEscapeMap.hasOwnProperty(c)) return printEscapeMap[c];else if (unicode.test(c)) return '\\u' + ('00' + ord(c).toString(16).toUpperCase()).slice(-4);else if (ctrl.test(c)) return '\\x' + ("0" + ord(c).toString(16).toUpperCase()).slice(-2); return c; }).join(''); return s; } //flatten two-dimensional array to one-dimension function flatten2(a) { return [].concat.apply([], a); } function repeats(s, n) { return new Array(n + 1).join(s); } function log() { // var a=slice.call(arguments); // if (isBrowser) { // Function.prototype.apply.apply(console.log,[console,a]); // } else {//Assume it is Node.js // var s='util'; // var util=require(s); // skip require.js // a.forEach(function (x) { // console.log(util.inspect(x,{ // showHidden:false,customInspect:true, // depth:64,colors:true // })); // }); // } } function locals(f) { var src = f.toString(); var re = /^\s+function\s+([a-zA-Z]\w+)\s*\(/mg; var fns = [], match; while (match = re.exec(src)) fns.push(match[1]); var methods = [], f; while (f = fns.pop()) methods.push(f + ':' + f); return '{\n' + methods.join(',\n') + '\n}'; } var Kit = { // return { sortUnique: sortUnique, idUnique: idUnique, hashUnique: hashUnique, Set: Set, repeats: repeats, negate: negate, coalesce: coalesce, classify: classify, parseCharset: parseCharset, chr: chr, ord: ord, pred: pred, succ: succ, toPrint: toPrint, flatten2: flatten2, log: log, isBrowser: isBrowser, locals: locals }; //@CHANGE // if (typeof define !== 'function') var define = require('amdefine')(module); // define(['./Kit'],function (K) { /** A Naive NFA Implementation Start state is always named 'start' @param {NFAConfig|CompactNFAConfig} a type NFAConfig = {compact:false,accepts:StateSet,trans:[Transition]} type State = String type StateSet = [State] type Tranisition = {from:StateSet,to:StateSet,charset:Charset,action:Action,assert:Assert} type Charset = String|[Range] Charset is similar to regex charset,supports negation and range but metacharacters Examples: includes: 'abc0-9','[^]' excludes: '^c-z0-9','^a^' //excluded 'a' and '^' two chars any char: '\0-\uffff' Or set charset to processed disjoint ranges:['ac','d','eh'] Set `charset` to empty string to enable empty move(ε-moves). Action: Function(stack:Array,c:String,i:Int,state:String,inputs:String):Array stack: storage stack c: current char i: current index state: current state inputs: whole input string Optional return new stack Only eMove transition allow `assert` Actions and Asserts of eMove transition always execute before non-eMove transitions on current path. Assert: Function(stack:Array,c:String,i:Int,state:String,inputs:String):Boolean Return True if assertion just success,if fail return false If success and need skip num chars, return the Int count to increase `i`,this feature is designed for backref. Stack modifications in action only allow shift,unshift and return new stack. NFAConfig example used to recognize numbers:{ compact:false,accepts:'start'. trans:[{from:'start',to:'start',charset:'0-9'}] } CompactNFAConfig example,see `structure` function. An automaton used to recognize triples:{ compact:true,accepts:'start', trans:[ ['start>start','0369'],['start>q1','147'],['start>q2','258'], ['q1>q1','0369'],['q1>q2','147'],['q1>start','258'], ['q2>q2','0369'],['q2>q1','258'],['q2>start','147'], ] }; @return { input:Function } */ function NFA(a) { a = a.compact ? structure(a) : a; var accepts = {}, i, trans = a.trans, // FMap={toState:Function} router = {/* fromState : { eMove:[{to:State,action:Function,assert:Function,eMove:Bool}], eMoveStates:[State],// ε-move dest states charMove:{ // expanded to include eMove Range:[{to:State,action:Function,assert:Function,eMove:Bool}], Char:[{to:State,action:Function,assert:Function,eMove:Bool}] }, ranges:Set([Range]), // all trans keep original order in transitions list trans:[Transition] } */}; for (i = 0, n = a.accepts.length; i < n; i++) accepts[a.accepts[i]] = true; //add accept states var t; for (i = 0, n = trans.length; i < n; i++) { //collect charsets t = trans[i]; if (t.charset) t.ranges = typeof t.charset === 'string' ? Kit.parseCharset(t.charset) : t.charset;else t.eMove = true; t.from.forEach(function (from) { var path = router[from] = router[from] || { eMoveStates: [], eMove: [], charMove: {}, trans: [], ranges: [] }; if (t.eMove) path.eMoveStates = path.eMoveStates.concat(t.to);else path.ranges = path.ranges.concat(t.ranges); path.trans.push(t); }); } var fromStates = Object.keys(router); fromStates.forEach(function (from) { var path = router[from], trans = path.trans, charMove = path.charMove, eMove = path.eMove, ranges = path.ranges; var cls = Kit.classify(ranges), rangeMap = cls.map; trans.forEach(function (t) { if (t.eMove) { t.to.forEach(function (toState) { eMove.push({ to: toState, action: t.action, assert: t.assert, eMove: true }); }); } else { Kit.flatten2(t.ranges.map(function (r) { return rangeMap[r]; })).forEach(function (r) { (charMove[r] = charMove[r] || []).push(t); }); } }); ranges = Kit.Set(cls.ranges.filter(function (rg) { return !!rg[1]; })); //exclude single char path.ranges = ranges; // expand charMove to includes ε-move Object.keys(charMove).forEach(function (r) { var transChar = charMove[r]; var transAll = []; trans.forEach(function (t) { t.to.forEach(function (toState) { if (t.eMove || ~transChar.indexOf(t)) transAll.push({ to: toState, action: t.action, assert: t.assert, eMove: t.eMove }); }); }); charMove[r] = transAll; }); delete path.trans; delete path.eMoveStates; }); return { accepts: accepts, router: router, input: input, assertDFA: assertDFA, accept: accept }; } function accept(state) { return this.accepts.hasOwnProperty(state); } function assertDFA() { var router = this.router; var fromStates = Object.keys(router), path; for (var i = 0, l = fromStates.length; i < l; i++) { path = router[fromStates[i]]; if (path.eMove.length > 1) { throw new Error("DFA Assertion Fail!\nFrom state `" + fromStates[i] + "` can goto to multi ε-move states!"); } var charMove = path.charMove; var ranges = Object.keys(charMove); for (var k = 0, n = ranges.length; k < n; k++) { var t = charMove[ranges[k]]; if (t.length !== 1) { Kit.log(charMove); throw new Error("DFA Assertion Fail!\nFrom state `" + fromStates[i] + "` via charset `" + ranges[k] + "` can goto to multi states!"); } } if (ranges.length && path.eMove.length) { throw new Error("DFA Assertion Fail!\nFrom state `" + fromStates[i] + "` can goto extra ε-move state!"); } } return true; } /** return { stack:Array, acceptable:Boolean, lastIndex:Int, lastState:String } */ function input(s, startIndex, _debug) { startIndex = startIndex || 0; var _this = this; return _input(s, startIndex, 'start', [], startIndex - 1); function _input(s, startIndex, fromState, stack, lastIndex) { recur: do { var c, range, advanceIndex, lastResult; var path = _this.router[fromState]; if (!path) break; var eMove = path.eMove, charMove = path.charMove, trans; if (startIndex < s.length) { c = s[startIndex]; if (charMove.hasOwnProperty(c)) { trans = charMove[c]; } else if (range = findRange(path.ranges, c)) { trans = charMove[range]; } else { trans = eMove; } } else { trans = eMove; } var sp = stack.length, t, skip, ret, oldLastIndex = lastIndex; for (var j = 0, n = trans.length; j < n; j++) { t = trans[j]; advanceIndex = t.eMove ? 0 : 1; lastIndex = oldLastIndex; stack.splice(0, stack.length - sp); sp = stack.length; // backup stack length if (t.assert) { if ((skip = t.assert(stack, c, startIndex, fromState, s)) === false) continue; // For backref skip num chars if (typeof skip === 'number') { startIndex += skip;lastIndex += skip; } } if (t.action) stack = t.action(stack, c, startIndex, fromState, s) || stack; lastIndex = t.eMove ? lastIndex : startIndex; _debug && Kit.log(c + ":" + fromState + ">" + t.to); if (j === n - 1) { startIndex += advanceIndex; fromState = t.to; continue recur; // Human flesh tail call optimize? } else { ret = _input(s, startIndex + advanceIndex, t.to, stack, lastIndex); } if (ret.acceptable) return ret; lastResult = ret; } if (lastResult) return lastResult; break; } while (true); return { stack: stack, lastIndex: lastIndex, lastState: fromState, acceptable: _this.accept(fromState) }; } } function findRange(ranges, c /*:Char*/) { var i = ranges.indexOf(c, cmpRange); if (!~i) return false; return ranges[i]; } function cmpRange(c, rg) { var head = rg[0], tail = rg[1]; if (c > tail) return 1; if (c < head) return -1; return 0; } /** Convert CompactNFAConfig to NFAConfig @param {CompactNFAConfig} a type CompactNFAConfig={compact:true,accepts:CompactStateSet,trans:[CompactTransition]} type CompactStateSet = StateSet.join(",") type CompactTransition = [CompactStateMap,Charset,Action,Assert] type CompactStateMap = FromStateSet.join(",")+">"+ToStateSet.join(",") */ function structure(a) { a.accepts = a.accepts.split(','); var ts = a.trans, i = ts.length, t, s, from, to; while (i--) { t = ts[i]; s = t[0].split('>'); from = s[0].split(','); to = s[1].split(','); ts[i] = { from: from, to: to, charset: t[1], action: t[2], assert: t[3] }; } a.compact = false; return a; } //@CHANGE var NFA_1 = NFA; // if (typeof define !== 'function') var define = require('amdefine')(module); // define(['./NFA','./Kit'],function (NFA,K) { //@CHANGE /** Parse Regex to AST parse:Function(re:String) parse.Constants parse.exportConstants:Function */ var Constants = { //Node Type Constants EXACT_NODE: "exact", CHARSET_NODE: "charset", CHOICE_NODE: "choice", GROUP_NODE: "group", ASSERT_NODE: "assert", DOT_NODE: "dot", BACKREF_NODE: "backref", EMPTY_NODE: "empty", //Assertion Type Constants AssertLookahead: "AssertLookahead", AssertNegativeLookahead: "AssertNegativeLookahead", AssertNonWordBoundary: "AssertNonWordBoundary", AssertWordBoundary: "AssertWordBoundary", AssertEnd: "AssertEnd", AssertBegin: "AssertBegin" }; /** AST: Node = { // Base Node interface type:NodeType, // Node type string raw:String, // Raw regex string repeat:{ min:Int,max:Int, // Repeat times. [min,max] means "{min,max}". // Set max=Infinity forms a "{min,}" range // Set max=undefined forms a "{min}" range nonGreedy:Boolean // If this repeat is non-greedy,viz. had a "?" quantifier }, indices:[Int,Int] // Raw string in original regex index range [start,end) // You can use regexStr.slice(start,end) to retrieve node.raw string } NodeType = exact|dot|charset|choice|empty|group|assert|backref ExactNode = { // Literal match chars string type:"exact", chars:"c", raw:"c{1,2}" // When repeat or escape,raw will diff from chars } DotNode = {type:"dot"} //viz. "." , dot match any char but newline "\n\r" // Because of IgnoreCase flag, // The client code need to compute disjoint ranges itself. CharsetNode = { type:"charset", exclude:Boolean, // True only if it is "[^abc]" form classes:[Char], // Named character classes. e.g. [\d]. // All names: d(Digit),D(Non-digit),w,W,s,S chars:String, // Literal chars. e.g. [abc] repr as 'abc' ranges:[Range] // Range: a-z repr as 'az' } ChoiceNode = { type:"choice", branches:[[Node]] // Choice more branches,e.g. /a|b|c/ } EmptyNode = { // This node will match any input,include empty string type:"empty" //new RegExp("") will give an empty node. /a|/ will give branches with an empty node } GroupNode = { type:"group", nonCapture:false, // true means:"(?:abc)",default is false num:Int, // If capture is true.It is group's int index(>=1). endParenIndex:Int, // /(a)+/ will generate only one node,so indices is [0,4],endParenIndex is 3 sub:[Node] // Sub pattern nodes } AssertNode = { type:"assert", assertionType:String, //See Assertion Type Constants sub:[Node] //Optional,\b \B ^ $ Assertion this property is empty } Only AssertLookahead,AssertNegativeLookahead has `sub` property "(?=(abc))" repr as { type:"assert", assertionType:AssertLookahead, sub:[{ type:"group", sub:[{type:"exact",raw:"abc"}] }] } BackrefNode = { type:"backref", num:Int // Back references index.Correspond to group.num } */ function exportConstants() { var code = Object.keys(Constants).map(function (k) { return k + "=" + JSON.stringify(Constants[k]); }).join(";"); var Global = function () { return typeof window === 'object' && window; }(); Global.eval(code); } exportConstants(); function AST(a) { this.raw = a.raw; this.tree = a.tree; this.groupCount = a.groupCount; } /** @param {Function} f Visitor function accept node as one argument. @param {String} nodeType Give the node type you want to visit,or omitted to visit all */ AST.prototype.traverse = function (f, nodeType) { travel(this.tree, f); function travel(stack, f) { stack.forEach(function (node) { if (!nodeType || node.type === nodeType) f(node); if (node.sub) travel(node.sub, f);else if (node.branches) node.branches.forEach(function (b) { travel(b, f); }); }); } }; var G_DEBUG; /** @param {String} re input regex as string @param {Object} [options] @option {Boolean} options.debug If enable debug log @option {Boolean} options.strict If enable strict mode @return {Object} { raw:String, // original re groupCount:Int, //Total group count tree:Array // AST Tree Stack } */ function parse(re, _debug) { G_DEBUG = _debug; var parser = getNFAParser(); var ret, stack, lastState; ret = parser.input(re, 0, _debug); stack = ret.stack; stack = actions.endChoice(stack); // e.g. /a|b/ lastState = ret.lastState; var valid = ret.acceptable && ret.lastIndex === re.length - 1; //just syntax valid regex if (!valid) { var error; switch (lastState) { case 'charsetRangeEndWithNullChar': error = { type: 'CharsetRangeEndWithNullChar', message: "Charset range end with NUL char does not make sense!\n" + "Because [a-\\0] is not a valid range.\n" + "And [\\0-\\0] should be rewritten into [\\0]." }; break; case 'repeatErrorFinal': error = { type: 'NothingRepeat', message: "Nothing to repeat!" }; break; case 'digitFollowNullError': error = { type: 'DigitFollowNullError', message: "The '\\0' represents the <NUL> char and cannot be followed by a decimal digit!" }; break; case 'charsetRangeEndClass': error = { type: 'CharsetRangeEndClass', message: 'Charset range ends with class such as "\\w\\W\\d\\D\\s\\S" is invalid!' }; break; case 'charsetOctEscape': error = { type: 'DecimalEscape', message: 'Decimal escape appears in charset is invalid.Because it can\'t be explained as backreference.And octal escape is deprecated!' }; break; default: if (lastState.indexOf('charset') === 0) { error = { type: 'UnclosedCharset', message: 'Unterminated character class!' }; } else if (re[ret.lastIndex] === ')') { error = { type: 'UnmatchedParen', message: 'Unmatched end parenthesis!' }; } else { error = { type: 'UnexpectedChar', message: 'Unexpected char!' }; ret.lastIndex++; } } if (error) { error.lastIndex = ret.lastIndex; error.astStack = ret.stack; error.lastState = lastState; throw new RegexSyntaxError(error); } } if (stack._parentGroup) { throw new RegexSyntaxError({ type: "UnterminatedGroup", message: "Unterminated group!", lastIndex: stack._parentGroup.indices[0], lastState: lastState, astStack: stack }); } if (valid) { var groupCount = stack.groupCounter ? stack.groupCounter.i : 0; delete stack.groupCounter; _fixNodes(stack, re, re.length); stack = _filterEmptyExact(stack); var ast = new AST({ raw: re, groupCount: groupCount, tree: stack }); // Check charset ranges out of order error.(Because of charsetRangeEndEscape) ast.traverse(_checkCharsetRange, CHARSET_NODE); // Check any repeats after assertion. e.g. /a(?=b)+/ doesn't make sense. ast.traverse(_checkRepeat, ASSERT_NODE); _coalesceExactNode(stack); G_DEBUG = false; return ast; } } parse.Constants = Constants; parse.exportConstants = exportConstants; parse.RegexSyntaxError = RegexSyntaxError; parse.getNFAParser = getNFAParser; var _NFAParser; function getNFAParser() { if (!_NFAParser) { _NFAParser = NFA_1(config, G_DEBUG); } return _NFAParser; } function _set(obj, prop, value) { Object.defineProperty(obj, prop, { value: value, enumerable: G_DEBUG, writable: true, configurable: true }); } function _filterEmptyExact(stack) { return stack.filter(function (node) { if (node.type == EXACT_NODE && node.concatTemp) { delete node.concatTemp; return !!node.chars; } else if (node.sub) { node.sub = _filterEmptyExact(node.sub); } else if (node.branches) { node.branches = node.branches.map(_filterEmptyExact); } return true; }); } function _coalesceExactNode(stack) { var prev = stack[0]; down(prev); for (var i = 1, j = 1, l = stack.length, node; i < l; i++) { node = stack[i]; if (node.type === EXACT_NODE) { if (prev.type === EXACT_NODE && !prev.repeat && !node.repeat) { prev.indices[1] = node.indices[1]; prev.raw += node.raw; prev.chars += node.chars; continue; } } else { down(node); } stack[j++] = node; prev = node; } if (prev) stack.length = j; function down(node) { if (node.sub) { _coalesceExactNode(node.sub); } else if (node.branches) { node.branches.map(_coalesceExactNode); } } } function _fixNodes(stack, re, endIndex) { if (!stack.length) { stack.push({ type: EMPTY_NODE, indices: [endIndex, endIndex] }); return; } stack.reduce(function (endIndex, node) { node.indices.push(endIndex); node.raw = re.slice(node.indices[0], endIndex); if (node.type === GROUP_NODE || node.type === ASSERT_NODE && node.sub) { _fixNodes(node.sub, re, node.endParenIndex); } else if (node.type === CHOICE_NODE) { node.branches.reduce(function (endIndex, branch) { _fixNodes(branch, re, endIndex); var head = branch[0]; // Reversed,so branch[0] is head.Dammit mystic code return (head ? head.indices[0] : endIndex) - 1; // skip '|' }, endIndex); node.branches.reverse(); } else if (node.type === EXACT_NODE) { if (!node.concatTemp) { node.chars = node.chars || node.raw; } } return node.indices[0]; }, endIndex); stack.reverse(); } function _checkRepeat(node) { if (node.repeat) { var astype = node.assertionType; var msg = 'Nothing to repeat! Repeat after assertion doesn\'t make sense!'; if (astype === 'AssertLookahead' || astype === 'AssertNegativeLookahead') { var assertifier = astype === 'AssertLookahead' ? '?=' : '?!'; var pattern = '(' + assertifier + 'b)'; msg += '\n/a' + pattern + '+/、/a' + pattern + '{1,n}/ are the same as /a' + pattern + '/。\n' + '/a' + pattern + '*/、/a' + pattern + '{0,n}/、/a' + pattern + '?/ are the same as /a/。'; } throw new RegexSyntaxError({ type: 'NothingRepeat', lastIndex: node.indices[1] - 1, message: msg }); } } //check charset ranges out of order error.(Because of charsetRangeEndEscape) // [z-\u54] had to defer check function _checkCharsetRange(node) { node.ranges = Kit.sortUnique(node.ranges.map(function (range) { if (range[0] > range[1]) { throw new RegexSyntaxError({ type: "OutOfOrder", lastIndex: range.lastIndex, message: "Range [" + range.join('-') + "] out of order in character class!" }); } return range.join(''); })); } function RegexSyntaxError(e) { this.name = "RegexSyntaxError"; this.type = e.type; this.lastIndex = e.lastIndex; this.lastState = e.lastState; this.astStack = e.astStack; this.message = e.message; Object.defineProperty(this, 'stack', { value: new Error(e.message).stack, enumerable: false }); } RegexSyntaxError.prototype.toString = function () { return this.name + ' ' + this.type + ':' + this.message; }; var escapeCharMap = { n: "\n", r: "\r", t: "\t", v: "\v", f: "\f" }; // All indices' end will be fixed later by stack[i].indices.push(stack[i+1].indices[0]) // All raw string filled later by node.raw=s.slice(node.indices[0],node.indices[1]) // All nodes are unshift to stack, so they're reverse order. var actions = function _() { function exact(stack, c, i) { //any literal string. // ExactNode.chars will be filled later (than raw) // Escape actions and repeat actions will fill node.chars // node.chars = node.chars || node.raw var last = stack[0]; if (!last || last.type != EXACT_NODE || last.repeat || last.chars && !last.concatTemp) { stack.unshift({ type: EXACT_NODE, indices: [i] }); } if (last && last.concatTemp) { last.chars += c; } } function dot(stack, c, i) { // /./ stack.unshift({ type: DOT_NODE, indices: [i] }); } function nullChar(stack, c, i) { stack.unshift({ type: EXACT_NODE, chars: "\0", indices: [i - 1] }); } function assertBegin(stack, c, i) { // /^/ stack.unshift({ type: ASSERT_NODE, indices: [i], assertionType: AssertBegin }); } function assertEnd(stack, c, i, state, s) { stack.unshift({ type: ASSERT_NODE, indices: [i], assertionType: AssertEnd }); } function assertWordBoundary(stack, c, i) { //\b \B assertion stack.unshift({ type: ASSERT_NODE, indices: [i - 1], assertionType: c == 'b' ? AssertWordBoundary : AssertNonWordBoundary }); } function repeatnStart(stack, c, i) { // /a{/ //Treat repeatn as normal exact node,do transfer in repeatnEnd action. //Because /a{+/ is valid. var last = stack[0]; if (last.type === EXACT_NODE) { return; } else { // '[a-z]{' is valid stack.unshift({ type: EXACT_NODE, indices: [i] }); } } function repeatnComma(stack, c, i) { // /a{n,}/ var last = stack[0]; _set(last, '_commaIndex', i); } function repeatnEnd(stack, c, i, state, s) { // /a{n,m}/ var last = stack[0], charEndIndex = s.lastIndexOf('{', i); var min = parseInt(s.slice(charEndIndex + 1, last._commaIndex || i), 10); var max; if (!last._commaIndex) { // /a{n}/ max = min; } else { if (last._commaIndex + 1 == i) { // /a{n,}/ max = Infinity; } else { max = parseInt(s.slice(last._commaIndex + 1, i), 10); } if (max < min) { throw new RegexSyntaxError({ type: "OutOfOrder", lastState: state, lastIndex: i, astStack: stack, message: "Numbers out of order in {} quantifier!" }); } delete last._commaIndex; } if (last.indices[0] >= charEndIndex) { stack.shift(); } _repeat(stack, min, max, charEndIndex, s); } function repeat0(stack, c, i, state, s) { _repeat(stack, 0, Infinity, i, s); } // e.g. /a*/ function repeat01(stack, c, i, state, s) { _repeat(stack, 0, 1, i, s); } // e.g. /a?/ function repeat1(stack, c, i, state, s) { _repeat(stack, 1, Infinity, i, s); } // e.g. /a+/ function _repeat(stack, min, max, charEndIndex, s) { var last = stack[0], repeat = { min: min, max: max, nonGreedy: false }, charIndex = charEndIndex - 1; if (last.chars && last.chars.length === 1) charIndex = last.indices[0]; if (last.type === EXACT_NODE) { // exact node only repeat last char var a = { type: EXACT_NODE, repeat: repeat, chars: last.chars ? last.chars : s[charIndex], indices: [charIndex] }; if (last.indices[0] === charIndex) stack.shift(); // e.g. /a{n}/ should be only single node stack.unshift(a); } else { last.repeat = repeat; } _set(repeat, 'beginIndex', charEndIndex - stack[0].indices[0]); } function repeatNonGreedy(stack) { stack[0].repeat.nonGreedy = true; } function escapeStart(stack, c, i) { stack.unshift({ concatTemp: true, type: EXACT_NODE, chars: "", indices: [i] }); } function normalEscape(stack, c, i) { if (escapeCharMap.hasOwnProperty(c)) c = escapeCharMap[c]; stack.unshift({ type: EXACT_NODE, chars: c, indices: [i - 1] }); } function charClassEscape(stack, c, i) { stack.unshift({ type: CHARSET_NODE, indices: [i - 1], chars: '', ranges: [], classes: [c], exclude: false }); } function hexEscape(stack, c, i, state, s) { c = String.fromCharCode(parseInt(s[i - 1] + c, 16)); stack.shift(); // remove temp "xN", /\x5/ should match "x5",so there is an exact node with chars "x5" in stack stack.unshift({ type: EXACT_NODE, chars: c, indices: [i - 3] // \xAA length }); } function unicodeEscape(stack, c, i, state, s) { c = String.fromCharCode(parseInt(s.slice(i - 3, i + 1), 16)); stack.shift(); // same as hexEscape, other cases could be emliminate at the end by _filterEmptyExact stack.unshift({ type: EXACT_NODE, chars: c, indices: [i - 5] // \u5409 length }); } function groupStart(stack, c, i) { var counter = stack.groupCounter = stack.groupCounter || { i: 0 }; counter.i++; var group = { type: GROUP_NODE, num: counter.i, sub: [], indices: [i], _parentStack: stack // Used to restore current stack when group end,viz. encounters ")" }; stack = group.sub; _set(stack, '_parentGroup', group); stack.groupCounter = counter; //keep groupCounter persist and ref modifiable return stack; } function groupNonCapture(stack) { // /(?:)/ var group = stack._parentGroup; group.nonCapture = true; group.num = undefined; stack.groupCounter.i--; } function groupToAssertion(stack, c, i) { // Convert /(?!)/,/(?=)/ to AssertNode var group = stack._parentGroup; group.type = ASSERT_NODE; group.assertionType = c == '=' ? AssertLookahead : AssertNegativeLookahead; // Caveat!!! Assertion group no need to capture group.num = undefined; stack.groupCounter.i--; } function groupEnd(stack, c, i, state, s) { stack = endChoice(stack); // restore group's stack from choice var group = stack._parentGroup; if (!group) { throw new RegexSyntaxError({ type: 'UnexpectedChar', lastIndex: i, lastState: state, astStack: stack, message: "Unexpected end parenthesis!" }); } delete stack._parentGroup; // Be generous,I don't care sparse object performance. delete stack.groupCounter; // clean stack = group._parentStack; // restore stack delete group._parentStack; stack.unshift(group); group.endParenIndex = i; return stack; } function choice(stack, c, i) { // encounters "|" //replace current stack with choices new branch stack var newStack = [], choice; if (stack._parentChoice) { choice = stack._parentChoice; choice.branches.unshift(newStack); _set(newStack, '_parentChoice', choice); _set(newStack, '_parentGroup', choice); newStack.groupCounter = stack.groupCounter; // keep track delete stack._parentChoice; delete stack.groupCounter; // This stack is in choice.branches,so clean it } else { // "/(a|)/" ,create new ChoiceNode var first = stack[stack.length - 1]; // Because of stack is reverse order choice = { type: CHOICE_NODE, indices: [first ? first.indices[0] : i - 1], branches: [] }; _set(choice, '_parentStack', stack); choice.branches.unshift(stack.slice()); // contents before "|" stack.length = 0; /* e.g. "/(a|b)/" is { type:'group',sub:[ {type:'choice',branches:[ [{type:'exact',chars:'a'}], [{type:'exact',chars:'b'}] ]}]}*/ stack.unshift(choice); // must not clean groupCounter newStack.groupCounter = stack.groupCounter; _set(newStack, '_parentChoice', choice); _set(newStack, '_parentGroup', choice); choice.branches.unshift(newStack); } return newStack; } //if current stack is a choice's branch,return the original parent stack function endChoice(stack) { if (stack._parentChoice) { var choice = stack._parentChoice; delete stack._parentChoice; delete stack._parentGroup; delete stack.groupCounter; var parentStack = choice._parentStack; delete choice._parentStack; return parentStack; } return stack; } function charsetStart(stack, c, i) { stack.unshift({ type: CHARSET_NODE, indices: [i], classes: [], ranges: [], chars: '' }); } function charsetExclude(stack) { stack[0].exclude = true; } function charsetContent(stack, c, i) { stack[0].chars += c; } function charsetNormalEscape(stack, c, i) { if (escapeCharMap.hasOwnProperty(c)) c = escapeCharMap[c]; stack[0].chars += c; } function charsetNullChar(stack, c, i) { stack[0].chars += "\0"; } function charsetClassEscape(stack, c) { stack[0].classes.push(c); } function charsetHexEscape(stack, c, i, state, s) { var last = stack[0]; c = String.fromCharCode(parseInt(last.chars.slice(-1) + c, 16)); last.chars = last.chars.slice(0, -2); // also remove "xA" last.chars += c; } function charsetUnicodeEscape(stack, c, i, state, s) { var last = stack[0]; c = String.fromCharCode(parseInt(last.chars.slice(-3) + c, 16)); last.chars = last.chars.slice(0, -4); //remove "uABC" last.chars += c; } function charsetRangeEnd(stack, c, i, state, s) { var charset = stack[0]; var range = charset.chars.slice(-2); range = [range[0], c]; range.lastIndex = i; charset.ranges.push(range); charset.chars = charset.chars.slice(0, -2); } function charsetRangeEndNormalEscape(stack, c) { if (escapeCharMap.hasOwnProperty(c)) c = escapeCharMap[c]; charsetRangeEnd.apply(this, arguments); } // [\x30-\x78] first repr as {ranges:['\x30','x']} // [\u0000-\u4567] first repr as {ranges:['\0','u']} // If escape sequences are valid then replace range end with corrent char // stack[0].chars did not contain 'u' or 'x' function charsetRangeEndUnicodeEscape(stack, c, i) { var charset = stack[0]; var code = charset.chars.slice(-3) + c; charset.chars = charset.chars.slice(0, -3); // So just remove previous three,no 'u' var range = charset.ranges.pop(); c = String.fromCharCode(parseInt(code, 16)); range = [range[0], c]; range.lastIndex = i; charset.ranges.push(range); } function charsetRangeEndHexEscape(stack, c, i) { var charset = stack[0]; var code = charset.chars.slice(-1) + c; charset.chars = charset.chars.slice(0, -1); // last.chars does'nt contain 'x' var range = charset.ranges.pop(); c = String.fromCharCode(parseInt(code, 16)); range = [range[0], c]; range.lastIndex = i; charset.ranges.push(range); } /* Caveat!!! See:https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/RegExp \0 Matches a NUL character. Do not follow this with another digit. ECMA-262 Standard: 15.10.2.11 DecimalEscape NOTE If \ is followed by a decimal number n whose first digit is not 0, then the escape sequence is considered to be a backreference. It is an error if n is greater than the total number of left capturing parentheses in the entire regular expression. \0 represents the <NUL> character and cannot be followed by a decimal digit. But in both Chrome and Firefox, /\077/ matches "\077",e.g. String.fromCharCode(parseInt("77",8)) /(g)\1/ matches "gg",it's OK. But /(g)\14/ matches "g\14","\14" is String.fromCharCode(parseInt("14",8)) And /(g)\1456/ matches "g\145"+"6",/(g)\19/ matches "g\1"+"9". Who knows WTF? Considering that ECMAScript StrictMode did not support OctEscape, I'm not going to implement OctEscape. I will make it conform the Standard.(Also keep code simple) */ function backref(stack, c, i, state) { var last = stack[0], n = parseInt(c, 10), isFirstNum = state === 'escape', counter = stack.groupCounter, cn = counter && counter.i || 0; if (!isFirstNum) { //previous node must be backref node n = parseInt(last.num + "" + n, 10); } else { last = { type: BACKREF_NODE, indices: [i - 1] }; stack.unshift(last); } if (n > cn) { throw new RegexSyntaxError({ type: 'InvalidBackReference', lastIndex: i, astStack: stack, lastState: state, message: 'Back reference number(' + n + ') greater than current groups count(' + cn + ').' }); } else if (_isRecursive(n, stack)) { throw new RegexSyntaxError({ type: 'InvalidBackReference', lastIndex: i, astStack: stack, lastState: state, message: 'Recursive back reference in group (' + n + ') itself.' }); } last.num = n; function _isRecursive(n, stack) { if (!stack._parentGroup) return false; if (stack._parentGroup.num == n) return n; return _isRecursive(n, stack._parentGroup._parentStack); } } //console.log(K.locals(_)); return { escapeStart: escapeStart, exact: exact, dot: dot, nullChar: nullChar, assertBegin: assertBegin, assertEnd: assertEnd, assertWordBoundary: assertWordBoundary, repeatnStart: repeatnStart, repeatnComma: repeatnComma, repeatNonGreedy: repeatNonGreedy, repeatnEnd: repeatnEnd, repeat1: repeat1, repeat01: repeat01, repeat0: repeat0, charClassEscape: charClassEscape, normalEscape: normalEscape, unicodeEscape: unicodeEscape, hexEscape: hexEscape, charClassEscape: charClassEscape, groupStart: groupStart, groupNonCapture: groupNonCapture, backref: backref, groupToAssertion: groupToAssertion, groupEnd: groupEnd, choice: choice, endChoice: endChoice, charsetStart: charsetStart, charsetExclude: charsetExclude, charsetContent: charsetContent, charsetNullChar: charsetNullChar, charsetClassEscape: charsetClassEscape, charsetHexEscape: charsetHexEscape, charsetUnicodeEscape: charsetUnicodeEscape, charsetRangeEnd: charsetRangeEnd, charsetNormalEscape: charsetNormalEscape, charsetRangeEndNormalEscape: charsetRangeEndNormalEscape, charsetRangeEndUnicodeEscape: charsetRangeEndUnicodeEscape, charsetRangeEndHexEscape: charsetRangeEndHexEscape }; }(); var digit = '0-9'; var hexDigit = '0-9a-fA-F'; //EX,It is an exclusive charset var exactEXCharset = '^+*?^$.|(){[\\'; var charClassEscape = 'dDwWsS'; var unicodeEscape = 'u'; var hexEscape = 'x'; //var octDigit='0-7'; //var octEscape='0-7'; Never TODO. JavaScript doesn't support string OctEscape in strict mode. // In charset,\b\B means "\b","\B",not word boundary // NULL Escape followed digit should throw error var normalEscapeInCharsetEX = '^' + charClassEscape + unicodeEscape + hexEscape + '0-9'; // 'rntvf\\' escape ,others return raw // Also need exclude \b\B assertion and backref var normalEscapeEX = normalEscapeInCharsetEX + 'bB1-9'; //var controlEscape;//Never TODO.Same reason as OctEscape. var repeatnStates = 'repeatnStart,repeatn_1,repeatn_2,repeatnErrorStart,repeatnError_1,repeatnError_2'; var hexEscapeStates = 'hexEscape1,hexEscape2'; var unicodeEscapeStates = 'unicodeEscape1,unicodeEscape2,unicodeEscape3,unicodeEscape4'; var allHexEscapeStates = hexEscapeStates + ',' + unicodeEscapeStates; var charsetIncompleteEscapeStates = 'charsetUnicodeEscape1,charsetUnicodeEscape2