regulex
Version:
JavaScript Regular Expression Parser and Visualizer.
986 lines (886 loc) • 37 kB
JavaScript
if (typeof define !== 'function') var define = require('amdefine')(module);
define(['./NFA','./Kit'],function (NFA,K) {
/**
Parse Regex to AST
parse:Function(re:String)
parse.Constants
parse.exportConstants:Function
*/
var Constants={
//Node Type Constants
EXACT_NODE:"exact",
CHARSET_NODE:"charset",
CHOICE_NODE:"choice",
GROUP_NODE:"group",
ASSERT_NODE:"assert",
DOT_NODE:"dot",
BACKREF_NODE:"backref",
EMPTY_NODE:"empty",
//Assertion Type Constants
AssertLookahead:"AssertLookahead",
AssertNegativeLookahead:"AssertNegativeLookahead",
AssertNonWordBoundary:"AssertNonWordBoundary",
AssertWordBoundary:"AssertWordBoundary",
AssertEnd:"AssertEnd",
AssertBegin:"AssertBegin"
};
/**
AST:
Node = { // Base Node interface
type:NodeType, // Node type string
raw:String, // Raw regex string
repeat:{
min:Int,max:Int, // Repeat times. [min,max] means "{min,max}".
// Set max=Infinity forms a "{min,}" range
// Set max=undefined forms a "{min}" range
nonGreedy:Boolean // If this repeat is non-greedy,viz. had a "?" quantifier
},
indices:[Int,Int] // Raw string in original regex index range [start,end)
// You can use regexStr.slice(start,end) to retrieve node.raw string
}
NodeType = exact|dot|charset|choice|empty|group|assert|backref
ExactNode = { // Literal match chars string
type:"exact",
chars:"c",
raw:"c{1,2}" // When repeat or escape,raw will diff from chars
}
DotNode = {type:"dot"} //viz. "." , dot match any char but newline "\n\r"
// Because of IgnoreCase flag,
// The client code need to compute disjoint ranges itself.
CharsetNode = {
type:"charset",
exclude:Boolean, // True only if it is "[^abc]" form
classes:[Char], // Named character classes. e.g. [\d].
// All names: d(Digit),D(Non-digit),w,W,s,S
chars:String, // Literal chars. e.g. [abc] repr as 'abc'
ranges:[Range] // Range: a-z repr as 'az'
}
ChoiceNode = {
type:"choice",
branches:[[Node]] // Choice more branches,e.g. /a|b|c/
}
EmptyNode = { // This node will match any input,include empty string
type:"empty" //new RegExp("") will give an empty node. /a|/ will give branches with an empty node
}
GroupNode = {
type:"group",
nonCapture:false, // true means:"(?:abc)",default is false
num:Int, // If capture is true.It is group's int index(>=1).
endParenIndex:Int, // /(a)+/ will generate only one node,so indices is [0,4],endParenIndex is 3
sub:[Node] // Sub pattern nodes
}
AssertNode = {
type:"assert",
assertionType:String, //See Assertion Type Constants
sub:[Node] //Optional,\b \B ^ $ Assertion this property is empty
}
Only AssertLookahead,AssertNegativeLookahead has `sub` property
"(?=(abc))" repr as {
type:"assert", assertionType:AssertLookahead,
sub:[{
type:"group",
sub:[{type:"exact",raw:"abc"}]
}]
}
BackrefNode = {
type:"backref",
num:Int // Back references index.Correspond to group.num
}
*/
function exportConstants() {
var code=Object.keys(Constants).map(function (k) {
return k+"="+JSON.stringify(Constants[k]);
}).join(";");
var Global=(function () {
return this;
})();
Global.eval(code);
}
exportConstants();
function AST(a) {
this.raw=a.raw;
this.tree=a.tree;
this.groupCount=a.groupCount;
}
/**
@param {Function} f Visitor function accept node as one argument.
@param {String} nodeType Give the node type you want to visit,or omitted to visit all
*/
AST.prototype.traverse=function (f,nodeType) {
travel(this.tree,f);
function travel(stack,f) {
stack.forEach(function (node) {
if (!nodeType || node.type===nodeType) f(node);
if (node.sub) travel(node.sub,f);
else if (node.branches) node.branches.forEach(function (b) {travel(b,f)});
});
}
};
var G_DEBUG;
/**
@param {String} re input regex as string
@param {Object} [options]
@option {Boolean} options.debug If enable debug log
@option {Boolean} options.strict If enable strict mode
@return {Object}
{
raw:String, // original re
groupCount:Int, //Total group count
tree:Array // AST Tree Stack
}
*/
function parse(re,_debug) {
G_DEBUG=_debug;
var parser=getNFAParser();
var ret,stack,lastState;
ret=parser.input(re,0,_debug);
stack=ret.stack;
stack=actions.endChoice(stack); // e.g. /a|b/
lastState=ret.lastState;
var valid=ret.acceptable && ret.lastIndex===re.length-1;//just syntax valid regex
if (!valid) {
var error;
switch (lastState) {
case 'charsetRangeEndWithNullChar':
error={
type:'CharsetRangeEndWithNullChar',
message:"Charset range end with NUL char does not make sense!\n"+
"Because [a-\\0] is not a valid range.\n"+
"And [\\0-\\0] should be rewritten into [\\0].",
};
break;
case 'repeatErrorFinal':
error={
type:'NothingRepeat',
message:"Nothing to repeat!"
};
break;
case 'digitFollowNullError':
error={
type:'DigitFollowNullError',
message:"The '\\0' represents the <NUL> char and cannot be followed by a decimal digit!"
};
break;
case 'charsetRangeEndClass':
error={
type:'CharsetRangeEndClass',
message:'Charset range ends with class such as "\\w\\W\\d\\D\\s\\S" is invalid!'
};
break;
case 'charsetOctEscape':
error={
type:'DecimalEscape',
message:'Decimal escape appears in charset is invalid.Because it can\'t be explained as backreference.And octal escape is deprecated!'
};
break;
default:
if (lastState.indexOf('charset')===0) {
error={
type:'UnclosedCharset',
message:'Unterminated character class!'
};
} else if (re[ret.lastIndex]===')') {
error={
type:'UnmatchedParen',
message:'Unmatched end parenthesis!'
};
} else {
error={
type:'UnexpectedChar',
message:'Unexpected char!'
}
ret.lastIndex++;
}
}
if (error) {
error.lastIndex=ret.lastIndex;
error.astStack=ret.stack;
error.lastState=lastState;
throw new RegexSyntaxError(error);
}
}
if (stack._parentGroup) {
throw new RegexSyntaxError({
type:"UnterminatedGroup",
message:"Unterminated group!",
lastIndex:stack._parentGroup.indices[0],
lastState:lastState,
astStack:stack
});
}
if (valid) {
var groupCount=stack.groupCounter?stack.groupCounter.i:0;
delete stack.groupCounter;
_fixNodes(stack,re,re.length);
stack = _filterEmptyExact(stack);
var ast=new AST({
raw:re,
groupCount:groupCount,
tree:stack
});
// Check charset ranges out of order error.(Because of charsetRangeEndEscape)
ast.traverse(_checkCharsetRange,CHARSET_NODE);
// Check any repeats after assertion. e.g. /a(?=b)+/ doesn't make sense.
ast.traverse(_checkRepeat,ASSERT_NODE);
_coalesceExactNode(stack);
G_DEBUG=false;
return ast;
}
}
parse.Constants=Constants;
parse.exportConstants=exportConstants;
parse.RegexSyntaxError=RegexSyntaxError;
parse.getNFAParser=getNFAParser;
var _NFAParser;
function getNFAParser() {
if (!_NFAParser) {
_NFAParser=NFA(config,G_DEBUG);
}
return _NFAParser;
}
function _set(obj,prop,value) {
Object.defineProperty(obj,prop,{
value:value,enumerable:G_DEBUG,writable:true,configurable:true
});
}
function _filterEmptyExact(stack) {
return stack.filter(function (node) {
if (node.type == EXACT_NODE && node.concatTemp) {
delete node.concatTemp;
return !!node.chars;
} else if (node.sub) {
node.sub = _filterEmptyExact(node.sub);
} else if (node.branches) {
node.branches = node.branches.map(_filterEmptyExact);
}
return true;
});
}
function _coalesceExactNode(stack) {
var prev=stack[0];
down(prev);
for (var i=1,j=1,l=stack.length,node;i<l;i++) {
node=stack[i];
if (node.type===EXACT_NODE) {
if (prev.type===EXACT_NODE && !prev.repeat && !node.repeat) {
prev.indices[1]=node.indices[1];
prev.raw+=node.raw;
prev.chars+=node.chars;
continue;
}
} else {
down(node);
}
stack[j++]=node;
prev=node;
}
if (prev) stack.length=j;
function down(node) {
if (node.sub) {
_coalesceExactNode(node.sub);
} else if (node.branches) {
node.branches.map(_coalesceExactNode);
}
}
}
function _fixNodes(stack,re,endIndex) {
if (!stack.length) {
stack.push({type:EMPTY_NODE,indices:[endIndex,endIndex]});
return;
}
stack.reduce(function (endIndex,node) {
node.indices.push(endIndex);
node.raw=re.slice(node.indices[0],endIndex);
if (node.type===GROUP_NODE || (node.type===ASSERT_NODE && node.sub)) {
_fixNodes(node.sub,re,node.endParenIndex);
} else if (node.type===CHOICE_NODE) {
node.branches.reduce(function (endIndex,branch) {
_fixNodes(branch,re,endIndex);
var head=branch[0]; // Reversed,so branch[0] is head.Dammit mystic code
return (head?head.indices[0]:endIndex)-1; // skip '|'
},endIndex);
node.branches.reverse();
} else if (node.type===EXACT_NODE) {
if (!node.concatTemp) {
node.chars = node.chars || node.raw;
}
}
return node.indices[0];
},endIndex);
stack.reverse();
}
function _checkRepeat(node) {
if (node.repeat) {
var astype = node.assertionType;
var msg = 'Nothing to repeat! Repeat after assertion doesn\'t make sense!';
if (astype === 'AssertLookahead' || astype === 'AssertNegativeLookahead' ) {
var assertifier = astype === 'AssertLookahead' ? '?=' : '?!';
var pattern = '('+assertifier+'b)';
msg += '\n/a'+pattern+'+/、/a'+pattern+'{1,n}/ are the same as /a'+pattern+'/。\n' +
'/a'+pattern+'*/、/a'+pattern+'{0,n}/、/a'+pattern+'?/ are the same as /a/。';
}
throw new RegexSyntaxError({
type:'NothingRepeat',
lastIndex:node.indices[1]-1,
message: msg
})
}
}
//check charset ranges out of order error.(Because of charsetRangeEndEscape)
// [z-\u54] had to defer check
function _checkCharsetRange(node) {
node.ranges=K.sortUnique(node.ranges.map(function (range) {
if (range[0]>range[1]) {
throw new RegexSyntaxError({
type:"OutOfOrder",
lastIndex:range.lastIndex,
message:"Range ["+range.join('-')+"] out of order in character class!"
});
}
return range.join('');
}));
}
function RegexSyntaxError(e) {
this.name="RegexSyntaxError";
this.type=e.type;
this.lastIndex=e.lastIndex;
this.lastState=e.lastState;
this.astStack=e.astStack;
this.message=e.message;
Object.defineProperty(this,'stack',{
value:new Error(e.message).stack,enumerable:false
});
}
RegexSyntaxError.prototype.toString=function () {
return this.name+' '+this.type+':'+this.message;
};
var escapeCharMap={n:"\n",r:"\r",t:"\t",v:"\v",f:"\f"};
// All indices' end will be fixed later by stack[i].indices.push(stack[i+1].indices[0])
// All raw string filled later by node.raw=s.slice(node.indices[0],node.indices[1])
// All nodes are unshift to stack, so they're reverse order.
var actions=(function _() {
function exact(stack,c,i) { //any literal string.
// ExactNode.chars will be filled later (than raw)
// Escape actions and repeat actions will fill node.chars
// node.chars = node.chars || node.raw
var last=stack[0];
if (!last || last.type!=EXACT_NODE || last.repeat || (last.chars && !last.concatTemp)) {
stack.unshift({type:EXACT_NODE, indices:[i]});
}
if (last && last.concatTemp) {
last.chars += c;
}
}
function dot(stack,c,i) { // /./
stack.unshift({type:DOT_NODE,indices:[i]});
}
function nullChar(stack,c,i) {
stack.unshift({
type:EXACT_NODE, chars:"\0",
indices:[i-1]
});
}
function assertBegin(stack,c,i) { // /^/
stack.unshift({
type:ASSERT_NODE,
indices:[i],
assertionType:AssertBegin
});
}
function assertEnd(stack,c,i,state,s) {
stack.unshift({
type:ASSERT_NODE,
indices:[i],
assertionType:AssertEnd
});
}
function assertWordBoundary(stack,c,i) {//\b \B assertion
stack.unshift({
type:ASSERT_NODE,
indices:[i-1],
assertionType: c=='b'?AssertWordBoundary:AssertNonWordBoundary
});
}
function repeatnStart(stack,c,i) { // /a{/
//Treat repeatn as normal exact node,do transfer in repeatnEnd action.
//Because /a{+/ is valid.
var last=stack[0];
if (last.type===EXACT_NODE) {
return;
} else { // '[a-z]{' is valid
stack.unshift({type:EXACT_NODE,indices:[i]});
}
}
function repeatnComma(stack,c,i) { // /a{n,}/
var last=stack[0];
_set(last,'_commaIndex',i);
}
function repeatnEnd(stack,c,i,state,s) { // /a{n,m}/
var last=stack[0],charEndIndex=s.lastIndexOf('{',i);
var min=parseInt(s.slice(charEndIndex+1,last._commaIndex || i),10);
var max;
if (!last._commaIndex) { // /a{n}/
max=min;
} else {
if (last._commaIndex+1==i) { // /a{n,}/
max=Infinity;
} else {
max=parseInt(s.slice(last._commaIndex+1,i),10);
}
if (max < min) {
throw new RegexSyntaxError({
type:"OutOfOrder",lastState:state,
lastIndex:i,astStack:stack,
message:"Numbers out of order in {} quantifier!"
});
}
delete last._commaIndex;
}
if (last.indices[0]>=charEndIndex) {
stack.shift();
}
_repeat(stack,min,max,charEndIndex,s);
}
function repeat0(stack,c,i,state,s) { _repeat(stack,0,Infinity,i,s) } // e.g. /a*/
function repeat01(stack,c,i,state,s) { _repeat(stack,0,1,i,s) } // e.g. /a?/
function repeat1(stack,c,i,state,s) { _repeat(stack,1,Infinity,i,s) } // e.g. /a+/
function _repeat(stack,min,max,charEndIndex,s) {
var last=stack[0],repeat={min:min,max:max,nonGreedy:false},
charIndex=charEndIndex-1;
if (last.chars && last.chars.length===1) charIndex=last.indices[0];
if (last.type===EXACT_NODE) { // exact node only repeat last char
var a={
type:EXACT_NODE,
repeat:repeat,chars:last.chars?last.chars:s[charIndex],
indices:[charIndex]
};
if (last.indices[0]===charIndex) stack.shift(); // e.g. /a{n}/ should be only single node
stack.unshift(a);
} else {
last.repeat=repeat;
}
_set(repeat,'beginIndex',charEndIndex-stack[0].indices[0]);
}
function repeatNonGreedy(stack) { stack[0].repeat.nonGreedy=true}
function escapeStart(stack,c,i) {
stack.unshift({
concatTemp:true,
type:EXACT_NODE,chars:"",indices:[i]
});
}
function normalEscape(stack,c,i) {
if (escapeCharMap.hasOwnProperty(c)) c=escapeCharMap[c];
stack.unshift({
type:EXACT_NODE,chars:c,indices:[i-1]
});
}
function charClassEscape(stack,c,i) {
stack.unshift({
type:CHARSET_NODE,indices:[i-1],chars:'',ranges:[],
classes:[c],exclude:false
});
}
function hexEscape(stack,c,i,state,s) {
c=String.fromCharCode(parseInt(s[i-1]+c,16));
stack.shift(); // remove temp "xN", /\x5/ should match "x5",so there is an exact node with chars "x5" in stack
stack.unshift({
type:EXACT_NODE, chars:c,
indices:[i-3] // \xAA length
});
}
function unicodeEscape(stack,c,i,state,s) {
c=String.fromCharCode(parseInt(s.slice(i-3,i+1),16));
stack.shift(); // same as hexEscape, other cases could be emliminate at the end by _filterEmptyExact
stack.unshift({
type:EXACT_NODE, chars:c,
indices:[i-5] // \u5409 length
});
}
function groupStart(stack,c,i) {
var counter=(stack.groupCounter=(stack.groupCounter || {i:0}));
counter.i++;
var group={
type:GROUP_NODE,
num: counter.i,
sub:[], indices:[i],
_parentStack:stack // Used to restore current stack when group end,viz. encounters ")"
};
stack=group.sub;
_set(stack,'_parentGroup',group);
stack.groupCounter=counter; //keep groupCounter persist and ref modifiable
return stack;
}
function groupNonCapture(stack) { // /(?:)/
var group=stack._parentGroup
group.nonCapture=true;
group.num=undefined;
stack.groupCounter.i--;
}
function groupToAssertion(stack,c,i) { // Convert /(?!)/,/(?=)/ to AssertNode
var group=stack._parentGroup;
group.type=ASSERT_NODE;
group.assertionType= c=='=' ? AssertLookahead : AssertNegativeLookahead ;
// Caveat!!! Assertion group no need to capture
group.num=undefined;
stack.groupCounter.i--;
}
function groupEnd(stack,c,i,state,s) {
stack=endChoice(stack); // restore group's stack from choice
var group=stack._parentGroup;
if (!group) {
throw new RegexSyntaxError({
type:'UnexpectedChar',
lastIndex:i,
lastState:state,
astStack:stack,
message:"Unexpected end parenthesis!"
});
}
delete stack._parentGroup; // Be generous,I don't care sparse object performance.
delete stack.groupCounter; // clean
stack=group._parentStack; // restore stack
delete group._parentStack;
stack.unshift(group);
group.endParenIndex=i;
return stack;
}
function choice(stack,c,i) { // encounters "|"
//replace current stack with choices new branch stack
var newStack=[],choice;
if (stack._parentChoice) {
choice=stack._parentChoice;
choice.branches.unshift(newStack);
_set(newStack,'_parentChoice',choice);
_set(newStack,'_parentGroup',choice);
newStack.groupCounter=stack.groupCounter; // keep track
delete stack._parentChoice;
delete stack.groupCounter; // This stack is in choice.branches,so clean it
} else { // "/(a|)/" ,create new ChoiceNode
var first=stack[stack.length-1]; // Because of stack is reverse order
choice={
type:CHOICE_NODE,indices:[(first?first.indices[0]:i-1)],
branches:[]
};
_set(choice,'_parentStack',stack);
choice.branches.unshift(stack.slice()); // contents before "|"
stack.length=0;
/* e.g. "/(a|b)/" is {
type:'group',sub:[
{type:'choice',branches:[
[{type:'exact',chars:'a'}],
[{type:'exact',chars:'b'}]
]}]}*/
stack.unshift(choice); // must not clean groupCounter
newStack.groupCounter=stack.groupCounter;
_set(newStack,'_parentChoice',choice);
_set(newStack,'_parentGroup',choice);
choice.branches.unshift(newStack);
}
return newStack;
}
//if current stack is a choice's branch,return the original parent stack
function endChoice(stack) {
if (stack._parentChoice) {
var choice=stack._parentChoice;
delete stack._parentChoice;
delete stack._parentGroup;
delete stack.groupCounter;
var parentStack=choice._parentStack;
delete choice._parentStack;
return parentStack;
}
return stack;
}
function charsetStart(stack,c,i) {
stack.unshift({
type:CHARSET_NODE,indices:[i],
classes:[],ranges:[],chars:''
});
}
function charsetExclude(stack) {stack[0].exclude=true}
function charsetContent(stack,c,i) {stack[0].chars+=c}
function charsetNormalEscape(stack,c,i) {
if (escapeCharMap.hasOwnProperty(c)) c=escapeCharMap[c];
stack[0].chars+=c;
}
function charsetNullChar(stack,c,i) {
stack[0].chars+="\0";
}
function charsetClassEscape(stack,c) {
stack[0].classes.push(c);
}
function charsetHexEscape(stack,c,i,state,s) {
var last=stack[0];
c=String.fromCharCode(parseInt(last.chars.slice(-1)+c,16));
last.chars=last.chars.slice(0,-2); // also remove "xA"
last.chars+=c;
}
function charsetUnicodeEscape(stack,c,i,state,s) {
var last=stack[0];
c=String.fromCharCode(parseInt(last.chars.slice(-3)+c,16));
last.chars=last.chars.slice(0,-4); //remove "uABC"
last.chars+=c;
}
function charsetRangeEnd(stack,c,i,state,s) {
var charset=stack[0];
var range=charset.chars.slice(-2);
range=[range[0],c];
range.lastIndex=i;
charset.ranges.push(range);
charset.chars=charset.chars.slice(0,-2);
}
function charsetRangeEndNormalEscape(stack,c) {
if (escapeCharMap.hasOwnProperty(c)) c=escapeCharMap[c];
charsetRangeEnd.apply(this,arguments);
}
// [\x30-\x78] first repr as {ranges:['\x30','x']}
// [\u0000-\u4567] first repr as {ranges:['\0','u']}
// If escape sequences are valid then replace range end with corrent char
// stack[0].chars did not contain 'u' or 'x'
function charsetRangeEndUnicodeEscape(stack,c,i) {
var charset=stack[0];
var code=charset.chars.slice(-3)+c;
charset.chars=charset.chars.slice(0,-3); // So just remove previous three,no 'u'
var range=charset.ranges.pop();
c=String.fromCharCode(parseInt(code,16));
range=[range[0],c];
range.lastIndex=i;
charset.ranges.push(range);
}
function charsetRangeEndHexEscape(stack,c,i) {
var charset=stack[0];
var code=charset.chars.slice(-1)+c;
charset.chars=charset.chars.slice(0,-1); // last.chars does'nt contain 'x'
var range=charset.ranges.pop();
c=String.fromCharCode(parseInt(code,16));
range=[range[0],c];
range.lastIndex=i;
charset.ranges.push(range);
}
/* Caveat!!!
See:https://developer.mozilla.org/en/docs/Web/JavaScript/Reference/Global_Objects/RegExp
\0 Matches a NUL character. Do not follow this with another digit.
ECMA-262 Standard: 15.10.2.11 DecimalEscape
NOTE
If \ is followed by a decimal number n whose first digit is not 0, then the escape sequence is considered to be
a backreference. It is an error if n is greater than the total number of left capturing parentheses in the entire regular
expression. \0 represents the <NUL> character and cannot be followed by a decimal digit.
But in both Chrome and Firefox, /\077/ matches "\077",e.g. String.fromCharCode(parseInt("77",8))
/(g)\1/ matches "gg",it's OK.
But /(g)\14/ matches "g\14","\14" is String.fromCharCode(parseInt("14",8))
And /(g)\1456/ matches "g\145"+"6",/(g)\19/ matches "g\1"+"9". Who knows WTF?
Considering that ECMAScript StrictMode did not support OctEscape,
I'm not going to implement OctEscape.
I will make it conform the Standard.(Also keep code simple)
*/
function backref(stack,c,i,state) {
var last=stack[0],n=parseInt(c,10),
isFirstNum=state==='escape',
counter=stack.groupCounter,
cn=(counter && counter.i) || 0;
if (!isFirstNum) { //previous node must be backref node
n=parseInt(last.num+""+n,10);
} else {
last={type:BACKREF_NODE,indices:[i-1]};
stack.unshift(last);
}
if (n>cn) {
throw new RegexSyntaxError({
type:'InvalidBackReference',lastIndex:i,astStack:stack,lastState:state,
message:'Back reference number('+n+') greater than current groups count('+cn+').'
});
} else if (_isRecursive(n,stack)) {
throw new RegexSyntaxError({
type:'InvalidBackReference',lastIndex:i,astStack:stack,lastState:state,
message:'Recursive back reference in group ('+n+') itself.'
});
}
last.num=n;
function _isRecursive(n,stack) {
if (!stack._parentGroup) return false;
if (stack._parentGroup.num==n) return n;
return _isRecursive(n,stack._parentGroup._parentStack);
}
}
//console.log(K.locals(_));
return {
escapeStart:escapeStart,
exact:exact,dot:dot,nullChar:nullChar,assertBegin:assertBegin,
assertEnd:assertEnd,assertWordBoundary:assertWordBoundary,
repeatnStart:repeatnStart,repeatnComma:repeatnComma,repeatNonGreedy:repeatNonGreedy,
repeatnEnd:repeatnEnd,repeat1:repeat1,repeat01:repeat01,repeat0:repeat0,
charClassEscape:charClassEscape,normalEscape:normalEscape,
unicodeEscape:unicodeEscape,hexEscape:hexEscape,charClassEscape:charClassEscape,
groupStart:groupStart,groupNonCapture:groupNonCapture,backref:backref,
groupToAssertion:groupToAssertion,groupEnd:groupEnd,
choice:choice,endChoice:endChoice,
charsetStart:charsetStart,charsetExclude:charsetExclude,
charsetContent:charsetContent,charsetNullChar:charsetNullChar,
charsetClassEscape:charsetClassEscape,
charsetHexEscape:charsetHexEscape,
charsetUnicodeEscape:charsetUnicodeEscape,
charsetRangeEnd:charsetRangeEnd,charsetNormalEscape:charsetNormalEscape,
charsetRangeEndNormalEscape:charsetRangeEndNormalEscape,
charsetRangeEndUnicodeEscape:charsetRangeEndUnicodeEscape,
charsetRangeEndHexEscape:charsetRangeEndHexEscape
};
})();
var digit='0-9';
var hexDigit='0-9a-fA-F';
//EX,It is an exclusive charset
var exactEXCharset='^+*?^$.|(){[\\';
var charClassEscape='dDwWsS';
var unicodeEscape='u';
var hexEscape='x';
//var octDigit='0-7';
//var octEscape='0-7'; Never TODO. JavaScript doesn't support string OctEscape in strict mode.
// In charset,\b\B means "\b","\B",not word boundary
// NULL Escape followed digit should throw error
var normalEscapeInCharsetEX='^'+charClassEscape+unicodeEscape+hexEscape+'0-9';
// 'rntvf\\' escape ,others return raw
// Also need exclude \b\B assertion and backref
var normalEscapeEX=normalEscapeInCharsetEX+'bB1-9';
//var controlEscape;//Never TODO.Same reason as OctEscape.
var repeatnStates='repeatnStart,repeatn_1,repeatn_2,repeatnErrorStart,repeatnError_1,repeatnError_2';
var hexEscapeStates='hexEscape1,hexEscape2';
var unicodeEscapeStates='unicodeEscape1,unicodeEscape2,unicodeEscape3,unicodeEscape4';
var allHexEscapeStates=hexEscapeStates+','+unicodeEscapeStates;
var charsetIncompleteEscapeStates='charsetUnicodeEscape1,charsetUnicodeEscape2,charsetUnicodeEscape3,charsetUnicodeEscape4,charsetHexEscape1,charsetHexEscape2';
// [a-\u1z] means [a-u1z], [a-\u-z] means [-za-u]
// [a-\u0-9] means [a-u0-9]. WTF!
var charsetRangeEndIncompleteEscapeFirstStates='charsetRangeEndUnicodeEscape1,charsetRangeEndHexEscape1';
var charsetRangeEndIncompleteEscapeRemainStates='charsetRangeEndUnicodeEscape2,charsetRangeEndUnicodeEscape3,charsetRangeEndUnicodeEscape4,charsetRangeEndHexEscape2';
var charsetRangeEndIncompleteEscapeStates=charsetRangeEndIncompleteEscapeFirstStates+','+charsetRangeEndIncompleteEscapeRemainStates;
var config={
compact:true,
accepts:'start,begin,end,repeat0,repeat1,exact,repeatn,repeat01,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates),
trans:[
['start,begin,end,exact,repeatNonGreedy,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,choice,repeatn>exact',exactEXCharset,actions.exact],
// e.g. /\u54/ means /u54/
[allHexEscapeStates+'>exact',exactEXCharset+hexDigit,actions.exact],
// e.g. /\0abc/ is exact "\0abc",but /\012/ is an error
['nullChar>exact',exactEXCharset+digit,actions.exact],
//[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>exact',exactEXCharset+'']
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',start,begin,end,exact,repeatNonGreedy,repeat0,repeat1,repeat01,groupStart,groupQualifiedStart,choice,repeatn>exact','.',actions.dot],
['start,groupStart,groupQualifiedStart,end,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates+'>begin','^',actions.assertBegin],
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',exact>repeatnStart','{',actions.repeatnStart],
['start,begin,end,groupQualifiedStart,groupStart,repeat0,repeat1,repeatn,repeat01,repeatNonGreedy,choice>repeatnErrorStart','{',actions.exact],//No repeat,treat as exact char e.g. /{/,/^{/,/a|{/
['repeatnStart>repeatn_1',digit,actions.exact], // Now maybe /a{1/
['repeatn_1>repeatn_1',digit,actions.exact], // Could be /a{11/
['repeatn_1>repeatn_2',',',actions.repeatnComma], // Now maybe /a{1,/
['repeatn_2>repeatn_2',digit,actions.exact], // Now maybe /a{1,3/
['repeatn_1,repeatn_2>repeatn','}',actions.repeatnEnd], //Totally end /a{1,3}/
//Repeat treat as exact chars
['repeatnStart,repeatnErrorStart>exact','}',actions.exact], // e.g. /{}/,/a{}/
//Add exclusion 0-9 and "}", e.g. /a{a/,/a{,/ are valid exact match
['repeatnStart,repeatnErrorStart>exact',exactEXCharset+'0-9}',actions.exact],
// "/{}/" is valid exact match but /{1,2}/ is error repeat.
// So must track it with states repeatnError_1,repeatnError_2
['repeatnErrorStart>repeatnError_1',digit,actions.exact],
['repeatnError_1>repeatnError_1',digit,actions.exact],
['repeatnError_1>repeatnError_2',',',actions.exact],
['repeatnError_2>repeatnError_2',digit,actions.exact],
// repeatErrorFinal is an unacceptable state. Nothing to repeat error should be throwed
['repeatnError_2,repeatnError_1>repeatErrorFinal','}'],
// "/a{2a/" and "/{2a/" are valid exact match
['repeatn_1,repeatnError_1>exact',exactEXCharset+digit+',}',actions.exact],
// "/a{2,a/" and "/{3,a" are valid
['repeatn_2,repeatnError_2>exact',exactEXCharset+digit+'}',actions.exact],
['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat0','*',actions.repeat0],
['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat1','+',actions.repeat1],
['exact,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>repeat01','?',actions.repeat01],
['choice>repeatErrorFinal','*+?'],
['repeat0,repeat1,repeat01,repeatn>repeatNonGreedy','?',actions.repeatNonGreedy],
['repeat0,repeat1,repeat01,repeatn>repeatErrorFinal','+*'],
// Escape
['start,begin,end,groupStart,groupQualifiedStart,exact,repeatNonGreedy,repeat0,repeat1,repeat01,repeatn,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>escape','\\',actions.escapeStart],
['escape>nullChar','0',actions.nullChar],
['nullChar>digitFollowNullError','0-9'], // "/\0123/" is invalid in standard
['escape>exact',normalEscapeEX,actions.normalEscape],
['escape>exact','bB',actions.assertWordBoundary],
['escape>exact',charClassEscape,actions.charClassEscape],
['escape>unicodeEscape1',unicodeEscape,actions.exact],
['unicodeEscape1>unicodeEscape2',hexDigit,actions.exact],
['unicodeEscape2>unicodeEscape3',hexDigit,actions.exact],
['unicodeEscape3>unicodeEscape4',hexDigit,actions.exact],
['unicodeEscape4>exact',hexDigit,actions.unicodeEscape],
['escape>hexEscape1',hexEscape,actions.exact],
['hexEscape1>hexEscape2',hexDigit,actions.exact],
['hexEscape2>exact',hexDigit,actions.hexEscape],
['escape>digitBackref','1-9',actions.backref],
['digitBackref>digitBackref',digit,actions.backref],
['digitBackref>exact',exactEXCharset+digit,actions.exact],
// Group start
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,start,groupStart,groupQualifiedStart,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>groupStart','(',actions.groupStart],
['groupStart>groupQualify','?'],
['groupQualify>groupQualifiedStart',':',actions.groupNonCapture],//group non-capturing
['groupQualify>groupQualifiedStart','=',actions.groupToAssertion],//group positive lookahead
['groupQualify>groupQualifiedStart','!',actions.groupToAssertion],//group negative lookahead
[(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+',groupStart,groupQualifiedStart,begin,end,exact,repeat1,repeat0,repeat01,repeatn,repeatNonGreedy,choice>exact',')',actions.groupEnd],//group end
//choice
['start,begin,end,groupStart,groupQualifiedStart,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>choice','|', actions.choice],
['start,groupStart,groupQualifiedStart,begin,exact,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>end','$',actions.assertEnd],
// Charset [HA-HO]
['exact,begin,end,repeat0,repeat1,repeat01,repeatn,repeatNonGreedy,groupQualifiedStart,groupStart,start,choice,'+(repeatnStates+',nullChar,digitBackref,'+unicodeEscapeStates+','+hexEscapeStates)+'>charsetStart','[',actions.charsetStart],
['charsetStart>charsetExclude','^',actions.charsetExclude],
['charsetStart>charsetContent','^\\]^',actions.charsetContent],
['charsetExclude>charsetContent','^\\]',actions.charsetContent], // "[^^]" is valid
['charsetContent,charsetClass>charsetContent','^\\]-',actions.charsetContent],
['charsetClass>charsetContent','-',actions.charsetContent],
// Charset Escape
[charsetIncompleteEscapeStates+
',charsetStart,charsetContent,charsetNullChar,charsetClass,charsetExclude,charsetRangeEnd>charsetEscape','\\'],
['charsetEscape>charsetContent',normalEscapeInCharsetEX,actions.charsetNormalEscape],
['charsetEscape>charsetNullChar','0',actions.charsetNullChar],
//Didn't allow oct escape
['charsetEscape>charsetOctEscape','1-9'],
['charsetRangeEndEscape>charsetOctEscape','1-9'],
//Treat /[\012]/ as an error
['charsetNullChar>digitFollowNullError',digit],
// Only null char not followed by digit is valid
['charsetNullChar>charsetContent','^0-9\\]-',actions.charsetContent],
// charsetClass state should diff from charsetContent
// Because /[\s-a]/ means /[-a\s]/
['charsetEscape>charsetClass',charClassEscape,actions.charsetClassEscape],
['charsetEscape>charsetUnicodeEscape1',unicodeEscape,actions.charsetContent],
['charsetUnicodeEscape1>charsetUnicodeEscape2',hexDigit,actions.charsetContent],
['charsetUnicodeEscape2>charsetUnicodeEscape3',hexDigit,actions.charsetContent],
['charsetUnicodeEscape3>charsetUnicodeEscape4',hexDigit,actions.charsetContent],
['charsetUnicodeEscape4>charsetContent',hexDigit,actions.charsetUnicodeEscape],
['charsetEscape>charsetHexEscape1',hexEscape,actions.charsetContent],
['charsetHexEscape1>charsetHexEscape2',hexDigit,actions.charsetContent],
['charsetHexEscape2>charsetContent',hexDigit,actions.charsetHexEscape],
// [a\u54-9] should be treat as [4-9au5]
[charsetIncompleteEscapeStates+'>charsetContent','^\\]'+hexDigit+'-',actions.charsetContent],
[charsetIncompleteEscapeStates+',charsetNullChar,charsetContent>charsetRangeStart','-',actions.charsetContent],
['charsetRangeStart>charsetRangeEnd','^\\]',actions.charsetRangeEnd],
['charsetRangeEnd>charsetContent','^\\]',actions.charsetContent],
// Some troubles here, [0-\x39] means [0-9]
['charsetRangeStart>charsetRangeEndEscape','\\'],
['charsetRangeEndEscape>charsetRangeEnd',normalEscapeEX,actions.charsetRangeEndNormalEscape],
// No need to care [a-\0],it is not a valid range so will throw OutOfOrder error.
// But what about [\0-\0]? Insane!
['charsetRangeEndEscape>charsetRangeEndWithNullChar','0'],
['charsetRangeEndEscape>charsetRangeEndUnicodeEscape1',unicodeEscape,actions.charsetRangeEnd],
['charsetRangeEndUnicodeEscape1>charsetRangeEndUnicodeEscape2',hexDigit,actions.charsetContent],
['charsetRangeEndUnicodeEscape2>charsetRangeEndUnicodeEscape3',hexDigit,actions.charsetContent],
['charsetRangeEndUnicodeEscape3>charsetRangeEndUnicodeEscape4',hexDigit,actions.charsetContent],
['charsetRangeEndUnicodeEscape4>charsetRangeEnd',hexDigit,actions.charsetRangeEndUnicodeEscape],
['charsetRangeEndEscape>charsetRangeEndHexEscape1',hexEscape,actions.charsetRangeEnd],
['charsetRangeEndHexEscape1>charsetRangeEndHexEscape2',hexDigit,actions.charsetContent],
['charsetRangeEndHexEscape2>charsetRangeEnd',hexDigit,actions.charsetRangeEndHexEscape],
// [0-\w] means [-0\w]? Should throw error!
['charsetRangeEndEscape>charsetRangeEndClass',charClassEscape],
// [a-\uz] means [za-u],[a-\u-z] means [-za-u]
[charsetRangeEndIncompleteEscapeFirstStates+'>charsetContent','^\\]'+hexDigit,actions.charsetContent],
// [a-\u0-9] means [0-9a-u]
[charsetRangeEndIncompleteEscapeRemainStates+'>charsetRangeStart','-',actions.charsetContent],
[charsetIncompleteEscapeStates+','
+charsetRangeEndIncompleteEscapeStates
+',charsetNullChar,charsetRangeStart,charsetContent'
+',charsetClass,charsetExclude,charsetRangeEnd>exact',
']']
]
};
return parse;
});