phpjs
Version:
640 lines (637 loc) • 21 kB
JavaScript
function token_get_all (source) {
// Split given source into PHP tokens
// + original by: Marco Marchi�
// + improved by: Brett Zamir (http://brett-zamir.me)
// - depends on: token_name
// % note 1: Token numbers depend on the PHP version
// % note 2: token_name is only necessary for a non-standard php.js-specific use of this function;
// % note 2: if you define an object on this.php_js.phpParser (where "this" is the scope of the
// % note 2: token_get_all function (either a namespaced php.js object or the window object)),
// % note 2: this function will call that object's methods if they have the same names as the tokens,
// % note 2: passing them the string, line number, and token number (in that order)
// * example 1: token_get_all('/'+'* comment *'+'/');
// * returns 1: [[311, '/* comment */', 1]]
// Token to number conversion
var tokens = {
T_REQUIRE_ONCE:261,
T_REQUIRE:260,
T_EVAL:259,
T_INCLUDE_ONCE:258,
T_INCLUDE:257,
T_LOGICAL_OR:262,
T_LOGICAL_XOR:263,
T_LOGICAL_AND:264,
T_PRINT:265,
T_SR_EQUAL:276,
T_SL_EQUAL:275,
T_XOR_EQUAL:274,
T_OR_EQUAL:273,
T_AND_EQUAL:272,
T_MOD_EQUAL:271,
T_CONCAT_EQUAL:270,
T_DIV_EQUAL:269,
T_MUL_EQUAL:268,
T_MINUS_EQUAL:267,
T_PLUS_EQUAL:266,
T_BOOLEAN_OR:277,
T_BOOLEAN_AND:278,
T_IS_NOT_IDENTICAL:282,
T_IS_IDENTICAL:281,
T_IS_NOT_EQUAL:280,
T_IS_EQUAL:279,
T_IS_GREATER_OR_EQUAL:284,
T_IS_SMALLER_OR_EQUAL:283,
T_SR:286,
T_SL:285,
T_INSTANCEOF:287,
T_UNSET_CAST:296,
T_BOOL_CAST:295,
T_OBJECT_CAST:294,
T_ARRAY_CAST:293,
T_STRING_CAST:292,
T_DOUBLE_CAST:291,
T_INT_CAST:290,
T_DEC:289,
T_INC:288,
T_CLONE:298,
T_NEW:297,
T_EXIT:299,
T_IF:300,
T_ELSEIF:301,
T_ELSE:302,
T_ENDIF:303,
T_LNUMBER:304,
T_DNUMBER:305,
T_STRING:306,
T_STRING_VARNAME:307,
T_VARIABLE:308,
T_NUM_STRING:309,
T_INLINE_HTML:310,
T_CHARACTER:311,
T_BAD_CHARACTER:312,
T_ENCAPSED_AND_WHITESPACE:313,
T_CONSTANT_ENCAPSED_STRING:314,
T_ECHO:315,
T_DO:316,
T_WHILE:317,
T_ENDWHILE:318,
T_FOR:319,
T_ENDFOR:320,
T_FOREACH:321,
T_ENDFOREACH:322,
T_DECLARE:323,
T_ENDDECLARE:324,
T_AS:325,
T_SWITCH:326,
T_ENDSWITCH:327,
T_CASE:328,
T_DEFAULT:329,
T_BREAK:330,
T_CONTINUE:331,
T_GOTO:332,
T_FUNCTION:333,
T_CONST:334,
T_RETURN:335,
T_TRY:336,
T_CATCH:337,
T_THROW:338,
T_USE:339,
T_GLOBAL:340,
T_PUBLIC:346,
T_PROTECTED:345,
T_PRIVATE:344,
T_FINAL:343,
T_ABSTRACT:342,
T_STATIC:341,
T_VAR:347,
T_UNSET:348,
T_ISSET:349,
T_EMPTY:350,
T_HALT_COMPILER:351,
T_CLASS:352,
T_INTERFACE:353,
T_EXTENDS:354,
T_IMPLEMENTS:355,
T_OBJECT_OPERATOR:356,
T_DOUBLE_ARROW:357,
T_LIST:358,
T_ARRAY:359,
T_CLASS_C:360,
T_METHOD_C:361,
T_FUNC_C:362,
T_LINE:363,
T_FILE:364,
T_COMMENT:365,
T_DOC_COMMENT:366,
T_OPEN_TAG:367,
T_OPEN_TAG_WITH_ECHO:368,
T_CLOSE_TAG:369,
T_WHITESPACE:370,
T_START_HEREDOC:371,
T_END_HEREDOC:372,
T_DOLLAR_OPEN_CURLY_BRACES:373,
T_CURLY_OPEN:374,
T_PAAMAYIM_NEKUDOTAYIM:375,
T_NAMESPACE:376,
T_NS_C:377,
T_DIR:378,
T_NS_SEPARATOR:379
},
//Keywords tokens
keywordsToken = {
"abstract": tokens.T_ABSTRACT,
"array": tokens.T_ARRAY,
"as": tokens.T_AS,
"break": tokens.T_BREAK,
"case": tokens.T_CASE,
"catch": tokens.T_CATCH,
"class": tokens.T_CLASS,
"__CLASS__": tokens.T_CLASS_C,
"clone": tokens.T_CLONE,
"const": tokens.T_CONST,
"continue": tokens.T_CONTINUE,
"declare": tokens.T_DECLARE,
"default": tokens.T_DEFAULT,
"__DIR__": tokens.T_DIR,
"die": tokens.T_EXIT,
"do": tokens.T_DO,
"echo": tokens.T_ECHO,
"else": tokens.T_ELSE,
"elseif": tokens.T_ELSEIF,
"empty": tokens.T_EMPTY,
"enddeclare": tokens.T_ENDDECLARE,
"endfor": tokens.T_ENDFOR,
"endforeach": tokens.T_ENDFOREACH,
"endif": tokens.T_ENDIF,
"endswitch": tokens.T_ENDSWITCH,
"endwhile": tokens.T_ENDWHILE,
"eval": tokens.T_EVAL,
"exit": tokens.T_EXIT,
"extends": tokens.T_EXTENDS,
"__FILE__": tokens.T_FILE,
"final": tokens.T_FINAL,
"for": tokens.T_FOR,
"foreach": tokens.T_FOREACH,
"function": tokens.T_FUNCTION,
"__FUNCTION__": tokens.T_FUNC_C,
"global": tokens.T_GLOBAL,
"goto": tokens.T_GOTO,
"__halt_compiler": tokens.T_HALT_COMPILER,
"if": tokens.T_IF,
"implements": tokens.T_IMPLEMENTS,
"include": tokens.T_INCLUDE,
"include_once": tokens.T_INCLUDE_ONCE,
"instanceof": tokens.T_INSTANCEOF,
"interface": tokens.T_INTERFACE,
"isset": tokens.T_ISSET,
"__LINE__": tokens.T_LINE,
"list": tokens.T_LIST,
"and": tokens.T_LOGICAL_AND,
"or": tokens.T_LOGICAL_OR,
"xor": tokens.T_LOGICAL_XOR,
"__METHOD__": tokens.T_METHOD_C,
"namespace": tokens.T_NAMESPACE,
"__NAMESPACE__": tokens.T_NS_C,
"new": tokens.T_NEW,
"print": tokens.T_PRINT,
"private": tokens.T_PRIVATE,
"public": tokens.T_PUBLIC,
"protected": tokens.T_PROTECTED,
"require": tokens.T_REQUIRE,
"require_once": tokens.T_REQUIRE_ONCE,
"return": tokens.T_RETURN,
"static": tokens.T_STATIC,
"switch": tokens.T_SWITCH,
"throw": tokens.T_THROW,
"try": tokens.T_TRY,
"unset": tokens.T_UNSET,
"use": tokens.T_USE,
"var": tokens.T_VAR,
"while": tokens.T_WHILE
},
//Type casting tokens
typeCasting = {
"array": tokens.T_ARRAY_CAST,
"bool": tokens.T_BOOL_CAST,
"boolean": tokens.T_BOOL_CAST,
"real": tokens.T_DOUBLE_CAST,
"double": tokens.T_DOUBLE_CAST,
"float": tokens.T_DOUBLE_CAST,
"int": tokens.T_INT_CAST,
"integer": tokens.T_INT_CAST,
"object": tokens.T_OBJECT_CAST,
"string": tokens.T_STRING_CAST,
"unset": tokens.T_UNSET_CAST,
"binary": tokens.T_STRING_CAST
},
//Symbols tokens
symbols = {
"&=": tokens.T_AND_EQUAL,
"&&": tokens.T_BOOLEAN_AND,
"||": tokens.T_BOOLEAN_OR,
"?>": tokens.T_CLOSE_TAG,
"%>": tokens.T_CLOSE_TAG,
".=": tokens.T_CONCAT_EQUAL,
"--": tokens.T_DEC,
"/=": tokens.T_DIV_EQUAL,
"=>": tokens.T_DOUBLE_ARROW,
"::": tokens.T_PAAMAYIM_NEKUDOTAYIM,
"++": tokens.T_INC,
"==": tokens.T_IS_EQUAL,
">=": tokens.T_IS_GREATER_OR_EQUAL,
"===": tokens.T_IS_IDENTICAL,
"!=": tokens.T_IS_NOT_EQUAL,
"<>": tokens.T_IS_NOT_EQUAL,
"!==": tokens.T_IS_NOT_IDENTICAL,
"<=": tokens.T_IS_SMALLER_OR_EQUAL,
"-=": tokens.T_MINUS_EQUAL,
"%=": tokens.T_MOD_EQUAL,
"*=": tokens.T_MUL_EQUAL,
"\\": tokens.T_NS_SEPARATOR,
"->": tokens.T_OBJECT_OPERATOR,
"|=": tokens.T_OR_EQUAL,
"+=": tokens.T_PLUS_EQUAL,
"<<": tokens.T_SL,
"<<=": tokens.T_SL_EQUAL,
">>": tokens.T_SR,
">>=": tokens.T_SR_EQUAL,
"^=": tokens.T_XOR_EQUAL
},
//Buffer tokens
bufferTokens = {
"html": tokens.T_INLINE_HTML,
"inlineComment": tokens.T_COMMENT,
"comment": tokens.T_COMMENT,
"docComment": tokens.T_DOC_COMMENT,
"singleQuote": tokens.T_CONSTANT_ENCAPSED_STRING,
"doubleQuotes": tokens.T_CONSTANT_ENCAPSED_STRING,
"nowdoc": tokens.T_ENCAPSED_AND_WHITESPACE,
"heredoc": tokens.T_ENCAPSED_AND_WHITESPACE
},
//Buffer type. Start an html buffer immediatelly.
bufferType = "html",
//Buffer content
buffer = "",
match,
token,
//Last emitted token
lastToken,
//Results array
ret = [],
//Word that started the heredoc or nowdoc buffer
heredocWord,
//Line number
line = 1,
//Line at which the buffer begins
lineBuffer = 1,
//Flag that indicates if the current double quoted string has been splitted
split,
//This variable will store the previous buffer type of the tokenizer before parsing a
//complex variable syntax
complexVarPrevBuffer,
//Number of open brackets inside a complex variable syntax
openBrackets,
//Function to emit tokens
emitToken = function (token, code, preventBuffer, l) {
if (!preventBuffer && bufferType) {
buffer += token;
lastToken = null;
} else {
lastToken = code ? code : token;
ret.push(code ? [code, token, l || line] : token);
}
},
//Function to emit and close the current buffer
emitBuffer = function () {
buffer && emitToken(buffer, bufferTokens[bufferType], true, lineBuffer);
buffer = "";
bufferType = null;
},
//Function to check if the token at the current index is escaped
isEscaped = function () {
var escaped = false,
i = match.index - 1;
for (1; i >= 0; i--) {
if (source.charAt(i) !== "\\") {
break;
}
escaped = !escaped;
}
return escaped;
},
//This function is used to split a double quoted string or a heredoc buffer after a variable
//has been found inside it
splitString = function () {
//Don't emit empty buffers
if (!buffer) {
return;
}
//If the buffer is a double quoted string and it has not yet been splitted, emit the double
//quotes as a token without an associated code
if (bufferType === "doubleQuotes" && !split) {
split = true;
emitToken('"', null, true);
buffer = buffer.substr(1);
}
buffer && emitToken(buffer, tokens.T_ENCAPSED_AND_WHITESPACE, true, lineBuffer);
buffer = "";
lineBuffer = line;
},
//Returns the number of line feed characters in the given string
getNewLines = function (str) {
var i = 0;
str.replace(newLines, function () {
i++;
});
return i;
},
//Regexp that matches starting whitespaces
nextWS = /^\s/,
//Regexp that matches starting line feeds
nextLF = /^\r?\n/,
//Regexp to remove characters and get the type in type casting tokens
castType = /^\(\s*|\s*\)$/g,
//Regexp used to find additional whitespaces matches by the first group of the main regexp
additionalSpaces = /(\r?\n)(\s+)$/,
//Regexp used to find line feed characters in a string
newLines = /\n/g,
//Regexp used to strip useless characters from heredoc start declaration
heredocStripChars = /^<<<\s*"?|["']?\r?\n/g,
//Regexp to check if the characters that follow a word are valid as heredoc end declaration
heredocEndFollowing = /^;?\r?\n/,
//Tokenizer regexp
tokenizer = /(\s+)|(<(?:\?(?:php\r?\s?|=)?|%=?))|\b(__halt_compiler|__CLASS__|__DIR__|__FILE__|__FUNCTION__|__LINE__|__METHOD__|__NAMESPACE__|abstract|and|array|as|break|case|catch|class|clone|const|continue|declare|default|die|do|echo|elseif|else|empty|enddeclare|endforeach|endfor|endif|endswitch|endwhile|eval|exit|final|foreach|for|function|extends|global|goto|if|implements|include_once|include|instanceof|interface|isset|list|namespace|new|or|xor|print|private|protected|public|require_once|require|return|static|switch|throw|try|unset|use|var|while)\b|(\(\s*(?:array|bool(?:ean)?|real|double|float|int(?:eger)?|object|string|unset|binary)\s*\))|((?:\d+(?:\.\d*)?|\d*\.\d+)e[\+\-]?\d+|\d*\.\d+|\d+\.\d*)|(\d+(?:x[0-9a-fA-F]+)?)|(\$[a-zA-Z_][a-zA-Z_0-9]*)|(\/\/|\/\*\*?|\*\/|#)|(<<<\s*['"]?[a-zA-Z]\w*['"]?\r?\n)|(&[=&]?|\.=?|\/=?|-[=\->]?|::?|\^=?|%[=>]?|\?>?|\+[=\+]?|\*=?|\|[=\|]?|!={0,2}|=(?:>|={1,2})?|>>?=?|<(?:>|<?=?)?|[\\;\(\)\{\}\[\],~@`\$"'])|(\w+)|(.)/ig;
while (match = tokenizer.exec(source)) {
if (match[1]) {
//Whitespace
token = match[1];
//Since PHP closing tag token matches also the following line feed
//character, if the last token was a PHP closing tag and the current
//one starts with a line feed, this character must be removed and
//added to the previous token
if (lastToken === tokens.T_CLOSE_TAG) {
token = token.replace(nextLF, function (a) {
ret[ret.length - 1][1] += a;
line++;
lineBuffer++;
return "";
});
if (!token) {
continue;
}
}
emitToken(token, tokens.T_WHITESPACE);
if (token.indexOf("\n") > -1) {
//Increment line number if the token contains one or more line feed characters
line += getNewLines(token);
//Close the inline comment buffer if it's open
if (bufferType === "inlineComment") {
//Since the regexp matches multilple whitespaces but the comment token includes
//only the first line feed, the other whitespaces must be emitted as a separated
//token
var spToken = false,
lf;
buffer = buffer.replace(additionalSpaces, function (a, p, n) {
spToken = n;
return p;
});
emitBuffer();
if (spToken) {
lf = getNewLines(spToken);
emitToken(spToken, tokens.T_WHITESPACE, true, line - lf);
}
}
}
} else if (match[2]) {
//PHP Open tags
token = match[2];
//If there's an active html buffer emit it as a token
if (bufferType === "html") {
emitBuffer();
}
emitToken(
token,
token === "<?=" || token === "<%=" ? tokens.T_OPEN_TAG_WITH_ECHO : tokens.T_OPEN_TAG
);
if (token.indexOf("\n") > -1) {
line++;
}
} else if (match[3]) {
//Keywords
token = match[3];
//If it's preceded by -> than it's an object property and it must be tokenized as T_STRING
emitToken(
token,
lastToken === tokens.T_OBJECT_OPERATOR ?
tokens.T_STRING :
keywordsToken[token] || keywordsToken[token.toLowerCase()]
);
} else if (match[4]) {
//Type-casting
token = match[4].replace(castType, "").toLowerCase();
emitToken(match[4], typeCasting[token]);
} else if (match[5]) {
//Floating point numbers
emitToken(match[5], tokens.T_DNUMBER);
} else if (match[6] || match[6] === "0") {
//Integer numbers
//Numeric array index inside a heredoc or a double quoted string
if (lastToken === "[" && (bufferType === "heredoc" || bufferType === "doubleQuotes")) {
emitToken(match[6], tokens.T_NUM_STRING, true);
} else {
token = match[6];
var isHex = token.charAt(1).toLowerCase() === "x";
//If it's greater than 2147483648 it's considered as a floating point number
emitToken(
token,
parseInt(isHex ? parseInt(token, 16) : token, 10) < 2147483648 ?
tokens.T_LNUMBER :
tokens.T_DNUMBER
);
}
} else if (match[7]) {
//Variable
//If there's an active buffer emit the token only if it's inside a double quoted string
//or a heredoc and it's not escaped
if ((bufferType === "heredoc" || bufferType === "doubleQuotes") && !isEscaped()) {
splitString();
emitToken(match[7], tokens.T_VARIABLE, true);
} else {
emitToken(match[7], tokens.T_VARIABLE);
}
} else if(match[8]) {
//Comment signs
token = match[8];
//Change the buffer only if there's no active buffer
if (!bufferType) {
if (token === "//" || token === "#") {
bufferType = "inlineComment";
} else if (token === "/**") {
bufferType = nextWS.test(source.substr(match.index + token.length)) ?
"docComment" :
"comment";
} else if (token === "/*") {
bufferType = "comment";
}
lineBuffer = line;
}
emitToken(token);
//Close the multi line comment buffer if there's one open
if (token === "*/" && (bufferType === "comment" || bufferType === "docComment")) {
emitBuffer();
}
} else if (match[9]) {
//Heredoc and nowdoc start declaration
token = match[9];
emitToken(token, tokens.T_START_HEREDOC);
line++;
if (!bufferType) {
heredocWord = token.replace(heredocStripChars, "");
//If the first character is a quote then it's a nowdoc otherwise it's an heredoc
if (heredocWord.charAt(0) === "'") {
//Strip the leading quote
heredocWord = heredocWord.substr(1);
bufferType = "nowdoc";
} else {
bufferType = "heredoc";
}
lineBuffer = line;
}
} else if (match[10]) {
//Symbols
token = match[10];
if (token in symbols) {
//Syntax $obj->prop inside strings and heredoc
if (token === "->" && lastToken === tokens.T_VARIABLE && (bufferType === "heredoc" ||
bufferType === "doubleQuotes")) {
emitToken(token, symbols[token], true);
continue;
}
emitToken(token, symbols[token]);
//If the token is a PHP close tag and there isn't an active buffer start an html buffer
if (!bufferType && symbols[token] === tokens.T_CLOSE_TAG) {
bufferType = "html";
lineBuffer = line;
}
} else {
//Start string buffers if there isn't an active buffer
if (!bufferType && (token === "'" || token === '"')) {
if (token === "'") {
bufferType = "singleQuote";
} else {
split = false;
bufferType = "doubleQuotes";
}
lineBuffer = line;
//Add the token to the buffer and continue to skip next checks
emitToken(token);
continue;
} else if (token === '"' && bufferType === "doubleQuotes" && !isEscaped()) {
//If the string has been splitted emit the current buffer and the double quotes
//as separate tokens
if (split) {
splitString();
bufferType = null;
emitToken('"');
} else {
emitToken('"');
emitBuffer();
}
continue;
} else if (bufferType === "heredoc" || bufferType === "doubleQuotes") {
//Array index delimiters inside heredoc or double quotes
if ((token === "[" && lastToken === tokens.T_VARIABLE) ||
(token === "]" && (lastToken === tokens.T_NUM_STRING ||
lastToken === tokens.T_STRING))) {
emitToken(token, null, true);
continue;
} else if (((token === "$" && source.charAt(match.index + 1) === "{") ||
(token === "{" && source.charAt(match.index + 1) === "$")) &&
!isEscaped()) {
//Complex variable syntax ${varname} or {$varname}. Store the current
//buffer type and evaluate next tokens as there's no active buffer.
//The current buffer will be reset when the declaration is closed
splitString();
complexVarPrevBuffer = bufferType;
bufferType = null;
if (token === "$") {
emitToken(token + "{", tokens.T_DOLLAR_OPEN_CURLY_BRACES);
openBrackets = 0;
} else {
emitToken(token, tokens.T_CURLY_OPEN);
openBrackets = 1;
}
continue;
}
} else if (complexVarPrevBuffer && !openBrackets && token === "{") {
//Skip the token if it's the bracket that follows the dollar in the
//${varname} syntax because it's included in the previous token
openBrackets++;
continue;
}
emitToken(token);
//Increment or decrement the number of open brackets inside a complex
//variable syntax
if (complexVarPrevBuffer && (token === "{" || token === "}")) {
if (token === "{") {
openBrackets++;
} else if (!--openBrackets) {
//If every bracket has been closed reset the previous buffer
bufferType = complexVarPrevBuffer;
complexVarPrevBuffer = null;
}
} else if (token === "'" && bufferType === "singleQuote" && !isEscaped()) {
//Stop the single quoted string buffer if the character is a quote,
//there's an open single quoted string buffer and the character is
//not escaped
emitBuffer();
}
}
} else if (match[11]) {
//Word
token = match[11];
//If there's an open nowdoc or heredoc buffer, the string is the same that the one
//that has started the buffer, it's preceded by a line feed and followed by the
//right characters then emit the buffer and the word
if ((bufferType === "nowdoc" || bufferType === "heredoc") && token === heredocWord &&
source.charAt(match.index - 1) === "\n" &&
heredocEndFollowing.test(source.substr(match.index + token.length))) {
emitBuffer();
emitToken(token, tokens.T_END_HEREDOC);
continue;
} else if ((bufferType === "heredoc" || bufferType === "doubleQuotes")) {
if (lastToken === "[") {
//Literal array index inside a heredoc or a double quoted string
emitToken(token, tokens.T_STRING, true);
continue;
} else if (lastToken === tokens.T_OBJECT_OPERATOR) {
//Syntax $obj->prop inside strings and heredoc
emitToken(token, tokens.T_STRING, true);
continue;
}
} else if (complexVarPrevBuffer && lastToken === tokens.T_DOLLAR_OPEN_CURLY_BRACES) {
//Complex variable syntax ${varname}
emitToken(token, tokens.T_STRING_VARNAME);
continue;
}
emitToken(token, tokens.T_STRING);
} else {
//Other characters
//If below ASCII 32 it's a bad character
if (token.charCodeAt(0) < 32) {
emitToken(match[12], tokens.T_BAD_CHARACTER);
} else {
//If there isn't an open buffer there should be an syntax error, but we don't care
//so it will be emitted as a simple string
emitToken(match[12], tokens.T_STRING);
}
}
}
//If there's an open buffer emit it
if (bufferType && (bufferType !== "doubleQuotes" || !split)) {
emitBuffer();
} else {
splitString();
}
return ret;
}