phpeggy
Version:
PHP target for Peggy parser generator
848 lines (728 loc) • 25.9 kB
JavaScript
"use strict";
const asts = require("peggy/lib/compiler/asts");
const Stack = require("peggy/lib/compiler/stack");
const op = require("../opcodes");
const internalUtils = require("../utils");
// Load static parser parts
const header = require("./generate-php/header");
const utilityFunctions = require("./generate-php/utility-functions");
const syntaxErrorClass = require("./generate-php/syntax-error-class");
const dataStorageClasses = require("./generate-php/data-storage-classes");
const commonMethods = require("./generate-php/common-methods");
/* Generates parser PHP code. */
module.exports = function(ast, options) {
if (!ast.literals || !ast.classes || !ast.expectations || !ast.functions) {
throw new Error(
"generatePHP: generate bytecode was not called."
);
}
let phpGlobalNamespacePrefix = "";
let phpGlobalNamePrefixOrNamespaceEscaped = "";
const phpNamespace = options.phpeggy.parserNamespace;
const phpParserClass = options.phpeggy.parserClassName;
if (phpNamespace) {
phpGlobalNamespacePrefix = "\\";
// For use within double quoted strings inside generated code, ensure there is a double backslash
phpGlobalNamePrefixOrNamespaceEscaped = phpNamespace.replace(/\\+/g, "\\\\") + "\\\\";
}
/* Only indent non-empty lines to avoid trailing whitespace. */
function indent(numberOfSpaces, code) {
return code.replace(/^(.+)$/gm, " ".repeat(numberOfSpaces) + "$1");
}
function name(name) { return "peg_parse_" + name; }
function generateTablesDeclaration() {
function buildRegexp(cls) {
const regexp = "/^["
+ (cls.inverted ? "^" : "")
+ cls.value.map(part => (Array.isArray(part)
? part.map(internalUtils.escapePhpRegexp).join("-")
: internalUtils.escapePhpRegexp(part)
)).join("")
+ "]/" + (cls.ignoreCase ? "i" : "") + (cls.unicode ? "u" : "");
// Should use r modifier in future for fine tuning, only as of php 8.4.0
return internalUtils.quotePhp(regexp);
}
const literals = ast.literals.map(
(l, i) => "private string $peg_l" + i + " = " + internalUtils.quotePhp(l) + ";"
);
const classes = ast.classes.map(
(c, i) => "private string $peg_c" + i + " = " + buildRegexp(c) + ";"
);
const expectations = ast.expectations.map(
(e, i) => "private pegExpectation $peg_e" + i + ";"
);
return [
...literals ? literals : [],
...classes ? classes : [],
...expectations ? expectations : [],
];
}
function generateTablesDefinition() {
function buildExpectation(e) {
switch (e.type) {
case "rule": {
return 'new pegExpectation("other", ' + internalUtils.quotePhp(e.value) + ")";
}
case "literal": {
return "new pegExpectation("
+ [
'"literal",',
internalUtils.quotePhp(internalUtils.quotePhp(e.value)) + ",",
internalUtils.quotePhp(e.value) + ",",
internalUtils.quotePhp(e.ignoreCase.toString()),
/* eslint-disable-next-line @stylistic/indent */
].join(" ")
+ ")";
}
case "class": {
const escapedClass = "["
+ e.value.map(part => (Array.isArray(part)
? part.map(internalUtils.escapePhp).join("-")
: internalUtils.escapePhp(part)
)).join("")
+ "]";
return "new pegExpectation("
+ [
'"class",',
internalUtils.quotePhp(escapedClass) + ",",
`"${escapedClass}",`,
internalUtils.quotePhp(e.ignoreCase.toString()) + ",",
internalUtils.quotePhp(e.unicode.toString()),
/* eslint-disable-next-line @stylistic/indent */
].join(" ")
+ ")";
}
case "any":
return 'new pegExpectation("any", "any character")';
default: throw new Error("Unknown expectation type (" + JSON.stringify(e) + ")");
}
}
const expectations = ast.expectations.map(
(e, i) => "$this->peg_e" + i + " = " + buildExpectation(e) + ";"
);
return [
...expectations ? expectations : [],
];
}
function generateFunctions() {
return ast.functions.map((f, i) => [
"private function peg_f" + i + "(",
...f.params.map(param => " mixed $" + param + ","),
"): mixed {",
" " + internalUtils.extractPhpCode(f.body).trim(),
"}",
"",
].join("\n"));
}
function generateCacheHeader(ruleIndexCode) {
return [
"$key = $this->peg_currPos * " + ast.rules.length + " + " + ruleIndexCode + ";",
"$cached = $this->peg_cache[$key] ?? false;",
"",
"if ($cached) {",
" $this->peg_currPos = $cached->nextPos;",
" return $cached->result;",
"}",
"",
];
}
function generateCacheFooter(resultCode) {
return [
"",
"$this->peg_cache[$key] = new pegCacheItem($this->peg_currPos, " + resultCode + ");",
];
}
function generateRuleFunction(rule) {
const parts = [];
// |literals[i]| of the abstract machine
function l(i) {
return "$this->peg_l" + i;
}
// |classes[i]| of the abstract machine
function c(i) {
return "$this->peg_c" + i;
}
// |expectations[i]| of the abstract machine
function e(i) {
return "$this->peg_e" + i;
}
// |actions[i]| of the abstract machine
function f(i) {
return "$this->peg_f" + i;
}
function inputSubstr(start, len) {
/*
* If we can guarantee that `start` is within the bounds of
* the array, replace this with a direct array access when
* `len === 1`. Currently we cannot guarantee this.
*/
return "$this->input_substr(" + start + ", " + len + ")";
}
const stack = new Stack(rule.name, "$s", "", rule.bytecode);
function compile(bc) {
let ip = 0;
const end = bc.length;
const parts = [];
// eslint-disable-next-line no-useless-assignment
let value = undefined;
function compileCondition(cond, argCount, thenFn) {
const baseLength = argCount + 3;
const thenLength = bc[ip + baseLength - 2];
const elseLength = bc[ip + baseLength - 1];
let thenCode = undefined;
let elseCode = undefined;
stack.checkedIf(
ip,
() => {
ip += baseLength;
thenCode = (thenFn || compile)(bc.slice(ip, ip + thenLength));
ip += thenLength;
},
elseLength > 0
? () => {
elseCode = compile(bc.slice(ip, ip + elseLength));
ip += elseLength;
}
: null
);
parts.push("if (" + cond + ") {");
parts.push(...thenCode.map(line => indent(4, line)));
if (elseLength > 0) {
parts.push("} else {");
parts.push(...elseCode.map(line => indent(4, line)));
}
parts.push("}");
}
/*
MATCH_* opcodes typically do something like
if (<test>($this->input_substr($this->peg_currPos, length))) {
sN = $this->input_substr($this->peg_currPos, length);
...
} else {
sN = $this->peg_FAILED;
...
}
compileInputChunkCondition will convert that to
sN = $this->input_substr($this->peg_currPos, length);
if (<test>(sN)) {
...
} else {
sN = $this->peg_FAILED;
...
}
and avoid extracting the sub string twice.
*/
function compileInputChunkCondition(
condFn, argCount, inputChunkLength
) {
const baseLength = argCount + 3;
let inputChunk = inputSubstr("$this->peg_currPos", inputChunkLength);
let thenFn = null;
if (bc[ip + baseLength] === op.ACCEPT_N
&& bc[ip + baseLength + 1] === inputChunkLength
) {
// Push the assignment to the next available variable.
parts.push(stack.push(inputChunk));
inputChunk = stack.pop();
thenFn = bc => {
// The bc[0] is an ACCEPT_N, and bc[1] is the N. We've already done
// the assignment (before the if), so we just need to bump the
// stack, and increment $this->peg_currPos appropriately.
stack.sp++;
const code = compile(bc.slice(2));
code.unshift(
inputChunkLength > 1
? "$this->peg_currPos += " + inputChunkLength + ";"
: "$this->peg_currPos++;"
);
return code;
};
}
compileCondition(condFn(inputChunk, thenFn !== null), argCount, thenFn);
}
function compileLoop(cond) {
const baseLength = 2;
const bodyLength = bc[ip + baseLength - 1];
let bodyCode = undefined;
stack.checkedLoop(ip, () => {
ip += baseLength;
bodyCode = compile(bc.slice(ip, ip + bodyLength));
ip += bodyLength;
});
parts.push("while (" + cond + ") {");
parts.push(...bodyCode.map(line => indent(4, line)));
parts.push("}");
}
// Length of string in terms of code points
function countCodePoints(str) {
return [...str].length;
}
/*
* Extracted into a function just to silence JSHint complaining about
* creating functions in a loop.
*/
function stackIndex(p) {
return stack.index(p);
}
function compileCall(baseLength) {
const paramsLength = bc[ip + baseLength - 1];
return f(bc[ip + 1])
+ "("
+ bc.slice(ip + baseLength, ip + baseLength + paramsLength).map(
p => stackIndex(p)
).join(", ")
+ ")";
}
while (ip < end) {
switch (bc[ip]) {
case op.PUSH_EMPTY_STRING: // PUSH_EMPTY_STRING
parts.push(stack.push("\"\""));
ip++;
break;
case op.PUSH_UNDEFINED: // PUSH_UNDEFINED
parts.push(stack.push("null"));
ip++;
break;
case op.PUSH_NULL: // PUSH_NULL
parts.push(stack.push("null"));
ip++;
break;
case op.PUSH_FAILED: // PUSH_FAILED
parts.push(stack.push("$this->peg_FAILED"));
ip++;
break;
case op.PUSH_EMPTY_ARRAY: // PUSH_EMPTY_ARRAY
parts.push(stack.push("[]"));
ip++;
break;
case op.PUSH_CURR_POS: // PUSH_CURR_POS
parts.push(stack.push("$this->peg_currPos"));
ip++;
break;
case op.POP: // POP
stack.pop();
ip++;
break;
case op.POP_CURR_POS: // POP_CURR_POS
parts.push("$this->peg_currPos = " + stack.pop() + ";");
ip++;
break;
case op.POP_N: // POP_N n
stack.pop(bc[ip + 1]);
ip += 2;
break;
case op.NIP: // NIP
value = stack.pop();
stack.pop();
parts.push(stack.push(value));
ip++;
break;
case op.APPEND: // APPEND
value = stack.pop();
parts.push(stack.top() + "[] = " + value + ";");
ip++;
break;
case op.WRAP: // WRAP n
parts.push(
stack.push("[" + stack.pop(bc[ip + 1]).join(", ") + "]")
);
ip += 2;
break;
case op.TEXT: { // TEXT
const stackTop = stack.pop();
parts.push(stack.push(
inputSubstr(
stackTop,
"$this->peg_currPos - " + stackTop
)
));
ip++;
break;
}
case op.PLUCK: { // PLUCK n, k, p1, ..., pK
const baseLength = 3;
const paramsLength = bc[ip + baseLength - 1];
const n = baseLength + paramsLength;
value = bc.slice(ip + baseLength, ip + n);
value = (paramsLength === 1)
? stack.index(value[0])
: `[ ${
value.map(p => stackIndex(p)).join(", ")
} ]`;
stack.pop(bc[ip + 1]);
parts.push(stack.push(value));
ip += n;
break;
}
case op.IF: // IF t, f
compileCondition(stack.top(), 0);
break;
case op.IF_ERROR: // IF_ERROR t, f
compileCondition(stack.top() + " === $this->peg_FAILED", 0);
break;
case op.IF_NOT_ERROR: // IF_NOT_ERROR t, f
compileCondition(stack.top() + " !== $this->peg_FAILED", 0);
break;
case op.WHILE_NOT_ERROR: // WHILE_NOT_ERROR b
compileLoop(stack.top() + " !== $this->peg_FAILED", 0);
break;
case op.MATCH_ANY: // MATCH_ANY a, f, ...
compileCondition("$this->input_length > $this->peg_currPos", 0);
break;
case op.MATCH_STRING: { // MATCH_STRING s, a, f, ...
const litNum = bc[ip + 1];
compileInputChunkCondition(
inputChunk => `${inputChunk} === ${l(litNum)}`,
1,
countCodePoints(ast.literals[litNum])
);
break;
}
case op.MATCH_STRING_IC: { // MATCH_STRING_IC s, a, f, ...
const litNum = bc[ip + 1];
compileInputChunkCondition(
inputChunk => `\\mb_strtolower(${inputChunk}, "UTF-8") === ${l(litNum)}`,
1,
countCodePoints(ast.literals[litNum])
);
break;
}
case op.MATCH_CHAR_CLASS: { // MATCH_CHAR_CLASS c, a, f, ...
const regNum = bc[ip + 1];
compileInputChunkCondition(
inputChunk => `\\preg_match(${c(regNum)}, ${inputChunk})`,
1,
1
);
break;
}
case op.ACCEPT_N: // ACCEPT_N n
parts.push(stack.push(
inputSubstr("$this->peg_currPos", bc[ip + 1])
));
parts.push(
bc[ip + 1] > 1
? "$this->peg_currPos += " + bc[ip + 1] + ";"
: "$this->peg_currPos++;"
);
ip += 2;
break;
case op.ACCEPT_STRING: { // ACCEPT_STRING s
const length = countCodePoints(ast.literals[bc[ip + 1]]);
parts.push(stack.push(l(bc[ip + 1])));
parts.push(
length > 1
? `$this->peg_currPos += ${length};`
: "$this->peg_currPos++;"
);
ip += 2;
break;
}
case op.FAIL: // FAIL e
parts.push(stack.push("$this->peg_FAILED"));
parts.push("if ($this->peg_silentFails === 0) {");
parts.push(" $this->peg_fail(" + e(bc[ip + 1]) + ");");
parts.push("}");
ip += 2;
break;
case op.IF_LT: // IF_LT min, t, f
compileCondition("\\count(" + stack.top() + ") < " + bc[ip + 1], 1);
break;
case op.IF_GE: // IF_GE max, t, f
compileCondition("\\count(" + stack.top() + ") >= " + bc[ip + 1], 1);
break;
case op.IF_LT_DYNAMIC: // IF_LT_DYNAMIC min, t, f
value = stack.index(bc[ip + 1]);
compileCondition("\\is_numeric(" + value + ") ? \\count(" + stack.top() + ") < " + value + " : false", 1);
break;
case op.IF_GE_DYNAMIC: // IF_GE_DYNAMIC max, t, f
value = stack.index(bc[ip + 1]);
compileCondition("\\is_numeric(" + value + ") ? \\count(" + stack.top() + ") >= " + value + " : true", 1);
break;
case op.LOAD_SAVED_POS: // LOAD_SAVED_POS p
parts.push("$this->peg_reportedPos = " + stack.index(bc[ip + 1]) + ";");
ip += 2;
break;
case op.UPDATE_SAVED_POS: // UPDATE_SAVED_POS
parts.push("$this->peg_reportedPos = $this->peg_currPos;");
ip++;
break;
case op.CALL: // CALL f, n, pc, p1, p2, ..., pN
value = compileCall(4);
stack.pop(bc[ip + 2]);
parts.push(stack.push(value));
ip += 4 + bc[ip + 3];
break;
case op.RULE: // RULE r
parts.push(stack.push("$this->" + name(ast.rules[bc[ip + 1]].name) + "()"));
ip += 2;
break;
case op.SILENT_FAILS_ON: // SILENT_FAILS_ON
parts.push("$this->peg_silentFails++;");
ip++;
break;
case op.SILENT_FAILS_OFF: // SILENT_FAILS_OFF
parts.push("$this->peg_silentFails--;");
ip++;
break;
default:
throw new Error("Invalid opcode: " + bc[ip] + ".", { rule: rule.name, bytecode: bc });
}
}
return parts;
}
const code = compile(rule.bytecode);
parts.push(
"private function " + name(rule.name) + "(): mixed",
"{"
);
if (options.cache) {
parts.push(...generateCacheHeader(
asts.indexOfRule(ast, rule.name)
).map(line => indent(4, line)));
}
parts.push(...code.map(line => indent(4, line)));
if (options.cache) {
parts.push(...generateCacheFooter(stack.result())
.map(line => indent(4, line)));
}
parts.push(
"",
" return " + stack.result() + ";",
"}",
""
);
return parts;
}
//
// Start collection of code for parser output
//
const parts = [];
parts.push(...header);
if (typeof options.phpeggy.header === "string") {
parts.push(
options.phpeggy.header,
""
);
}
parts.push(
"declare(strict_types=1);",
""
);
if (phpNamespace) {
parts.push(
"namespace " + phpNamespace + ";",
""
);
}
// Global initializer
if (ast.topLevelInitializer) {
const topLevel = Array.isArray(ast.topLevelInitializer)
? ast.topLevelInitializer
: [ast.topLevelInitializer];
// Put library code before code using it.
for (const topLevelInitializer of topLevel.slice().reverse()) {
const topLevelInitializerCode = internalUtils.extractPhpCode(
topLevelInitializer.code.trim()
);
if (topLevelInitializerCode !== "") {
parts.push(
topLevelInitializerCode,
""
);
}
}
}
parts.push(...utilityFunctions(
phpGlobalNamePrefixOrNamespaceEscaped
));
parts.push(...syntaxErrorClass(
phpGlobalNamePrefixOrNamespaceEscaped,
phpGlobalNamespacePrefix
));
parts.push(...dataStorageClasses(
phpGlobalNamePrefixOrNamespaceEscaped
));
parts.push(
"class " + phpParserClass,
"{"
);
parts.push(...[
...options.cache
? ["/** @var pegCacheItem[] */",
"public array $peg_cache = [];",
""]
: [],
"private int $peg_currPos = 0;",
"private int $peg_reportedPos = 0;",
"private int $peg_cachedPos = 0;",
"private pegCachedPosDetails $peg_cachedPosDetails;",
"private int $peg_maxFailPos = 0;",
"/** @var pegExpectation[] $peg_maxFailExpected */",
"private array $peg_maxFailExpected = [];",
"private int $peg_silentFails = 0;", // 0 = report failures, > 0 = silence failures
"/** @var string[] $input */",
"private array $input = [];",
"/** @var array<string, mixed> $options */",
"private array $options = [];",
"private int $input_length = 0;",
"private " + phpGlobalNamespacePrefix + "stdClass $peg_FAILED;",
'private string $peg_source = "";',
"",
].map(line => indent(4, line)));
parts.push(...[
...generateTablesDeclaration(),
"",
].map(line => indent(4, line)));
// Constructor start
parts.push(...[
"public function __construct()",
"{",
" $this->peg_FAILED = new " + phpGlobalNamespacePrefix + "stdClass();",
" $this->peg_cachedPosDetails = new pegCachedPosDetails();",
].map(line => indent(4, line)));
parts.push(...generateTablesDefinition().map(line => indent(8, line)));
parts.push(...[
"}",
"",
].map(line => indent(4, line)));
// Constructor end
// Grammar-provided methods
if (ast.initializer) {
const astInitializer = Array.isArray(ast.initializer)
? ast.initializer
: [ast.initializer];
for (const initializer of astInitializer) {
const initializerCode = internalUtils.extractPhpCode(
initializer.code.trim()
);
if (initializerCode !== "") {
parts.push(...[
initializerCode,
"",
].map(line => indent(4, line)));
}
}
}
// START public function parse
parts.push(...[
"/**",
" * @param string|string[] $input",
" * @param mixed[] $args",
" * @throws " + phpGlobalNamespacePrefix + "Exception",
" * @throws SyntaxError",
" */",
"public function parse(",
" $input,",
" array ...$args",
"): mixed {",
].map(line => indent(4, line)));
parts.push(...[
"$this->peg_cleanup_state();",
"$this->options = $args[0] ?? [];",
"if (\\is_array($input)) {",
" $this->input = $input;",
"} else {",
' \\preg_match_all("/./us", $input, $match);',
" $this->input = $match[0];",
"}",
"$this->input_length = \\count($this->input);",
'$this->peg_source = $this->options["grammarSource"] ?? "";',
"",
].map(line => indent(8, line)));
parts.push(...[
"$old_regex_encoding = (string) \\mb_regex_encoding();",
'\\mb_regex_encoding("UTF-8");',
"",
].map(line => indent(8, line)));
parts.push(...[
"if (method_exists($this, 'initialize')) {",
" $this->initialize();",
"}",
"",
].map(line => indent(8, line)));
const startRuleFunctions = options.allowedStartRules.map(
ruleName => '"' + ruleName + '" => [$this, "' + name(ruleName) + '"]'
).join(", ");
parts.push(...[
"$peg_startRuleFunctions = [" + startRuleFunctions + "];",
'$peg_startRuleFunction = [$this, "' + name(options.allowedStartRules[0]) + '"];',
].map(line => indent(8, line)));
parts.push(...[
'if (isset($this->options["startRule"])) {',
' if (!isset($peg_startRuleFunctions[$this->options["startRule"]])) {',
" throw new " + phpGlobalNamespacePrefix + 'Exception("Can\'t start parsing from rule \\"" . $this->options["startRule"] . "\\".");',
" }",
"",
' $peg_startRuleFunction = $peg_startRuleFunctions[$this->options["startRule"]];',
"}",
"",
"/* @var mixed $peg_result */",
"$peg_result = \\call_user_func($peg_startRuleFunction);",
"",
].map(line => indent(8, line)));
if (options.cache) {
parts.push(...[
"$this->peg_cache = [];",
"",
].map(line => indent(8, line)));
}
parts.push(...[
"\\mb_regex_encoding($old_regex_encoding);",
"",
].map(line => indent(8, line)));
parts.push(...[
"if ($peg_result !== $this->peg_FAILED && $this->peg_currPos === $this->input_length) {",
" $this->peg_cleanup_state();", // Free up memory
" return $peg_result;",
"}",
"",
"if ($peg_result !== $this->peg_FAILED && $this->peg_currPos < $this->input_length) {",
' $this->peg_fail(new pegExpectation("end", "end of input"));',
"}",
"",
"$exception = $this->peg_buildException(null, $this->peg_maxFailExpected, $this->peg_maxFailPos);",
"$this->peg_cleanup_state();", // Free up memory
"throw $exception;",
].map(line => indent(8, line)));
parts.push(
" }",
""
);
// END public function parse
parts.push(...commonMethods(options.cache).map(line => indent(4, line)));
parts.push(...generateFunctions().map(line => indent(4, line)));
ast.rules.forEach(rule => {
parts.push(...generateRuleFunction(rule).map(line => indent(4, line)));
});
// Remove empty line
parts.pop();
parts.push(
"};",
""
);
ast.code = parts.join("\n");
};
/*
* The MIT License (MIT)
*
* Copyright (c) 2014-2025 The PHPeggy AUTHORS
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/