abnf
Version:
Augmented Backus-Naur Form (ABNF) parsing. See RFC 5234.
621 lines (558 loc) • 16.1 kB
JavaScript
import assert from "node:assert";
// Abstract Syntax Tree for ABNF.
// Note: It is NOT the goal of this AST to preserve enough information
// to round-trip (at this time)
const PEGGY_DELIMS = {
sep: " ",
alt: "/",
before: "|",
after: "|",
count: "..",
rangeBefore: "[",
rangeAfter: "]",
rangeSep: "-",
};
const PEST_DELIMS = {
sep: " ~ ",
alt: "|",
before: "{",
after: "}",
count: ", ",
rangeBefore: "",
rangeAfter: "",
rangeSep: "..",
};
const BEFORE_SURROGATES = 0xd7ff;
const FIRST_HIGH = 0xd800;
const FIRST_LOW = 0xdc00;
const LAST_LOW = 0xdfff;
const AFTER_SURROGATES = 0xe000;
const LAST_BMP = 0xffff;
const FIRST_ASTRAL = 0x10000;
const LAST_ASTRAL = 0x10ffff;
const SURROGATE_PAGE_SIZE = 0x400;
const UTF16_RANGES = [
[0, BEFORE_SURROGATES],
// [FIRST_HIGH, LAST_LOW], // Ignore this range
[AFTER_SURROGATES, LAST_BMP],
[FIRST_ASTRAL, LAST_ASTRAL],
];
/**
* @typedef {object} FormatOptions
* @prop {string} [format='peggy']
* @prop {string|string[]} [startRule] Defaults to first
* @prop {boolean} [stubs=false]
* @prop {boolean} [unused=false]
*/
/**
* @typedef {Required<FormatOptions>} RequiredFormatOptions
* @prop {string} [FormatOptions.format='peggy']
*/
function slug(s) {
return s.replace(/-/g, "_");
}
function str(s, opts) {
s = `"${s.replace(/[\\"\x00-\x19\x7f-\xff]/g, c => `\\${{
"\r": "r",
"\n": "n",
'"': '"',
"\t": "t",
"\v": "x0B",
"\\": "\\",
}[c] || `x${c.charCodeAt(0).toString(16).padStart(2, "0")}`}`)}"`;
if (opts?.format === "peggy") {
s = s.replace(
/[\u0100-\uffff]/,
c => `\\u${c.codePointAt(0).toString(16).padStart(4, 0)}`
);
s = s.replace(
/[\u{10000}-\u{10ffff}]/u,
c => `\\u{${c.codePointAt(0).toString(16)}}`
);
} else if (opts?.format === "pest") {
s = s.replace(
/[\u{ff}-\u{10ffff}]/u,
c => `\\u{${c.codePointAt(0).toString(16)}}`
);
}
return s;
}
function fromArray(opts, a, joiner, needed, parent) {
if (Array.isArray(a)) {
return a.map(b => fromArray(opts, b, joiner, needed)).join(joiner);
}
return a.toFormat(opts, needed, parent);
}
/**
* Throw a bad format error.
*
* @param {RequiredFormatOptions} opts
* @returns {never}
*/
function badFormat(opts) {
throw new Error(`Unknown format: "${opts.format}"`);
}
function delims(opts) {
switch (opts.format) {
case "peggy":
return PEGGY_DELIMS;
case "pest":
return PEST_DELIMS;
default:
}
return badFormat(opts);
}
function surrogates(codePoint) {
return [
Math.floor((codePoint - FIRST_ASTRAL) / SURROGATE_PAGE_SIZE) + FIRST_HIGH,
((codePoint - FIRST_ASTRAL) % SURROGATE_PAGE_SIZE) + FIRST_LOW,
];
}
/**
* Partition the inclusive range [first, last] into chunks that are in one of
* the ranges specified in parts. If you want to ignore a range, do not
* include it in parts.
*
* @param {number} first
* @param {number} last
* @param {[start: number, end: number][]} parts Ranges to check. The start
* and end points are both inclusive.
* @returns {[start: number, end: number][]}
*/
function partitionRange(first, last, parts) {
const res = [];
for (const [start, end] of parts) {
if ((first <= end) && (last >= start)) { // Overlap with range?
res.push([Math.max(first, start), Math.min(last, end)]);
}
}
return res;
}
// Only exported for testing
export class Base {
constructor(type, loc, simple = true) {
if (typeof type !== "string") {
throw new TypeError(`Invalid type: ${type}`);
}
if (!loc || (typeof loc !== "object")) {
throw new TypeError(`Invalid location: ${loc} for ${type}`);
}
this.type = type;
this.loc = loc;
this.simple = simple;
}
/* c8 ignore start */
/**
* @abstract
* @deprecated Use `toFormat({format: 'peggy'})
*/
toPeggy(_opts) {
throw new Error(`Deprecated conversion to peggy grammar [${this.type}]`);
}
/**
*
* @param {RequiredFormatOptions} opts
*/
toFormat(_opts) {
throw new Error(`Unimplmented formatter [${this.type}]`);
}
/* c8 ignore stop */
}
export class Prose extends Base {
constructor(str, loc) {
super("prose", loc, false);
this.str = str;
}
toFormat(opts) {
return `. { error(\`Can't convert prose description to ${opts.format} grammar: "${this.str}"\`) }`;
}
}
export class CaseInsensitiveString extends Base {
constructor(str, loc) {
super("caseInsensitveString", loc);
this.str = str;
}
toFormat(opts) {
const s = str(this.str, opts);
if (this.str.match(/[a-z]/i)) {
switch (opts.format) {
case "peggy":
return `${s}i`;
case "pest":
return `^${s}`;
default:
badFormat(opts);
}
}
// Not worth the "i" modifier if there's no [a-zA-Z] characters.
return s;
}
}
export class CaseSensitiveString extends Base {
constructor(str, base, loc) {
super("caseSensitveString", loc);
this.str = str;
this.base = base;
}
toFormat(opts) {
return str(this.str, opts);
}
}
export class Concatenation extends Base {
constructor(elements, loc) {
super("concatenation", loc, false);
this.elements = elements;
}
toFormat(opts, needed) {
const { sep } = delims(opts);
return fromArray(opts, this.elements, sep, needed);
}
}
export class Alternation extends Base {
constructor(alts, loc) {
super("alternation", loc, false);
this.alts = alts.map(a => (a instanceof Alternation ? a.alts : a)).flat();
}
add(alt) {
this.alts.push(alt);
}
toFormat(opts, needed, parent) {
const { alt } = delims(opts);
if (parent && (parent.type === "rule")) {
// Top level alts go on new lines
return fromArray(opts, this.alts, `\n ${alt} `, needed);
}
return fromArray(opts, this.alts, ` ${alt} `, needed);
}
}
export class Repetition extends Base {
constructor(rep, el, loc) {
super("repetition", loc);
this.rep = rep;
this.el = el;
}
toFormat(opts, needed) {
return this.rep.toFormat(opts, needed, this.el);
}
}
export class Repeat extends Base {
constructor(min, max, loc) {
super("repeat", loc);
this.min = min;
this.max = max;
}
toFormat(opts, needed, el) {
const { before, after, count } = delims(opts);
let many = null;
if (this.min === 0) {
if (this.max === 1) {
many = "?";
} else if (this.max === null) {
many = "*";
}
} else if (this.min === 1) {
if (this.max === null) {
many = "+";
}
}
if (!many) {
// Min is always an integer. Max may be undefined or an int > min.
if (this.min === this.max) {
// 1*1 is a legal no-op
many = (this.min > 1) ? `${before}${this.min}${after}` : "";
} else {
// It's idiomatic Peggy to not use 0 as the min of a range.
many = `${before}${this.min || ""}${count}${this.max || ""}${after}`;
}
}
let arr = fromArray(opts, el, " ", needed);
if (!el.simple) {
arr = "(" + arr + ")";
}
arr += many;
return arr;
}
}
export class HashRepeat extends Base {
constructor(min, max, loc) {
super("hash_repeat", loc);
this.min = min;
this.max = max;
}
toFormat(opts, needed, el) {
if (opts.format !== "peggy") {
// Need help from a pest-atarian.
badFormat(opts);
}
// // range-set = 1#range-spec
// range_set
// = @ranges:(rs:(range_spec?)|1.., OWS "," OWS| {return rs.filter(r => r)}) &{
// return ranges.length > 0
// }
// Min is always an integer. Max may be undefined or an int > min.
let arr = fromArray(opts, el, " ", needed);
assert(el.simple, "Non-simple elements should be impossible here");
arr = `some:(${arr}?)|.., [ \\t]* "," [ \\t]*| {return some.filter(s => s)}`;
if (this.min || this.max) {
arr = `@count:(${arr}) &{ return `;
if (this.min) {
arr += `(count.length >= ${this.min})`;
if (this.max) {
arr += " && ";
}
}
if (this.max) {
arr += `(count.length <= ${this.max})`;
}
arr += " }";
}
return arr;
}
}
export class Range extends Base {
constructor(base, first, last, loc) {
super("range", loc);
this.base = base;
this.first = first;
this.last = last;
}
static create(base, first, last, loc, utf16 = true) {
if (first > last) {
throw new Error(`Range out of order ${first.toString(16)}-${last.toString(16)}`);
}
if (first === last) {
return new CaseSensitiveString(String.fromCodePoint(first), base, loc);
}
if (utf16 && (last > BEFORE_SURROGATES)) {
const alts = [];
for (const [start, end] of partitionRange(first, last, UTF16_RANGES)) {
if (start < FIRST_ASTRAL) {
// No surrogates needed
alts.push(Range.create(base, start, end, loc, false));
} else {
// Pure astral range.
// This code follows the logic in regenerate:
// https://github.com/mathiasbynens/regenerate/blob/11567339f40fd262435934d544885bc047cb4220/regenerate.js#L996
// I didn't use regenerate directly because:
// a) I only needed a small part of it
// b) Regenerate will only generate a string, which I then would have
// to parse to get the info I needed out.
// I believe this use is within the spirit of the MIT license.
const [startH, startL] = surrogates(start);
const [endH, endL] = surrogates(end);
let complete = false;
if (
(startH === endH) || ((startL === FIRST_LOW) && (endL === LAST_LOW))
) {
alts.push(new Concatenation([
Range.create(base, startH, endH, loc, false),
Range.create(base, startL, endL, loc, false),
], loc));
complete = true;
} else {
// First part of range, where startL might be greater than FIRST_LOW
// May one day be combined with below if startL === FIRST_LOW
alts.push(new Concatenation([
new CaseSensitiveString(String.fromCodePoint(startH), base, loc),
Range.create(base, startL, LAST_LOW, loc, false),
], loc));
}
if (!complete && (startH + 1 < endH)) {
if (endL === LAST_LOW) {
alts.push(new Concatenation([
Range.create(base, startH + 1, endH + 1, loc, false),
Range.create(base, FIRST_LOW, endL, loc, false),
], loc));
complete = true;
} else {
alts.push(new Concatenation([
Range.create(base, startH + 1, endH - 1, loc, false),
Range.create(base, FIRST_LOW, LAST_LOW, loc, false),
], loc));
}
}
if (!complete) {
alts.push(new Concatenation([
new CaseSensitiveString(String.fromCodePoint(endH), base, loc),
Range.create(base, FIRST_LOW, endL, loc, false),
], loc));
}
}
}
switch (alts.length) {
case 0:
throw new Error(`Range consists of all surrogates ${first.toString(16)}-${last.toString(16)}`);
case 1:
return alts[0];
default:
return new Alternation(alts, loc);
}
}
return new Range(base, first, last, loc);
}
static escape(opts, num) {
if (opts.format === "peggy") {
if (num <= 0xff) {
return "\\x" + num.toString(16).padStart(2, 0);
} else if (num <= LAST_BMP) {
return "\\u" + num.toString(16).padStart(4, 0);
} else {
throw new Error(`0x${num.toString(16)} does not fit in UTF-16`);
}
}
if (opts.format === "pest") {
return `'\\u{${num.toString(16).padStart(2, 0)}}'`;
}
return badFormat(opts);
}
toFormat(opts) {
const { rangeBefore, rangeAfter, rangeSep } = delims(opts);
const first = Range.escape(opts, this.first);
const last = Range.escape(opts, this.last);
if ((opts.format === "peggy") && (this.first + 1 === this.last)) {
return `${rangeBefore}${first}${last}${rangeAfter}`;
}
return `${rangeBefore}${first}${rangeSep}${last}${rangeAfter}`;
}
}
export class RuleRef extends Base {
constructor(name, loc) {
super("ruleref", loc);
this.name = name;
}
toFormat(opts, needed) {
// Do not upcase here, so that unused rules will stay in the original case.
needed.push(this.name);
return slug(this.name);
}
}
export class Rule extends Base {
constructor(name, def, loc) {
super("rule", loc);
this.name = name;
this.def = def;
}
toFormat(opts, needed) {
switch (opts.format) {
case "peggy":
return `${slug(this.name)}\n = ${fromArray(opts, this.def, " ", needed, this)}\n\n`;
case "pest":
return `${slug(this.name)} = {\n ${fromArray(opts, this.def, " ", needed, this)}\n}\n\n`;
default:
return badFormat(opts);
}
}
addAlternate(def, loc) {
if (!(this.def instanceof Alternation)) {
this.def = new Alternation([this.def], this.def.loc);
}
this.def.add(def);
this.def.loc.end = loc.end;
return this;
}
}
export class Group extends Base {
constructor(alt, loc) {
super("group", loc);
this.alt = alt;
}
toFormat(opts, needed) {
return `(${fromArray(opts, this.alt, " ", needed)})`;
}
}
export class Rules extends Base {
constructor() {
super("rules", {}); // Location will be replaced later
this.defs = {};
this.refs = [];
this.first = null;
}
toFormat(opts = {}) {
opts = /** @type {RequiredFormatOptions} */ ({
format: "peggy",
startRule: this.first,
stubs: false,
unused: false,
...opts,
});
if (!["peggy", "pest"].includes(opts.format)) {
badFormat(opts);
}
if (opts.startRule === null) {
return "";
}
if (typeof opts.startRule === "string") {
opts.startRule = [opts.startRule];
}
const needed = [...opts.startRule];
const done = new Set();
let res = "";
while (needed.length > 0) {
const ro = needed.shift();
const r = ro.toUpperCase();
if (!done.has(r)) {
done.add(r);
const rule = this.defs[r];
if (!rule) {
if (opts.stubs) {
res += `${ro} = . { throw new Error("Unknown rule '${ro}'") }\n`;
continue;
} else {
throw new Error(`Unknown rule: "${r}"`);
}
}
res += rule.toFormat(opts, needed);
}
}
if (opts.unused) {
for (const r of Object.values(this.defs)) {
const ru = r.name.toUpperCase();
if (!done.has(ru)) {
done.add(ru);
res += "// Unused rule\n";
res += r.toFormat(opts, needed);
}
}
}
return res;
}
/**
* Convert to a peggy grammar.
*
* @param {*} opts
* @returns
* @deprecated Use `toFormat({format: 'peggy'})`
*/
toPeggy(opts = {}) {
opts.format = "peggy";
return this.toFormat(opts);
}
addRule(name, def, loc) {
const n = name.toUpperCase();
if (!this.first) {
this.first = n;
}
if (Object.prototype.hasOwnProperty.call(this.defs, n)) {
throw new Error("Duplicate rule definition (line " + loc.start.line + "): " + name);
}
const ret = new Rule(name, def, loc);
this.defs[n] = ret;
return ret;
}
addAlternate(name, def, loc) {
const rule = this.defs[name.toUpperCase()];
if (!rule) {
throw new Error(`Trying to add to a non-existant rule (line ${loc.start.line}): ${name}`);
}
return rule.addAlternate(def, loc);
}
addRef(name, loc) {
const r = new RuleRef(name, loc);
this.refs.push(r);
return r;
}
findRefs(name) {
const nameU = name.toUpperCase();
return this.refs.filter(ref => ref.name.toUpperCase() === nameU);
}
}