@thi.ng/parse
Version:
Purely functional parser combinators & AST generation for generic inputs
379 lines (378 loc) • 11.5 kB
JavaScript
import { DEFAULT, defmulti } from "@thi.ng/defmulti/defmulti";
import { illegalArgs } from "@thi.ng/errors/illegal-arguments";
import { unsupported } from "@thi.ng/errors/unsupported";
import { alt, altD } from "./combinators/alt.js";
import { dynamic } from "./combinators/dynamic.js";
import { lookahead } from "./combinators/lookahead.js";
import { maybe } from "./combinators/maybe.js";
import { not } from "./combinators/not.js";
import { oneOrMore, repeat, zeroOrMore } from "./combinators/repeat.js";
import { seq, seqD } from "./combinators/seq.js";
import { xform } from "./combinators/xform.js";
import { defContext } from "./context.js";
import { ALPHA, ALPHA_NUM } from "./presets/alpha.js";
import { BINARY_UINT, BIT } from "./presets/bits.js";
import { DIGIT } from "./presets/digits.js";
import { ESC, UNICODE } from "./presets/escape.js";
import { HEX_DIGIT, HEX_UINT } from "./presets/hex.js";
import { FLOAT, INT, UINT } from "./presets/numbers.js";
import { STRING } from "./presets/string.js";
import { DNL, NL, SPACE, WS, WS0, WS1 } from "./presets/whitespace.js";
import { always, alwaysD } from "./prims/always.js";
import {
inputEnd,
inputStart,
lineEnd,
lineStart,
wordBoundary
} from "./prims/anchor.js";
import { lit, litD } from "./prims/lit.js";
import { noneOf } from "./prims/none-of.js";
import { oneOf } from "./prims/one-of.js";
import { range, rangeD } from "./prims/range.js";
import { string, stringD } from "./prims/string.js";
import { collect, xfCollect } from "./xform/collect.js";
import { xfCount } from "./xform/count.js";
import { discard, xfDiscard } from "./xform/discard.js";
import { hoist, hoistResult, xfHoist, xfHoistResult } from "./xform/hoist.js";
import { join, xfJoin } from "./xform/join.js";
import { xfJson } from "./xform/json.js";
import { nest } from "./xform/nest.js";
import { xfFloat, xfInt } from "./xform/number.js";
import { print, xfPrint } from "./xform/print.js";
import { xfReplace } from "./xform/replace.js";
import { xfTrim } from "./xform/trim.js";
import { withID } from "./xform/with-id.js";
const APOS = litD("'");
const DASH = litD("-");
const REPEAT = maybe(
alt([
oneOf("?*+", "repeat"),
collect(
seq(
[litD("{"), UINT, maybe(lit(",")), maybe(UINT), litD("}")],
"repeatN"
)
)
])
);
const DISCARD = maybe(lit("!"), void 0, "discard");
const CHAR_OR_ESC = alt([UNICODE, ESC, always()]);
const CHAR_RANGE = seq([CHAR_OR_ESC, DASH, CHAR_OR_ESC], "charRange");
const CHAR_SEL = seq(
[
litD("["),
maybe(lit("^", "invert")),
oneOrMore(alt([CHAR_RANGE, UNICODE, noneOf("]", "char")]), "choice"),
litD("]")
],
"charSel"
);
const ANY = lit(".", "any");
const LIT = hoistResult(seq([APOS, CHAR_OR_ESC, APOS], "char"));
const SYM = join(oneOrMore(alt([ALPHA_NUM, oneOf(".-_$")]), "sym"));
const RULE_REF = seq([litD("<"), SYM, litD(">")], "ref");
const TERM_BODY = alt([RULE_REF, ANY, LIT, STRING, CHAR_SEL]);
const LOOK_AHEAD = maybe(
seq(
[
stringD("(?"),
oneOf("-+"),
seq([TERM_BODY, REPEAT, DISCARD], "lhterm"),
litD(")")
],
"lhspec"
),
void 0,
"lhnone"
);
const TERM = seq([TERM_BODY, REPEAT, DISCARD, LOOK_AHEAD], "term");
const ALT = seq(
[
litD("("),
WS0,
TERM,
zeroOrMore(seq([WS0, litD("|"), WS0, TERM])),
WS0,
litD(")"),
REPEAT,
DISCARD,
LOOK_AHEAD
],
"alt"
);
const RULE_XF = hoist(
seq([stringD("=>"), WS1, alt([SYM, RULE_REF, STRING]), WS1], "xform")
);
const RULE = seq(
[
WS0,
SYM,
WS0,
litD(":"),
oneOrMore(alt([TERM, ALT, WS1]), "body"),
maybe(RULE_XF),
litD(";"),
WS0
],
"rule"
);
const COMMENT = seqD([WS0, litD("#"), lookahead(always(), DNL)]);
const GRAMMAR = zeroOrMore(alt([RULE, COMMENT]), "rules");
const __first = ($) => $.children[0];
const __nth = ($, n) => $.children[n];
const __hasDynRuleRefs = (term, builtins) => {
let res = term.id === "ref" && !builtins.has(__first(term).result);
if (term.children) {
for (let x of term.children) {
res ||= __hasDynRuleRefs(x, builtins);
}
}
return res;
};
const __compile = defmulti(
(scope) => scope.id,
{
unicode: "char"
},
{
[DEFAULT]: ($) => unsupported(`unknown op: ${$.id}`),
root: ($, lang, opts, flags) => {
const rules = __first($).children;
const builtins = new Set(Object.keys(lang.rules));
const staticRules = /* @__PURE__ */ new Set();
const dynamicRules = /* @__PURE__ */ new Set();
for (let r of rules) {
if (__hasDynRuleRefs(r, builtins)) {
lang.rules[__first(r).result] = dynamic();
dynamicRules.add(r);
} else {
staticRules.add(r);
}
}
for (let r of [...staticRules, ...dynamicRules]) {
const id = __first(r).result;
const parser = __compile(r, lang, opts, flags);
if (dynamicRules.has(r)) {
lang.rules[id].set(parser);
} else {
lang.rules[id] = parser;
}
}
return lang;
},
rule: ($, lang, opts, flags) => {
const [id, body, xf] = $.children;
opts.debug && console.log(`rule: ${id.result}`, xf);
const acc = [];
for (let b of body.children) {
const c = __compile(b, lang, opts, flags);
c && acc.push(c);
}
let parser = acc.length > 1 ? seq(acc, id.result) : withID(id.result, acc[0]);
if (xf.id === "sym") {
const $xf = lang.env[xf.result];
if (!$xf) illegalArgs(`missing xform: ${xf.result}`);
parser = xform(parser, $xf);
} else if (xf.id === "ref") {
const $id = __first(xf).result;
if ($id === id) illegalArgs(`self-referential: ${$id}`);
const $xf = lang.rules[$id];
if (!$xf) illegalArgs(`missing xform rule: ${$id}`);
parser = nest(parser, $xf);
} else if (xf.id === "string") {
parser = xform(parser, xfReplace(xf.result));
}
return parser;
},
ref: ($, lang, opts, flags) => {
const id = __first($).result;
opts.debug && console.log(`ref: ${id}`, flags);
const ref = lang.rules[id];
return ref ? flags.discard ? discard(ref) : ref : illegalArgs(`invalid rule ref: ${id}`);
},
term: ($, lang, opts, flags) => {
const [term, repeat2, discard2, lookahead2] = $.children;
opts.debug && console.log(`term: ${term.id}`, flags);
return __compileRDL(
(discard3) => __compile(term, lang, opts, { ...flags, discard: discard3 }),
repeat2,
discard2,
lookahead2,
lang,
opts
);
},
lhterm: ($, lang, opts, flags) => {
const [term, repeat2, discard2] = $.children;
opts.debug && console.log(`lhterm: ${term.id}`);
return __compileRD(
(discard3) => __compile(term, lang, opts, { ...flags, discard: discard3 }),
repeat2,
discard2,
opts
);
},
alt: ($, lang, opts, flags) => {
opts.debug && console.log(`alt: ${$.id}`, flags);
const [term0, { children: terms }, repeat2, disc, lookahead2] = $.children;
const acc = [__compile(term0, lang, opts, flags)];
if (terms) {
for (let c of terms) {
acc.push(__compile(__first(c), lang, opts, flags));
}
}
return __compileRDL(
(optimize) => optimize || flags.discard ? acc.length > 1 ? altD(acc) : discard(acc[0]) : acc.length > 1 ? alt(acc) : acc[0],
repeat2,
disc,
lookahead2,
lang,
opts
);
},
any: (_, __, opts, flags) => {
opts.debug && console.log(`any`, flags);
return flags.discard ? alwaysD() : always("any");
},
char: ($, _, opts, flags) => {
const x = $.result;
opts.debug && console.log(`lit: '${x}'`, flags);
return (flags.discard ? litD : lit)(x);
},
string: ($, _, opts, flags) => {
const x = $.result;
opts.debug && console.log(`string: "${x}"`, flags);
return (flags.discard ? stringD : string)(x);
},
charRange: ($, _, opts, flags) => {
const [a, b] = $.children;
opts.debug && console.log(`range: ${a.result} - ${b.result}`, flags);
return (flags.discard ? rangeD : range)(a.result, b.result);
},
charSel: ($, lang, opts, flags) => {
opts.debug && console.log("charSel", flags);
let parser;
const children = __nth($, 1).children;
if (children.length === 1) {
parser = __compile(children[0], lang, opts, flags);
} else {
const onlyChars = children.every((x) => x.id === "char");
if (onlyChars) {
parser = oneOf(children.map((x) => x.result).join(""));
} else {
parser = alt(
children.map((c) => __compile(c, lang, opts, flags))
);
}
}
const invert = __first($).result;
opts.debug && console.log(`invert: ${invert}`);
return invert ? not(parser, flags.discard ? alwaysD() : always()) : parser;
}
}
);
const __compileRepeat = (parser, rspec, opts) => {
opts.debug && console.log(`repeat: ${rspec.id}`);
if (rspec.id === "repeat") {
switch (rspec.result) {
case "?":
return maybe(parser);
case "*":
return zeroOrMore(parser);
case "+":
return oneOrMore(parser);
default:
return parser;
}
} else if (rspec.id === "repeatN") {
const [n, sep, m] = rspec.result;
return repeat(parser, n, sep ? m || Infinity : m || n);
}
return parser;
};
const __compileDiscard = (parser, dspec, opts) => {
opts.debug && console.log(`discard:`, dspec.result);
return dspec.result === "!" ? discard(parser) : parser;
};
const __compileLookahead = (parser, spec, lang, opts) => {
opts.debug && console.log(`lookahead:`, spec.id);
return spec.id === "lhspec" ? lookahead(
parser,
__compile(__nth(spec, 1), lang, opts, {}),
__first(spec).result === "+"
) : parser;
};
const __compileRD = (parser, rspec, dspec, opts) => dspec.result != null && rspec.result == null ? parser(true) : __compileDiscard(
__compileRepeat(parser(false), rspec, opts),
dspec,
opts
);
const __compileRDL = (parser, rspec, dspec, lhspec, lang, opts) => __compileLookahead(
__compileRD(parser, rspec, dspec, opts),
lhspec,
lang,
opts
);
const defGrammar = (rules, env, opts) => {
opts = { debug: false, optimize: true, ...opts };
env = {
binary: xfInt(2),
collect: xfCollect,
count: xfCount,
discard: xfDiscard,
float: xfFloat,
hex: xfInt(16),
hoist: xfHoist,
hoistR: xfHoistResult,
int: xfInt(10),
join: xfJoin,
json: xfJson,
print: xfPrint(),
trim: xfTrim,
...env
};
const ctx = defContext(rules);
const result = (opts.debug ? print(GRAMMAR) : GRAMMAR)(ctx);
if (result) {
return __compile(
ctx.root,
{
env,
grammar: ctx,
rules: {
ALPHA_NUM,
ALPHA,
BIT,
BINARY_UINT,
DIGIT,
DNL,
END: inputEnd,
ESC,
FLOAT,
HEX_DIGIT,
HEX_UINT,
INT,
LEND: lineEnd,
LSTART: lineStart,
NL,
SPACE,
START: inputStart,
STRING,
UNICODE,
UINT,
WB: wordBoundary,
WS,
WS0,
WS1
}
},
opts,
{}
);
}
};
export {
GRAMMAR,
defGrammar
};