shelving
Version:
Toolkit for using data in JavaScript.
255 lines (254 loc) • 11.3 kB
JavaScript
import { getLink } from "../util/link.js";
import { HTTP_SCHEMES } from "../util/uri.js";
import { MARKUP_RULES } from "./rule/index.js";
export class MarkupParser {
/**
* The list of parsing rules this parser applies.
*/
rules;
/**
* Calculated list of priorities to iterate over (extracted from the rules), e.g. [10, 0, -10]
*/
priorities;
/**
* Set the `rel=""` property used for any links (e.g. `rel="nofollow ugc"`).
* @example "nofollow ugc"
*/
rel;
/**
* Current page URL — used as the base for resolving relative refs (`./foo`, `#x`, bare segments) in link hrefs.
* @default Falls back to `root` if not set.
*/
url;
/**
* Site root URL — used as the base for resolving site-absolute path hrefs (`/foo`), honoring its subfolder.
* @default Falls back to `url` if not set.
*/
root;
/**
* Valid URI schemes/protocols for URLs and URIs.
* @example ["http:", "https:"]
* @default ["http:", "https:"]
*/
schemes;
/**
* Default context to use if one isn't set. Defaults to `"block"`
*/
context;
constructor({ rules = MARKUP_RULES, rel, url, root, schemes = HTTP_SCHEMES, context = "block" } = {}) {
this.rules = rules;
this.priorities = _getPriorities(rules);
this.rel = rel;
this.url = url;
this.root = root;
this.schemes = schemes;
this.context = context;
}
/**
* Parse a text string as Markdownish markup syntax and render it as elements.
* - Syntax is not defined by this code, but by the rules supplied to it.
*
* @param input The string content possibly containing markup syntax, e.g. "This is a *bold* string.
* @param parser A markup parser instance.
* @param context The context to render in (defaults to `"block"`).
*
* @returns A React node — an element, a string, `null`, or an array of zero or more of those.
*/
parse(input, context = "block") {
const nodes = _parseNodes(input, this, context);
return !nodes.length ? null : nodes.length === 1 ? nodes[0] : nodes;
}
/** Yield the rules active in `context` that sit in the given priority tier. */
*getRules(context, priority) {
for (const r of this.rules)
if (r.priority === priority && r.contexts.includes(context))
yield r;
}
/**
* Get a HREF link with the correct context of our `options.url` and `options.root`
*
* @returns `ImmutableURI` a (URL) object if the link matches and is parseable and has an allowed scheme.
* @returns `undefined` if the link does not amtch the allowed `options.schemes`
*/
getLink(href) {
const link = getLink(href, this.url, this.root);
if (link && this.schemes.includes(link.protocol))
return link;
}
}
/** Extract the unique rule priorities, ordered highest first — these are the tiers to resolve in turn. */
function _getPriorities(rules) {
const priorities = [];
for (const { priority } of rules)
if (!priorities.includes(priority))
priorities.push(priority);
return priorities.sort().reverse();
}
/**
* Parse a string into its rendered nodes using a tiered / masking engine.
*
* Rules are grouped into priority tiers and resolved highest tier first. Once a tier claims a
* region it is "masked" (blanked in a working copy of the string) so lower-priority rules cannot
* match into — or across — it, but can still match around it. That single mechanism is the fix
* for code spans that straddle link delimiters: a code span is masked before links resolve, so a
* link either wraps a whole masked code span (and re-parses it) or cannot form at all.
*
* Rules own the recursion into their own children — they call `parser.parse` again, optionally
* with a different context — so the engine never reaches inside a rule's content itself.
*/
function _parseNodes(input, parser, context) {
// Resolve tier by tier. `claimed` collects the winning regions; `masked` hides each resolved
// region from every lower tier.
const claimed = [];
let masked = input;
for (const priority of parser.priorities) {
const higher = Array.from(claimed); // Snapshot — claims from already-resolved (higher) tiers.
for (const claim of _scanTier(masked, input, parser, context, priority, higher)) {
// A claim that wraps earlier (higher-tier) claims absorbs them: the wrapper re-parses
// that text itself (e.g. a link re-parsing its own title).
for (let i = claimed.length - 1; i >= 0; i--) {
const c = claimed[i];
if (c && c.start >= claim.start && c.end <= claim.end)
claimed.splice(i, 1);
}
claimed.push(claim);
masked = _mask(masked, claim.start, claim.end);
}
}
// Walk left to right: raw text fills the gaps, rendered elements fill the claims. Each claim
// already carries the capture groups recovered against the original text, so no rule's regexp
// needs to run a second time here.
claimed.sort((a, b) => a.start - b.start);
const nodes = [];
let pos = 0;
for (const { rule, start, end, groups } of claimed) {
if (start > pos)
nodes.push(input.slice(pos, start));
nodes.push(rule.render(start.toString(), groups, parser));
pos = end;
}
if (pos < input.length)
nodes.push(input.slice(pos));
return nodes;
}
/**
* Yield one tier's winning claims, left to right, non-overlapping.
*
* "Leftmost wins" needs every rule's next match up front — but instead of throwing the losers
* away we cache one match per rule, and only recompute a rule's match when the chosen claim
* actually invalidated it. A rule whose match still lies ahead is reused untouched.
*/
function* _scanTier(masked, input, parser, context, priority, higher) {
// Materialise this tier's rules.
const rules = Array.from(parser.getRules(context, priority));
// Prime one cached match per rule.
const cache = [];
for (const rule of rules)
cache.push(_findFrom(rule, masked, input, 0, higher));
for (;;) {
// The leftmost cached match wins; on a tie the earlier rule in the list wins.
let best;
for (const claim of cache)
if (claim && (!best || claim.start < best.start))
best = claim;
if (!best)
return;
yield best;
// Keep every cached match still ahead of `best`; recompute only the rules whose match overlapped (or was) `best`.
for (let i = 0; i < rules.length; i++) {
const rule = rules[i];
const claim = cache[i];
if (rule && (!claim || claim.start < best.end))
cache[i] = _findFrom(rule, masked, input, best.end, higher);
}
}
}
/**
* Find the first valid claim for `rule` in `masked` at or after `from`, or `undefined`.
* - A claim is valid if it sits in free space, or if it genuinely wraps the higher-tier claims it spans.
* - Confirmed by re-running the rule on the original (unmasked) slice and checking it still matches the whole region.
* - A match that merely straddles a masked region (e.g. a paragraph's trailing whitespace swallowing a fenced block) is spurious and is retried bounded by the claim it crossed.
*/
function _findFrom(rule, masked, input, from, higher) {
let lo = from;
for (;;) {
// Advance `lo` past any higher-tier claim that covers it.
for (let moved = true; moved;) {
moved = false;
for (const h of higher)
if (h.start <= lo && lo < h.end) {
lo = h.end;
moved = true;
}
}
if (lo >= masked.length)
return undefined;
const match = rule.regexp.exec(masked.slice(lo));
if (!match)
return undefined;
const start = lo + match.index;
const end = start + match[0].length;
// Inspect the higher-tier claims this match touches in a single allocation-free pass:
// - `leadingEnd` — a claim starts at/before the match, so the match cannot begin here.
// - `interior` — every spanned claim sits strictly inside (a possible genuine wrapper).
// - `wallStart`/`wallEnd` — the first claim the match crosses, used to bound a retry.
let overlaps = false;
let leadingEnd = -1;
let interior = true;
let wallStart = masked.length;
let wallEnd = -1;
for (const h of higher) {
if (h.start < end && h.end > start) {
overlaps = true;
if (h.start <= start) {
if (h.end > leadingEnd)
leadingEnd = h.end;
}
else if (h.end >= end) {
interior = false;
}
if (h.start < wallStart) {
wallStart = h.start;
wallEnd = h.end;
}
}
}
if (!overlaps)
return { rule, start, end, groups: match.groups };
// A higher claim starts at or before this match — it cannot begin here, skip past it.
if (leadingEnd >= 0) {
lo = leadingEnd;
continue;
}
// A genuine wrapper holds every spanned claim strictly inside it (delimiters of its own on
// both sides) and still matches the whole region when re-run on the original (unmasked)
// slice. A claim that merely shares a boundary — a paragraph whose trailing whitespace
// swallows the block below it — is spurious, not a wrapper.
if (interior) {
const original = rule.regexp.exec(input.slice(start, end));
if (original && !original.index && original[0].length === end - start)
return { rule, start, end, groups: original.groups };
}
// Spurious span — retry bounded by the first claim it crossed.
const bounded = rule.regexp.exec(masked.slice(lo, wallStart));
if (bounded) {
const s = lo + bounded.index;
return { rule, start: s, end: s + bounded[0].length, groups: bounded.groups };
}
lo = wallEnd;
}
}
// Placeholder a claimed region is blanked to: non-whitespace and non-word, so lower tiers see a
// masked region as opaque *content* rather than whitespace. Blanking to a space made a leading or
// trailing code span look like whitespace to inline emphasis (which rejects whitespace at its
// start/end), so `**`code`**` failed to form. Newlines are kept so block structure survives.
const _MASK_CHAR = "\u0000";
/** Blank the `[start, end)` region of `text` — every character becomes `_MASK_CHAR`, except newlines. */
function _mask(text, start, end) {
let blanked = "";
for (let i = start; i < end; i++)
blanked += text[i] === "\n" ? "\n" : _MASK_CHAR;
return `${text.slice(0, start)}${blanked}${text.slice(end)}`;
}
/** MarkupParser sentinel with the default markup rules */
export const MARKUP_PARSER = new MarkupParser();