UNPKG

shelving

Version:

Toolkit for using data in JavaScript.

github.com/dhoulb/shelving

dhoulb/shelving

255 lines (254 loc) • 11.3 kB

JavaScript

import { getLink } from "../util/link.js"; import { HTTP_SCHEMES } from "../util/uri.js"; import { MARKUP_RULES } from "./rule/index.js"; export class MarkupParser { /** * The list of parsing rules this parser applies. */ rules; /** * Calculated list of priorities to iterate over (extracted from the rules), e.g. [10, 0, -10] */ priorities; /** * Set the `rel=""` property used for any links (e.g. `rel="nofollow ugc"`). * @example "nofollow ugc" */ rel; /** * Current page URL — used as the base for resolving relative refs (`./foo`, `#x`, bare segments) in link hrefs. * @default Falls back to `root` if not set. */ url; /** * Site root URL — used as the base for resolving site-absolute path hrefs (`/foo`), honoring its subfolder. * @default Falls back to `url` if not set. */ root; /** * Valid URI schemes/protocols for URLs and URIs. * @example ["http:", "https:"] * @default ["http:", "https:"] */ schemes; /** * Default context to use if one isn't set. Defaults to `"block"` */ context; constructor({ rules = MARKUP_RULES, rel, url, root, schemes = HTTP_SCHEMES, context = "block" } = {}) { this.rules = rules; this.priorities = _getPriorities(rules); this.rel = rel; this.url = url; this.root = root; this.schemes = schemes; this.context = context; } /** * Parse a text string as Markdownish markup syntax and render it as elements. * - Syntax is not defined by this code, but by the rules supplied to it. * * @param input The string content possibly containing markup syntax, e.g. "This is a *bold* string. * @param parser A markup parser instance. * @param context The context to render in (defaults to `"block"`). * * @returns A React node — an element, a string, `null`, or an array of zero or more of those. */ parse(input, context = "block") { const nodes = _parseNodes(input, this, context); return !nodes.length ? null : nodes.length === 1 ? nodes[0] : nodes; } /** Yield the rules active in `context` that sit in the given priority tier. */ *getRules(context, priority) { for (const r of this.rules) if (r.priority === priority && r.contexts.includes(context)) yield r; } /** * Get a HREF link with the correct context of our `options.url` and `options.root` * * @returns `ImmutableURI` a (URL) object if the link matches and is parseable and has an allowed scheme. * @returns `undefined` if the link does not amtch the allowed `options.schemes` */ getLink(href) { const link = getLink(href, this.url, this.root); if (link && this.schemes.includes(link.protocol)) return link; } } /** Extract the unique rule priorities, ordered highest first — these are the tiers to resolve in turn. */ function _getPriorities(rules) { const priorities = []; for (const { priority } of rules) if (!priorities.includes(priority)) priorities.push(priority); return priorities.sort().reverse(); } /** * Parse a string into its rendered nodes using a tiered / masking engine. * * Rules are grouped into priority tiers and resolved highest tier first. Once a tier claims a * region it is "masked" (blanked in a working copy of the string) so lower-priority rules cannot * match into — or across — it, but can still match around it. That single mechanism is the fix * for code spans that straddle link delimiters: a code span is masked before links resolve, so a * link either wraps a whole masked code span (and re-parses it) or cannot form at all. * * Rules own the recursion into their own children — they call `parser.parse` again, optionally * with a different context — so the engine never reaches inside a rule's content itself. */ function _parseNodes(input, parser, context) { // Resolve tier by tier. `claimed` collects the winning regions; `masked` hides each resolved // region from every lower tier. const claimed = []; let masked = input; for (const priority of parser.priorities) { const higher = Array.from(claimed); // Snapshot — claims from already-resolved (higher) tiers. for (const claim of _scanTier(masked, input, parser, context, priority, higher)) { // A claim that wraps earlier (higher-tier) claims absorbs them: the wrapper re-parses // that text itself (e.g. a link re-parsing its own title). for (let i = claimed.length - 1; i >= 0; i--) { const c = claimed[i]; if (c && c.start >= claim.start && c.end <= claim.end) claimed.splice(i, 1); } claimed.push(claim); masked = _mask(masked, claim.start, claim.end); } } // Walk left to right: raw text fills the gaps, rendered elements fill the claims. Each claim // already carries the capture groups recovered against the original text, so no rule's regexp // needs to run a second time here. claimed.sort((a, b) => a.start - b.start); const nodes = []; let pos = 0; for (const { rule, start, end, groups } of claimed) { if (start > pos) nodes.push(input.slice(pos, start)); nodes.push(rule.render(start.toString(), groups, parser)); pos = end; } if (pos < input.length) nodes.push(input.slice(pos)); return nodes; } /** * Yield one tier's winning claims, left to right, non-overlapping. * * "Leftmost wins" needs every rule's next match up front — but instead of throwing the losers * away we cache one match per rule, and only recompute a rule's match when the chosen claim * actually invalidated it. A rule whose match still lies ahead is reused untouched. */ function* _scanTier(masked, input, parser, context, priority, higher) { // Materialise this tier's rules. const rules = Array.from(parser.getRules(context, priority)); // Prime one cached match per rule. const cache = []; for (const rule of rules) cache.push(_findFrom(rule, masked, input, 0, higher)); for (;;) { // The leftmost cached match wins; on a tie the earlier rule in the list wins. let best; for (const claim of cache) if (claim && (!best || claim.start < best.start)) best = claim; if (!best) return; yield best; // Keep every cached match still ahead of `best`; recompute only the rules whose match overlapped (or was) `best`. for (let i = 0; i < rules.length; i++) { const rule = rules[i]; const claim = cache[i]; if (rule && (!claim || claim.start < best.end)) cache[i] = _findFrom(rule, masked, input, best.end, higher); } } } /** * Find the first valid claim for `rule` in `masked` at or after `from`, or `undefined`. * - A claim is valid if it sits in free space, or if it genuinely wraps the higher-tier claims it spans. * - Confirmed by re-running the rule on the original (unmasked) slice and checking it still matches the whole region. * - A match that merely straddles a masked region (e.g. a paragraph's trailing whitespace swallowing a fenced block) is spurious and is retried bounded by the claim it crossed. */ function _findFrom(rule, masked, input, from, higher) { let lo = from; for (;;) { // Advance `lo` past any higher-tier claim that covers it. for (let moved = true; moved;) { moved = false; for (const h of higher) if (h.start <= lo && lo < h.end) { lo = h.end; moved = true; } } if (lo >= masked.length) return undefined; const match = rule.regexp.exec(masked.slice(lo)); if (!match) return undefined; const start = lo + match.index; const end = start + match[0].length; // Inspect the higher-tier claims this match touches in a single allocation-free pass: // - `leadingEnd` — a claim starts at/before the match, so the match cannot begin here. // - `interior` — every spanned claim sits strictly inside (a possible genuine wrapper). // - `wallStart`/`wallEnd` — the first claim the match crosses, used to bound a retry. let overlaps = false; let leadingEnd = -1; let interior = true; let wallStart = masked.length; let wallEnd = -1; for (const h of higher) { if (h.start < end && h.end > start) { overlaps = true; if (h.start <= start) { if (h.end > leadingEnd) leadingEnd = h.end; } else if (h.end >= end) { interior = false; } if (h.start < wallStart) { wallStart = h.start; wallEnd = h.end; } } } if (!overlaps) return { rule, start, end, groups: match.groups }; // A higher claim starts at or before this match — it cannot begin here, skip past it. if (leadingEnd >= 0) { lo = leadingEnd; continue; } // A genuine wrapper holds every spanned claim strictly inside it (delimiters of its own on // both sides) and still matches the whole region when re-run on the original (unmasked) // slice. A claim that merely shares a boundary — a paragraph whose trailing whitespace // swallows the block below it — is spurious, not a wrapper. if (interior) { const original = rule.regexp.exec(input.slice(start, end)); if (original && !original.index && original[0].length === end - start) return { rule, start, end, groups: original.groups }; } // Spurious span — retry bounded by the first claim it crossed. const bounded = rule.regexp.exec(masked.slice(lo, wallStart)); if (bounded) { const s = lo + bounded.index; return { rule, start: s, end: s + bounded[0].length, groups: bounded.groups }; } lo = wallEnd; } } // Placeholder a claimed region is blanked to: non-whitespace and non-word, so lower tiers see a // masked region as opaque *content* rather than whitespace. Blanking to a space made a leading or // trailing code span look like whitespace to inline emphasis (which rejects whitespace at its // start/end), so `**`code`**` failed to form. Newlines are kept so block structure survives. const _MASK_CHAR = "\u0000"; /** Blank the `[start, end)` region of `text` — every character becomes `_MASK_CHAR`, except newlines. */ function _mask(text, start, end) { let blanked = ""; for (let i = start; i < end; i++) blanked += text[i] === "\n" ? "\n" : _MASK_CHAR; return `${text.slice(0, start)}${blanked}${text.slice(end)}`; } /** MarkupParser sentinel with the default markup rules */ export const MARKUP_PARSER = new MarkupParser();