ecmarkup
Version:
Custom element definitions and core utilities for markup that specifies ECMAScript and related technologies.
691 lines (690 loc) • 29.7 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.isProsePart = isProsePart;
exports.parse = parse;
exports.walk = walk;
const header_parser_1 = require("./header-parser");
const tokMatcher = /(?<olist>«|«)|(?<clist>»|»)|(?<orec>\{)|(?<crec>\})|(?<oparen>\()|(?<cparen>\))|(?<and>(?:, )?and )|(?<is> is )|(?<comma>,)|(?<x_of>\b\w+ of )|(?<with_args> with arguments? )/u;
const periodSpaceMatcher = /(?<period>\.(?= ))/u;
const periodSpaceOrEOFMatcher = /(?<period>\.(?= |$))/u;
function isProsePart(tok) {
return (tok != null &&
(tok.name === 'text' ||
tok.name === 'comment' ||
tok.name === 'tag' ||
tok.name === 'opaqueTag' ||
tok.name === 'star' ||
tok.name === 'underscore' ||
tok.name === 'double-brackets' ||
tok.name === 'tick' ||
tok.name === 'tilde' ||
tok.name === 'pipe'));
}
class ParseFailure extends Error {
constructor(message, offset) {
super(message);
this.offset = offset;
}
}
function formatClose(close) {
const mapped = close.map(c => {
switch (c) {
case 'clist':
return 'list close';
case 'crec':
return 'record close';
case 'cparen':
return 'close parenthesis';
case 'eof':
return 'end of line';
case 'with_args':
return '"with argument(s)"';
case 'comma':
return 'comma';
case 'period':
return 'period';
case 'and':
return '"and"';
case 'is':
return '"is"';
default:
return c;
}
});
return (0, header_parser_1.formatEnglishList)(mapped, 'or');
}
function addProse(items, token) {
// sometimes we determine after seeing a token that it should not have been treated as a token
if (isProsePart(token)) {
const prev = items[items.length - 1];
if (token.name === 'text' &&
(prev === null || prev === void 0 ? void 0 : prev.name) === 'text' &&
prev.location.end.offset === token.location.start.offset // might be false when e.g. skipping tags
) {
// join with previous token
items[items.length - 1] = {
name: 'text',
contents: prev.contents + token.contents,
location: {
start: { offset: prev.location.start.offset },
end: { offset: token.location.end.offset },
},
};
}
else {
items.push(token);
}
}
else {
// invoke addProse so it has a chance to join
addProse(items, {
name: 'text',
contents: token.source,
location: {
start: { offset: token.offset },
end: { offset: token.offset + token.source.length },
},
});
}
}
function isWhitespace(x) {
return x.name === 'text' && /^\s*$/.test(x.contents);
}
function isEmpty(s) {
return s.items.every(i => isProsePart(i) && isWhitespace(i));
}
function emptyThingHasNewline(s) {
// only call this function on things which pass isEmpty
return s.items.some(i => i.contents.includes('\n'));
}
function getTagName(tok) {
if (tok.name !== 'tag') {
return null;
}
const lowcase = tok.contents.toLowerCase();
if (lowcase.startsWith('<del>') || lowcase.startsWith('<del ')) {
return 'open-del';
}
else if (lowcase.startsWith('</del>') || lowcase.startsWith('</del ')) {
return 'close-del';
}
else if (lowcase.startsWith('<figure>') || lowcase.startsWith('<figure ')) {
return 'open-figure';
}
else if (lowcase.startsWith('</figure>') || lowcase.startsWith('</figure ')) {
return 'close-figure';
}
else {
return null;
}
}
class ExprParser {
constructor(src, opNames) {
this.srcIndex = 0;
this.textTokOffset = null; // offset into current text node; only non-null if srcOffset points to a text node (but not conversely)
this.next = [];
this.src = src;
this.opNames = opNames;
}
peek() {
if (this.next.length === 0) {
this.advance();
}
return this.next[0];
}
// this method is complicated because the underlying data is a sequence of ecmarkdown fragments, not a string
advance() {
var _a;
const currentProse = [];
const commitProse = () => {
while (currentProse.length > 0) {
const frag = currentProse.shift();
this.next.push(frag);
}
};
while (this.srcIndex < this.src.length) {
let tok;
if (this.textTokOffset == null) {
tok = this.src[this.srcIndex];
}
else {
const originalTok = this.src[this.srcIndex];
const newContents = originalTok.contents.slice(this.textTokOffset);
const newStart = originalTok.location.start.offset + this.textTokOffset;
tok = {
name: 'text',
contents: newContents,
location: {
start: { offset: newStart },
end: { offset: newStart + newContents.length },
},
};
}
let match = tok.name === 'text' ? tok.contents.match(tokMatcher) : null;
if (tok.name === 'text' && match == null) {
// in `foo.[[bar]]`, the `.` ends this text token, but should not be recognized as a period
// but if `foo.` is the last token, it should be recognized as a period.
const periodMatcher = this.srcIndex < this.src.length - 1 ? periodSpaceMatcher : periodSpaceOrEOFMatcher;
match = tok.contents.match(periodMatcher);
}
// the `tok.name !== 'text'` part in the test below is redundant but makes TS happier
if (tok.name !== 'text' || match == null) {
const empty = (tok.name === 'text' && tok.contents.length === 0) ||
tok.name === 'tag' ||
tok.name === 'opaqueTag' ||
tok.name === 'comment';
if (!empty) {
currentProse.push(tok);
}
++this.srcIndex;
this.textTokOffset = null;
// skip anything in `<del>`
const tagName = getTagName(tok);
if (tagName === 'open-del') {
while (this.srcIndex < this.src.length &&
getTagName(this.src[this.srcIndex]) !== 'close-del') {
++this.srcIndex;
}
}
else if (tagName === 'open-figure') {
while (this.srcIndex < this.src.length &&
getTagName(this.src[this.srcIndex]) !== 'close-figure') {
++this.srcIndex;
}
commitProse();
this.next.push({
name: 'figure',
offset: tok.location.start.offset,
source: '',
});
return;
}
continue;
}
const { groups } = match;
const before = tok.contents.slice(0, match.index);
if (before.length > 0) {
currentProse.push({
name: 'text',
contents: before,
location: {
start: tok.location.start,
end: { offset: tok.location.start.offset + before.length },
},
});
}
const matchKind = Object.keys(groups).find(x => groups[x] != null);
commitProse();
this.textTokOffset = ((_a = this.textTokOffset) !== null && _a !== void 0 ? _a : 0) + match.index + match[0].length;
this.next.push({
name: matchKind,
offset: tok.location.start.offset + match.index,
source: groups[matchKind],
});
return;
}
commitProse();
this.next.push({
name: 'eof',
offset: this.src.length === 0 ? 0 : this.src[this.src.length - 1].location.end.offset,
source: '',
});
}
// returns true if this ate a newline
eatWhitespace() {
let next;
let hadNewline = false;
while (isProsePart((next = this.peek()))) {
if (next.name === 'text' && !/\S/.test(next.contents)) {
hadNewline || (hadNewline = next.contents.includes('\n'));
this.next.shift();
}
else {
break;
}
}
return hadNewline;
}
// guarantees the next token is an element of close
parseSeq(close) {
const items = [];
while (true) {
const next = this.peek();
switch (next.name) {
case 'and':
case 'is':
case 'period':
case 'with_args':
case 'comma': {
if (!close.includes(next.name)) {
addProse(items, next);
this.next.shift();
break;
}
if (items.length === 0) {
throw new ParseFailure(`unexpected ${next.name} (expected some content for element/argument)`, next.offset);
}
return { name: 'seq', items };
}
case 'eof': {
if (!close.includes('eof')) {
throw new ParseFailure(`unexpected eof (expected ${formatClose(close)})`, next.offset);
}
return { name: 'seq', items };
}
case 'olist': {
const startTok = this.next.shift();
const elements = [];
if (this.peek().name !== 'clist') {
while (true) {
elements.push(this.parseSeq(['clist', 'comma']));
if (this.peek().name === 'clist') {
break;
}
this.next.shift();
}
}
if (elements.length > 0 && isEmpty(elements[elements.length - 1])) {
if (elements.length === 1 || emptyThingHasNewline(elements[elements.length - 1])) {
// allow trailing commas when followed by whitespace
elements.pop();
}
else {
throw new ParseFailure(`unexpected list close (expected some content for element)`, this.peek().offset);
}
}
const endTok = this.next.shift(); // eat the clist
items.push({
name: 'list',
elements,
location: {
start: { offset: startTok.offset },
end: { offset: endTok.offset + endTok.source.length },
},
});
break;
}
case 'clist': {
if (!close.includes('clist')) {
throw new ParseFailure('unexpected list close without corresponding list open', next.offset);
}
return { name: 'seq', items };
}
case 'oparen': {
// scan backwards looking for stuff like `_foo_.bar`
// stop at the first space character or structured item
const callee = [];
for (let i = items.length - 1; i >= 0; --i) {
const ppart = items[i];
if (!isProsePart(ppart)) {
break;
}
if (ppart.name === 'text') {
const { contents } = ppart;
const spaceIndex = contents.lastIndexOf(' ');
if (spaceIndex !== -1) {
if (spaceIndex < contents.length - 1) {
const calleePart = contents.slice(spaceIndex + 1);
if (!/\p{Letter}/u.test(calleePart)) {
// e.g. -(x + 1)
break;
}
items[i] = {
name: 'text',
contents: contents.slice(0, spaceIndex + 1),
location: {
start: { offset: ppart.location.start.offset },
end: { offset: ppart.location.start.offset + spaceIndex + 1 },
},
};
const startOffset = ppart.location.start.offset + spaceIndex + 1;
// calleePart is nonempty because it matches \p{Letter}
callee.unshift({
name: 'text',
contents: calleePart,
location: {
start: { offset: startOffset },
end: { offset: startOffset + calleePart.length },
},
});
}
break;
}
}
callee.unshift(ppart);
items.pop();
}
if (callee.length > 0) {
if (callee[0].name === 'text') {
// check for -F(), which is negation of F() not an AO named -F
const initialNonLetter = callee[0].contents.match(/^\P{Letter}+/u);
if (initialNonLetter != null) {
const extra = initialNonLetter[0].length;
const extraLoc = callee[0].location.start.offset;
// we know by construction that there is at least one letter, so this is guaranteed to be nonempty
callee[0].contents = callee[0].contents.substring(extra);
callee[0].location.start.offset += extra;
const contents = callee[0].contents.substring(0, extra);
addProse(items, {
name: 'text',
contents,
location: {
start: { offset: extraLoc },
end: { offset: extraLoc + contents.length },
},
});
}
}
this.next.shift();
const args = [];
if (this.peek().name !== 'cparen') {
while (true) {
args.push(this.parseSeq(['cparen', 'comma']));
if (this.peek().name === 'cparen') {
break;
}
this.next.shift();
}
}
if (args.length > 0 && isEmpty(args[args.length - 1])) {
if (args.length === 1 || emptyThingHasNewline(args[args.length - 1])) {
// allow trailing commas when followed by a newline
args.pop();
}
else {
throw new ParseFailure(`unexpected close parenthesis (expected some content for argument)`, this.peek().offset);
}
}
const cParen = this.next.shift();
items.push({
name: 'call',
callee,
arguments: args,
location: {
start: { offset: callee[0].location.start.offset },
end: { offset: cParen.offset + cParen.source.length },
},
});
}
else {
const oParen = this.next.shift();
const parenContents = this.parseSeq(['cparen']).items;
const cParen = this.next.shift();
items.push({
name: 'paren',
items: parenContents,
location: {
start: { offset: oParen.offset },
end: { offset: cParen.offset + cParen.source.length },
},
});
}
break;
}
case 'cparen': {
if (!close.includes('cparen')) {
throw new ParseFailure('unexpected close parenthesis without corresponding open parenthesis', next.offset);
}
return { name: 'seq', items };
}
case 'orec': {
const oRecTok = this.next.shift();
let type = null;
const members = [];
while (true) {
const hadNewline = this.eatWhitespace();
const nextTok = this.peek();
if (nextTok.name === 'crec') {
if (!hadNewline) {
// ideally this would be a lint failure, or better yet a formatting thing, but whatever
throw new ParseFailure(members.length > 0
? 'trailing commas are only allowed when followed by a newline'
: 'records cannot be empty', nextTok.offset);
}
break;
}
if (!isProsePart(nextTok)) {
throw new ParseFailure('expected to find record field name', nextTok.offset);
}
if (nextTok.name !== 'double-brackets') {
const skipWs = nextTok.name === 'text' ? nextTok.contents.match(/^\s*/)[0].length : 0;
throw new ParseFailure('expected to find record field name', nextTok.location.start.offset + skipWs);
}
const { contents: name } = nextTok;
if (members.find(x => x.name === name)) {
throw new ParseFailure(`duplicate record field name ${name}`, nextTok.location.start.offset + 2);
}
this.next.shift();
const afterName = this.peek();
const colonMatch = afterName.name === 'text' ? afterName.contents.match(/^\s*:/) : null;
if (colonMatch != null) {
const afterNameAsText = afterName;
const withoutColon = afterNameAsText.contents.slice(colonMatch[0].length);
const offset = afterNameAsText.location.start.offset + colonMatch[0].length;
this.next[0] = {
name: 'text',
contents: withoutColon,
location: {
start: { offset },
end: { offset: offset + withoutColon.length },
},
};
if (type == null) {
type = 'record';
}
else if (type === 'record-spec') {
throw new ParseFailure('record field has value but preceding field does not', offset - 1);
}
const value = this.parseSeq(['crec', 'comma']);
if (value.items.length === 0) {
throw new ParseFailure('expected record field to have value', offset);
}
members.push({ name, value });
}
else {
if (type == null) {
type = 'record-spec';
}
else if (type === 'record') {
throw new ParseFailure('expected record field to have value', nextTok.location.end.offset);
}
members.push({ name });
this.eatWhitespace();
if (!['crec', 'comma'].includes(this.peek().name)) {
throw new ParseFailure(`expected ${formatClose(['crec', 'comma'])}`, nextTok.location.end.offset);
}
}
if (this.peek().name === 'crec') {
break;
}
this.next.shift(); // eat the comma
}
const cRecTok = this.next.shift();
// @ts-expect-error typing this properly is annoying
items.push({
name: type,
members,
location: {
start: { offset: oRecTok.offset },
end: { offset: cRecTok.offset + cRecTok.source.length },
},
});
break;
}
case 'crec': {
if (!close.includes('crec')) {
throw new ParseFailure('unexpected end of record without corresponding start of record', next.offset);
}
return { name: 'seq', items };
}
case 'x_of': {
this.next.shift();
const callee = next.source.split(' ')[0];
if (!this.opNames.has(callee)) {
addProse(items, next);
break;
}
const parseNode = this.parseSeq([
'eof',
'period',
'comma',
'cparen',
'clist',
'crec',
'with_args',
]);
const args = [];
if (this.peek().name === 'with_args') {
this.next.shift();
while (true) {
args.push(this.parseSeq([
'eof',
'period',
'and',
'is',
'comma',
'cparen',
'clist',
'crec',
'with_args',
]));
if (!['and', 'comma'].includes(this.peek().name)) {
break;
}
this.next.shift();
}
}
const lastThing = args.length > 0 ? args[args.length - 1] : parseNode;
items.push({
name: 'sdo-call',
callee: [
{
name: 'text',
contents: callee,
location: {
start: { offset: next.offset },
end: { offset: next.offset + callee.length },
},
},
],
parseNode,
arguments: args,
location: {
start: { offset: next.offset },
end: { offset: lastThing.items[lastThing.items.length - 1].location.end.offset },
},
});
break;
}
case 'figure': {
const tok = this.next.shift();
items.push({
name: 'figure',
location: {
start: { offset: tok.offset },
end: { offset: tok.offset + tok.source.length },
},
});
break;
}
default: {
if (isProsePart(next)) {
addProse(items, next);
this.next.shift();
break;
}
else {
// @ts-expect-error
throw new Error(`unreachable: unknown token type ${next.name}`);
}
}
}
}
}
}
// Note: this does not necessarily represent the entire input
// in particular it may omit some whitespace, tags, and comments
function parse(src, opNames) {
const parser = new ExprParser(src, opNames);
try {
return parser.parseSeq(['eof']);
}
catch (e) {
if (e instanceof ParseFailure) {
return { name: 'failure', message: e.message, offset: e.offset };
}
throw e;
}
}
// NB: paths are currently missing the index for prose sequences
// nothing needs this as yet so I haven't bothered finding a good way to represent it
function walk(f, current, path = []) {
f(current, path);
switch (current.name) {
case 'list': {
for (let i = 0; i < current.elements.length; ++i) {
path.push({ parent: current, index: i });
walk(f, current.elements[i], path);
path.pop();
}
break;
}
case 'record': {
for (let i = 0; i < current.members.length; ++i) {
path.push({ parent: current, index: i });
walk(f, current.members[i].value, path);
path.pop();
}
break;
}
case 'sdo-call': {
for (const part of current.callee) {
walk(f, part, path);
}
for (let i = 0; i < current.arguments.length; ++i) {
path.push({ parent: current, index: i });
walk(f, current.arguments[i], path);
path.pop();
}
break;
}
case 'call': {
for (const part of current.callee) {
walk(f, part, path);
}
for (let i = 0; i < current.arguments.length; ++i) {
path.push({ parent: current, index: i });
walk(f, current.arguments[i], path);
path.pop();
}
break;
}
case 'paren':
case 'seq': {
for (let i = 0; i < current.items.length; ++i) {
path.push({ parent: current, index: i });
walk(f, current.items[i], path);
path.pop();
}
break;
}
case 'underscore':
case 'double-brackets':
case 'comment':
case 'figure':
case 'opaqueTag':
case 'pipe':
case 'record-spec':
case 'star':
case 'tag':
case 'text':
case 'tick':
case 'tilde': {
break;
}
default: {
// @ts-expect-error
throw new Error(`unreachable: unknown expression node type ${current.name}`);
}
}
}