@xml-tools/parser
Version:
XML Parser Implemented in JavaScript
207 lines (173 loc) • 4.73 kB
JavaScript
const { createToken: createTokenOrg, Lexer } = require("chevrotain");
// A little mini DSL for easier lexer definition.
const fragments = {};
const f = fragments;
function FRAGMENT(name, def) {
fragments[name] = typeof def === "string" ? def : def.source;
}
function makePattern(strings, ...args) {
let combined = "";
for (let i = 0; i < strings.length; i++) {
combined += strings[i];
if (i < args.length) {
let pattern = args[i];
// By wrapping in a RegExp (none) capturing group
// We enabled the safe usage of qualifiers and assertions.
combined += `(?:${pattern})`;
}
}
return new RegExp(combined);
}
const tokensArray = [];
const tokensDictionary = {};
function createToken(options) {
const newTokenType = createTokenOrg(options);
tokensArray.push(newTokenType);
tokensDictionary[options.name] = newTokenType;
return newTokenType;
}
FRAGMENT(
"NameStartChar",
"(:|[a-zA-Z]|_|\\u2070-\\u218F|\\u2C00-\\u2FEF|\\u3001-\\uD7FF|\\uF900-\\uFDCF|\\uFDF0-\\uFFFD)"
);
FRAGMENT(
"NameChar",
makePattern`${f.NameStartChar}|-|\\.|\\d|\\u00B7||[\\u0300-\\u036F]|[\\u203F-\\u2040]`
);
FRAGMENT("Name", makePattern`${f.NameStartChar}(${f.NameChar})*`);
const Comment = createToken({
name: "Comment",
pattern: /<!--(.|\r?\n)*?-->/,
// A Comment may span multiple lines.
line_breaks: true,
});
const CData = createToken({
name: "CData",
pattern: /<!\[CDATA\[(.|\r?\n)*?]]>/,
line_breaks: true,
});
const DocType = createToken({
name: "DocType",
pattern: /john</from>
// - The second '<' will be skipped because in the mode "INSIDE" '<' is not recognized.
// - This means the AST will include only a single element instead of two
const INVALID_OPEN_INSIDE = createToken({
name: "INVALID_OPEN_INSIDE",
pattern: /</,
categories: [OPEN],
});
const TEXT = createToken({ name: "TEXT", pattern: /[^<&]+/ });
const CLOSE = createToken({ name: "CLOSE", pattern: />/, pop_mode: true });
const SPECIAL_CLOSE = createToken({
name: "SPECIAL_CLOSE",
pattern: /\?>/,
pop_mode: true,
});
const SLASH_CLOSE = createToken({
name: "SLASH_CLOSE",
pattern: /\/>/,
pop_mode: true,
});
const SLASH = createToken({ name: "SLASH", pattern: /\// });
const STRING = createToken({
name: "STRING",
pattern: /"[^<"]*"|'[^<']*'/,
});
const EQUALS = createToken({ name: "EQUALS", pattern: /=/ });
const Name = createToken({ name: "Name", pattern: makePattern`${f.Name}` });
const S = createToken({
name: "S",
pattern: /[ \t\r\n]/,
group: Lexer.SKIPPED,
});
const xmlLexerDefinition = {
defaultMode: "OUTSIDE",
modes: {
OUTSIDE: [
Comment,
CData,
DocType,
IgnoredDTD,
EntityRef,
CharRef,
SEA_WS,
XMLDeclOpen,
SLASH_OPEN,
PROCESSING_INSTRUCTION,
OPEN,
TEXT,
],
INSIDE: [
// Tokens from `OUTSIDE` to improve error recovery behavior
Comment,
INVALID_SLASH_OPEN,
INVALID_OPEN_INSIDE,
// "Real" `INSIDE` tokens
CLOSE,
SPECIAL_CLOSE,
SLASH_CLOSE,
SLASH,
EQUALS,
STRING,
Name,
S,
],
},
};
const xmlLexer = new Lexer(xmlLexerDefinition, {
// Reducing the amount of position tracking can provide a small performance boost (<10%)
// Likely best to keep the full info for better error position reporting and
// to expose "fuller" ITokens from the Lexer.
positionTracking: "full",
ensureOptimizations: false,
// TODO: inspect definitions for XML line terminators
lineTerminatorCharacters: ["\n"],
lineTerminatorsPattern: /\n|\r\n/g,
});
module.exports = {
xmlLexer,
tokensDictionary,
};