xml-parser-xo
Version:
Parse a XML string into a proprietary syntax tree
351 lines (294 loc) • 8.73 kB
text/typescript
export type XmlParserOptions = {
/**
* Returns false to exclude a node. Default is true.
*/
filter?: (node: XmlParserNode) => boolean|any;
/**
* True to throw an error when parsing XML document with invalid content like mismatched closing tags.
*/
strictMode?: boolean;
};
export type XmlParserNodeType = 'Comment'|'Text'|'ProcessingInstruction'|'Element'|'DocumentType'|'CDATA';
export type XmlParserNodeWrapper<T extends XmlParserNode> = {
excluded: boolean;
node: T;
}
export type XmlParserNode = {
type: XmlParserNodeType;
}
export type XmlParserAttribute = {
name: string;
value: string;
}
export type XmlParserElementChildNode = XmlParserTextNode|XmlParserElementNode|XmlParserCDATANode|XmlParserCommentNode|XmlParserProcessingInstructionNode;
export type XmlParserDocumentChildNode = XmlParserDocumentTypeNode|XmlParserProcessingInstructionNode|XmlParserElementChildNode;
export type XmlParserProcessingInstructionNode = {
type: 'ProcessingInstruction';
name: string;
attributes: Record<string, string>;
}
export type XmlParserElementNode = {
type: 'Element';
name: string;
attributes: Record<string, string>;
children: XmlParserElementChildNode[]|null;
}
export type XmlParserTextNode = {
type: 'Text';
content: string;
}
export type XmlParserCDATANode = {
type: 'CDATA';
content: string;
}
export type XmlParserCommentNode = {
type: 'Comment';
content: string;
}
export type XmlParserDocumentTypeNode = {
type: 'DocumentType';
content: string;
}
export type XmlParserResult = {
declaration?: XmlParserProcessingInstructionNode|null;
root: XmlParserElementNode;
children: XmlParserDocumentChildNode[];
};
export class ParsingError extends Error {
public readonly cause: string;
public constructor(message: string, cause: string) {
super(message);
this.cause = cause;
}
}
let parsingState: {
xml: string;
options: Required<XmlParserOptions>;
};
function nextChild() {
return element(false) || text() || comment() || cdata() || processingInstruction(false);
}
function nextRootChild() {
match(/\s*/);
return element(true) || comment() || doctype() || processingInstruction(false);
}
function parseDocument(): XmlParserResult {
const declaration = processingInstruction(true);
const children = [];
let documentRootNode;
let child = nextRootChild();
while (child) {
if (child.node.type === 'Element') {
if (documentRootNode) {
throw new Error('Found multiple root nodes');
}
documentRootNode = child.node;
}
if (!child.excluded) {
children.push(child.node);
}
child = nextRootChild();
}
if (!documentRootNode) {
throw new ParsingError('Failed to parse XML', 'Root Element not found');
}
if (parsingState.xml.length !== 0) {
throw new ParsingError('Failed to parse XML', 'Not Well-Formed XML');
}
return {
declaration: declaration ? declaration.node : null,
root: documentRootNode,
children
};
}
function processingInstruction(matchDeclaration: boolean): XmlParserNodeWrapper<XmlParserProcessingInstructionNode>|undefined {
const m = matchDeclaration ? match(/^<\?(xml(-stylesheet)?)\s*/) : match(/^<\?([\w-:.]+)\s*/);
if (!m) return;
// tag
const node: XmlParserProcessingInstructionNode = {
name: m[1],
type: 'ProcessingInstruction',
attributes: {}
};
// attributes
while (!(eos() || is('?>'))) {
const attr = attribute();
if (attr) {
node.attributes[attr.name] = attr.value;
} else {
return;
}
}
match(/\?>/);
return {
excluded: matchDeclaration ? false : parsingState.options.filter(node) === false,
node
};
}
function element(matchRoot: boolean): XmlParserNodeWrapper<XmlParserElementNode>|undefined {
const m = match(/^<([^?!</>\s]+)\s*/);
if (!m) return;
// name
const node: XmlParserElementNode = {
type: 'Element',
name: m[1],
attributes: {},
children: []
};
const excluded = matchRoot ? false : parsingState.options.filter(node) === false;
// attributes
while (!(eos() || is('>') || is('?>') || is('/>'))) {
const attr = attribute();
if (attr) {
node.attributes[attr.name] = attr.value;
} else {
return;
}
}
// self closing tag
if (match(/^\s*\/>/)) {
node.children = null;
return {
excluded,
node
};
}
match(/\??>/);
// children
let child = nextChild();
while (child) {
if (!child.excluded) {
node.children!.push(child.node);
}
child = nextChild();
}
// closing
if (parsingState.options.strictMode) {
const closingTag = `</${node.name}>`;
if (parsingState.xml.startsWith(closingTag)) {
parsingState.xml = parsingState.xml.slice(closingTag.length);
} else {
throw new ParsingError('Failed to parse XML', `Closing tag not matching "${closingTag}"`);
}
} else {
match(/^<\/[\w-:.\u00C0-\u00FF]+\s*>/);
}
return {
excluded,
node
};
}
function doctype(): XmlParserNodeWrapper<XmlParserDocumentTypeNode>|undefined {
const m =
match(/^<!DOCTYPE\s+\S+\s+SYSTEM[^>]*>/) ||
match(/^<!DOCTYPE\s+\S+\s+PUBLIC[^>]*>/) ||
match(/^<!DOCTYPE\s+\S+\s*\[[^\]]*]>/) ||
match(/^<!DOCTYPE\s+\S+\s*>/);
if (m) {
const node: XmlParserDocumentTypeNode = {
type: 'DocumentType',
content: m[0]
};
return {
excluded: parsingState.options.filter(node) === false,
node
};
}
}
function cdata(): XmlParserNodeWrapper<XmlParserCDATANode>|undefined {
if (parsingState.xml.startsWith('<![CDATA[')) {
const endPositionStart = parsingState.xml.indexOf(']]>');
if (endPositionStart > -1) {
const endPositionFinish = endPositionStart + 3;
const node: XmlParserCDATANode = {
type: 'CDATA',
content: parsingState.xml.substring(0, endPositionFinish)
};
parsingState.xml = parsingState.xml.slice(endPositionFinish);
return {
excluded: parsingState.options.filter(node) === false,
node
};
}
}
}
function comment(): XmlParserNodeWrapper<XmlParserCommentNode>|undefined {
const m = match(/^<!--[\s\S]*?-->/);
if (m) {
const node: XmlParserCommentNode = {
type: 'Comment',
content: m[0]
};
return {
excluded: parsingState.options.filter(node) === false,
node
};
}
}
function text(): XmlParserNodeWrapper<XmlParserTextNode>|undefined {
const m = match(/^([^<]+)/);
if (m) {
const node: XmlParserTextNode = {
type: 'Text',
content: m[1]
};
return {
excluded: parsingState.options.filter(node) === false,
node
};
}
}
function attribute(): XmlParserAttribute|undefined {
const m = match(/([^=]+)\s*=\s*("[^"]*"|'[^']*'|[^>\s]+)\s*/);
if (m) {
return {
name: m[1].trim(),
value: stripQuotes(m[2].trim())
};
}
}
function stripQuotes(val: string): string {
return val.replace(/^['"]|['"]$/g, '');
}
/**
* Match `re` and advance the string.
*/
function match(re: RegExp): RegExpMatchArray|undefined {
const m = parsingState.xml.match(re);
if (m) {
parsingState.xml = parsingState.xml.slice(m[0].length);
return m;
}
}
/**
* End-of-source.
*/
function eos(): boolean {
return 0 === parsingState.xml.length;
}
/**
* Check for `prefix`.
*/
function is(prefix: string): boolean {
return 0 === parsingState.xml.indexOf(prefix);
}
/**
* Parse the given XML string into an object.
*/
function parseXml(xml: string, options: XmlParserOptions = {}): XmlParserResult {
xml = xml.trim();
const filter: XmlParserOptions['filter'] = options.filter || (() => true);
parsingState = {
xml,
options: {
...options,
filter,
strictMode: options.strictMode === true
}
};
return parseDocument();
}
if (typeof module !== 'undefined' && typeof exports === 'object') {
module.exports = parseXml;
}
export default parseXml;