typesxml
Version:
Open source XML library written in TypeScript
188 lines • 6.8 kB
JavaScript
/*******************************************************************************
* Copyright (c) 2023-2026 Maxprograms.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse License 1.0
* which accompanies this distribution, and is available at
* https://www.eclipse.org/org/documents/epl-v10.html
*
* Contributors:
* Maxprograms - initial API and implementation
*******************************************************************************/
import { CData } from "./CData.js";
import { DOMBuilder } from "./DOMBuilder.js";
import { ProcessingInstruction } from "./ProcessingInstruction.js";
import { SAXParser } from "./SAXParser.js";
import { TextNode } from "./TextNode.js";
import { XMLComment } from "./XMLComment.js";
import { XMLDeclaration } from "./XMLDeclaration.js";
import { XMLDocumentType } from "./XMLDocumentType.js";
import { XMLElement } from "./XMLElement.js";
/**
* Generates the canonical XML representation defined by the W3C XML Test Suite.
*
* Canonicalization rules:
* - Attribute order is lexical (Unicode code point order).
* - Character data is escaped using the Datachar productions (&, <, >, ", 	, , ).
* - CDATA sections are treated as their character content.
* - Comments and document type declarations are omitted.
* - Processing instructions are preserved in document order with their data escaped as Datachar.
*/
export class XMLCanonicalizer {
document;
parseFile(path, encoding) {
const builder = new DOMBuilder();
const parser = new SAXParser();
parser.setContentHandler(builder);
parser.parseFile(path, encoding);
this.document = builder.getDocument();
}
parseString(xml, options) {
const builder = new DOMBuilder();
const parser = new SAXParser();
parser.setContentHandler(builder);
parser.parseString(xml, options);
this.document = builder.getDocument();
}
async parseStream(stream, options) {
const builder = new DOMBuilder();
const parser = new SAXParser();
parser.setContentHandler(builder);
await parser.parseStream(stream, options);
this.document = builder.getDocument();
}
setDocument(document) {
this.document = document;
}
getDocument() {
return this.document;
}
toString() {
if (!this.document) {
throw new Error("Canonicalizer has no document. Parse an XML source first.");
}
return this.renderDocument(this.document);
}
renderDocument(document) {
const parts = [];
for (const node of document.contentIterator()) {
parts.push(this.renderTopLevelNode(node));
}
return parts.join("");
}
renderTopLevelNode(node) {
if (node instanceof XMLDeclaration || node instanceof XMLComment || node instanceof XMLDocumentType) {
return ""; // omitted from canonical form
}
if (node instanceof ProcessingInstruction) {
return this.renderProcessingInstruction(node);
}
if (node instanceof XMLElement) {
return this.renderElement(node);
}
if (node instanceof TextNode || node instanceof CData) {
const value = this.getNodeValue(node);
if (this.isWhitespaceOnly(value)) {
return "";
}
return this.escapeData(value);
}
return "";
}
renderElement(element) {
const builder = [];
builder.push("<" + element.getName());
const attributes = [...element.getAttributes()].sort((a, b) => a.getName().localeCompare(b.getName()));
attributes.forEach((attribute) => {
builder.push(" " + attribute.getName() + "=\"" + this.escapeData(attribute.getValue()) + "\"");
});
builder.push(">");
element.getContent().forEach((child) => {
if (child instanceof XMLElement) {
builder.push(this.renderElement(child));
}
else if (child instanceof TextNode || child instanceof CData) {
builder.push(this.escapeData(this.getNodeValue(child)));
}
else if (child instanceof ProcessingInstruction) {
builder.push(this.renderProcessingInstruction(child));
}
// comments and other node types are ignored in canonical form
});
builder.push("</" + element.getName() + ">");
return builder.join("");
}
renderProcessingInstruction(pi) {
const data = this.escapeProcessingInstructionData(pi.getData());
return `<?${pi.getTarget()} ${data}?>`;
}
getNodeValue(node) {
if (node instanceof TextNode) {
return node.getValue();
}
return node.getValue();
}
escapeData(data) {
const normalized = data.replaceAll('\r\n', "\n");
let result = "";
for (let i = 0; i < normalized.length; i++) {
const char = normalized.charAt(i);
switch (char) {
case "&":
result += "&";
break;
case "<":
result += "<";
break;
case ">":
result += ">";
break;
case '"':
result += """;
break;
case "\t":
result += "	";
break;
case "\n":
result += " ";
break;
case "\r":
result += " ";
break;
default:
result += char;
}
}
return result;
}
escapeProcessingInstructionData(data) {
const normalized = data.replaceAll('\r\n', "\n");
let result = "";
for (let i = 0; i < normalized.length; i++) {
const char = normalized.charAt(i);
if (char === "&") {
result += "&";
}
else if (char === "\r") {
result += " ";
}
else {
result += char;
}
}
return result;
}
isWhitespaceOnly(value) {
if (value.length === 0) {
return true;
}
for (let i = 0; i < value.length; i++) {
const char = value.charAt(i);
if (char !== " " && char !== "\t" && char !== "\n" && char !== "\r") {
return false;
}
}
return true;
}
}
//# sourceMappingURL=XMLCanonicalizer.js.map