jgexml
Version:
The Just-Good-Enough XML Toolkit
352 lines (330 loc) • 9.45 kB
JavaScript
/*
The Just-Good-Enough XML Parser
*/
'use strict';
const sInitial = 0;
const sDeclaration = 1;
const sPreElement = 2;
const sElement = 3;
const sAttribute = 5;
const sAttrNML = 6; // No Mans Land
const sValue = 7;
const sEndElement = 9;
const sContent = 11;
const sAttributeSpacer = 12;
const sComment = 13;
const sProcessingInstruction = 15;
const sCData = 17;
const sDocType = 19;
const sDTD = 21;
const sError = 23;
const sEndDocument = 25;
require('./common');
function stateName(state) {
if (state == sInitial) {
return 'INITIAL';
}
else if (state == sDeclaration) {
return 'DECLARATION';
}
else if (state == sElement) {
return 'ELEMENT';
}
else if (state == sAttribute) {
return 'ATTRIBUTE';
}
else if (state == sValue) {
return 'VALUE';
}
else if (state == sEndElement) {
return 'END_ELEMENT';
}
else if (state == sContent) {
return 'CONTENT';
}
else if (state == sComment) {
return 'COMMENT';
}
else if (state == sProcessingInstruction) {
return 'PROCESSING_INSTRUCTION';
}
else if (state == sCData) {
return 'CDATA';
}
else if (state == sDocType) {
return 'DOCTYPE';
}
else if (state == sDTD) {
return 'DTD';
}
else if (state == sError) {
return 'ERROR';
}
else if (state == sEndDocument) {
return 'END_DOCUMENT';
}
}
function reset(context) {
context.state = sInitial;
context.newState = sInitial;
context.token = '';
context.boundary = ['<?','<'];
context.bIndex = -1;
context.lastElement = '';
context.keepToken = false;
context.position = 0;
context.depth = 0;
context.wellFormed = false;
context.validControlChars = ['\t','\r','\n'];
context.error = false;
}
// to create a push parser, pass in a callback function and omit the context parameter
// to create a pull parser, pass in null for the callback function and initially provide an empty object as the context
function jgeParse(s,callback,context) {
if (context && context.newState) {
if (!context.keepToken) context.token = '';
context.state = context.newState;
}
else {
context = {};
reset(context);
}
var c;
for (var i=context.position;i<s.length;i++) {
c = s.charAt(i);
if ((c.charCodeAt(0) < 32) && (context.validControlChars.indexOf(c) < 0)) {
context.newState = context.state = sError;
}
if (context.state != sContent) {
if (context.validControlChars.indexOf(c) >= 0) { //other unicode spaces are not treated as whitespace
c = ' ';
}
}
context.bIndex = -1;
for (var b=0;b<context.boundary.length;b++) {
if (s.substr(i,context.boundary[b].length) == context.boundary[b]) {
context.bIndex = b;
if (context.boundary[context.bIndex].length>1) {
i = i + context.boundary[context.bIndex].length-1;
}
break;
}
}
if (context.bIndex >= 0) {
if ((context.state != sValue) && (context.state != sComment)) { // && (context.state != sContent)
context.token = context.token.trim();
}
context.keepToken = false;
if (((context.state & 1) == 1) && ((context.token.trim() !== '') || context.state == sValue)) {
// TODO test element names for validity (using regex?)
if (context.state != sCData) {
context.token = context.token.replaceAll('&','&');
context.token = context.token.replaceAll('"','"');
context.token = context.token.replaceAll(''',"'");
context.token = context.token.replaceAll('>','>');
context.token = context.token.replaceAll('<','<');
if (context.token.indexOf('&#') >= 0) {
context.token = context.token.replace(/&(?:#([0-9]+)|#x([0-9a-fA-F]+));/g, function(match, group1, group2) {
var e;
if (group2) {
e = String.fromCharCode(parseInt(group2,16));
if ((e.charCodeAt(0) < 32) && (context.validControlChars.indexOf(e) < 0)) {
context.newState = context.state = sError;
}
return e;
}
else {
e = String.fromCharCode(group1);
if ((e.charCodeAt(0) < 32) && (context.validControlChars.indexOf(e) < 0)) {
context.newState = context.state = sError;
}
return e;
}
});
}
}
if (context.state == sElement) context.depth++;
else if (context.state == sEndElement) {
context.depth--;
if (context.depth<0) {
context.newState = context.state = sError;
}
}
if (context.state == sError) {
context.error = true;
}
if (callback) {
callback(context.state,context.token);
}
if (context.state == sError) {
context.boundary = [];
}
}
if (context.state == sInitial) {
if (context.boundary[context.bIndex] == '<?') {
context.newState = sDeclaration;
context.boundary = ['?>'];
}
else {
context.newState = sElement;
context.boundary = ['>',' ','/','!--','?','!DOCTYPE','![CDATA['];
context.boundary = context.boundary.concat(context.validControlChars);
}
}
else if (context.state == sDeclaration) {
context.newState = sPreElement;
context.boundary = ['<'];
if (context.token.indexOf('1.1')>0) {
context.validControlChars.push('\u2028','\u0085','\u0015');
}
}
else if (context.state == sPreElement) {
context.newState = sElement;
context.boundary = ['>',' ','/','!--','?','!DOCTYPE','![CDATA['];
context.boundary = context.boundary.concat(context.validControlChars);
}
else if (context.state == sElement) {
context.lastElement = context.token;
if (c == '>') {
context.newState = sContent;
context.boundary = ['<!DOCTYPE','<'];
}
else if (c == ' ') {
context.newState = sAttribute;
context.boundary = ['/','=','>'];
}
else if (c == '/') {
context.newState = sEndElement;
context.boundary = ['>'];
context.keepToken = true;
}
else if (c == '?') {
context.newState = sProcessingInstruction;
context.boundary = ['?>'];
}
else if (context.boundary[context.bIndex] == '!--') {
context.newState = sComment;
context.boundary = ['-->'];
}
else if (context.boundary[context.bIndex] == '![CDATA[') {
context.newState = sCData;
context.boundary = [']]>'];
}
else if (context.boundary[context.bIndex] == '!DOCTYPE') {
context.newState = sDocType;
context.boundary = ['>','['];
}
}
else if (context.state == sAttribute) {
if (c == '=' ) {
context.newState = sAttrNML;
context.boundary = ['\'','"'];
}
else if (c == '>') {
context.newState = sContent;
context.boundary = ['<!DOCTYPE','<'];
}
else if (c == '/') {
context.newState = sEndElement;
context.keepToken = true;
context.state = sAttributeSpacer; // to stop dummy attributes being emitted to pullparser
context.token = context.lastElement;
}
}
else if (context.state == sAttrNML) {
context.newState = sValue;
context.boundary = [c];
}
else if (context.state == sValue) {
context.newState = sAttribute;
context.boundary = ['=','/','>'];
}
else if (context.state == sEndElement) {
if (context.depth !== 0) context.newState = sContent;
context.boundary = ['<']; // don't allow DOCTYPE's after the first sEndElement
}
else if (context.state == sContent) {
if (context.boundary[context.bIndex] == '<!DOCTYPE') {
context.newState = sDocType;
context.boundary = ['>','['];
}
else {
context.newState = sElement;
context.boundary = ['>',' ','/','!--','?','![CDATA['];
context.boundary = context.boundary.concat(context.validControlChars);
}
}
else if (context.state == sComment) {
context.newState = sContent;
context.boundary = ['<!DOCTYPE','<'];
}
else if (context.state == sProcessingInstruction) {
context.newState = sContent;
context.boundary = ['<!DOCTYPE','<'];
}
else if (context.state == sCData) {
context.newState = sContent;
context.boundary = ['<!DOCTYPE','<'];
}
else if (context.state == sDocType) {
if (context.boundary[context.bIndex] == '[') {
context.newState = sDTD;
context.boundary = [']>'];
}
else {
context.newState = sPreElement;
context.boundary = ['<'];
}
}
else if (context.state == sDTD) {
context.newState = sPreElement;
context.boundary = ['<'];
}
if (!callback) {
if (((context.state & 1) == 1) && ((context.token.trim() !== '') || context.state == sValue)) {
context.position = i+1;
return context;
}
}
context.state = context.newState;
if (!context.keepToken) context.token = '';
}
else {
context.token += c;
}
}
if ((context.state == sEndElement) && (context.depth === 0) && (context.token.trim() === '')) {
context.wellFormed = true;
}
if ((!context.wellFormed) && (!context.error)) {
if (callback) {
// generate a final error, only for pushparsers though
callback(sError,context.token);
}
}
context.state = sEndDocument;
if (callback) {
callback(context.state,context.token);
return context.wellFormed;
}
else {
return context;
}
}
module.exports = {
parse : jgeParse,
getStateName : stateName,
sInitial : sInitial,
sDeclaration : sDeclaration,
sElement : sElement,
sAttribute : sAttribute,
sValue : sValue,
sEndElement : sEndElement,
sContent : sContent,
sComment : sComment,
sProcessingInstruction: sProcessingInstruction,
sCData : sCData,
sDocType : sDocType,
sDTD : sDTD,
sEndDocument : sEndDocument
};