UNPKG

@desertnet/html-parser

Version:

HTML parser and non-strict validator

493 lines (420 loc) 45.8 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); var _createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); var _Instr = require('./Instr'); var _Instr2 = _interopRequireDefault(_Instr); var _Op = require('./Op'); var _Op2 = _interopRequireDefault(_Op); var _scanner = require('@desertnet/scanner'); var _scanner2 = _interopRequireDefault(_scanner); var _HTMLNode = require('./HTMLNode'); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } var Compiler = function () { function Compiler() { _classCallCheck(this, Compiler); var entityStart = /&(?=[a-z0-9#]+;)/i; var attributeStart = /[^>=\s\/]+/i; /** * The tokenizing scanner for the input string. HTML contains * many contexts where the scanner needs to accept differnt * sets of tokens. The @desertnet/scanner module calls these * sets of token definitions "dialects". Below, we're initializing * our scanner with all these different dialects. * @private * @type {Scanner} */ this._scanner = new _scanner2.default({ // Starting dialect, for content "outside of a tag". "content": [{ "text": /[^<>&]+/ }, { "commentStart": /<!--/ }, { "entityStart": entityStart }, { "tagStart": /<[a-z][^\t\n\ \/\>\0\xff]*/i }, { "closeTagStart": /<\/[a-z][^\t\n\ \/\>\0\xff]*/i }, { "error": /[<>&]/ }], // Dialect for the inside of comment tags. "comment": [{ "commentEnd": /-->/ }, { "dash": /-/ }, { "text": /[^-]+/ }], // Dialect for the inside of HTML entities. "entity": [{ "entityEnd": /;/ }, { "hex": /#x[a-f0-9]+/i }, { "dec": /#\d+/ }, { "named": /[a-z][a-z0-9]*/i }, { "error": /[^]/ }], // Dialect for the inside of tags. "tag": [{ "tagEnd": />/ }, { "whitespace": /\s+/ }, { "selfClose": /\// }, { "error": /['"<=]/ }, { "attributeStart": attributeStart }], // Initial dialect for attributes. "attribute": [{ "whitespace": /\s+/ }, { "attributeValueQuotedStart": /=['"]/ }, { "attributeValueStart": /=/ }, { "tagEnd": />/ }, { "selfClose": /\// }, { "error": /['"<]/ }, { "attributeStart": attributeStart }], // Dialect for unquoted attribute values. "attributeValue": [{ "whitespace": /\s+/ }, { "entityStart": entityStart }, { "tagEnd": />/ }, { "error": /['"<=`&]/ }, { "text": /[^'"<>=`&\s]+/ }], // Dialect for quoted attribute values. "attributeValueQuoted": [{ "dquo": /"/ }, { "squo": /'/ }, { "entityStart": entityStart }, { "error": /&/ }, { "text": /[^"'&]+/ }], // Dialect for closing tags. "closeTag": [{ "tagEnd": />/ }, { "whitespace": /\s+/ }, { "error": /[^\s>]+/ }], // Dialect for inside of script, style, and xmp tags "rawtext": [{ "closeTag": /<\/[a-z]+\s*>/i }, { "text": /[^<]+/ }, { "lt": /</ }] }); /** * @private * @type {string} */ this._expectedAttributeValueEndTokenType; /** * @private * @type {string} */ this._expectedRawtextClosingTagName; } /** * @param {string} html */ _createClass(Compiler, [{ key: 'setInput', value: function setInput(html) { this._scanner.setSource(html); this._scanner.pushDialect("content"); } /** * @private * @param {string} dialect */ }, { key: 'pushDialect', value: function pushDialect(dialect) { this._scanner.pushDialect(dialect); } /** * @private */ }, { key: 'popDialect', value: function popDialect() { this._scanner.popDialect(); } /** * @private * @return {string} */ }, { key: 'currentDialect', value: function currentDialect() { var dialect = this._scanner.currentDialect(); if (dialect === null) { throw new Error("Scanner dialect unexpectedly null."); } return dialect; } /** * @private * @param {string} tokenType */ }, { key: 'setExpectedAttributeValueEndTokenType', value: function setExpectedAttributeValueEndTokenType(tokenType) { this._expectedAttributeValueEndTokenType = tokenType; } /** * @private * @return {string} */ }, { key: 'expectedAttributeValueEndTokenType', value: function expectedAttributeValueEndTokenType() { return this._expectedAttributeValueEndTokenType; } /** * @private * @param {string} name */ }, { key: 'setExpectedRawtextClosingTagName', value: function setExpectedRawtextClosingTagName(name) { this._expectedRawtextClosingTagName = name.toLowerCase(); } /** * @private * @return {string} */ }, { key: 'expectedRawtextClosingTagName', value: function expectedRawtextClosingTagName() { return this._expectedRawtextClosingTagName; } /** * @param {string} tagName */ }, { key: 'setRawtextModeForTag', value: function setRawtextModeForTag(tagName) { this.pushDialect("rawtext"); this.setExpectedRawtextClosingTagName(tagName); } /** * @return {Array.<Op>} */ }, { key: 'generateNextCodeFragment', value: function generateNextCodeFragment() { var token = this._scanner.nextToken(); if (token === null) { return null; } return this.generateCodeForTokenInDialect(token, this.currentDialect()); } /** * Take a node and the dialect in which it was found, and tell the * parser what to do next. * @private * @param {Foundation.Scanner.Token} token * @param {string} dialect * @return {Array.<Op>} */ }, { key: 'generateCodeForTokenInDialect', value: function generateCodeForTokenInDialect(token, dialect) { switch (dialect) { case "content": return this.generateCodeForContentToken(token); case "comment": return this.generateCodeForCommentToken(token); case "entity": return this.generateCodeForEntityToken(token); case "tag": return this.generateCodeForTagToken(token); case "attribute": return this.generateCodeForAttributeToken(token); case "attributeValue": return this.generateCodeForAttributeValueToken(token); case "attributeValueQuoted": return this.generateCodeForAttributeValueQuotedToken(token); case "closeTag": return this.generateCodeForCloseTagToken(token); case "rawtext": return this.generateCodeForRawtextToken(token); default: throw new Error("Called compileTokenForDialect on unsuppoted dialect."); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForContentToken', value: function generateCodeForContentToken(token) { switch (token.type) { case "text": case "error": return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.TextNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; case "commentStart": this.pushDialect("comment"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.CommentNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "entityStart": this.pushDialect("entity"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.EntityNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "tagStart": this.pushDialect("tag"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.TagNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "closeTagStart": this.pushDialect("closeTag"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.CloseTagNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForCommentToken', value: function generateCodeForCommentToken(token) { switch (token.type) { case "text": case "dash": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "commentEnd": this.popDialect(); return [new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token */ }, { key: 'generateCodeForEntityToken', value: function generateCodeForEntityToken(token) { switch (token.type) { case "hex": case "dec": case "named": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "error": case "entityEnd": this.popDialect(); return [new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForTagToken', value: function generateCodeForTagToken(token) { switch (token.type) { case "tagEnd": this.popDialect(); return [new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; case "whitespace": case "selfClose": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "attributeStart": case "error": this.pushDialect("attribute"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.AttrNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForAttributeToken', value: function generateCodeForAttributeToken(token) { switch (token.type) { case "attributeValueQuotedStart": var isDquo = !!token.value.match(/"$/); this.setExpectedAttributeValueEndTokenType(isDquo ? "dquo" : "squo"); // continue into next case... case "attributeValueStart": this.popDialect(); this.pushDialect(token.type.replace(/Start$/, "")); // continue into next case... case "whitespace": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "tagEnd": this.popDialect(); // pop out of attribute dialect this.popDialect(); // pop out of tag dialect return [new _Op2.default(_Instr2.default.POP_NODE), new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; case "selfClose": this.popDialect(); return [new _Op2.default(_Instr2.default.POP_NODE), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "attributeStart": case "error": return [new _Op2.default(_Instr2.default.POP_NODE), new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.AttrNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForAttributeValueToken', value: function generateCodeForAttributeValueToken(token) { switch (token.type) { case "whitespace": this.popDialect(); return [new _Op2.default(_Instr2.default.POP_NODE), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "entityStart": this.pushDialect("entity"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.EntityNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "tagEnd": this.popDialect(); // pop out of attributeValue dialect this.popDialect(); // pop out of tag dialect return [new _Op2.default(_Instr2.default.POP_NODE), new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; case "text": case "error": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForAttributeValueQuotedToken', value: function generateCodeForAttributeValueQuotedToken(token) { switch (token.type) { case "dquo": case "squo": if (token.type === this.expectedAttributeValueEndTokenType()) { this.popDialect(); return [new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; } else { token.type = "text"; return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; } case "entityStart": this.pushDialect("entity"); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.EntityNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "error": case "text": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForCloseTagToken', value: function generateCodeForCloseTagToken(token) { switch (token.type) { case "whitespace": case "error": return [new _Op2.default(_Instr2.default.ADD_TOKEN, token)]; case "tagEnd": this.popDialect(); return [new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; default: throw unknownTokenAssertion(token); } } /** * @private * @param {Foundation.Scanner.Token} token * @return {Array.<Op>} */ }, { key: 'generateCodeForRawtextToken', value: function generateCodeForRawtextToken(token) { switch (token.type) { case "closeTag": var closeTagName = token.value.toLowerCase().replace(/\W/g, ""); if (closeTagName === this.expectedRawtextClosingTagName()) { this.popDialect(); return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.CloseTagNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; } case "text": case "lt": return [new _Op2.default(_Instr2.default.PUSH_NODE, new _HTMLNode.TextNode()), new _Op2.default(_Instr2.default.ADD_TOKEN, token), new _Op2.default(_Instr2.default.POP_NODE)]; default: throw unknownTokenAssertion(token); } } }]); return Compiler; }(); /** * @private * @param {Foundation.Scanner.Token} token * @return {Error} */ exports.default = Compiler; function unknownTokenAssertion(token) { return new Error("failed assertion: unkown token type: " + token.type); } //# sourceMappingURL=data:application/json;base64,