UNPKG

html-tokenizer

Version:

Small, fast, event-driven, fault-tolerant html tokenizer. Works in node or browsers.

170 lines 6.59 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.Tokenizer = void 0; /* eslint-disable no-cond-assign */ const default_entities_1 = __importDefault(require("./default-entities")); const chunks = __importStar(require("./chunks")); const read_attribute_1 = __importDefault(require("./read-attribute")); const deentify_1 = __importDefault(require("./deentify")); /** * A low-level tokenizer utility used by the HTML parser. */ class Tokenizer { constructor(opts) { this.entityMap = Object.assign(Object.assign({}, default_entities_1.default), opts.entities); Object.freeze(this); } /** * Static method to tokenize HTML without instantiating a Tokenizer instance. * @param html HTML string to tokenize. * @param opts Optional tokenizer configuration options. */ static tokenize(html, opts = {}) { const tokenizer = new Tokenizer(opts); return tokenizer.tokenize(html); } /** * Static factory to create a tokenizer. * @param opts Tokenizer options. */ static from(opts) { return new Tokenizer(opts); } /** * Tokenize an HTML string. Returns an iterator, thus allowing * tokens to be consumed via for/of or other iteration mechanisms. * @param html HTML string to tokenize. */ *tokenize(html) { let currentText; for (const tkn of this._tokenize(html)) { if (tkn.type === 'text') { const text = tkn.text; if (currentText === undefined) { currentText = text; } else { currentText += text; } } else { if (currentText) { const deentText = deentify_1.default(currentText, this.entityMap); yield { type: 'text', text: deentText }; currentText = undefined; } yield tkn; } } } *_tokenize(html) { yield { type: 'start' }; let pos = 0; let state = 'inText'; let currentTag = ''; let next; while (pos < html.length) { if (state === 'inText') { const isBracket = html.charAt(pos) === '<'; // cheap pre-emptive check if (isBracket && (next = chunks.getOpeningTag(html, pos))) { pos += next.length; currentTag = next.match[2]; yield { type: 'opening-tag', name: currentTag }; state = 'inTag'; } else if (isBracket && (next = chunks.getClosingTag(html, pos))) { pos += next.length; yield { type: 'closing-tag', name: next.match[2] }; } else if (isBracket && (next = chunks.getCommentOpen(html, pos))) { pos += next.length; state = 'inComment'; } else if (next = chunks.getText(html, pos)) { pos += next.length; yield { type: 'text', text: next.match[1] }; } else { const text = html.substring(pos, pos + 1); pos += 1; yield { type: 'text', text }; } } else if (state === 'inComment') { if (next = chunks.getComment(html, pos)) { pos += next.length; yield { type: 'comment', text: next.match[2] }; state = 'inText'; } else { yield { type: 'comment', text: html.substring(pos) }; break; } } else if (state === 'inScript') { if (next = chunks.getScript(html, pos)) { pos += next.length; yield { type: 'text', text: next.match[2] }; yield { type: 'closing-tag', name: 'script' }; state = 'inText'; } else { yield { type: 'text', text: html.substring(pos) }; break; } } else if (state === 'inTag') { if (next = chunks.getAttributeName(html, pos)) { pos += next.length; const name = next.match[2]; const hasVal = next.match[4]; if (hasVal) { const read = read_attribute_1.default(html, pos); pos += read.length; yield { type: 'attribute', name, value: deentify_1.default(read.value, this.entityMap) }; } else { yield { type: 'attribute', name, value: '' }; } } else if (next = chunks.getTagEnd(html, pos)) { pos += next.length; const token = next.match[2]; yield { type: 'opening-tag-end', name: currentTag, token }; state = currentTag === 'script' ? 'inScript' : 'inText'; } else { state = 'inText'; } } else { break; } } yield { type: 'done' }; } } exports.Tokenizer = Tokenizer; //# sourceMappingURL=tokenizer.js.map