html-tokenizer
Version:
Small, fast, event-driven, fault-tolerant html tokenizer. Works in node or browsers.
170 lines • 6.59 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } });
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.Tokenizer = void 0;
/* eslint-disable no-cond-assign */
const default_entities_1 = __importDefault(require("./default-entities"));
const chunks = __importStar(require("./chunks"));
const read_attribute_1 = __importDefault(require("./read-attribute"));
const deentify_1 = __importDefault(require("./deentify"));
/**
* A low-level tokenizer utility used by the HTML parser.
*/
class Tokenizer {
constructor(opts) {
this.entityMap = Object.assign(Object.assign({}, default_entities_1.default), opts.entities);
Object.freeze(this);
}
/**
* Static method to tokenize HTML without instantiating a Tokenizer instance.
* @param html HTML string to tokenize.
* @param opts Optional tokenizer configuration options.
*/
static tokenize(html, opts = {}) {
const tokenizer = new Tokenizer(opts);
return tokenizer.tokenize(html);
}
/**
* Static factory to create a tokenizer.
* @param opts Tokenizer options.
*/
static from(opts) {
return new Tokenizer(opts);
}
/**
* Tokenize an HTML string. Returns an iterator, thus allowing
* tokens to be consumed via for/of or other iteration mechanisms.
* @param html HTML string to tokenize.
*/
*tokenize(html) {
let currentText;
for (const tkn of this._tokenize(html)) {
if (tkn.type === 'text') {
const text = tkn.text;
if (currentText === undefined) {
currentText = text;
}
else {
currentText += text;
}
}
else {
if (currentText) {
const deentText = deentify_1.default(currentText, this.entityMap);
yield { type: 'text', text: deentText };
currentText = undefined;
}
yield tkn;
}
}
}
*_tokenize(html) {
yield { type: 'start' };
let pos = 0;
let state = 'inText';
let currentTag = '';
let next;
while (pos < html.length) {
if (state === 'inText') {
const isBracket = html.charAt(pos) === '<'; // cheap pre-emptive check
if (isBracket && (next = chunks.getOpeningTag(html, pos))) {
pos += next.length;
currentTag = next.match[2];
yield { type: 'opening-tag', name: currentTag };
state = 'inTag';
}
else if (isBracket && (next = chunks.getClosingTag(html, pos))) {
pos += next.length;
yield { type: 'closing-tag', name: next.match[2] };
}
else if (isBracket && (next = chunks.getCommentOpen(html, pos))) {
pos += next.length;
state = 'inComment';
}
else if (next = chunks.getText(html, pos)) {
pos += next.length;
yield { type: 'text', text: next.match[1] };
}
else {
const text = html.substring(pos, pos + 1);
pos += 1;
yield { type: 'text', text };
}
}
else if (state === 'inComment') {
if (next = chunks.getComment(html, pos)) {
pos += next.length;
yield { type: 'comment', text: next.match[2] };
state = 'inText';
}
else {
yield { type: 'comment', text: html.substring(pos) };
break;
}
}
else if (state === 'inScript') {
if (next = chunks.getScript(html, pos)) {
pos += next.length;
yield { type: 'text', text: next.match[2] };
yield { type: 'closing-tag', name: 'script' };
state = 'inText';
}
else {
yield { type: 'text', text: html.substring(pos) };
break;
}
}
else if (state === 'inTag') {
if (next = chunks.getAttributeName(html, pos)) {
pos += next.length;
const name = next.match[2];
const hasVal = next.match[4];
if (hasVal) {
const read = read_attribute_1.default(html, pos);
pos += read.length;
yield { type: 'attribute', name, value: deentify_1.default(read.value, this.entityMap) };
}
else {
yield { type: 'attribute', name, value: '' };
}
}
else if (next = chunks.getTagEnd(html, pos)) {
pos += next.length;
const token = next.match[2];
yield { type: 'opening-tag-end', name: currentTag, token };
state = currentTag === 'script' ? 'inScript' : 'inText';
}
else {
state = 'inText';
}
}
else {
break;
}
}
yield { type: 'done' };
}
}
exports.Tokenizer = Tokenizer;
//# sourceMappingURL=tokenizer.js.map