@bonniernews/atlas-html-stream
Version:
A super fast html-parser stream that outputs tag, text and closing nodes.
256 lines (243 loc) • 6.95 kB
JavaScript
"use strict";
const { Transform } = require("stream");
const { TEXT, NODE, NAME, KEY, VALUE, SCRIPT, STYLE, COMMENT } = require("./states");
class SeqMatcher {
constructor(str) {
this.str = str;
this.max = str.length - 1;
this.pos = 0;
}
found(code) {
if (code !== this.str.charCodeAt(this.pos)) return !!(this.pos = 0);
if (this.pos === this.max) return !(this.pos = 0);
return !++this.pos;
}
reset() {
return !(this.pos = 0);
}
}
module.exports = class HtmlParser extends Transform {
constructor({ preserveWS } = {}){
super({ readableObjectMode: true });
this.preserveWS = preserveWS;
this.endScript = new SeqMatcher("</script>");
this.endStyle = new SeqMatcher("</style>");
this.beginComment = new SeqMatcher("!--");
this.endComment = new SeqMatcher("-->");
this.curPos = 0;
this.minPos = 0;
this.state = TEXT;
this.cache = "";
this.name = "";
this.key = "";
this.text = [];
this.data = {};
this.isClose = false;
this.isSelfClose = false;
this.hasEqual = false;
this.valStartChar = null;
}
reset() {
this.endScript.reset();
this.endStyle.reset();
this.beginComment.reset();
this.endComment.reset();
this.curPos = 0;
this.minPos = 0;
this.state = TEXT;
this.cache = "";
this.name = "";
this.key = "";
this.text = [];
this.data = {};
this.isClose = false;
this.isSelfClose = false;
this.hasEqual = false;
this.valStartChar = null;
}
_transform(chunk, encoding, done){
const cache = this.cache += chunk;
const cacheLen = cache.length;
let i = this.curPos, v = this.minPos, s = this.state, c;
while (i < cacheLen) {
c = cache.charCodeAt(i);
switch (s) {
case TEXT: {
if (!this.preserveWS && (c === 32 || c >= 9 && c <= 13)) { // ws
if (v < i) this.text.push(cache.substring(v, i));
v = i + 1;
} else if (c === 60) { // <
this.flushText(v, i);
s = NODE;
v = i + 1;
}
break;
}
case NODE: {
if (c === 62) { // >
if (this.key) this.flushKey();
s = this.flushNode();
v = i + 1;
} else if (c === 47 && !this.hasEqual) { // /
this.isClose = !(this.isSelfClose = !!this.name);
} else if (c !== 32 && (c < 9 || c > 13)) { // !ws
if (!this.name) { // name start
this.beginComment.found(c);
v = i;
s = NAME;
} else if (!this.key) { // key start
v = i;
s = KEY;
} else if (c === 61) { // =
this.hasEqual = true;
} else if (!this.hasEqual) { // next key
this.flushKey();
v = i;
s = KEY;
} else if (c === 34 || c === 39) { // ', "
v = i + 1;
this.valStartChar = c;
s = VALUE;
} else { // un-quoted val
v = i;
s = VALUE;
}
}
break;
}
case NAME: {
if (this.beginComment.found(c)) { // start comment
this.name = cache.substring(v, i + 1);
s = this.flushNode();
v = i + 1;
} else if (c === 32 || c >= 9 && c <= 13) { // ws
this.name = cache.substring(v, i);
s = NODE;
v = i + 1;
} else if (c === 47) { // /
this.isSelfClose = true;
this.name = cache.substring(v, i);
s = NODE;
v = i + 1;
} else if (c === 62) { // >
this.name = cache.substring(v, i);
s = this.flushNode();
v = i + 1;
}
break;
}
case KEY: {
if (c === 32 || c >= 9 && c <= 13) { // ws
this.key = cache.substring(v, i);
s = NODE;
v = i + 1;
} else if (c === 61) { // =
this.hasEqual = true;
this.key = cache.substring(v, i);
s = NODE;
v = i + 1;
} else if (c === 47) { // /
this.isSelfClose = true;
this.key = cache.substring(v, i);
s = NODE;
v = i + 1;
} else if (c === 62) { // >
this.flushKey(v, i);
s = this.flushNode();
v = i + 1;
}
break;
}
case VALUE: {
if (this.valStartChar !== null) {
if (c === this.valStartChar) { // found end quote
this.flushVal(v, i);
s = NODE;
v = i + 1;
}
} else if (c === 32 || c >= 9 && c <= 13) { // ws
this.flushVal(v, i);
s = NODE;
v = i + 1;
} else if (c === 62) { // >
this.flushVal(v, i);
s = this.flushNode();
v = i + 1;
}
break;
}
default: {
if (s === COMMENT && this.endComment.found(c)) {
s = this.flushSpecialNode(v, i - 2, "!--");
v = i + 1;
} else if (s === SCRIPT && this.endScript.found(c)) {
s = this.flushSpecialNode(v, i - 8, "script");
v = i + 1;
} else if (s === STYLE && this.endStyle.found(c)) {
s = this.flushSpecialNode(v, i - 7, "style");
v = i + 1;
}
}
}
i++;
}
this.cache = cache.substring(v);
this.curPos = i - v;
this.minPos = 0;
this.state = s;
done(null);
}
_flush(done){
this.flushText(this.minPos, this.curPos);
this.reset();
done(null);
}
flushKey(v, i) {
this.key = this.data[this.key || this.cache.substring(v, i)] = "";
}
flushVal(v, i) {
this.data[this.key] = this.cache.substring(v, i);
this.key = "";
this.valStartChar = this.hasEqual = null;
}
flushNode() {
const name = this.name;
if (!this.isClose) this.push({ name, data: this.data });
if (this.isSelfClose || this.isClose) this.push({ name });
let s;
switch (name) {
case "script":
s = SCRIPT;
break;
case "style":
s = STYLE;
break;
case "!--":
s = COMMENT;
break;
default:
s = TEXT;
}
this.data = {};
this.name = "";
this.isClose = false;
this.isSelfClose = false;
return s;
}
flushSpecialNode(v, i, name) {
const text = this.cache.substring(v, i);
if (text) this.push({ text });
this.push({ name });
return TEXT;
}
flushText(v, i) {
if (v < i) {
this.text.push(this.cache.substring(v, i));
this.push({ text: this.text.join(" ") });
this.text.length = 0;
} else if (this.text.length) {
this.push({ text: this.text.join(" ") });
this.text.length = 0;
}
}
};