node-html-parser
Version:
A very fast HTML parser, generating a simplified DOM, with basic element query support.
1,127 lines (1,126 loc) • 42.3 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.parse = exports.base_parse = void 0;
const css_select_1 = require("css-select");
const he_1 = __importDefault(require("he"));
const back_1 = __importDefault(require("../back"));
const matcher_1 = __importDefault(require("../matcher"));
const void_tag_1 = __importDefault(require("../void-tag"));
const comment_1 = __importDefault(require("./comment"));
const node_1 = __importDefault(require("./node"));
const text_1 = __importDefault(require("./text"));
const type_1 = __importDefault(require("./type"));
function decode(val) {
// clone string
return JSON.parse(JSON.stringify(he_1.default.decode(val)));
}
// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements
const Htags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup'];
const Dtags = ['details', 'dialog', 'dd', 'div', 'dt'];
const Ftags = ['fieldset', 'figcaption', 'figure', 'footer', 'form'];
const tableTags = ['table', 'td', 'tr'];
const htmlTags = ['address', 'article', 'aside', 'blockquote', 'br', 'hr', 'li', 'main', 'nav', 'ol', 'p', 'pre', 'section', 'ul'];
const kBlockElements = new Set();
function addToKBlockElement(...args) {
const addToSet = (array) => {
for (let index = 0; index < array.length; index++) {
const element = array[index];
kBlockElements.add(element);
kBlockElements.add(element.toUpperCase());
}
};
for (const arg of args)
addToSet(arg);
}
addToKBlockElement(Htags, Dtags, Ftags, tableTags, htmlTags);
class DOMTokenList {
_validate(c) {
if (/\s/.test(c)) {
throw new Error(`DOMException in DOMTokenList.add: The token '${c}' contains HTML space characters, which are not valid in tokens.`);
}
}
constructor(valuesInit = [], afterUpdate = () => null) {
this._set = new Set(valuesInit);
this._afterUpdate = afterUpdate;
}
add(c) {
this._validate(c);
this._set.add(c);
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
}
replace(c1, c2) {
this._validate(c2);
this._set.delete(c1);
this._set.add(c2);
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
}
remove(c) {
this._set.delete(c) && this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
}
toggle(c) {
this._validate(c);
if (this._set.has(c))
this._set.delete(c);
else
this._set.add(c);
this._afterUpdate(this); // eslint-disable-line @typescript-eslint/no-unsafe-call
}
contains(c) {
return this._set.has(c);
}
get length() {
return this._set.size;
}
values() {
return this._set.values();
}
get value() {
return Array.from(this._set.values());
}
toString() {
return Array.from(this._set.values()).join(' ');
}
}
/**
* HTMLElement, which contains a set of children.
*
* Note: this is a minimalist implementation, no complete tree
* structure provided (no parentNode, nextSibling,
* previousSibling etc).
* @class HTMLElement
* @extends {Node}
*/
class HTMLElement extends node_1.default {
/**
* Quote attribute values
* @param attr attribute value
* @returns {string} quoted value
*/
quoteAttribute(attr) {
if (attr == null) {
return 'null';
}
return JSON.stringify(attr.replace(/"/g, '"'))
.replace(/\\t/g, '\t')
.replace(/\\n/g, '\n')
.replace(/\\r/g, '\r')
.replace(/\\/g, '');
}
/**
* Creates an instance of HTMLElement.
* @param keyAttrs id and class attribute
* @param [rawAttrs] attributes in string
*
* @memberof HTMLElement
*/
constructor(tagName, keyAttrs, rawAttrs = '', parentNode = null, range, voidTag = new void_tag_1.default(), _parseOptions = {}) {
super(parentNode, range);
this.rawAttrs = rawAttrs;
this.voidTag = voidTag;
/**
* Node Type declaration.
*/
this.nodeType = type_1.default.ELEMENT_NODE;
this.rawTagName = tagName;
this.rawAttrs = rawAttrs || '';
this.id = keyAttrs.id || '';
this.childNodes = [];
this._parseOptions = _parseOptions;
this.classList = new DOMTokenList(keyAttrs.class ? keyAttrs.class.split(/\s+/) : [], (classList) => this.setAttribute('class', classList.toString()) // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
);
if (keyAttrs.id) {
if (!rawAttrs) {
this.rawAttrs = `id="${keyAttrs.id}"`;
}
}
if (keyAttrs.class) {
if (!rawAttrs) {
const cls = `class="${this.classList.toString()}"`;
if (this.rawAttrs) {
this.rawAttrs += ` ${cls}`;
}
else {
this.rawAttrs = cls;
}
}
}
}
/**
* Remove Child element from childNodes array
* @param {HTMLElement} node node to remove
*/
removeChild(node) {
this.childNodes = this.childNodes.filter((child) => {
return child !== node;
});
return this;
}
/**
* Exchanges given child with new child
* @param {HTMLElement} oldNode node to exchange
* @param {HTMLElement} newNode new node
*/
exchangeChild(oldNode, newNode) {
const children = this.childNodes;
this.childNodes = children.map((child) => {
if (child === oldNode) {
return newNode;
}
return child;
});
return this;
}
get tagName() {
return this.rawTagName ? this.rawTagName.toUpperCase() : this.rawTagName;
}
set tagName(newname) {
this.rawTagName = newname.toLowerCase();
}
get localName() {
return this.rawTagName.toLowerCase();
}
get isVoidElement() {
return this.voidTag.isVoidElement(this.localName);
}
/**
* Get escpaed (as-it) text value of current node and its children.
* @return {string} text content
*/
get rawText() {
// https://github.com/taoqf/node-html-parser/issues/249
if (/^br$/i.test(this.rawTagName)) {
return '\n';
}
return this.childNodes.reduce((pre, cur) => {
return (pre += cur.rawText);
}, '');
}
get textContent() {
return decode(this.rawText);
}
set textContent(val) {
const content = [new text_1.default(val, this)];
this.childNodes = content;
}
/**
* Get unescaped text value of current node and its children.
* @return {string} text content
*/
get text() {
return decode(this.rawText);
}
/**
* Get structured Text (with '\n' etc.)
* @return {string} structured text
*/
get structuredText() {
let currentBlock = [];
const blocks = [currentBlock];
function dfs(node) {
if (node.nodeType === type_1.default.ELEMENT_NODE) {
if (kBlockElements.has(node.rawTagName)) {
if (currentBlock.length > 0) {
blocks.push((currentBlock = []));
}
node.childNodes.forEach(dfs);
if (currentBlock.length > 0) {
blocks.push((currentBlock = []));
}
}
else {
node.childNodes.forEach(dfs);
}
}
else if (node.nodeType === type_1.default.TEXT_NODE) {
if (node.isWhitespace) {
// Whitespace node, postponed output
currentBlock.prependWhitespace = true;
}
else {
let text = node.trimmedText;
if (currentBlock.prependWhitespace) {
text = ` ${text}`;
currentBlock.prependWhitespace = false;
}
currentBlock.push(text);
}
}
}
dfs(this);
return blocks
.map((block) => {
return block.join('').replace(/\s{2,}/g, ' '); // Normalize each line's whitespace
})
.join('\n')
.replace(/\s+$/, ''); // trimRight;
}
toString() {
const tag = this.rawTagName;
if (tag) {
const attrs = this.rawAttrs ? ` ${this.rawAttrs}` : '';
return this.voidTag.formatNode(tag, attrs, this.innerHTML);
}
return this.innerHTML;
}
get innerHTML() {
return this.childNodes
.map((child) => {
return child.toString();
})
.join('');
}
set innerHTML(content) {
const r = parse(content, this._parseOptions);
const nodes = r.childNodes.length ? r.childNodes : [new text_1.default(content, this)];
resetParent(nodes, this);
resetParent(this.childNodes, null);
this.childNodes = nodes;
}
set_content(content, options = {}) {
if (content instanceof node_1.default) {
content = [content];
}
else if (typeof content == 'string') {
options = Object.assign(Object.assign({}, this._parseOptions), options);
const r = parse(content, options);
content = r.childNodes.length ? r.childNodes : [new text_1.default(r.innerHTML, this)];
}
resetParent(this.childNodes, null);
resetParent(content, this);
this.childNodes = content;
return this;
}
replaceWith(...nodes) {
const parent = this.parentNode;
const content = nodes
.map((node) => {
if (node instanceof node_1.default) {
return [node];
}
else if (typeof node == 'string') {
const r = parse(node, this._parseOptions);
return r.childNodes.length ? r.childNodes : [new text_1.default(node, this)];
}
return [];
})
.flat();
const idx = parent.childNodes.findIndex((child) => {
return child === this;
});
resetParent([this], null);
parent.childNodes = [...parent.childNodes.slice(0, idx), ...resetParent(content, parent), ...parent.childNodes.slice(idx + 1)];
return this;
}
get outerHTML() {
return this.toString();
}
/**
* Trim element from right (in block) after seeing pattern in a TextNode.
* @param {RegExp} pattern pattern to find
* @return {HTMLElement} reference to current node
*/
trimRight(pattern) {
for (let i = 0; i < this.childNodes.length; i++) {
const childNode = this.childNodes[i];
if (childNode.nodeType === type_1.default.ELEMENT_NODE) {
childNode.trimRight(pattern);
}
else {
const index = childNode.rawText.search(pattern);
if (index > -1) {
childNode.rawText = childNode.rawText.substr(0, index);
// trim all following nodes.
this.childNodes.length = i + 1;
}
}
}
return this;
}
/**
* Get DOM structure
* @return {string} structure
*/
get structure() {
const res = [];
let indention = 0;
function write(str) {
res.push(' '.repeat(indention) + str);
}
function dfs(node) {
const idStr = node.id ? `#${node.id}` : '';
const classStr = node.classList.length ? `.${node.classList.value.join('.')}` : ''; // eslint-disable-line @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/restrict-template-expressions, @typescript-eslint/no-unsafe-call
write(`${node.rawTagName}${idStr}${classStr}`);
indention++;
node.childNodes.forEach((childNode) => {
if (childNode.nodeType === type_1.default.ELEMENT_NODE) {
dfs(childNode);
}
else if (childNode.nodeType === type_1.default.TEXT_NODE) {
if (!childNode.isWhitespace) {
write('#text');
}
}
});
indention--;
}
dfs(this);
return res.join('\n');
}
/**
* Remove whitespaces in this sub tree.
* @return {HTMLElement} pointer to this
*/
removeWhitespace() {
let o = 0;
this.childNodes.forEach((node) => {
if (node.nodeType === type_1.default.TEXT_NODE) {
if (node.isWhitespace) {
return;
}
node.rawText = node.trimmedRawText;
}
else if (node.nodeType === type_1.default.ELEMENT_NODE) {
node.removeWhitespace();
}
this.childNodes[o++] = node;
});
this.childNodes.length = o;
// remove whitespace between attributes
const attrs = Object.keys(this.rawAttributes)
.map((key) => {
const val = this.rawAttributes[key];
return `${key}=${JSON.stringify(val)}`;
})
.join(' ');
this.rawAttrs = attrs;
delete this._rawAttrs;
return this;
}
/**
* Query CSS selector to find matching nodes.
* @param {string} selector Simplified CSS selector
* @return {HTMLElement[]} matching elements
*/
querySelectorAll(selector) {
return (0, css_select_1.selectAll)(selector, this, {
xmlMode: true,
adapter: matcher_1.default,
});
}
/**
* Query CSS Selector to find matching node.
* @param {string} selector Simplified CSS selector
* @return {(HTMLElement|null)} matching node
*/
querySelector(selector) {
return (0, css_select_1.selectOne)(selector, this, {
xmlMode: true,
adapter: matcher_1.default,
});
}
/**
* find elements by their tagName
* @param {string} tagName the tagName of the elements to select
*/
getElementsByTagName(tagName) {
const upperCasedTagName = tagName.toUpperCase();
const re = [];
const stack = [];
let currentNodeReference = this;
let index = 0;
// index turns to undefined once the stack is empty and the first condition occurs
// which happens once all relevant children are searched through
while (index !== undefined) {
let child;
// make it work with sparse arrays
do {
child = currentNodeReference.childNodes[index++];
} while (index < currentNodeReference.childNodes.length && child === undefined);
// if the child does not exist we move on with the last provided index (which belongs to the parentNode)
if (child === undefined) {
currentNodeReference = currentNodeReference.parentNode;
index = stack.pop();
continue;
}
if (child.nodeType === type_1.default.ELEMENT_NODE) {
// https://developer.mozilla.org/en-US/docs/Web/API/Element/getElementsByTagName#syntax
if (tagName === '*' || child.tagName === upperCasedTagName)
re.push(child);
// if children are existing push the current status to the stack and keep searching for elements in the level below
if (child.childNodes.length > 0) {
stack.push(index);
currentNodeReference = child;
index = 0;
}
}
}
return re;
}
/**
* find element by it's id
* @param {string} id the id of the element to select
* @returns {HTMLElement | null} the element with the given id or null if not found
*/
getElementById(id) {
const stack = [];
let currentNodeReference = this;
let index = 0;
// index turns to undefined once the stack is empty and the first condition occurs
// which happens once all relevant children are searched through
while (index !== undefined) {
let child;
// make it work with sparse arrays
do {
child = currentNodeReference.childNodes[index++];
} while (index < currentNodeReference.childNodes.length && child === undefined);
// if the child does not exist we move on with the last provided index (which belongs to the parentNode)
if (child === undefined) {
currentNodeReference = currentNodeReference.parentNode;
index = stack.pop();
continue;
}
if (child.nodeType === type_1.default.ELEMENT_NODE) {
if (child.id === id) {
return child;
}
// if children are existing push the current status to the stack and keep searching for elements in the level below
if (child.childNodes.length > 0) {
stack.push(index);
currentNodeReference = child;
index = 0;
}
}
}
return null;
}
/**
* traverses the Element and its parents (heading toward the document root) until it finds a node that matches the provided selector string. Will return itself or the matching ancestor. If no such element exists, it returns null.
* @param selector a DOMString containing a selector list
* @returns {HTMLElement | null} the element with the given id or null if not found
*/
closest(selector) {
const mapChild = new Map();
let el = this;
let old = null;
function findOne(test, elems) {
let elem = null;
for (let i = 0, l = elems.length; i < l && !elem; i++) {
const el = elems[i];
if (test(el)) {
elem = el;
}
else {
const child = mapChild.get(el);
if (child) {
elem = findOne(test, [child]);
}
}
}
return elem;
}
while (el) {
mapChild.set(el, old);
old = el;
el = el.parentNode;
}
el = this;
while (el) {
const e = (0, css_select_1.selectOne)(selector, el, {
xmlMode: true,
adapter: Object.assign(Object.assign({}, matcher_1.default), { getChildren(node) {
const child = mapChild.get(node);
return child && [child];
},
getSiblings(node) {
return [node];
},
findOne,
findAll() {
return [];
} }),
});
if (e) {
return e;
}
el = el.parentNode;
}
return null;
}
/**
* Append a child node to childNodes
* @param {Node} node node to append
* @return {Node} node appended
*/
appendChild(node) {
this.append(node);
return node;
}
/**
* Get attributes
* @access private
* @return {Object} parsed and unescaped attributes
*/
get attrs() {
if (this._attrs) {
return this._attrs;
}
this._attrs = {};
const attrs = this.rawAttributes;
for (const key in attrs) {
const val = attrs[key] || '';
this._attrs[key.toLowerCase()] = decode(val);
}
return this._attrs;
}
get attributes() {
const ret_attrs = {};
const attrs = this.rawAttributes;
for (const key in attrs) {
const val = attrs[key] || '';
ret_attrs[key] = decode(val);
}
return ret_attrs;
}
/**
* Get escaped (as-is) attributes
* @return {Object} parsed attributes
*/
get rawAttributes() {
if (this._rawAttrs) {
return this._rawAttrs;
}
const attrs = {};
if (this.rawAttrs) {
const re = /([a-zA-Z()[\]#@$.?:][a-zA-Z0-9-._:()[\]#]*)(?:\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+))?/g;
let match;
while ((match = re.exec(this.rawAttrs))) {
const key = match[1];
let val = match[2] || null;
if (val && (val[0] === `'` || val[0] === `"`))
val = val.slice(1, val.length - 1);
attrs[key] = attrs[key] || val;
}
}
this._rawAttrs = attrs;
return attrs;
}
removeAttribute(key) {
const attrs = this.rawAttributes;
delete attrs[key];
// Update this.attribute
if (this._attrs) {
delete this._attrs[key];
}
// Update rawString
this.rawAttrs = Object.keys(attrs)
.map((name) => {
const val = this.quoteAttribute(attrs[name]);
if (val === 'null' || val === '""')
return name;
return `${name}=${val}`;
})
.join(' ');
// Update this.id
if (key === 'id') {
this.id = '';
}
return this;
}
hasAttribute(key) {
return key.toLowerCase() in this.attrs;
}
/**
* Get an attribute
* @return {string | undefined} value of the attribute; or undefined if not exist
*/
getAttribute(key) {
return this.attrs[key.toLowerCase()];
}
/**
* Set an attribute value to the HTMLElement
* @param {string} key The attribute name
* @param {string} value The value to set, or null / undefined to remove an attribute
*/
setAttribute(key, value) {
if (arguments.length < 2) {
throw new Error("Failed to execute 'setAttribute' on 'Element'");
}
const k2 = key.toLowerCase();
const attrs = this.rawAttributes;
for (const k in attrs) {
if (k.toLowerCase() === k2) {
key = k;
break;
}
}
attrs[key] = String(value);
// update this.attrs
if (this._attrs) {
this._attrs[k2] = decode(attrs[key]);
}
// Update rawString
this.rawAttrs = Object.keys(attrs)
.map((name) => {
const val = this.quoteAttribute(attrs[name]);
if (val === 'null' || val === '""')
return name;
return `${name}=${val}`;
})
.join(' ');
// Update this.id
if (key === 'id') {
this.id = value;
}
return this;
}
/**
* Replace all the attributes of the HTMLElement by the provided attributes
* @param {Attributes} attributes the new attribute set
*/
setAttributes(attributes) {
// Invalidate current this.attributes
if (this._attrs) {
delete this._attrs;
}
// Invalidate current this.rawAttributes
if (this._rawAttrs) {
delete this._rawAttrs;
}
// Update rawString
this.rawAttrs = Object.keys(attributes)
.map((name) => {
const val = attributes[name];
if (val === 'null' || val === '""')
return name;
return `${name}=${this.quoteAttribute(String(val))}`;
})
.join(' ');
return this;
}
insertAdjacentHTML(where, html) {
if (arguments.length < 2) {
throw new Error('2 arguments required');
}
const p = parse(html, this._parseOptions);
if (where === 'afterend') {
this.after(...p.childNodes);
}
else if (where === 'afterbegin') {
this.prepend(...p.childNodes);
}
else if (where === 'beforeend') {
this.append(...p.childNodes);
}
else if (where === 'beforebegin') {
this.before(...p.childNodes);
}
else {
throw new Error(`The value provided ('${where}') is not one of 'beforebegin', 'afterbegin', 'beforeend', or 'afterend'`);
}
return this;
}
/** Prepend nodes or strings to this node's children. */
prepend(...insertable) {
const nodes = resolveInsertable(insertable);
resetParent(nodes, this);
this.childNodes.unshift(...nodes);
}
/** Append nodes or strings to this node's children. */
append(...insertable) {
const nodes = resolveInsertable(insertable);
resetParent(nodes, this);
this.childNodes.push(...nodes);
}
/** Insert nodes or strings before this node. */
before(...insertable) {
const nodes = resolveInsertable(insertable);
const siblings = this.parentNode.childNodes;
resetParent(nodes, this.parentNode);
siblings.splice(siblings.indexOf(this), 0, ...nodes);
}
/** Insert nodes or strings after this node. */
after(...insertable) {
const nodes = resolveInsertable(insertable);
const siblings = this.parentNode.childNodes;
resetParent(nodes, this.parentNode);
siblings.splice(siblings.indexOf(this) + 1, 0, ...nodes);
}
get nextSibling() {
if (this.parentNode) {
const children = this.parentNode.childNodes;
let i = 0;
while (i < children.length) {
const child = children[i++];
if (this === child)
return children[i] || null;
}
return null;
}
}
get nextElementSibling() {
if (this.parentNode) {
const children = this.parentNode.childNodes;
let i = 0;
let find = false;
while (i < children.length) {
const child = children[i++];
if (find) {
if (child instanceof HTMLElement) {
return child || null;
}
}
else if (this === child) {
find = true;
}
}
return null;
}
}
get previousSibling() {
if (this.parentNode) {
const children = this.parentNode.childNodes;
let i = children.length;
while (i > 0) {
const child = children[--i];
if (this === child)
return children[i - 1] || null;
}
return null;
}
}
get previousElementSibling() {
if (this.parentNode) {
const children = this.parentNode.childNodes;
let i = children.length;
let find = false;
while (i > 0) {
const child = children[--i];
if (find) {
if (child instanceof HTMLElement) {
return child || null;
}
}
else if (this === child) {
find = true;
}
}
return null;
}
}
/** Get all childNodes of type {@link HTMLElement}. */
get children() {
const children = [];
for (const childNode of this.childNodes) {
if (childNode instanceof HTMLElement) {
children.push(childNode);
}
}
return children;
}
/**
* Get the first child node.
* @return The first child or undefined if none exists.
*/
get firstChild() {
return this.childNodes[0];
}
/**
* Get the first child node of type {@link HTMLElement}.
* @return The first child element or undefined if none exists.
*/
get firstElementChild() {
return this.children[0];
}
/**
* Get the last child node.
* @return The last child or undefined if none exists.
*/
get lastChild() {
return (0, back_1.default)(this.childNodes);
}
/**
* Get the last child node of type {@link HTMLElement}.
* @return The last child element or undefined if none exists.
*/
get lastElementChild() {
return this.children[this.children.length - 1];
}
get childElementCount() {
return this.children.length;
}
get classNames() {
return this.classList.toString();
}
/** Clone this Node */
clone() {
return parse(this.toString(), this._parseOptions).firstChild;
}
}
exports.default = HTMLElement;
// #xB7 | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x203F-#x2040] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
// https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name
const kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z@\xB7\xC0-\xD6\xD8-\xF6\u00F8-\u03A1\u03A3-\u03D9\u03DB-\u03EF\u03F7-\u03FF\u0400-\u04FF\u0500-\u052F\u1D00-\u1D2B\u1D6B-\u1D77\u1D79-\u1D9A\u1E00-\u1E9B\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A-\u212B\u2132\u214E\u2160-\u2188\u2C60-\u2C7F\uA722-\uA787\uA78B-\uA78E\uA790-\uA7AD\uA7B0-\uA7B7\uA7F7-\uA7FF\uAB30-\uAB5A\uAB5C-\uAB5F\uAB64-\uAB65\uFB00-\uFB06\uFB13-\uFB17\uFF21-\uFF3A\uFF41-\uFF5A\x37F-\u1FFF\u200C-\u200D\u203F-\u2040\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/gu;
// const kMarkupPattern = /<!--[\s\S]*?-->|<(\/?)([a-zA-Z][-.:0-9_a-zA-Z]*)((?:\s+[^>]*?(?:(?:'[^']*')|(?:"[^"]*"))?)*)\s*(\/?)>/g;
const kAttributePattern = /(?:^|\s)(id|class)\s*=\s*((?:'[^']*')|(?:"[^"]*")|\S+)/gi;
const kElementsClosedByOpening = {
li: { li: true, LI: true },
LI: { li: true, LI: true },
p: { p: true, div: true, P: true, DIV: true },
P: { p: true, div: true, P: true, DIV: true },
b: { div: true, DIV: true },
B: { div: true, DIV: true },
td: { td: true, th: true, TD: true, TH: true },
TD: { td: true, th: true, TD: true, TH: true },
th: { td: true, th: true, TD: true, TH: true },
TH: { td: true, th: true, TD: true, TH: true },
h1: { h1: true, H1: true },
H1: { h1: true, H1: true },
h2: { h2: true, H2: true },
H2: { h2: true, H2: true },
h3: { h3: true, H3: true },
H3: { h3: true, H3: true },
h4: { h4: true, H4: true },
H4: { h4: true, H4: true },
h5: { h5: true, H5: true },
H5: { h5: true, H5: true },
h6: { h6: true, H6: true },
H6: { h6: true, H6: true },
};
const kElementsClosedByClosing = {
li: { ul: true, ol: true, UL: true, OL: true },
LI: { ul: true, ol: true, UL: true, OL: true },
a: { div: true, DIV: true },
A: { div: true, DIV: true },
b: { div: true, DIV: true },
B: { div: true, DIV: true },
i: { div: true, DIV: true },
I: { div: true, DIV: true },
p: { div: true, DIV: true },
P: { div: true, DIV: true },
td: { tr: true, table: true, TR: true, TABLE: true },
TD: { tr: true, table: true, TR: true, TABLE: true },
th: { tr: true, table: true, TR: true, TABLE: true },
TH: { tr: true, table: true, TR: true, TABLE: true },
};
const frameflag = 'documentfragmentcontainer';
/**
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
* @param {string} data html
* @return {HTMLElement} root element
*/
function base_parse(data, options = {}) {
var _a, _b;
const voidTag = new void_tag_1.default((_a = options === null || options === void 0 ? void 0 : options.voidTag) === null || _a === void 0 ? void 0 : _a.closingSlash, (_b = options === null || options === void 0 ? void 0 : options.voidTag) === null || _b === void 0 ? void 0 : _b.tags);
const elements = options.blockTextElements || {
script: true,
noscript: true,
style: true,
pre: true,
};
const element_names = Object.keys(elements);
const kBlockTextElements = element_names.map((it) => new RegExp(`^${it}$`, 'i'));
const kIgnoreElements = element_names.filter((it) => Boolean(elements[it])).map((it) => new RegExp(`^${it}$`, 'i'));
function element_should_be_ignore(tag) {
return kIgnoreElements.some((it) => it.test(tag));
}
function is_block_text_element(tag) {
return kBlockTextElements.some((it) => it.test(tag));
}
const createRange = (startPos, endPos) => [startPos - frameFlagOffset, endPos - frameFlagOffset];
const root = new HTMLElement(null, {}, '', null, [0, data.length], voidTag, options);
let currentParent = root;
const stack = [root];
let lastTextPos = -1;
let noNestedTagIndex = undefined;
let match;
// https://github.com/taoqf/node-html-parser/issues/38
data = `<${frameflag}>${data}</${frameflag}>`;
const { lowerCaseTagName, fixNestedATags } = options;
const dataEndPos = data.length - (frameflag.length + 2);
const frameFlagOffset = frameflag.length + 2;
while ((match = kMarkupPattern.exec(data))) {
// Note: Object destructuring here consistently tests as higher performance than array destructuring
// eslint-disable-next-line prefer-const
let { 0: matchText, 1: leadingSlash, 2: tagName, 3: attributes, 4: closingSlash } = match;
const matchLength = matchText.length;
const tagStartPos = kMarkupPattern.lastIndex - matchLength;
const tagEndPos = kMarkupPattern.lastIndex;
// Add TextNode if content
if (lastTextPos > -1) {
if (lastTextPos + matchLength < tagEndPos) {
const text = data.substring(lastTextPos, tagStartPos);
currentParent.appendChild(new text_1.default(text, currentParent, createRange(lastTextPos, tagStartPos)));
}
}
lastTextPos = kMarkupPattern.lastIndex;
// https://github.com/taoqf/node-html-parser/issues/38
// Skip frameflag node
if (tagName === frameflag)
continue;
// Handle comments
if (matchText[1] === '!') {
if (options.comment) {
// Only keep what is in between <!-- and -->
const text = data.substring(tagStartPos + 4, tagEndPos - 3);
currentParent.appendChild(new comment_1.default(text, currentParent, createRange(tagStartPos, tagEndPos)));
}
continue;
}
/* -- Handle tag matching -- */
// Fix tag casing if necessary
if (lowerCaseTagName)
tagName = tagName.toLowerCase();
// Handle opening tags (ie. <this> not </that>)
if (!leadingSlash) {
/* Populate attributes */
const attrs = {};
for (let attMatch; (attMatch = kAttributePattern.exec(attributes));) {
const { 1: key, 2: val } = attMatch;
const isQuoted = val[0] === `'` || val[0] === `"`;
attrs[key.toLowerCase()] = isQuoted ? val.slice(1, val.length - 1) : val;
}
const parentTagName = currentParent.rawTagName;
if (!closingSlash && kElementsClosedByOpening[parentTagName]) {
if (kElementsClosedByOpening[parentTagName][tagName]) {
stack.pop();
currentParent = (0, back_1.default)(stack);
}
}
// Prevent nested A tags by terminating the last A and starting a new one : see issue #144
if (fixNestedATags && (tagName === 'a' || tagName === 'A')) {
if (noNestedTagIndex !== undefined) {
stack.splice(noNestedTagIndex);
currentParent = (0, back_1.default)(stack);
}
noNestedTagIndex = stack.length;
}
const tagEndPos = kMarkupPattern.lastIndex;
const tagStartPos = tagEndPos - matchLength;
currentParent = currentParent.appendChild(
// Initialize range (end position updated later for closed tags)
new HTMLElement(tagName, attrs, attributes.slice(1), null, createRange(tagStartPos, tagEndPos), voidTag, options));
stack.push(currentParent);
if (is_block_text_element(tagName)) {
// Find closing tag
const closeMarkup = `</${tagName}>`;
const closeIndex = lowerCaseTagName
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;
if (element_should_be_ignore(tagName)) {
const text = data.substring(tagEndPos, textEndPos);
if (text.length > 0 && /\S/.test(text)) {
currentParent.appendChild(new text_1.default(text, currentParent, createRange(tagEndPos, textEndPos)));
}
}
if (closeIndex === -1) {
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
}
else {
lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
// Cause to be treated as self-closing, because no close found
leadingSlash = '/';
}
}
}
// Handle closing tags or self-closed elements (ie </tag> or <br>)
if (leadingSlash || closingSlash || voidTag.isVoidElement(tagName)) {
while (true) {
if (noNestedTagIndex != null && (tagName === 'a' || tagName === 'A'))
noNestedTagIndex = undefined;
if (currentParent.rawTagName === tagName) {
// Update range end for closed tag
currentParent.range[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
stack.pop();
currentParent = (0, back_1.default)(stack);
break;
}
else {
const parentTagName = currentParent.tagName;
// Trying to close current tag, and move on
if (kElementsClosedByClosing[parentTagName]) {
if (kElementsClosedByClosing[parentTagName][tagName]) {
stack.pop();
currentParent = (0, back_1.default)(stack);
continue;
}
}
// Use aggressive strategy to handle unmatching markups.
break;
}
}
}
}
return stack;
}
exports.base_parse = base_parse;
/**
* Parses HTML and returns a root element
* Parse a chuck of HTML source.
*/
function parse(data, options = {}) {
const stack = base_parse(data, options);
const [root] = stack;
while (stack.length > 1) {
// Handle each error elements.
const last = stack.pop();
const oneBefore = (0, back_1.default)(stack);
if (last.parentNode && last.parentNode.parentNode) {
if (last.parentNode === oneBefore && last.tagName === oneBefore.tagName) {
// Pair error case <h3> <h3> handle : Fixes to <h3> </h3>
// this is wrong, becouse this will put the H3 outside the current right position which should be inside the current Html Element, see issue 152 for more info
if (options.parseNoneClosedTags !== true) {
oneBefore.removeChild(last);
last.childNodes.forEach((child) => {
oneBefore.parentNode.appendChild(child);
});
stack.pop();
}
}
else {
// Single error <div> <h3> </div> handle: Just removes <h3>
// Why remove? this is already a HtmlElement and the missing <H3> is already added in this case. see issue 152 for more info
// eslint-disable-next-line no-lonely-if
if (options.parseNoneClosedTags !== true) {
oneBefore.removeChild(last);
last.childNodes.forEach((child) => {
oneBefore.appendChild(child);
});
}
}
}
else {
// If it's final element just skip.
}
}
// response.childNodes.forEach((node) => {
// if (node instanceof HTMLElement) {
// node.parentNode = null;
// }
// });
return root;
}
exports.parse = parse;
/**
* Resolves a list of {@link NodeInsertable} to a list of nodes,
* and removes nodes from any potential parent.
*/
function resolveInsertable(insertable) {
return insertable.map((val) => {
if (typeof val === 'string') {
return new text_1.default(val);
}
val.remove();
return val;
});
}
function resetParent(nodes, parent) {
return nodes.map((node) => {
node.parentNode = parent;
return node;
});
}