UNPKG

jssoup

Version:

JSSoup is a BeautifulSoup style HTML parser library.

851 lines (678 loc) 26.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports["default"] = void 0; var _builder = _interopRequireDefault(require("./builder.js")); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; } function _typeof(obj) { "@babel/helpers - typeof"; if (typeof Symbol === "function" && typeof Symbol.iterator === "symbol") { _typeof = function _typeof(obj) { return typeof obj; }; } else { _typeof = function _typeof(obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; } return _typeof(obj); } function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); if (superClass) _setPrototypeOf(subClass, superClass); } function _setPrototypeOf(o, p) { _setPrototypeOf = Object.setPrototypeOf || function _setPrototypeOf(o, p) { o.__proto__ = p; return o; }; return _setPrototypeOf(o, p); } function _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; } function _possibleConstructorReturn(self, call) { if (call && (_typeof(call) === "object" || typeof call === "function")) { return call; } else if (call !== void 0) { throw new TypeError("Derived constructors may only return object or undefined"); } return _assertThisInitialized(self); } function _assertThisInitialized(self) { if (self === void 0) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return self; } function _isNativeReflectConstruct() { if (typeof Reflect === "undefined" || !Reflect.construct) return false; if (Reflect.construct.sham) return false; if (typeof Proxy === "function") return true; try { Boolean.prototype.valueOf.call(Reflect.construct(Boolean, [], function () {})); return true; } catch (e) { return false; } } function _getPrototypeOf(o) { _getPrototypeOf = Object.setPrototypeOf ? Object.getPrototypeOf : function _getPrototypeOf(o) { return o.__proto__ || Object.getPrototypeOf(o); }; return _getPrototypeOf(o); } function _classCallCheck(instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } } function _defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); return Constructor; } var htmlparser = require('htmlparser'); if (typeof module !== 'undefined' && typeof module.exports !== 'undefined') { try { htmlparser = Tautologistics.NodeHtmlParser; } catch (e) {} } else { htmlparser = Tautologistics.NodeHtmlParser; } var SoupElement = /*#__PURE__*/function () { function SoupElement() { var parent = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : null; var previousElement = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null; var nextElement = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : null; _classCallCheck(this, SoupElement); this.parent = parent; this.previousElement = previousElement; this.nextElement = nextElement; } _createClass(SoupElement, [{ key: "nextSibling", get: function get() { if (!this.parent) return undefined; var index = this.parent.contents.indexOf(this); if (index == this.parent.contents.length - 1) return undefined; return this.parent.contents[index + 1]; } }, { key: "previousSibling", get: function get() { if (!this.parent) return undefined; var index = this.parent.contents.indexOf(this); if (index == 0) return undefined; return this.parent.contents[index - 1]; } }, { key: "nextSiblings", get: function get() { if (!this.parent) return undefined; var index = this.parent.contents.indexOf(this); if (index == this.parent.contents.length - 1) return undefined; return this.parent.contents.slice(index + 1); } }, { key: "previousSiblings", get: function get() { if (!this.parent) return undefined; var index = this.parent.contents.indexOf(this); if (index == 0) return undefined; return this.parent.contents.slice(0, index); } // remove item from dom tree }, { key: "extract", value: function extract() { var extractFirst = this; var extractLast = this; var descendants = this.descendants; if (descendants && descendants.length) { extractLast = descendants[descendants.length - 1]; } // these two maybe null var before = this.previousElement; var after = extractLast.nextElement; // modify extract subtree extractFirst.previousElement = null; extractLast.nextElement = null; if (before) { before.nextElement = after; } if (after) { after.previousElement = before; } //remove node from contents array if (this.parent) { var index = this.parent.contents.indexOf(this); if (index >= 0) { this.parent.contents.splice(index, 1); } } this.parent = null; } }, { key: "insert", value: function insert(index, newElement) { var _this = this; if (newElement == null) { throw "Cannot insert null element!"; } if (newElement === this) { throw "Cannot add one itself!"; } if (!(this instanceof SoupTag)) { throw "insert is not support in " + this.constructor.name; } if (index < 0) { throw "index cannot be negative!"; } if (newElement instanceof JSSoup) { newElement.contents.forEach(function (element) { _this.insert(index, element); ++index; }); return; } index = Math.min(index, this.contents.length); if (typeof newElement == 'string') { newElement = new SoupString(newElement); } if (newElement.parent) { if (newElement.parent === this) { var curIndex = this.contents.indexOf(newElement); if (index == curIndex) return; if (index > curIndex) { --index; } } newElement.extract(); } var count = this.contents.length; var descendantsOfNewElement = newElement.descendants; var lastElementOfNewElement = descendantsOfNewElement && descendantsOfNewElement.length > 0 ? descendantsOfNewElement[descendantsOfNewElement.length - 1] : newElement; // handle previous element of newElement if (index == 0) { newElement.previousElement = this; } else { var previousChild = this.contents[index - 1]; var previousDescendants = previousChild.descendants; newElement.previousElement = previousDescendants && previousDescendants.length > 0 ? previousDescendants[previousDescendants.length - 1] : previousChild; } if (newElement.previousElement) { newElement.previousElement.nextElement = newElement; } // handle next element of newElement if (index < count) { lastElementOfNewElement.nextElement = this.contents[index]; } else { var parent = this; var parentNextSibling = null; while (!parentNextSibling && parent) { parentNextSibling = parent.nextSibling; parent = parent.parent; } if (parentNextSibling) { lastElementOfNewElement.nextElement = parentNextSibling; } else { lastElementOfNewElement.nextElement = null; } } if (lastElementOfNewElement.nextElement) { lastElementOfNewElement.nextElement.previousElement = lastElementOfNewElement; } newElement.parent = this; this.contents.splice(index, 0, newElement); } }, { key: "replaceWith", value: function replaceWith(newElement) { if (this.parent == null) { throw "Cannot replace element without parent!"; } if (newElement === this) { return; } if (newElement === this.parent) { throw "Cannot replace element with its parent!"; } var parent = this.parent; var index = this.parent.contents.indexOf(this); this.extract(); try { parent.insert(index, newElement); } catch (err) { throw 'Cannot replace this element!'; } return this; } }]); return SoupElement; }(); var SoupComment = /*#__PURE__*/function (_SoupElement) { _inherits(SoupComment, _SoupElement); var _super = _createSuper(SoupComment); function SoupComment(text) { var _this2; var parent = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null; var previousElement = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : null; var nextElement = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : null; _classCallCheck(this, SoupComment); _this2 = _super.call(this, parent, previousElement, nextElement); _this2._text = text; return _this2; } return SoupComment; }(SoupElement); var SoupString = /*#__PURE__*/function (_SoupElement2) { _inherits(SoupString, _SoupElement2); var _super2 = _createSuper(SoupString); function SoupString(text) { var _this3; var parent = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null; var previousElement = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : null; var nextElement = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : null; _classCallCheck(this, SoupString); _this3 = _super2.call(this, parent, previousElement, nextElement); _this3._text = text; return _this3; } return SoupString; }(SoupElement); SoupString.prototype.toString = function () { return this._text; }; var SoupDoctypeString = /*#__PURE__*/function (_SoupString) { _inherits(SoupDoctypeString, _SoupString); var _super3 = _createSuper(SoupDoctypeString); function SoupDoctypeString(text) { var _this4; var parent = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : null; var previousElement = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : null; var nextElement = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : null; _classCallCheck(this, SoupDoctypeString); _this4 = _super3.call(this, text, parent, previousElement, nextElement); _this4._text = text; return _this4; } return SoupDoctypeString; }(SoupString); SoupDoctypeString.prototype.toString = function () { return "<" + this._text + ">"; }; var SoupTag = /*#__PURE__*/function (_SoupElement3) { _inherits(SoupTag, _SoupElement3); var _super4 = _createSuper(SoupTag); function SoupTag(name, builder) { var _this5; var attrs = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : null; var parent = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : null; var previousElement = arguments.length > 4 && arguments[4] !== undefined ? arguments[4] : null; var nextElement = arguments.length > 5 && arguments[5] !== undefined ? arguments[5] : null; _classCallCheck(this, SoupTag); _this5 = _super4.call(this, parent, previousElement, nextElement); _this5.name = name; _this5.contents = []; _this5.attrs = attrs || {}; _this5.hidden = false; _this5.builder = builder; return _this5; } _createClass(SoupTag, [{ key: "_append", value: function _append(child) { if (child) this.contents.push(child); } /* * Build a soup object tree */ }, { key: "_build", value: function _build(children) { if (!children || children.length < 1) return this; var last = this; for (var i = 0; i < children.length; ++i) { var ele = this._transform(children[i]); last.nextElement = ele; ele.previousElement = last; if (ele instanceof SoupTag) { last = ele._build(children[i].children); } else { last = ele; } this._append(ele); } return last; } /* * It's a soup object factory * It consturcts a soup object from dom. */ }, { key: "_transform", value: function _transform(dom) { if (!dom) return null; if (dom.type === 'text') { return new SoupString(dom.data, this); } else if (dom.type === 'comment') { return new SoupComment(dom.data, this); } else if (dom.type === 'directive') { //<!** if (dom.name === '!DOCTYPE') { return new SoupDoctypeString(dom.data, this); } } return new SoupTag(dom.name, this.builder, dom.attribs, this); } }, { key: "string", get: function get() { var cur = this; while (cur && cur.contents && cur.contents.length == 1) { cur = cur.contents[0]; } if (!cur || cur instanceof SoupTag) return undefined; return cur; } }, { key: "find", value: function find() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var r = this.findAll(name, attrs, string); if (r.length > 0) return r[0]; return undefined; } /* * like find_all in BeautifulSoup */ }, { key: "findAll", value: function findAll() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var results = []; var strainer = new SoupStrainer(name, attrs, string); var descendants = this.descendants; for (var i = 0; i < descendants.length; ++i) { if (descendants[i] instanceof SoupTag) { var tag = strainer.match(descendants[i]); if (tag) { results.push(tag); } } } return results; } }, { key: "findPreviousSibling", value: function findPreviousSibling() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var results = this.findPreviousSiblings(name, attrs, string); if (results.length > 0) { return results[0]; } return undefined; } }, { key: "findPreviousSiblings", value: function findPreviousSiblings() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var results = []; var cur = this.previousSibling; var strainer = new SoupStrainer(name, attrs, string); while (cur) { if (cur instanceof SoupTag) { var tag = strainer.match(cur); if (tag) { results.push(tag); } } cur = cur.previousSibling; } return results; } }, { key: "findNextSibling", value: function findNextSibling() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var results = this.findNextSiblings(name, attrs, string); if (results.length > 0) { return results[0]; } return undefined; } }, { key: "findNextSiblings", value: function findNextSiblings() { var name = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : undefined; var attrs = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : undefined; var string = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : undefined; var results = []; var cur = this.nextSibling; var strainer = new SoupStrainer(name, attrs, string); while (cur) { if (cur instanceof SoupTag) { var tag = strainer.match(cur); if (tag) { results.push(tag); } } cur = cur.nextSibling; } return results; } }, { key: "getText", value: function getText() { var separator = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ''; var text = []; var descendants = this.descendants; for (var i = 0; i < descendants.length; ++i) { if (descendants[i] instanceof SoupString) { text.push(descendants[i]._text); } } return text.join(separator); } }, { key: "text", get: function get() { return this.getText(); } }, { key: "descendants", get: function get() { var ret = []; var cur = this.nextElement; while (cur) { var parent = cur.parent; while (parent && parent != this) { parent = parent.parent; } if (!parent) break; ret.push(cur); cur = cur.nextElement; } return ret; } }, { key: "_convertAttrsToString", value: function _convertAttrsToString() { var text = ''; if (!this.attrs) return text; for (var key in this.attrs) { if (Array.isArray(this.attrs[key])) { text += key + '="' + this.attrs[key].join(' ') + '" '; } else { text += key + '="' + this.attrs[key] + '" '; } } text = text.trim(); return text; } }, { key: "_prettify", value: function _prettify(indent, breakline) { var level = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 0; var text = ''; if (this.hidden && level == 0) { --level; } if (!this.hidden) { var attrs = this._convertAttrsToString(); if (attrs) { text += indent.repeat(level) + '<' + this.name + ' ' + attrs; } else { text += indent.repeat(level) + '<' + this.name; } } // is an element doesn't have any contents, it's a self closing element if (!this.hidden) { if (this._isEmptyElement() && this.builder.canBeEmptyElement(this.name)) { text += ' />' + breakline; return text; } else { text += '>' + breakline; } } for (var i = 0; i < this.contents.length; ++i) { if (this.contents[i] instanceof SoupString) { var curText = this.contents[i].toString(); curText = curText.trim(); if (curText.length != 0) { if (curText.substring(curText.length - 1) == "\n") { text += indent.repeat(level + 1) + curText; } else { text += indent.repeat(level + 1) + curText + breakline; } } } else { if (this.contents[i] instanceof SoupComment) { text += indent.repeat(level + 1) + "<!--" + this.contents[i]._text + "-->" + breakline; } else { text += this.contents[i]._prettify(indent, breakline, level + 1); } } } if (!this.hidden) { text += indent.repeat(level) + '</' + this.name + '>' + breakline; } return text; } }, { key: "prettify", value: function prettify() { var indent = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : ' '; var breakline = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '\n'; return this._prettify(indent, breakline).trim(); } /* * Append item in contents */ }, { key: "append", value: function append(item) { var pre = this; var next = this.nextElement; var appendFirst = item; var appendLast = item; var itemDescendants = item.descendants; if (itemDescendants && itemDescendants.length > 0) { appendLast = itemDescendants[itemDescendants.length - 1]; } var descendants = this.descendants; if (descendants && descendants.length > 0) { pre = descendants[descendants.length - 1]; next = pre.nextElement; } //merge two SoupString if (item instanceof SoupString && pre instanceof SoupString) { pre._text += item._text; return; } appendFirst.previousElement = pre; appendLast.nextElement = next; if (pre) pre.nextElement = appendFirst; if (next) next.previousElement = appendLast; this.contents.push(item); item.parent = this; } }, { key: "_isEmptyElement", value: function _isEmptyElement() { return this.contents.length == 0; } }]); return SoupTag; }(SoupElement); SoupTag.prototype.toString = function () { return this.prettify('', ''); }; var ROOT_TAG_NAME = '[document]'; var JSSoup = /*#__PURE__*/function (_SoupTag) { _inherits(JSSoup, _SoupTag); var _super5 = _createSuper(JSSoup); function JSSoup(text) { var _this6; var ignoreWhitespace = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true; _classCallCheck(this, JSSoup); _this6 = _super5.call(this, ROOT_TAG_NAME, new _builder["default"](), null); var handler = new htmlparser.DefaultHandler(function (error, dom) { if (error) { console.log(error); } else {} }, { verbose: false, ignoreWhitespace: ignoreWhitespace }); var parser = new htmlparser.Parser(handler); parser.parseComplete(text); if (Array.isArray(handler.dom)) { _this6._build(handler.dom); } else { _this6._build([handler.dom]); } _this6.hidden = true; return _this6; } return JSSoup; }(SoupTag); exports["default"] = JSSoup; var SoupStrainer = /*#__PURE__*/function () { function SoupStrainer(name, attrs, string) { _classCallCheck(this, SoupStrainer); if (typeof attrs == 'string') { attrs = { "class": [attrs] }; } else if (Array.isArray(attrs)) { attrs = { "class": attrs }; } else if (attrs && attrs["class"] && typeof attrs["class"] == 'string') { attrs["class"] = [attrs["class"]]; } if (attrs && attrs["class"]) { for (var i = 0; i < attrs["class"].length; ++i) { attrs["class"][i] = attrs["class"][i].trim(); } } this.name = name; this.attrs = attrs; this.string = string; } _createClass(SoupStrainer, [{ key: "match", value: function match(tag) { // match string if (this.name == undefined && this.attrs == undefined) { if (this.string) { if (this._matchName(tag.string, this.string)) return tag.string;else return null; } return tag; } // match tag name var match = this._matchName(tag.name, this.name); if (!match) return null; // match string match = this._matchName(tag.string, this.string); if (!match) return null; // match attributes if (_typeof(this.attrs) == 'object') { if (!this._isEmptyObject(this.attrs)) { var props = Object.getOwnPropertyNames(this.attrs); var found = false; for (var i = 0; i < props.length; ++i) { if (props[i] in tag.attrs && this._matchAttrs(props[i], tag.attrs[props[i]], this.attrs[props[i]])) { found = true; break; } } if (!found) return null; } } return tag; } }, { key: "_matchName", value: function _matchName(tagItem, name) { if (name == undefined || name == null) return true; // if name is an array, then tag match any item in this array is a match. if (Array.isArray(name)) { for (var i = 0; i < name.length; ++i) { var match = this._matchName(tagItem, name[i]); if (match) return true; } return false; } return tagItem == name; } }, { key: "_matchAttrs", value: function _matchAttrs(name, candidateAttrs, attrs) { if (typeof candidateAttrs == 'string') { if (name == 'class') { candidateAttrs = candidateAttrs.replace(/\s\s+/g, ' ').trim().split(' '); } else { candidateAttrs = [candidateAttrs]; } } if (typeof attrs == 'string') { attrs = [attrs]; } for (var i = 0; i < attrs.length; ++i) { if (candidateAttrs.indexOf(attrs[i]) < 0) return false; } return true; } }, { key: "_isEmptyObject", value: function _isEmptyObject(obj) { return Object.keys(obj).length == 0; } }]); return SoupStrainer; }();