UNPKG

htmlstr-parser

Version:

Simple HTML to JSON parser use Regexp and String.indexOf

268 lines (210 loc) 6.66 kB
(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() : typeof define === 'function' && define.amd ? define(factory) : (global.htmlParser = factory()); }(this, (function () { 'use strict'; var STARTTAG_REX = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/; var ENDTAG_REX = /^<\/([-A-Za-z0-9_]+)[^>]*>/; var ATTR_REX = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; function makeMap(str) { return str.split(",").reduce(function (map, cur) { map[cur] = true; return map; }, {}); } var EMPTY_MAKER = makeMap("area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr"); var FILLATTRS_MAKER = makeMap("checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected"); function isEmptyMaker(tag) { return !!EMPTY_MAKER[tag]; } function isFillattrsMaker(attr) { return !!FILLATTRS_MAKER[attr]; } var classCallCheck = function (instance, Constructor) { if (!(instance instanceof Constructor)) { throw new TypeError("Cannot call a class as a function"); } }; var createClass = function () { function defineProperties(target, props) { for (var i = 0; i < props.length; i++) { var descriptor = props[i]; descriptor.enumerable = descriptor.enumerable || false; descriptor.configurable = true; if ("value" in descriptor) descriptor.writable = true; Object.defineProperty(target, descriptor.key, descriptor); } } return function (Constructor, protoProps, staticProps) { if (protoProps) defineProperties(Constructor.prototype, protoProps); if (staticProps) defineProperties(Constructor, staticProps); return Constructor; }; }(); var inherits = function (subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function, not " + typeof superClass); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, enumerable: false, writable: true, configurable: true } }); if (superClass) Object.setPrototypeOf ? Object.setPrototypeOf(subClass, superClass) : subClass.__proto__ = superClass; }; var possibleConstructorReturn = function (self, call) { if (!self) { throw new ReferenceError("this hasn't been initialised - super() hasn't been called"); } return call && (typeof call === "object" || typeof call === "function") ? call : self; }; var TagStart = function () { function TagStart(name, tag) { classCallCheck(this, TagStart); this.name = name; this.attributes = this.getAttributes(tag); } createClass(TagStart, [{ key: 'getAttributes', value: function getAttributes(str) { var attrsMap = {}; str.replace(ATTR_REX, function (match, name) { var args = Array.prototype.slice.call(arguments); var value = args[2] ? args[2] : args[3] ? args[3] : args[4] ? args[4] : isFillattrsMaker(name) ? name : ""; attrsMap[name] = value.replace(/(^|[^\\])"/g, '$1\\\"'); }); return attrsMap; } }]); return TagStart; }(); var TagEmpty = function (_TagStart) { inherits(TagEmpty, _TagStart); function TagEmpty(name, tag) { classCallCheck(this, TagEmpty); return possibleConstructorReturn(this, (TagEmpty.__proto__ || Object.getPrototypeOf(TagEmpty)).call(this, name, tag)); } return TagEmpty; }(TagStart); var TagEnd = function TagEnd(name) { classCallCheck(this, TagEnd); this.name = name; }; var Text = function Text(text) { classCallCheck(this, Text); this.text = text; }; var ElEMENT_TYPE = "Element"; var TEXT_TYPE = "Text"; function createElement(token) { var tagName = token.name; var attributes = token.attributes; if (token instanceof TagEmpty) { return { type: ElEMENT_TYPE, tagName: tagName, attributes: attributes }; } return { type: ElEMENT_TYPE, tagName: tagName, attributes: attributes, children: [] }; } function createText(token) { var content = token.text; return { type: TEXT_TYPE, content: content }; } function createNodeFactory(type, token) { switch (type) { case ElEMENT_TYPE: return createElement(token); case TEXT_TYPE: return createText(token); default: break; } } function parse(tokens) { var root = { tag: "root", children: [] }; var tagArray = [root]; tagArray.last = function () { return tagArray[tagArray.length - 1]; }; for (var i = 0; i < tokens.length; i++) { var token = tokens[i]; if (token instanceof TagStart) { var node = createNodeFactory(ElEMENT_TYPE, token); if (node.children) { tagArray.push(node); } else { tagArray.last().children.push(node); } continue; } if (token instanceof TagEnd) { var parent = tagArray[tagArray.length - 2]; var _node = tagArray.pop(); parent.children.push(_node); continue; } if (token instanceof Text) { tagArray.last().children.push(createNodeFactory(TEXT_TYPE, token)); continue; } } return root; } function tokenize(html) { var string = html; var tokens = []; var maxTime = Date.now() + 1000; while (string) { if (string.indexOf("<!--") === 0) { var lastIndex = string.indexOf("-->") + 3; string = string.substring(lastIndex); continue; } if (string.indexOf("</") === 0) { var match = string.match(ENDTAG_REX); if (!match) continue; string = string.substring(match[0].length); var name = match[1]; if (isEmptyMaker(name)) continue; tokens.push(new TagEnd(name)); continue; } if (string.indexOf("<") === 0) { var _match = string.match(STARTTAG_REX); if (!_match) continue; string = string.substring(_match[0].length); var _name = _match[1]; var attrs = _match[2]; var token = isEmptyMaker(_name) ? new TagEmpty(_name, attrs) : new TagStart(_name, attrs); tokens.push(token); continue; } var index = string.indexOf('<'); var text = index < 0 ? string : string.substring(0, index); string = index < 0 ? "" : string.substring(index); tokens.push(new Text(text)); if (Date.now() >= maxTime) break; } return tokens; } function htmlParser(html) { return parse(tokenize(html)); } return htmlParser; })));