UNPKG

@thednp/domparser

Version:

🍝 Super light HTML parser for isomorphic applications.

github.com/thednp/domparser

thednp/domparser

311 lines (310 loc) • 9.83 kB

JavaScript

//#region src/parts/util.ts const ATTR_REGEX = /([^\s=]+)(?:=(?:"([^"]*)"|'([^']*)'|([^\s"']+)))?/g; /** * Get attributes from a string token and return an object * @param token the string token * @returns the attributes object */ const getBaseAttributes = (token) => { const attrs = {}; const [tagName, ...parts] = token.split(/\s+/); if (parts.length < 1) return attrs; const attrStr = token.slice(tagName.length); let match; while (match = ATTR_REGEX.exec(attrStr)) { const [, name, d, s, u] = match; name !== "/" && (attrs[name] = d ?? s ?? u ?? ""); } return attrs; }; /** * Get attributes from a string token and return an object. * In addition to the base tool, this also filters configured * unsafe attributes. * @param tagStr the string token * @param config an optional set of options * @returns the attributes object */ const getAttributes = (tagStr, config) => { const { unsafeAttrs } = config || {}; const baseAttrs = getBaseAttributes(tagStr); const attrs = {}; for (const [key, value] of Object.entries(baseAttrs)) if (!unsafeAttrs || !unsafeAttrs?.has(toLowerCase(key))) attrs[key] = value; return attrs; }; /** * Converts a string to lowercase. * @param str The string to convert. * @returns The lowercase string. */ const toLowerCase = (str) => str.toLowerCase(); /** * Converts a string to uppercase. * @param str The string to convert. * @returns The uppercase string. */ const toUpperCase = (str) => str.toUpperCase(); /** * Checks if a string starts with a specified prefix. * @param str The string to check. * @param prefix The prefix to search for. * @param position The position to start looking from. * @returns `true` if the string starts with the prefix, `false` otherwise. */ const startsWith = (str, prefix, position) => str.startsWith(prefix, position); /** * Checks if a string ends with a specified suffix. * @param str The string to check. * @param suffix The suffix to search for. * @param position The position to start looking from. * @returns `true` if the string ends with the suffix, `false` otherwise. */ const endsWith = (str, suffix, position) => str.endsWith(suffix, position); /** * Creates a string from a character code. * @param char The character code. * @returns The string representation of the character code. */ const fromCharCode = (char) => String.fromCharCode(char); /** * Returns the character code at a specific index in a string. * @param str The string to check. * @param index The index of the character to get the code for. * @returns The character code at the specified index. */ const charCodeAt = (str, index) => str.charCodeAt(index); /** * Defines a property on an object. * @param obj The object to define the property on. * @param propName The name of the property. * @param desc The property descriptor. * @returns The object after defining the property. */ /** * Defines multiple properties on an object. * @param obj The object to define properties on. * @param props An object where keys are property names and values are property descriptors. * @returns The object after defining the properties. */ const defineProperties = (obj, props) => Object.defineProperties(obj, props); /** * Checks if a node is an object. * @param node The object to check. * @returns `true` if the node is an object, `false` otherwise. */ const isObj = (node) => node !== null && typeof node === "object"; /** * Checks if a node is a root object (`RootNode` or `RootLike`). * @param node The object to check. * @returns `true` if the node is an object, `false` otherwise. */ const isRoot = (node) => isObj(node) && isNode(node) && node.nodeName === "#document"; /** * Checks if a node is a tag node (`NodeLike` or `DOMNode`). * @param node The node to check. * @returns `true` if the node is a tag node, `false` otherwise. */ const isTag = (node) => isObj(node) && "tagName" in node; /** * Checks if a node is a root node (`RootLike` or `RootNode`), * a tag node (`NodeLike` or `DOMNode`), a comment node * (`CommentLike` or `CommentNode`) or text node (`TextLike` or `TextNode`). * @param node The node to check. * @returns `true` if the node is a tag node, `false` otherwise. */ const isNode = (node) => isObj(node) && "nodeName" in node; /** * Checks if a value is a primitive (number or string). * @param val The value to check. * @returns `true` if the value is a primitive, `false` otherwise. */ const isPrimitive = (val) => typeof val === "string" || typeof val === "number"; /** * Trim a string value. * @param str A string value * @returns The trimmed value of the same string. */ const trim = (str) => str.trim(); /** * Set of self-closing HTML tags used by the `Parser`. */ const selfClosingTags = new Set([ "?xml", "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr", "path", "circle", "ellipse", "line", "rect", "use", "stop", "polygon", "polyline" ]); const escape = (str) => { if (str === null || str === "") return ""; else str = str.toString(); const map = { "&": "&", "<": "<", ">": ">", "\"": """, "'": "'" }; return str.replace(/[&<>"']/g, (m) => { return map[m]; }); }; const DOM_ERROR = "DomParserError:"; const DEFAULT_CHUNK_SIZE = 64 * 1024; const DEFAULT_MAX_SCRIPT_SIZE = 128 * 1024; /** * Tokenizes an HTML string into an array of HTML tokens. * These tokens represent opening tags, closing tags, text content, and comments. * @param html The HTML string to tokenize. * @returns An array of `HTMLToken` objects. */ const tokenize = (html, options = {}) => { const { maxScriptSize = DEFAULT_MAX_SCRIPT_SIZE, chunkSize = DEFAULT_CHUNK_SIZE } = options; const specialTags = ["script", "style"]; const tokens = []; const len = html.length; const COM_START = ["!--", "![CDATA["]; const COM_END = ["--", "]]"]; let COM_TYPE = 0; let token = ""; let scriptContent = ""; let inTag = false; let inQuote = false; let quote = 0; let inPre = false; let inTemplate = false; let inComment = false; let inStyleScript = false; let currentChunkStart = 0; while (currentChunkStart < len) { const chunkEnd = Math.min(currentChunkStart + chunkSize, len); const chunk = html.slice(currentChunkStart, chunkEnd); for (let i = 0; i < chunk.length; i++) { const globalIndex = currentChunkStart + i; const char = charCodeAt(chunk, i); if (inStyleScript) { const endSpecialTag = specialTags.find((t) => startsWith(html, `/${t}`, globalIndex + 1)); if (char === 60 && endSpecialTag && !inTemplate && !inQuote) { // istanbul ignore else @preserve if (scriptContent.length < maxScriptSize) tokens.push({ tokenType: "text", value: trim(scriptContent), isSC: false }); tokens.push({ tokenType: "tag", value: "/" + endSpecialTag, isSC: false }); scriptContent = ""; inStyleScript = false; i += endSpecialTag.length + 2; } else { // istanbul ignore next @preserve - don't crash the test!! if (scriptContent.length >= maxScriptSize) continue; if (char === 96) inTemplate = !inTemplate; else if (!inTemplate && (char === 34 || char === 39)) { // istanbul ignore else @preserve if (!inQuote) { quote = char; inQuote = true; } else if (char === quote) inQuote = false; } scriptContent += fromCharCode(char); } continue; } if (inComment) { token += fromCharCode(char); if (endsWith(token, COM_END[COM_TYPE]) && charCodeAt(html, globalIndex + 1) === 62) { const tokenValue = COM_TYPE === 1 ? escape(token) : token; tokens.push({ tokenType: "comment", value: `<${trim(tokenValue)}>`, isSC: false }); inComment = false; token = ""; i += 1; } continue; } if (inTag && token.includes("=") && (char === 34 || char === 39)) { if (!inQuote) { quote = char; inQuote = true; } else if (char === quote) inQuote = false; token += fromCharCode(char); continue; } if (char === 60 && !inQuote && !inTemplate) { const value = trim(token); value && tokens.push({ tokenType: "text", value: inPre ? token : value, isSC: false }); token = ""; const commentStart = COM_START.find((cs) => startsWith(html, cs, globalIndex + 1)); if (commentStart) { COM_TYPE = COM_START.indexOf(commentStart); inComment = true; token += commentStart; i += commentStart.length; continue; } inTag = true; } else if (char === 62 && inTag && !inTemplate) { if (token === "/pre") inPre = false; else if (token === "pre" || startsWith(token, "pre")) inPre = true; const startSpecialTag = specialTags.find((t) => t === token || startsWith(token, t)); if (startSpecialTag && !endsWith(token, "/")) inStyleScript = true; const isDocType = startsWith(toLowerCase(token), "!doctype"); // istanbul ignore else @preserve if (token) { const isSC = endsWith(token, "/"); const [tagName] = token.split(/\s/); const value = inQuote ? tagName + (isSC ? "/" : "") : token; tokens.push({ tokenType: isDocType ? "doctype" : "tag", value: isSC ? trim(value.slice(0, -1)) : trim(value), isSC }); } token = ""; inTag = false; inQuote = false; } else token += fromCharCode(char); } currentChunkStart = chunkEnd; } const lastToken = trim(token); if (lastToken) tokens.push({ tokenType: "text", value: lastToken, isSC: false }); return tokens; }; //#endregion export { ATTR_REGEX, DOM_ERROR, charCodeAt, defineProperties, endsWith, escape, fromCharCode, getAttributes, getBaseAttributes, isNode, isObj, isPrimitive, isRoot, isTag, selfClosingTags, startsWith, toLowerCase, toUpperCase, tokenize, trim }; //# sourceMappingURL=util-9cK5wJoo.js.map