UNPKG

@thednp/domparser

Version:

🍝 Super light HTML parser for isomorphic applications.

1 lines 9.1 kB
{"version":3,"file":"dom-parser-ojVv8ZvA.cjs","names":["config?: Partial<DomParserOptions>","DOM_ERROR","htmlString?: string","stack: (RootNode | DOMNode)[]","tagStack: string[]","newNode: ChildNode"],"sources":["../src/parts/dom-parser.ts"],"sourcesContent":["// dom-parser.ts\nimport { createBasicNode, createDocument, createElement } from \"./prototype.ts\";\nimport type {\n ChildNode,\n DOMNode,\n DomParserOptions,\n DomParserResult,\n GetAttributesOptions,\n RootNode,\n} from \"./types.ts\";\n\nimport {\n DOM_ERROR,\n getAttributes,\n isObj,\n selfClosingTags,\n startsWith,\n tokenize,\n toUpperCase,\n} from \"./util.ts\";\n\n/**\n * **DomParser**\n *\n * Unlike the basic **Parser**, **DomParser** creates a new `Document` like instance with DOM-like\n * methods and properties and populates it with `Node` like objects resulted from the parsing\n * of a given HTML markup.\n *\n * @example\n * ```ts\n * const config = {\n * // On creating new node callback function\n * onNodeCallback?: myFunction(node: DOMNode) => DOMNode | YOURNode,\n * // Common dangerous tags that could lead to XSS attacks\n * filterTags: [\n * \"script\", \"style\", \"iframe\", \"object\", \"embed\", \"base\", \"form\",\n * \"input\", \"button\", \"textarea\", \"select\", \"option\"\n * ],\n * // Unsafe attributes that could lead to XSS attacks\n * filterAttrs: [\n * \"onerror\", \"onload\", \"onunload\", \"onclick\", \"ondblclick\", \"onmousedown\",\n * \"onmouseup\", \"onmouseover\", \"onmousemove\", \"onmouseout\", \"onkeydown\",\n * \"onkeypress\", \"onkeyup\", \"onchange\", \"onsubmit\", \"onreset\", \"onselect\",\n * \"onblur\", \"onfocus\", \"formaction\", \"href\", \"xlink:href\", \"action\"\n * ]\n * }\n * const { root: doc, components, tags } = DomParser.parseFromString(\"<!doctype html><html>This is starting html</html>\", config);\n * console.log(doc.documentElement.outerHTML);\n * // > \"<html>This is starting html</html>\"\n * ```\n *\n * @param startHTML Initial HTML content\n * @param config the `Parser` options to apply to the parsing of the startHTML markup.\n * @returns The `Document` like root node\n */\nexport const DomParser = (\n config?: Partial<DomParserOptions>,\n) => {\n if (config && !isObj(config)) {\n throw new Error(`${DOM_ERROR} 1st parameter is not an object.`);\n }\n\n // Common dangerous tags that could lead to XSS\n let unsafeTags = new Set<string>();\n let unsafeTagDepth = 0;\n\n // Unsafe attributes that could lead to XSS\n let unsafeAttrs = new Set<string>();\n\n // Apply config\n const { filterTags, filterAttrs, onNodeCallback } = config || {};\n if (filterTags?.length) unsafeTags = new Set(filterTags);\n if (filterAttrs?.length) unsafeAttrs = new Set(filterAttrs);\n const getAttrOptions = { unsafeAttrs } as GetAttributesOptions;\n // don't override the default function unless it's actualy set\n\n return {\n parseFromString(htmlString?: string) {\n if (htmlString && typeof htmlString !== \"string\") {\n throw new Error(`${DOM_ERROR} 1st parameter is not a string.`);\n }\n const root = createDocument();\n if (!htmlString) return { root, components: [], tags: [] };\n\n const stack: (RootNode | DOMNode)[] = [root];\n const tagStack: string[] = [];\n const components = new Set<string>();\n const tags = new Set<string>();\n const tokens = tokenize(htmlString);\n const tLen = tokens.length;\n let newNode: ChildNode;\n\n for (let i = 0; i < tLen; i += 1) {\n const { tokenType, value, isSC } = tokens[i];\n\n // Skip doctype, but store it as a root property\n if (tokenType === \"doctype\") {\n root.doctype = `<${value}>`;\n continue;\n }\n\n const currentParent = stack[stack.length - 1];\n const isClosing = startsWith(value, \"/\");\n const tagName = isClosing ? value.slice(1) : value.split(/[\\s/>]/)[0];\n const isSelfClosing = isSC || selfClosingTags.has(tagName);\n\n // Tag Matching Detection Logic\n if (tokenType === \"tag\" && !isSelfClosing) {\n // Start Tag (and not self-closing)\n if (!isClosing) {\n // Push tag name onto the tag stack\n tagStack.push(tagName);\n // Closing Tag\n } else {\n // Pop the last opened tag\n const expectedTag = tagStack.pop();\n if (expectedTag !== tagName) {\n if (expectedTag === undefined) {\n throw new Error(\n `${DOM_ERROR} Mismatched closing tag: </${tagName}>. No open tag found.`,\n );\n } else {\n throw new Error(\n `${DOM_ERROR} Mismatched closing tag: </${tagName}>. Expected closing tag for <${expectedTag}>.`,\n );\n }\n }\n }\n }\n\n // Skip unsafe tags AND their children\n if (unsafeTags.has(tagName)) {\n if (!isSelfClosing) {\n if (!isClosing) {\n unsafeTagDepth++;\n } else {\n unsafeTagDepth--;\n }\n }\n continue;\n }\n\n // Don't process anything while inside unsafe tags\n if (unsafeTagDepth > 0) continue;\n\n if ([\"text\", \"comment\"].includes(tokenType)) {\n newNode = createBasicNode(\n `#${tokenType as \"text\" | \"comment\"}`,\n value,\n ) as ChildNode;\n currentParent.append(newNode);\n continue;\n }\n\n // Register tag/component type\n (tagName[0] === toUpperCase(tagName[0]) || tagName.includes(\"-\")\n ? components\n : tags).add(tagName);\n\n if (!isClosing) {\n const attributes = getAttributes(value, getAttrOptions);\n newNode = createElement.call(\n root,\n tagName as DOMNode[\"tagName\"],\n attributes,\n );\n currentParent.append(newNode);\n stack.slice(1, -1).map((parent) =>\n (parent as DOMNode).registerChild(newNode as DOMNode)\n );\n\n if (onNodeCallback) onNodeCallback(newNode, currentParent, root);\n\n const charset = attributes?.charset;\n if (tagName === \"meta\" && charset) {\n root.charset = toUpperCase(charset);\n }\n\n !isSelfClosing && stack.push(newNode);\n } else if (!isSelfClosing && stack.length > 1) {\n stack.pop();\n }\n }\n\n // Check for unclosed tags at the end\n // an edge case where end tag is malformed `</incomplete`\n if (tagStack.length > 0) {\n const unclosedTag = tagStack.pop();\n throw new Error(`${DOM_ERROR} Unclosed tag: <${unclosedTag}>.`);\n }\n\n return {\n root,\n components: Array.from(components),\n tags: Array.from(tags),\n } satisfies DomParserResult;\n },\n };\n};\n"],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuDA,MAAa,YAAY,CACvBA,WACG;AACH,KAAI,WAAW,mBAAM,OAAO,CAC1B,OAAM,IAAI,OAAO,EAAEC,uBAAU;CAI/B,IAAI,6BAAa,IAAI;CACrB,IAAI,iBAAiB;CAGrB,IAAI,8BAAc,IAAI;CAGtB,MAAM,EAAE,YAAY,aAAa,gBAAgB,GAAG,UAAU,CAAE;AAChE,KAAI,YAAY,OAAQ,cAAa,IAAI,IAAI;AAC7C,KAAI,aAAa,OAAQ,eAAc,IAAI,IAAI;CAC/C,MAAM,iBAAiB,EAAE,YAAa;AAGtC,QAAO,EACL,gBAAgBC,YAAqB;AACnC,MAAI,qBAAqB,eAAe,SACtC,OAAM,IAAI,OAAO,EAAED,uBAAU;EAE/B,MAAM,OAAO,kCAAgB;AAC7B,OAAK,WAAY,QAAO;GAAE;GAAM,YAAY,CAAE;GAAE,MAAM,CAAE;EAAE;EAE1D,MAAME,QAAgC,CAAC,IAAK;EAC5C,MAAMC,WAAqB,CAAE;EAC7B,MAAM,6BAAa,IAAI;EACvB,MAAM,uBAAO,IAAI;EACjB,MAAM,SAAS,sBAAS,WAAW;EACnC,MAAM,OAAO,OAAO;EACpB,IAAIC;AAEJ,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,KAAK,GAAG;GAChC,MAAM,EAAE,WAAW,OAAO,MAAM,GAAG,OAAO;AAG1C,OAAI,cAAc,WAAW;AAC3B,SAAK,WAAW,GAAG,MAAM;AACzB;GACD;GAED,MAAM,gBAAgB,MAAM,MAAM,SAAS;GAC3C,MAAM,YAAY,wBAAW,OAAO,IAAI;GACxC,MAAM,UAAU,YAAY,MAAM,MAAM,EAAE,GAAG,MAAM,MAAM,SAAS,CAAC;GACnE,MAAM,gBAAgB,QAAQ,6BAAgB,IAAI,QAAQ;AAG1D,OAAI,cAAc,UAAU,cAE1B,MAAK,UAEH,UAAS,KAAK,QAAQ;QAEjB;IAEL,MAAM,cAAc,SAAS,KAAK;AAClC,QAAI,gBAAgB,QAClB,KAAI,uBACF,OAAM,IAAI,OACP,EAAEJ,uBAAU,6BAA6B,QAAQ;QAGpD,OAAM,IAAI,OACP,EAAEA,uBAAU,6BAA6B,QAAQ,+BAA+B,YAAY;GAIpG;AAIH,OAAI,WAAW,IAAI,QAAQ,EAAE;AAC3B,SAAK,cACH,MAAK,UACH;QAEA;AAGJ;GACD;AAGD,OAAI,iBAAiB,EAAG;AAExB,OAAI,CAAC,QAAQ,SAAU,EAAC,SAAS,UAAU,EAAE;AAC3C,cAAU,mCACP,GAAG,UAAgC,GACpC,MACD;AACD,kBAAc,OAAO,QAAQ;AAC7B;GACD;AAGD,IAAC,QAAQ,OAAO,yBAAY,QAAQ,GAAG,IAAI,QAAQ,SAAS,IAAI,GAC5D,aACA,MAAM,IAAI,QAAQ;AAEtB,QAAK,WAAW;IACd,MAAM,aAAa,2BAAc,OAAO,eAAe;AACvD,cAAU,gCAAc,KACtB,MACA,SACA,WACD;AACD,kBAAc,OAAO,QAAQ;AAC7B,UAAM,MAAM,GAAG,GAAG,CAAC,IAAI,CAAC,WACtB,AAAC,OAAmB,cAAc,QAAmB,CACtD;AAED,QAAI,eAAgB,gBAAe,SAAS,eAAe,KAAK;IAEhE,MAAM,UAAU,YAAY;AAC5B,QAAI,YAAY,UAAU,QACxB,MAAK,UAAU,yBAAY,QAAQ;AAGrC,KAAC,iBAAiB,MAAM,KAAK,QAAQ;GACtC,YAAW,iBAAiB,MAAM,SAAS,EAC1C,OAAM,KAAK;EAEd;AAID,MAAI,SAAS,SAAS,GAAG;GACvB,MAAM,cAAc,SAAS,KAAK;AAClC,SAAM,IAAI,OAAO,EAAEA,uBAAU,kBAAkB,YAAY;EAC5D;AAED,SAAO;GACL;GACA,YAAY,MAAM,KAAK,WAAW;GAClC,MAAM,MAAM,KAAK,KAAK;EACvB;CACF,EACF;AACF"}