@thednp/domparser
Version:
🍝 Super light HTML parser for isomorphic applications.
1 lines • 17.2 kB
Source Map (JSON)
{"version":3,"file":"util-BfmRalo8.mjs","names":["attrs: NodeLikeAttributes","match: RegExpExecArray | null","map: Record<string, string>","tokens: HTMLToken[]"],"sources":["../src/parts/util.ts"],"sourcesContent":["// util.ts\nimport type {\n ChildLike,\n ChildNode,\n DOMNode,\n GetAttributesOptions,\n HTMLToken,\n NodeLike,\n NodeLikeAttributes,\n RootLike,\n RootNode,\n TokenizerOptions,\n} from \"./types\";\n\n// general utils\n\nexport const ATTR_REGEX = /([^\\s=]+)(?:=(?:\"([^\"]*)\"|'([^']*)'|([^\\s\"']+)))?/g;\n\n/**\n * Get attributes from a string token and return an object\n * @param token the string token\n * @returns the attributes object\n */\nexport const getBaseAttributes = (token: string) => {\n const attrs: NodeLikeAttributes = {};\n const [tagName, ...parts] = token.split(/\\s+/);\n if (parts.length < 1) return attrs;\n\n const attrStr = token.slice(tagName.length);\n let match: RegExpExecArray | null;\n\n while ((match = ATTR_REGEX.exec(attrStr))) {\n const [, name, d, s, u] = match;\n name !== \"/\" && (attrs[name] = d ?? s ?? u ?? \"\");\n }\n\n return attrs;\n};\n\n/**\n * Get attributes from a string token and return an object.\n * In addition to the base tool, this also filters configured\n * unsafe attributes.\n * @param tagStr the string token\n * @param config an optional set of options\n * @returns the attributes object\n */\nexport const getAttributes = (\n tagStr: string,\n config?: Partial<GetAttributesOptions>,\n): NodeLikeAttributes => {\n const { unsafeAttrs } = config || {};\n const baseAttrs = getBaseAttributes(tagStr);\n const attrs: NodeLikeAttributes = {};\n\n for (const [key, value] of Object.entries(baseAttrs)) {\n if (!unsafeAttrs || !unsafeAttrs?.has(toLowerCase(key))) {\n attrs[key] = value;\n }\n }\n\n return attrs;\n};\n\n/**\n * Converts a string to lowercase.\n * @param str The string to convert.\n * @returns The lowercase string.\n */\nexport const toLowerCase = (str: string): string => str.toLowerCase();\n\n/**\n * Converts a string to uppercase.\n * @param str The string to convert.\n * @returns The uppercase string.\n */\nexport const toUpperCase = (str: string): string => str.toUpperCase();\n\n/**\n * Checks if a string starts with a specified prefix.\n * @param str The string to check.\n * @param prefix The prefix to search for.\n * @param position The position to start looking from.\n * @returns `true` if the string starts with the prefix, `false` otherwise.\n */\nexport const startsWith = (\n str: string,\n prefix: string,\n position?: number,\n): boolean => str.startsWith(prefix, position);\n\n/**\n * Checks if a string ends with a specified suffix.\n * @param str The string to check.\n * @param suffix The suffix to search for.\n * @param position The position to start looking from.\n * @returns `true` if the string ends with the suffix, `false` otherwise.\n */\nexport const endsWith = (\n str: string,\n suffix: string,\n position?: number,\n): boolean => str.endsWith(suffix, position);\n\n/**\n * Creates a string from a character code.\n * @param char The character code.\n * @returns The string representation of the character code.\n */\nexport const fromCharCode = (char: number): string => String.fromCharCode(char);\n\n/**\n * Returns the character code at a specific index in a string.\n * @param str The string to check.\n * @param index The index of the character to get the code for.\n * @returns The character code at the specified index.\n */\nexport const charCodeAt = (str: string, index: number): number =>\n str.charCodeAt(index);\n\n/**\n * Defines a property on an object.\n * @param obj The object to define the property on.\n * @param propName The name of the property.\n * @param desc The property descriptor.\n * @returns The object after defining the property.\n */\n// export const defineProperty = <T extends Record<string, unknown>>(\n// obj: T,\n// propName: PropertyKey,\n// desc: PropertyDescriptor,\n// ): T => Object.defineProperty(obj, propName, desc);\n\n/**\n * Defines multiple properties on an object.\n * @param obj The object to define properties on.\n * @param props An object where keys are property names and values are property descriptors.\n * @returns The object after defining the properties.\n */\nexport const defineProperties = <T extends Record<string, unknown>>(\n obj: T,\n props: Record<PropertyKey, PropertyDescriptor>,\n): T => Object.defineProperties(obj, props);\n\n// Type guards\n\n/**\n * Checks if a node is an object.\n * @param node The object to check.\n * @returns `true` if the node is an object, `false` otherwise.\n */\nexport const isObj = (node: unknown) =>\n node !== null && typeof node === \"object\";\n\n/**\n * Checks if a node is a root object (`RootNode` or `RootLike`).\n * @param node The object to check.\n * @returns `true` if the node is an object, `false` otherwise.\n */\nexport const isRoot = (\n node: RootLike | RootNode | ChildLike | ChildNode,\n): node is RootLike | RootNode =>\n isObj(node) && isNode(node as unknown as ChildLike) &&\n node.nodeName === \"#document\";\n\n/**\n * Checks if a node is a tag node (`NodeLike` or `DOMNode`).\n * @param node The node to check.\n * @returns `true` if the node is a tag node, `false` otherwise.\n */\nexport const isTag = (\n node: ChildLike | ChildNode,\n): node is NodeLike | DOMNode => isObj(node) && \"tagName\" in node;\n\n/**\n * Checks if a node is a root node (`RootLike` or `RootNode`),\n * a tag node (`NodeLike` or `DOMNode`), a comment node\n * (`CommentLike` or `CommentNode`) or text node (`TextLike` or `TextNode`).\n * @param node The node to check.\n * @returns `true` if the node is a tag node, `false` otherwise.\n */\nexport const isNode = (\n node: ChildLike | ChildNode | NodeLikeAttributes | string | number,\n): node is ChildLike | NodeLike | DOMNode => isObj(node) && \"nodeName\" in node;\n\n/**\n * Checks if a value is a primitive (number or string).\n * @param val The value to check.\n * @returns `true` if the value is a primitive, `false` otherwise.\n */\nexport const isPrimitive = <T extends (number | string)>(\n val: unknown,\n): val is T => typeof val === \"string\" || typeof val === \"number\";\n\n/**\n * Trim a string value.\n * @param str A string value\n * @returns The trimmed value of the same string.\n */\nexport const trim = (str: string) => str.trim();\n\n/**\n * Set of self-closing HTML tags used by the `Parser`.\n */\nexport const selfClosingTags = new Set([\n \"?xml\",\n \"area\",\n \"base\",\n \"br\",\n \"col\",\n \"embed\",\n \"hr\",\n \"img\",\n \"input\",\n \"link\",\n \"meta\",\n \"param\",\n \"source\",\n \"track\",\n \"wbr\",\n \"path\",\n \"circle\",\n \"ellipse\",\n \"line\",\n \"rect\",\n \"use\",\n \"stop\",\n \"polygon\",\n \"polyline\",\n]);\n\nexport const escape = (str: string) => {\n if ((str === null) || (str === \"\")) {\n return \"\";\n } else {\n str = str.toString();\n }\n\n const map: Record<string, string> = {\n \"&\": \"&\",\n \"<\": \"<\",\n \">\": \">\",\n '\"': \""\",\n \"'\": \"'\",\n };\n\n return str.replace(/[&<>\"']/g, (m) => {\n return map[m];\n });\n};\n\nexport const DOM_ERROR = \"DomParserError:\";\nconst DEFAULT_CHUNK_SIZE = 64 * 1024; // 65536 = 64KB\nconst DEFAULT_MAX_SCRIPT_SIZE = 128 * 1024; // 131072 = 128KB\n\n/**\n * Tokenizes an HTML string into an array of HTML tokens.\n * These tokens represent opening tags, closing tags, text content, and comments.\n * @param html The HTML string to tokenize.\n * @returns An array of `HTMLToken` objects.\n */\nexport const tokenize = (\n html: string,\n options: TokenizerOptions = {},\n): HTMLToken[] => {\n const {\n maxScriptSize = DEFAULT_MAX_SCRIPT_SIZE,\n chunkSize = DEFAULT_CHUNK_SIZE,\n } = options;\n\n const specialTags = [\"script\", \"style\"] as const;\n const tokens: HTMLToken[] = [];\n const len = html.length;\n const COM_START = [\"!--\", \"![CDATA[\"];\n const COM_END = [\"--\", \"]]\"];\n let COM_TYPE = 0; // [0 = #comment, 1 = CDATA]\n\n let token = \"\";\n let scriptContent = \"\";\n let inTag = false;\n let inQuote = false;\n let quote = 0;\n let inPre = false;\n let inTemplate = false;\n let inComment = false;\n let inStyleScript = false;\n let currentChunkStart = 0;\n\n while (currentChunkStart < len) {\n const chunkEnd = Math.min(currentChunkStart + chunkSize, len);\n const chunk = html.slice(currentChunkStart, chunkEnd);\n\n for (let i = 0; i < chunk.length; i++) {\n const globalIndex = currentChunkStart + i;\n const char = charCodeAt(chunk, i);\n\n if (inStyleScript) {\n const endSpecialTag = specialTags.find((t) =>\n startsWith(html, `/${t}`, globalIndex + 1)\n );\n\n if (char === 60 && endSpecialTag && !inTemplate && !inQuote) { // <\n // istanbul ignore else @preserve\n if (scriptContent.length < maxScriptSize) {\n tokens.push({\n tokenType: \"text\",\n value: trim(scriptContent),\n isSC: false,\n });\n }\n tokens.push({\n tokenType: \"tag\",\n value: \"/\" + endSpecialTag,\n isSC: false,\n });\n scriptContent = \"\";\n inStyleScript = false;\n i += endSpecialTag.length + 2;\n } else {\n // istanbul ignore next @preserve - don't crash the test!!\n if (scriptContent.length >= maxScriptSize) {\n // Once we hit the limit, just skip content until closing tag\n continue;\n }\n if (char === 96) { // ` | 0x60\n inTemplate = !inTemplate;\n // \" or ' | 0x22 or 0x27\n } else if (!inTemplate && (char === 34 || char === 39)) {\n // istanbul ignore else @preserve\n if (!inQuote) {\n quote = char;\n inQuote = true;\n } else if (char === quote) {\n inQuote = false;\n }\n }\n scriptContent += fromCharCode(char);\n }\n continue;\n }\n\n if (inComment) {\n token += fromCharCode(char);\n if (\n endsWith(token, COM_END[COM_TYPE]) &&\n charCodeAt(html, globalIndex + 1) === 62\n ) { // >\n const tokenValue = COM_TYPE === 1 ? escape(token) : token;\n tokens.push({\n tokenType: \"comment\",\n value: `<${trim(tokenValue)}>`,\n isSC: false,\n });\n inComment = false;\n token = \"\";\n i += 1;\n }\n continue;\n }\n\n if (\n (inTag && token.includes(\"=\")) &&\n (char === 34 || char === 39)\n ) {\n if (!inQuote) {\n quote = char;\n inQuote = true;\n } else if (char === quote) {\n inQuote = false;\n }\n token += fromCharCode(char);\n continue;\n }\n\n if (\n char === 60 && !inQuote && !inTemplate\n ) { // 0x3c | \"<\"\n const value = trim(token);\n value &&\n tokens.push({\n tokenType: \"text\",\n value: inPre ? token : value,\n isSC: false,\n });\n token = \"\";\n\n const commentStart = COM_START.find((cs) =>\n startsWith(html, cs, globalIndex + 1)\n );\n if (commentStart) {\n COM_TYPE = COM_START.indexOf(commentStart);\n inComment = true;\n token += commentStart;\n i += commentStart.length;\n continue;\n }\n\n inTag = true;\n } else if (\n char === 62 && inTag && !inTemplate\n ) { // 0x3e | \">\"\n if (token === \"/pre\") {\n inPre = false;\n } else if (token === \"pre\" || startsWith(token, \"pre\")) {\n inPre = true;\n }\n const startSpecialTag = specialTags.find((t) =>\n t === token || startsWith(token, t)\n );\n if (startSpecialTag && !endsWith(token, \"/\")) {\n inStyleScript = true;\n }\n\n const isDocType = startsWith(toLowerCase(token), \"!doctype\");\n\n // istanbul ignore else @preserve\n if (token) {\n const isSC = endsWith(token, \"/\");\n const [tagName] = token.split(/\\s/);\n const value = inQuote ? tagName + (isSC ? \"/\" : \"\") : token;\n tokens.push({\n tokenType: isDocType ? \"doctype\" : \"tag\",\n value: isSC ? trim(value.slice(0, -1)) : trim(value),\n isSC,\n });\n }\n token = \"\";\n inTag = false;\n inQuote = false;\n } else {\n token += fromCharCode(char);\n }\n }\n currentChunkStart = chunkEnd;\n }\n\n const lastToken = trim(token);\n if (lastToken) {\n tokens.push({\n tokenType: \"text\",\n value: lastToken,\n isSC: false,\n });\n }\n\n return tokens;\n};\n"],"mappings":";AAgBA,MAAa,aAAa;;;;;;AAO1B,MAAa,qBAAqB,UAAkB;CAClD,MAAMA,QAA4B,EAAE;CACpC,MAAM,CAAC,SAAS,GAAG,SAAS,MAAM,MAAM,MAAM;AAC9C,KAAI,MAAM,SAAS,EAAG,QAAO;CAE7B,MAAM,UAAU,MAAM,MAAM,QAAQ,OAAO;CAC3C,IAAIC;AAEJ,QAAQ,QAAQ,WAAW,KAAK,QAAQ,EAAG;EACzC,MAAM,GAAG,MAAM,GAAG,GAAG,KAAK;AAC1B,WAAS,QAAQ,MAAM,QAAQ,KAAK,KAAK,KAAK;;AAGhD,QAAO;;;;;;;;;;AAWT,MAAa,iBACX,QACA,WACuB;CACvB,MAAM,EAAE,gBAAgB,UAAU,EAAE;CACpC,MAAM,YAAY,kBAAkB,OAAO;CAC3C,MAAMD,QAA4B,EAAE;AAEpC,MAAK,MAAM,CAAC,KAAK,UAAU,OAAO,QAAQ,UAAU,CAClD,KAAI,CAAC,eAAe,CAAC,aAAa,IAAI,YAAY,IAAI,CAAC,CACrD,OAAM,OAAO;AAIjB,QAAO;;;;;;;AAQT,MAAa,eAAe,QAAwB,IAAI,aAAa;;;;;;AAOrE,MAAa,eAAe,QAAwB,IAAI,aAAa;;;;;;;;AASrE,MAAa,cACX,KACA,QACA,aACY,IAAI,WAAW,QAAQ,SAAS;;;;;;;;AAS9C,MAAa,YACX,KACA,QACA,aACY,IAAI,SAAS,QAAQ,SAAS;;;;;;AAO5C,MAAa,gBAAgB,SAAyB,OAAO,aAAa,KAAK;;;;;;;AAQ/E,MAAa,cAAc,KAAa,UACtC,IAAI,WAAW,MAAM;;;;;;;;;;;;;;AAqBvB,MAAa,oBACX,KACA,UACM,OAAO,iBAAiB,KAAK,MAAM;;;;;;AAS3C,MAAa,SAAS,SACpB,SAAS,QAAQ,OAAO,SAAS;;;;;;AAOnC,MAAa,UACX,SAEA,MAAM,KAAK,IAAI,OAAO,KAA6B,IACnD,KAAK,aAAa;;;;;;AAOpB,MAAa,SACX,SAC+B,MAAM,KAAK,IAAI,aAAa;;;;;;;;AAS7D,MAAa,UACX,SAC2C,MAAM,KAAK,IAAI,cAAc;;;;;;AAO1E,MAAa,eACX,QACa,OAAO,QAAQ,YAAY,OAAO,QAAQ;;;;;;AAOzD,MAAa,QAAQ,QAAgB,IAAI,MAAM;;;;AAK/C,MAAa,kBAAkB,IAAI,IAAI;CACrC;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACA;CACD,CAAC;AAEF,MAAa,UAAU,QAAgB;AACrC,KAAK,QAAQ,QAAU,QAAQ,GAC7B,QAAO;KAEP,OAAM,IAAI,UAAU;CAGtB,MAAME,MAA8B;EAClC,KAAK;EACL,KAAK;EACL,KAAK;EACL,MAAK;EACL,KAAK;EACN;AAED,QAAO,IAAI,QAAQ,aAAa,MAAM;AACpC,SAAO,IAAI;GACX;;AAGJ,MAAa,YAAY;AACzB,MAAM,qBAAqB,KAAK;AAChC,MAAM,0BAA0B,MAAM;;;;;;;AAQtC,MAAa,YACX,MACA,UAA4B,EAAE,KACd;CAChB,MAAM,EACJ,gBAAgB,yBAChB,YAAY,uBACV;CAEJ,MAAM,cAAc,CAAC,UAAU,QAAQ;CACvC,MAAMC,SAAsB,EAAE;CAC9B,MAAM,MAAM,KAAK;CACjB,MAAM,YAAY,CAAC,OAAO,WAAW;CACrC,MAAM,UAAU,CAAC,MAAM,KAAK;CAC5B,IAAI,WAAW;CAEf,IAAI,QAAQ;CACZ,IAAI,gBAAgB;CACpB,IAAI,QAAQ;CACZ,IAAI,UAAU;CACd,IAAI,QAAQ;CACZ,IAAI,QAAQ;CACZ,IAAI,aAAa;CACjB,IAAI,YAAY;CAChB,IAAI,gBAAgB;CACpB,IAAI,oBAAoB;AAExB,QAAO,oBAAoB,KAAK;EAC9B,MAAM,WAAW,KAAK,IAAI,oBAAoB,WAAW,IAAI;EAC7D,MAAM,QAAQ,KAAK,MAAM,mBAAmB,SAAS;AAErD,OAAK,IAAI,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;GACrC,MAAM,cAAc,oBAAoB;GACxC,MAAM,OAAO,WAAW,OAAO,EAAE;AAEjC,OAAI,eAAe;IACjB,MAAM,gBAAgB,YAAY,MAAM,MACtC,WAAW,MAAM,IAAI,KAAK,cAAc,EAAE,CAC3C;AAED,QAAI,SAAS,MAAM,iBAAiB,CAAC,cAAc,CAAC,SAAS;;AAE3D,SAAI,cAAc,SAAS,cACzB,QAAO,KAAK;MACV,WAAW;MACX,OAAO,KAAK,cAAc;MAC1B,MAAM;MACP,CAAC;AAEJ,YAAO,KAAK;MACV,WAAW;MACX,OAAO,MAAM;MACb,MAAM;MACP,CAAC;AACF,qBAAgB;AAChB,qBAAgB;AAChB,UAAK,cAAc,SAAS;WACvB;;AAEL,SAAI,cAAc,UAAU,cAE1B;AAEF,SAAI,SAAS,GACX,cAAa,CAAC;cAEL,CAAC,eAAe,SAAS,MAAM,SAAS,KAEjD;;UAAI,CAAC,SAAS;AACZ,eAAQ;AACR,iBAAU;iBACD,SAAS,MAClB,WAAU;;AAGd,sBAAiB,aAAa,KAAK;;AAErC;;AAGF,OAAI,WAAW;AACb,aAAS,aAAa,KAAK;AAC3B,QACE,SAAS,OAAO,QAAQ,UAAU,IAClC,WAAW,MAAM,cAAc,EAAE,KAAK,IACtC;KACA,MAAM,aAAa,aAAa,IAAI,OAAO,MAAM,GAAG;AACpD,YAAO,KAAK;MACV,WAAW;MACX,OAAO,IAAI,KAAK,WAAW,CAAC;MAC5B,MAAM;MACP,CAAC;AACF,iBAAY;AACZ,aAAQ;AACR,UAAK;;AAEP;;AAGF,OACG,SAAS,MAAM,SAAS,IAAI,KAC5B,SAAS,MAAM,SAAS,KACzB;AACA,QAAI,CAAC,SAAS;AACZ,aAAQ;AACR,eAAU;eACD,SAAS,MAClB,WAAU;AAEZ,aAAS,aAAa,KAAK;AAC3B;;AAGF,OACE,SAAS,MAAM,CAAC,WAAW,CAAC,YAC5B;IACA,MAAM,QAAQ,KAAK,MAAM;AACzB,aACE,OAAO,KAAK;KACV,WAAW;KACX,OAAO,QAAQ,QAAQ;KACvB,MAAM;KACP,CAAC;AACJ,YAAQ;IAER,MAAM,eAAe,UAAU,MAAM,OACnC,WAAW,MAAM,IAAI,cAAc,EAAE,CACtC;AACD,QAAI,cAAc;AAChB,gBAAW,UAAU,QAAQ,aAAa;AAC1C,iBAAY;AACZ,cAAS;AACT,UAAK,aAAa;AAClB;;AAGF,YAAQ;cAER,SAAS,MAAM,SAAS,CAAC,YACzB;AACA,QAAI,UAAU,OACZ,SAAQ;aACC,UAAU,SAAS,WAAW,OAAO,MAAM,CACpD,SAAQ;AAKV,QAHwB,YAAY,MAAM,MACxC,MAAM,SAAS,WAAW,OAAO,EAAE,CACpC,IACsB,CAAC,SAAS,OAAO,IAAI,CAC1C,iBAAgB;IAGlB,MAAM,YAAY,WAAW,YAAY,MAAM,EAAE,WAAW;;AAG5D,QAAI,OAAO;KACT,MAAM,OAAO,SAAS,OAAO,IAAI;KACjC,MAAM,CAAC,WAAW,MAAM,MAAM,KAAK;KACnC,MAAM,QAAQ,UAAU,WAAW,OAAO,MAAM,MAAM;AACtD,YAAO,KAAK;MACV,WAAW,YAAY,YAAY;MACnC,OAAO,OAAO,KAAK,MAAM,MAAM,GAAG,GAAG,CAAC,GAAG,KAAK,MAAM;MACpD;MACD,CAAC;;AAEJ,YAAQ;AACR,YAAQ;AACR,cAAU;SAEV,UAAS,aAAa,KAAK;;AAG/B,sBAAoB;;CAGtB,MAAM,YAAY,KAAK,MAAM;AAC7B,KAAI,UACF,QAAO,KAAK;EACV,WAAW;EACX,OAAO;EACP,MAAM;EACP,CAAC;AAGJ,QAAO"}