UNPKG

@kingsword/node-html-markdown

Version:

Fast HTML to markdown cross-compiler, compatible with both node and the browser

389 lines 15.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.nodeHtmlParserConfig = exports.aTagTranslatorConfig = exports.defaultCodeBlockTranslators = exports.tableCellTranslatorConfig = exports.tableRowTranslatorConfig = exports.tableTranslatorConfig = exports.defaultTranslators = exports.defaultOptions = exports.contentlessElements = exports.defaultIgnoreElements = exports.defaultBlockElements = void 0; const utilities_1 = require("./utilities"); const translator_1 = require("./translator"); /* ****************************************************************************************************************** */ // region: Elements /* ****************************************************************************************************************** */ exports.defaultBlockElements = [ "ADDRESS", "ARTICLE", "ASIDE", "AUDIO", "BLOCKQUOTE", "BODY", "CANVAS", "CENTER", "DD", "DIR", "DIV", "DL", "DT", "FIELDSET", "FIGCAPTION", "FIGURE", "FOOTER", "FORM", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEADER", "HGROUP", "HR", "HTML", "ISINDEX", "LI", "MAIN", "MENU", "NAV", "NOFRAMES", "NOSCRIPT", "OL", "OUTPUT", "P", "PRE", "SECTION", "TABLE", "TBODY", "TD", "TFOOT", "TH", "THEAD", "TR", "UL", ]; exports.defaultIgnoreElements = [ "AREA", "BASE", "COL", "COMMAND", "EMBED", "HEAD", "INPUT", "KEYGEN", "LINK", "META", "PARAM", "SCRIPT", "SOURCE", "STYLE", "TRACK", "WBR", ]; exports.contentlessElements = ["BR", "HR", "IMG"]; // endregion /* ****************************************************************************************************************** */ // region: Options /* ****************************************************************************************************************** */ // noinspection RegExpUnnecessaryNonCapturingGroup exports.defaultOptions = Object.freeze({ preferNativeParser: false, codeFence: "```", bulletMarker: "*", indent: " ", codeBlockStyle: "fenced", emDelimiter: "_", strongDelimiter: "**", strikeDelimiter: "~~", maxConsecutiveNewlines: 3, /** * Character: Affects: Example: * * \ Escaping \- * ` Code `` code ``, ```lang\n code block \n``` * * Bullet & Separators * item, *** * _ Bold, Italics, Separator _italic_, __bold__, ^___ * ~ Strikethrough, Code ~~strike~~, ~~~lang\n code block \n~~~ * [ Url [caption](url) * ] Url [caption](url) */ globalEscape: [/[\\`*_~\[\]]/gm, "\\$&"], /** * Note: The following compiled pattern was selected after perf testing various alternatives. * Please be mindful of performance if updating/changing it. * * Sequence: Affects: Example: * * +(space) Bullets + item * = Heading heading\n==== * #{1,6}(space) Heading ## Heading * > Blockquote > quote * - Bullet, Header, Separator - item, heading\n---, --- * \d+\.(space) Numbered list item 1. Item */ lineStartEscape: [/^(\s*?)((?:\+\s)|(?:[=>-])|(?:#{1,6}\s))|(?:(\d+)(\.\s))/gm, "$1$3\\$2$4"], useInlineLinks: true, }); // endregion /* ****************************************************************************************************************** */ // region: Translators /* ****************************************************************************************************************** */ exports.defaultTranslators = { /* Pre-formatted text */ pre: { noEscape: true, preserveWhitespace: true }, /* Line break */ br: { content: ` \n`, recurse: false }, /* Horizontal Rule*/ hr: { content: "---", recurse: false }, /* Headings */ "h1,h2,h3,h4,h5,h6": ({ node }) => ({ prefix: "#".repeat(+node.tagName.charAt(1)) + " ", }), /* Bold / Strong */ "strong,b": { spaceIfRepeatingChar: true, postprocess: ({ content, options: { strongDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, strongDelimiter), }, /* Strikethrough */ "del,s,strike": { spaceIfRepeatingChar: true, postprocess: ({ content, options: { strikeDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, strikeDelimiter), }, /* Italic / Emphasis */ "em,i": { spaceIfRepeatingChar: true, postprocess: ({ content, options: { emDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, emDelimiter), }, /* Lists (ordered & unordered) */ "ol,ul": ({ listKind }) => ({ surroundingNewlines: listKind ? 1 : 2, }), /* List Item */ li: ({ options: { bulletMarker }, indentLevel, listKind, listItemNumber }) => { const indentationLevel = +(indentLevel || 0); return { prefix: " ".repeat(+(indentLevel || 0)) + (listKind === "OL" && listItemNumber !== undefined ? `${listItemNumber}. ` : `${bulletMarker} `), surroundingNewlines: 1, postprocess: ({ content }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : content .trim() .replace(/([^\r\n])(?:\r?\n)+/g, `$1 \n${" ".repeat(indentationLevel)}`) .replace(/(\S+?)[^\S\r\n]+$/gm, "$1 "), }; }, /* Block Quote */ blockquote: { postprocess: ({ content }) => (0, utilities_1.trimNewLines)(content).replace(/^(>*)[^\S\r\n]?/gm, `>$1 `), }, /* Code (block / inline) */ code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => { var _a, _b; const isCodeBlock = ["PRE", "WRAPPED-PRE"].includes(parent === null || parent === void 0 ? void 0 : parent.tagName) && parent.childNodes.length < 2; /* Handle code (non-block) */ if (!isCodeBlock) return { spaceIfRepeatingChar: true, noEscape: true, postprocess: ({ content }) => { var _a, _b; // Find longest occurring sequence of running backticks and add one more (so content is escaped) const delimiter = "`" + (((_b = (_a = content.match(/`+/g)) === null || _a === void 0 ? void 0 : _a.sort((a, b) => b.length - a.length)) === null || _b === void 0 ? void 0 : _b[0]) || ""); const padding = delimiter.length > 1 ? " " : ""; return (0, utilities_1.surround)((0, utilities_1.surround)(content, padding), delimiter); }, }; /* Handle code block */ if (codeBlockStyle === "fenced") { const language = ((_b = (_a = node.getAttribute("class")) === null || _a === void 0 ? void 0 : _a.match(/language-(\S+)/)) === null || _b === void 0 ? void 0 : _b[1]) || ""; return { noEscape: true, prefix: codeFence + language + "\n", postfix: "\n" + codeFence, childTranslators: visitor.instance.codeBlockTranslators, }; } else { return { noEscape: true, postprocess: ({ content }) => content.replace(/^/gm, " "), childTranslators: visitor.instance.codeBlockTranslators, }; } }, /* Table */ table: ({ visitor }) => ({ surroundingNewlines: 2, childTranslators: visitor.instance.tableTranslators, postprocess: ({ content, nodeMetadata, node }) => { // Split and trim leading + trailing pipes const rawRows = (0, utilities_1.splitSpecial)(content).map(({ text }) => text.replace(/^(?:\|\s+)?(.+)\s*\|\s*$/, "$1")); /* Get Row Data */ const rows = []; let colWidth = []; for (const row of rawRows) { if (!row) continue; /* Track columns */ const cols = row.split(" |").map((c, i) => { c = c.trim(); if (colWidth.length < i + 1 || (colWidth[i] && colWidth[i] < c.length)) colWidth[i] = c.length; return c; }); rows.push(cols); } if (rows.length < 1) return translator_1.PostProcessResult.RemoveNode; /* Compose Table */ const maxCols = colWidth.length; let res = ""; const caption = nodeMetadata.get(node).tableMeta.caption; if (caption) res += caption + "\n"; rows.forEach((cols, rowNumber) => { var _a; res += "| "; /* Add Columns */ for (let i = 0; i < maxCols; i++) { let c = (_a = cols[i]) !== null && _a !== void 0 ? _a : ""; let colWidth_ = colWidth[i]; c += " ".repeat(Math.max(0, colWidth_ ? colWidth_ - c.length : 0)); // Pad to max length res += c + " |" + (i < maxCols - 1 ? " " : ""); } res += "\n"; // Add separator row if (rowNumber === 0) res += "|" + colWidth.map((w) => " " + "-".repeat(w) + " |").join("") + "\n"; }); return res; }, }), /* Link */ a: ({ node, options, visitor }) => { const href = node.getAttribute("href"); if (!href) return {}; // Encodes symbols that can cause problems in markdown let encodedHref = ""; for (const chr of href) { switch (chr) { case "(": encodedHref += "%28"; break; case ")": encodedHref += "%29"; break; case "_": encodedHref += "%5F"; break; case "*": encodedHref += "%2A"; break; default: encodedHref += chr; } } const title = node.getAttribute("title"); // Inline link, when possible // See: https://github.com/crosstype/node-html-markdown/issues/17 if (node.textContent === href && options.useInlineLinks) return { content: `<${encodedHref}>` }; return { postprocess: ({ content }) => content.replace(/(?:\r?\n)+/g, " "), childTranslators: visitor.instance.aTagTranslators, prefix: "[", postfix: "]" + (!options.useLinkReferenceDefinitions ? `(${encodedHref}${title ? ` "${title}"` : ""})` : `[${visitor.addOrGetUrlDefinition(encodedHref)}]`), }; }, /* Image */ img: ({ node, options }) => { const src = node.getAttribute("src") || ""; if (!src || (!options.keepDataImages && /^data:/i.test(src))) return { ignore: true }; const alt = node.getAttribute("alt") || ""; const title = node.getAttribute("title") || ""; return { content: `![${alt}](${src}${title && ` "${title}"`})`, recurse: false, }; }, }; exports.tableTranslatorConfig = { /* Table Caption */ caption: ({ visitor }) => ({ surroundingNewlines: false, childTranslators: visitor.instance.tableCellTranslators, postprocess: ({ content, nodeMetadata, node }) => { const caption = content.replace(/(?:\r?\n)+/g, " ").trim(); if (caption) nodeMetadata.get(node).tableMeta.caption = "__" + caption + "__"; return translator_1.PostProcessResult.RemoveNode; }, }), /* Table row */ tr: ({ visitor }) => ({ surroundingNewlines: false, childTranslators: visitor.instance.tableRowTranslators, postfix: "\n", prefix: "| ", postprocess: ({ content }) => (!/ \|\s*$/.test(content) ? translator_1.PostProcessResult.RemoveNode : content), }), /* Table cell, (header cell) */ "th,td": ({ visitor }) => ({ surroundingNewlines: false, childTranslators: visitor.instance.tableCellTranslators, prefix: " ", postfix: " |", postprocess: ({ content }) => (0, utilities_1.trimNewLines)(content) .replace("|", "\\|") .replace(/(?:\r?\n)+/g, " ") .trim(), }), }; exports.tableRowTranslatorConfig = { "th,td": exports.tableTranslatorConfig["th,td"], }; exports.tableCellTranslatorConfig = { a: exports.defaultTranslators["a"], "strong,b": exports.defaultTranslators["strong,b"], "del,s,strike": exports.defaultTranslators["del,s,strike"], "em,i": exports.defaultTranslators["em,i"], img: exports.defaultTranslators["img"], }; exports.defaultCodeBlockTranslators = { br: { content: `\n`, recurse: false }, hr: { content: "---", recurse: false }, "h1,h2,h3,h4,h5,h6": { prefix: "[", postfix: "]" }, "ol,ul": exports.defaultTranslators["ol,ul"], li: exports.defaultTranslators["li"], tr: { surroundingNewlines: true }, img: { recurse: false }, }; exports.aTagTranslatorConfig = { br: { content: "\n", recurse: false }, hr: { content: "\n", recurse: false }, pre: exports.defaultTranslators["pre"], "strong,b": exports.defaultTranslators["strong,b"], "del,s,strike": exports.defaultTranslators["del,s,strike"], "em,i": exports.defaultTranslators["em,i"], img: exports.defaultTranslators["img"], }; // endregion /* ****************************************************************************************************************** */ // region: General /* ****************************************************************************************************************** */ /** * Note: Do not change - values are tuned for performance */ exports.nodeHtmlParserConfig = { lowerCaseTagName: false, comment: false, fixNestedATags: true, blockTextElements: { script: false, noscript: false, style: false, }, }; // endregion //# sourceMappingURL=config.js.map