@kingsword/node-html-markdown
Version:
Fast HTML to markdown cross-compiler, compatible with both node and the browser
389 lines • 15.3 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.nodeHtmlParserConfig = exports.aTagTranslatorConfig = exports.defaultCodeBlockTranslators = exports.tableCellTranslatorConfig = exports.tableRowTranslatorConfig = exports.tableTranslatorConfig = exports.defaultTranslators = exports.defaultOptions = exports.contentlessElements = exports.defaultIgnoreElements = exports.defaultBlockElements = void 0;
const utilities_1 = require("./utilities");
const translator_1 = require("./translator");
/* ****************************************************************************************************************** */
// region: Elements
/* ****************************************************************************************************************** */
exports.defaultBlockElements = [
"ADDRESS",
"ARTICLE",
"ASIDE",
"AUDIO",
"BLOCKQUOTE",
"BODY",
"CANVAS",
"CENTER",
"DD",
"DIR",
"DIV",
"DL",
"DT",
"FIELDSET",
"FIGCAPTION",
"FIGURE",
"FOOTER",
"FORM",
"FRAMESET",
"H1",
"H2",
"H3",
"H4",
"H5",
"H6",
"HEADER",
"HGROUP",
"HR",
"HTML",
"ISINDEX",
"LI",
"MAIN",
"MENU",
"NAV",
"NOFRAMES",
"NOSCRIPT",
"OL",
"OUTPUT",
"P",
"PRE",
"SECTION",
"TABLE",
"TBODY",
"TD",
"TFOOT",
"TH",
"THEAD",
"TR",
"UL",
];
exports.defaultIgnoreElements = [
"AREA",
"BASE",
"COL",
"COMMAND",
"EMBED",
"HEAD",
"INPUT",
"KEYGEN",
"LINK",
"META",
"PARAM",
"SCRIPT",
"SOURCE",
"STYLE",
"TRACK",
"WBR",
];
exports.contentlessElements = ["BR", "HR", "IMG"];
// endregion
/* ****************************************************************************************************************** */
// region: Options
/* ****************************************************************************************************************** */
// noinspection RegExpUnnecessaryNonCapturingGroup
exports.defaultOptions = Object.freeze({
preferNativeParser: false,
codeFence: "```",
bulletMarker: "*",
indent: " ",
codeBlockStyle: "fenced",
emDelimiter: "_",
strongDelimiter: "**",
strikeDelimiter: "~~",
maxConsecutiveNewlines: 3,
/**
* Character: Affects: Example:
*
* \ Escaping \-
* ` Code `` code ``, ```lang\n code block \n```
* * Bullet & Separators * item, ***
* _ Bold, Italics, Separator _italic_, __bold__, ^___
* ~ Strikethrough, Code ~~strike~~, ~~~lang\n code block \n~~~
* [ Url [caption](url)
* ] Url [caption](url)
*/
globalEscape: [/[\\`*_~\[\]]/gm, "\\$&"],
/**
* Note: The following compiled pattern was selected after perf testing various alternatives.
* Please be mindful of performance if updating/changing it.
*
* Sequence: Affects: Example:
*
* +(space) Bullets + item
* = Heading heading\n====
* #{1,6}(space) Heading ## Heading
* > Blockquote > quote
* - Bullet, Header, Separator - item, heading\n---, ---
* \d+\.(space) Numbered list item 1. Item
*/
lineStartEscape: [/^(\s*?)((?:\+\s)|(?:[=>-])|(?:#{1,6}\s))|(?:(\d+)(\.\s))/gm, "$1$3\\$2$4"],
useInlineLinks: true,
});
// endregion
/* ****************************************************************************************************************** */
// region: Translators
/* ****************************************************************************************************************** */
exports.defaultTranslators = {
/* Pre-formatted text */
pre: { noEscape: true, preserveWhitespace: true },
/* Line break */
br: { content: ` \n`, recurse: false },
/* Horizontal Rule*/
hr: { content: "---", recurse: false },
/* Headings */
"h1,h2,h3,h4,h5,h6": ({ node }) => ({
prefix: "#".repeat(+node.tagName.charAt(1)) + " ",
}),
/* Bold / Strong */
"strong,b": {
spaceIfRepeatingChar: true,
postprocess: ({ content, options: { strongDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, strongDelimiter),
},
/* Strikethrough */
"del,s,strike": {
spaceIfRepeatingChar: true,
postprocess: ({ content, options: { strikeDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, strikeDelimiter),
},
/* Italic / Emphasis */
"em,i": {
spaceIfRepeatingChar: true,
postprocess: ({ content, options: { emDelimiter } }) => (0, utilities_1.isWhiteSpaceOnly)(content) ? translator_1.PostProcessResult.RemoveNode : (0, utilities_1.tagSurround)(content, emDelimiter),
},
/* Lists (ordered & unordered) */
"ol,ul": ({ listKind }) => ({
surroundingNewlines: listKind ? 1 : 2,
}),
/* List Item */
li: ({ options: { bulletMarker }, indentLevel, listKind, listItemNumber }) => {
const indentationLevel = +(indentLevel || 0);
return {
prefix: " ".repeat(+(indentLevel || 0)) +
(listKind === "OL" && listItemNumber !== undefined ? `${listItemNumber}. ` : `${bulletMarker} `),
surroundingNewlines: 1,
postprocess: ({ content }) => (0, utilities_1.isWhiteSpaceOnly)(content)
? translator_1.PostProcessResult.RemoveNode
: content
.trim()
.replace(/([^\r\n])(?:\r?\n)+/g, `$1 \n${" ".repeat(indentationLevel)}`)
.replace(/(\S+?)[^\S\r\n]+$/gm, "$1 "),
};
},
/* Block Quote */
blockquote: {
postprocess: ({ content }) => (0, utilities_1.trimNewLines)(content).replace(/^(>*)[^\S\r\n]?/gm, `>$1 `),
},
/* Code (block / inline) */
code: ({ node, parent, options: { codeFence, codeBlockStyle }, visitor }) => {
var _a, _b;
const isCodeBlock = ["PRE", "WRAPPED-PRE"].includes(parent === null || parent === void 0 ? void 0 : parent.tagName) && parent.childNodes.length < 2;
/* Handle code (non-block) */
if (!isCodeBlock)
return {
spaceIfRepeatingChar: true,
noEscape: true,
postprocess: ({ content }) => {
var _a, _b;
// Find longest occurring sequence of running backticks and add one more (so content is escaped)
const delimiter = "`" + (((_b = (_a = content.match(/`+/g)) === null || _a === void 0 ? void 0 : _a.sort((a, b) => b.length - a.length)) === null || _b === void 0 ? void 0 : _b[0]) || "");
const padding = delimiter.length > 1 ? " " : "";
return (0, utilities_1.surround)((0, utilities_1.surround)(content, padding), delimiter);
},
};
/* Handle code block */
if (codeBlockStyle === "fenced") {
const language = ((_b = (_a = node.getAttribute("class")) === null || _a === void 0 ? void 0 : _a.match(/language-(\S+)/)) === null || _b === void 0 ? void 0 : _b[1]) || "";
return {
noEscape: true,
prefix: codeFence + language + "\n",
postfix: "\n" + codeFence,
childTranslators: visitor.instance.codeBlockTranslators,
};
}
else {
return {
noEscape: true,
postprocess: ({ content }) => content.replace(/^/gm, " "),
childTranslators: visitor.instance.codeBlockTranslators,
};
}
},
/* Table */
table: ({ visitor }) => ({
surroundingNewlines: 2,
childTranslators: visitor.instance.tableTranslators,
postprocess: ({ content, nodeMetadata, node }) => {
// Split and trim leading + trailing pipes
const rawRows = (0, utilities_1.splitSpecial)(content).map(({ text }) => text.replace(/^(?:\|\s+)?(.+)\s*\|\s*$/, "$1"));
/* Get Row Data */
const rows = [];
let colWidth = [];
for (const row of rawRows) {
if (!row)
continue;
/* Track columns */
const cols = row.split(" |").map((c, i) => {
c = c.trim();
if (colWidth.length < i + 1 || (colWidth[i] && colWidth[i] < c.length))
colWidth[i] = c.length;
return c;
});
rows.push(cols);
}
if (rows.length < 1)
return translator_1.PostProcessResult.RemoveNode;
/* Compose Table */
const maxCols = colWidth.length;
let res = "";
const caption = nodeMetadata.get(node).tableMeta.caption;
if (caption)
res += caption + "\n";
rows.forEach((cols, rowNumber) => {
var _a;
res += "| ";
/* Add Columns */
for (let i = 0; i < maxCols; i++) {
let c = (_a = cols[i]) !== null && _a !== void 0 ? _a : "";
let colWidth_ = colWidth[i];
c += " ".repeat(Math.max(0, colWidth_ ? colWidth_ - c.length : 0)); // Pad to max length
res += c + " |" + (i < maxCols - 1 ? " " : "");
}
res += "\n";
// Add separator row
if (rowNumber === 0)
res += "|" + colWidth.map((w) => " " + "-".repeat(w) + " |").join("") + "\n";
});
return res;
},
}),
/* Link */
a: ({ node, options, visitor }) => {
const href = node.getAttribute("href");
if (!href)
return {};
// Encodes symbols that can cause problems in markdown
let encodedHref = "";
for (const chr of href) {
switch (chr) {
case "(":
encodedHref += "%28";
break;
case ")":
encodedHref += "%29";
break;
case "_":
encodedHref += "%5F";
break;
case "*":
encodedHref += "%2A";
break;
default:
encodedHref += chr;
}
}
const title = node.getAttribute("title");
// Inline link, when possible
// See: https://github.com/crosstype/node-html-markdown/issues/17
if (node.textContent === href && options.useInlineLinks)
return { content: `<${encodedHref}>` };
return {
postprocess: ({ content }) => content.replace(/(?:\r?\n)+/g, " "),
childTranslators: visitor.instance.aTagTranslators,
prefix: "[",
postfix: "]" +
(!options.useLinkReferenceDefinitions
? `(${encodedHref}${title ? ` "${title}"` : ""})`
: `[${visitor.addOrGetUrlDefinition(encodedHref)}]`),
};
},
/* Image */
img: ({ node, options }) => {
const src = node.getAttribute("src") || "";
if (!src || (!options.keepDataImages && /^data:/i.test(src)))
return { ignore: true };
const alt = node.getAttribute("alt") || "";
const title = node.getAttribute("title") || "";
return {
content: ``,
recurse: false,
};
},
};
exports.tableTranslatorConfig = {
/* Table Caption */
caption: ({ visitor }) => ({
surroundingNewlines: false,
childTranslators: visitor.instance.tableCellTranslators,
postprocess: ({ content, nodeMetadata, node }) => {
const caption = content.replace(/(?:\r?\n)+/g, " ").trim();
if (caption)
nodeMetadata.get(node).tableMeta.caption = "__" + caption + "__";
return translator_1.PostProcessResult.RemoveNode;
},
}),
/* Table row */
tr: ({ visitor }) => ({
surroundingNewlines: false,
childTranslators: visitor.instance.tableRowTranslators,
postfix: "\n",
prefix: "| ",
postprocess: ({ content }) => (!/ \|\s*$/.test(content) ? translator_1.PostProcessResult.RemoveNode : content),
}),
/* Table cell, (header cell) */
"th,td": ({ visitor }) => ({
surroundingNewlines: false,
childTranslators: visitor.instance.tableCellTranslators,
prefix: " ",
postfix: " |",
postprocess: ({ content }) => (0, utilities_1.trimNewLines)(content)
.replace("|", "\\|")
.replace(/(?:\r?\n)+/g, " ")
.trim(),
}),
};
exports.tableRowTranslatorConfig = {
"th,td": exports.tableTranslatorConfig["th,td"],
};
exports.tableCellTranslatorConfig = {
a: exports.defaultTranslators["a"],
"strong,b": exports.defaultTranslators["strong,b"],
"del,s,strike": exports.defaultTranslators["del,s,strike"],
"em,i": exports.defaultTranslators["em,i"],
img: exports.defaultTranslators["img"],
};
exports.defaultCodeBlockTranslators = {
br: { content: `\n`, recurse: false },
hr: { content: "---", recurse: false },
"h1,h2,h3,h4,h5,h6": { prefix: "[", postfix: "]" },
"ol,ul": exports.defaultTranslators["ol,ul"],
li: exports.defaultTranslators["li"],
tr: { surroundingNewlines: true },
img: { recurse: false },
};
exports.aTagTranslatorConfig = {
br: { content: "\n", recurse: false },
hr: { content: "\n", recurse: false },
pre: exports.defaultTranslators["pre"],
"strong,b": exports.defaultTranslators["strong,b"],
"del,s,strike": exports.defaultTranslators["del,s,strike"],
"em,i": exports.defaultTranslators["em,i"],
img: exports.defaultTranslators["img"],
};
// endregion
/* ****************************************************************************************************************** */
// region: General
/* ****************************************************************************************************************** */
/**
* Note: Do not change - values are tuned for performance
*/
exports.nodeHtmlParserConfig = {
lowerCaseTagName: false,
comment: false,
fixNestedATags: true,
blockTextElements: {
script: false,
noscript: false,
style: false,
},
};
// endregion
//# sourceMappingURL=config.js.map