UNPKG

turndown

Version:

A library that converts HTML to Markdown

838 lines (714 loc) 20.1 kB
function extend (destination) { for (var i = 1; i < arguments.length; i++) { var source = arguments[i]; for (var key in source) { if (source.hasOwnProperty(key)) destination[key] = source[key]; } } return destination } function repeat (character, count) { return Array(count + 1).join(character) } function isBlock (node) { return [ 'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'center', 'dd', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'html', 'isindex', 'li', 'main', 'menu', 'nav', 'noframes', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul' ].indexOf(node.nodeName.toLowerCase()) !== -1 } var converters = {}; converters.paragraph = { filter: 'p', replacement: function (content) { return '\n\n' + content + '\n\n' } }; converters.lineBreak = { filter: 'br', replacement: function () { return ' \n' } }; converters.heading = { filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'], replacement: function (content, node, options) { var hLevel = Number(node.nodeName.charAt(1)); if (options.headingStyle === 'setext' && hLevel < 3) { var underline = repeat((hLevel === 1 ? '=' : '-'), content.length); return ( '\n\n' + content + '\n' + underline + '\n\n' ) } else { return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n' } } }; converters.blockquote = { filter: 'blockquote', replacement: function (content) { content = content.replace(/^\n+|\n+$/g, ''); content = content.replace(/^/gm, '> '); return '\n\n' + content + '\n\n' } }; converters.list = { filter: ['ul', 'ol'], replacement: function (content, node) { var parent = node.parentNode; if (parent.nodeName === 'LI' && parent.lastElementChild === node) { return '\n' + content } else { return '\n\n' + content + '\n\n' } } }; converters.listItem = { filter: 'li', replacement: function (content, node, options) { content = content .replace(/^\n+/, '') // remove leading newlines .replace(/\n+$/, '\n') // replace trailing newlines with just a single one .replace(/\n/gm, '\n '); // indent var prefix = options.bulletListMarker + ' '; var parent = node.parentNode; if (parent.nodeName === 'OL') { var start = parent.getAttribute('start'); var index = Array.prototype.indexOf.call(parent.children, node); prefix = (start ? Number(start) + index : index + 1) + '. '; } return ( prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '') ) } }; converters.indentedCodeBlock = { filter: function (node, options) { return ( options.codeBlockStyle === 'indented' && node.nodeName === 'PRE' && node.firstChild.nodeName === 'CODE' ) }, replacement: function (content, node, options) { return ( '\n\n ' + node.firstChild.textContent.replace(/\n/g, '\n ') + '\n\n' ) } }; converters.fencedCodeBlock = { filter: function (node, options) { return ( options.codeBlockStyle === 'fenced' && node.nodeName === 'PRE' && node.firstChild.nodeName === 'CODE' ) }, replacement: function (content, node, options) { var className = node.firstChild.className || ''; var language = (className.match(/language-(\S+)/) || [null, ''])[1]; return ( '\n\n' + options.fence + language + '\n' + node.firstChild.textContent + '\n' + options.fence + '\n\n' ) } }; converters.horizontalRule = { filter: 'hr', replacement: function (content, node, options) { return '\n\n' + options.hr + '\n\n' } }; converters.inlineLink = { filter: function (node, options) { return ( options.linkStyle === 'inlined' && node.nodeName === 'A' && node.getAttribute('href') ) }, replacement: function (content, node) { var href = node.getAttribute('href'); var title = node.title ? ' "' + node.title + '"' : ''; return '[' + content + '](' + href + title + ')' } }; converters.referenceLink = { filter: function (node, options) { return ( options.linkStyle === 'referenced' && node.nodeName === 'A' && node.getAttribute('href') ) }, replacement: function (content, node, options) { var href = node.getAttribute('href'); var title = node.title ? ' "' + node.title + '"' : ''; var replacement; var reference; switch (options.linkReferenceStyle) { case 'collapsed': replacement = '[' + content + '][]'; reference = '[' + content + ']: ' + href + title; break case 'shortcut': replacement = '[' + content + ']'; reference = '[' + content + ']: ' + href + title; break default: var id = this.references.length + 1; replacement = '[' + content + '][' + id + ']'; reference = '[' + id + ']: ' + href + title; } this.references.push(reference); return replacement }, references: [], append: function (options) { var references = ''; if (this.references.length) { references = '\n\n' + this.references.join('\n') + '\n\n'; this.references = []; // Reset references } return references } }; converters.emphasis = { filter: ['em', 'i'], replacement: function (content, node, options) { return options.emDelimiter + content + options.emDelimiter } }; converters.strong = { filter: ['strong', 'b'], replacement: function (content, node, options) { return options.strongDelimiter + content + options.strongDelimiter } }; converters.code = { filter: function (node) { var hasSiblings = node.previousSibling || node.nextSibling; var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings; return node.nodeName === 'CODE' && !isCodeBlock }, replacement: function (content) { return '`' + content + '`' } }; converters.image = { filter: 'img', replacement: function (content, node) { var alt = node.alt || ''; var src = node.getAttribute('src') || ''; var title = node.title || ''; var titlePart = title ? ' "' + title + '"' : ''; return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : '' } }; /** * This file automatically generated from `pre-publish.js`. * Do not manually edit. */ var index = { "area": true, "base": true, "br": true, "col": true, "embed": true, "hr": true, "img": true, "input": true, "keygen": true, "link": true, "menuitem": true, "meta": true, "param": true, "source": true, "track": true, "wbr": true }; /** * This file automatically generated from `build.js`. * Do not manually edit. */ var index$2 = [ "address", "article", "aside", "audio", "blockquote", "canvas", "dd", "div", "dl", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video" ]; var voidElements = index; Object.keys(voidElements).forEach(function (name) { voidElements[name.toUpperCase()] = 1; }); var blockElements = {}; index$2.forEach(function (name) { blockElements[name.toUpperCase()] = 1; }); /** * isBlockElem(node) determines if the given node is a block element. * * @param {Node} node * @return {Boolean} */ function isBlockElem(node) { return !!(node && blockElements[node.nodeName]); } /** * isVoid(node) determines if the given node is a void element. * * @param {Node} node * @return {Boolean} */ function isVoid(node) { return !!(node && voidElements[node.nodeName]); } /** * whitespace(elem [, isBlock]) removes extraneous whitespace from an * the given element. The function isBlock may optionally be passed in * to determine whether or not an element is a block element; if none * is provided, defaults to using the list of block elements provided * by the `block-elements` module. * * @param {Node} elem * @param {Function} blockTest */ function collapseWhitespace(elem, isBlock) { if (!elem.firstChild || elem.nodeName === 'PRE') return; if (typeof isBlock !== 'function') { isBlock = isBlockElem; } var prevText = null; var prevVoid = false; var prev = null; var node = next(prev, elem); while (node !== elem) { if (node.nodeType === 3) { // Node.TEXT_NODE var text = node.data.replace(/[ \r\n\t]+/g, ' '); if ((!prevText || / $/.test(prevText.data)) && !prevVoid && text[0] === ' ') { text = text.substr(1); } // `text` might be empty at this point. if (!text) { node = remove(node); continue; } node.data = text; prevText = node; } else if (node.nodeType === 1) { // Node.ELEMENT_NODE if (isBlock(node) || node.nodeName === 'BR') { if (prevText) { prevText.data = prevText.data.replace(/ $/, ''); } prevText = null; prevVoid = false; } else if (isVoid(node)) { // Avoid trimming space around non-block, non-BR void elements. prevText = null; prevVoid = true; } } else { node = remove(node); continue; } var nextNode = next(prev, node); prev = node; node = nextNode; } if (prevText) { prevText.data = prevText.data.replace(/ $/, ''); if (!prevText.data) { remove(prevText); } } } /** * remove(node) removes the given node from the DOM and returns the * next node in the sequence. * * @param {Node} node * @return {Node} node */ function remove(node) { var next = node.nextSibling || node.parentNode; node.parentNode.removeChild(node); return next; } /** * next(prev, current) returns the next node in the sequence, given the * current and previous nodes. * * @param {Node} prev * @param {Node} current * @return {Node} */ function next(prev, current) { if (prev && prev.parentNode === current || current.nodeName === 'PRE') { return current.nextSibling || current.parentNode; } return current.firstChild || current.nextSibling || current.parentNode; } var whitespace = collapseWhitespace; /* * Set up window for Node.js */ var root = (typeof window !== 'undefined' ? window : {}); /* * Parsing HTML strings */ function canParseHTMLNatively () { var Parser = root.DOMParser; var canParse = false; // Adapted from https://gist.github.com/1129031 // Firefox/Opera/IE throw errors on unsupported types try { // WebKit returns null on unsupported types if (new Parser().parseFromString('', 'text/html')) { canParse = true; } } catch (e) {} return canParse } function createHTMLParser () { var Parser = function () {}; // For Node.js environments if (typeof document === 'undefined') { var jsdom = require('jsdom'); Parser.prototype.parseFromString = function (string) { return jsdom.jsdom(string, { features: { FetchExternalResources: [], ProcessExternalResources: false } }) }; } else { if (!shouldUseActiveX()) { Parser.prototype.parseFromString = function (string) { var doc = document.implementation.createHTMLDocument(''); doc.open(); doc.write(string); doc.close(); return doc }; } else { Parser.prototype.parseFromString = function (string) { var doc = new window.ActiveXObject('htmlfile'); doc.designMode = 'on'; // disable on-page scripts doc.open(); doc.write(string); doc.close(); return doc }; } } return Parser } function shouldUseActiveX () { var useActiveX = false; try { document.implementation.createHTMLDocument('').open(); } catch (e) { if (window.ActiveXObject) useActiveX = true; } return useActiveX } var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser(); function RootNode (input) { var root; if (typeof input === 'string') { root = htmlParser().parseFromString(input, 'text/html').body; } else { root = input.cloneNode(true); } whitespace(root, isBlock); return root } var _htmlParser; function htmlParser () { _htmlParser = _htmlParser || new HTMLParser(); return _htmlParser } function Node (node) { node.isBlock = isBlock(node); node.isVoid = isVoid$1(node); node.isBlank = isBlank(node); node.flankingWhitespace = flankingWhitespace(node); return node } function isVoid$1 (node) { return [ 'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr' ].indexOf(node.nodeName.toLowerCase()) !== -1 } function isBlank (node) { return ( !isVoid$1(node) && !/A|TH|TD/.test(node.nodeName) && /^\s*$/i.test(node.textContent) ) } function flankingWhitespace (node) { var leading = ''; var trailing = ''; if (!isBlock(node)) { var hasLeading = /^[ \r\n\t]/.test(node.textContent); var hasTrailing = /[ \r\n\t]$/.test(node.textContent); if (hasLeading && !isFlankedByWhitespace('left', node)) { leading = ' '; } if (hasTrailing && !isFlankedByWhitespace('right', node)) { trailing = ' '; } } return { leading: leading, trailing: trailing } } function isFlankedByWhitespace (side, node) { var sibling; var regExp; var isFlanked; if (side === 'left') { sibling = node.previousSibling; regExp = / $/; } else { sibling = node.nextSibling; regExp = /^ /; } if (sibling) { if (sibling.nodeType === 3) { isFlanked = regExp.test(sibling.nodeValue); } else if (sibling.nodeType === 1 && !isBlock(sibling)) { isFlanked = regExp.test(sibling.textContent); } } return isFlanked } var reduce = Array.prototype.reduce; var leadingNewLinesRegExp = /^\n*/; var trailingNewLinesRegExp = /\n*$/; function TurndownService (options) { var defaults = { converters: converters, headingStyle: 'setext', hr: '* * *', bulletListMarker: '*', codeBlockStyle: 'indented', fence: '```', emDelimiter: '_', strongDelimiter: '**', linkStyle: 'inlined', linkReferenceStyle: 'full', blankConverter: { replacement: function (content, node) { return node.isBlock ? '\n\n' : '' } }, defaultConverter: { replacement: function (content, node) { return node.isBlock ? '\n\n' + content + '\n\n' : content } }, keepConverter: { filter: function (node) { switch (node.nodeName) { case 'TABLE': return true case 'PRE': return node.firstChild.nodeName !== 'CODE' default: return false } }, replacement: function (content, node) { return node.isBlock ? '\n\n' + node.outerHTML + '\n\n' : node.outerHTML } }, removeConverter: { filter: ['head'], replacement: function () { return '' } } }; this.options = extend({}, defaults, options); } TurndownService.prototype = { turndown: function (input) { if (!canConvert(input)) { throw new TypeError( input + ' is not a string, or an element/document/fragment node.' ) } if (input === '') return '' var root = new RootNode(input); return this.postProcess(this.process(root)) }, /** * Reduces a DOM node down to its Markdown string equivalent */ process: function process (parentNode) { var _this = this; return reduce.call(parentNode.childNodes, function (output, node) { node = new Node(node); var replacement; if (node.nodeType === 3) { replacement = _this.escape(node.nodeValue); } else if (node.nodeType === 1) { replacement = _this.replacementForNode(node); } return join(output, replacement) }, '') }, /** * Escapes Markdown syntax */ escape: function escape (string) { return ( string // Escape hr .replace(/^([-*_] *){3,}$/gm, function (match, character) { return match.split(character).join('\\' + character) }) // Escape ol bullet points .replace(/^(\W* {0,3})(\d+)\. /gm, '$1$2\\. ') // Escape ul bullet points .replace(/^([^\\\w]*)([*+-]) /gm, '$1\\$2 ') // Escape blockquote indents .replace(/^(\W* {0,3})> /gm, '$1\\> ') // Escape em/strong * .replace(/\*{1,2}([^\W*]+\W*)+\*{1,2}/g, function (match) { return match.replace(/\*/g, '\\*') }) // Escape em/strong _ .replace(/_{1,2}([^\W_]+\W*)+_{1,2}/g, function (match) { return match.replace(/_/g, '\\_') }) // Escape ` .replace(/`([^\W`]+\W*)+`/g, function (match) { return match.replace(/`/g, '\\`') }) // Escape link brackets .replace(/\[([^\]]*)\]/g, '\\[$1\\]') // eslint-disable-line no-useless-escape ) }, /** * Converts an element node to its Markdown equivalent */ replacementForNode: function replacementForNode (node) { var converter = this.converterForNode(node); var content = this.process(node); var whitespace = node.flankingWhitespace; if (whitespace.leading || whitespace.trailing) content = content.trim(); return ( whitespace.leading + converter.replacement(content, node, this.options) + whitespace.trailing ) }, /** * Finds a converter for a given node */ converterForNode: function converterForNode (node) { if (this.filterValue(this.options.keepConverter, node)) { return this.options.keepConverter } if (this.filterValue(this.options.removeConverter, node)) { return this.options.removeConverter } if (node.isBlank) return this.options.blankConverter for (var key in this.options.converters) { var converter = this.options.converters[key]; if (this.filterValue(converter, node)) return converter } return this.options.defaultConverter }, filterValue: function filterValue (converter, node) { var filter = converter.filter; if (typeof filter === 'string') { if (filter === node.nodeName.toLowerCase()) return true } else if (Array.isArray(filter)) { if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true } else if (typeof filter === 'function') { if (filter.call(converter, node, this.options)) return true } else { throw new TypeError('`filter` needs to be a string, array, or function') } }, postProcess: function postProcess (output) { for (var key in this.options.converters) { var converter = this.options.converters[key]; if (typeof converter.append === 'function') { output = join(output, converter.append(this.options)); } } return output.replace(/^[\t\r\n]+/, '').replace(/[\t\r\n\s]+$/, '') } }; function separatingNewlines (output, replacement) { var newlines = [ output.match(trailingNewLinesRegExp)[0], replacement.match(leadingNewLinesRegExp)[0] ].sort(); return newlines[newlines.length - 1] } function join (string1, string2) { var separator = separatingNewlines(string1, string2); // Remove trailing/leading newlines and replace with separator string1 = string1.replace(trailingNewLinesRegExp, ''); string2 = string2.replace(leadingNewLinesRegExp, ''); return string1 + separator + string2 } /** * Determines whether an input can be converted */ function canConvert (input) { return ( input != null && ( typeof input === 'string' || input.nodeType && ( input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11 ) ) ) } export default TurndownService;