turndown
Version:
A library that converts HTML to Markdown
838 lines (714 loc) • 20.1 kB
JavaScript
function extend (destination) {
for (var i = 1; i < arguments.length; i++) {
var source = arguments[i];
for (var key in source) {
if (source.hasOwnProperty(key)) destination[key] = source[key];
}
}
return destination
}
function repeat (character, count) {
return Array(count + 1).join(character)
}
function isBlock (node) {
return [
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas',
'center', 'dd', 'dir', 'div', 'dl', 'dt', 'fieldset', 'figcaption',
'figure', 'footer', 'form', 'frameset', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'header', 'hgroup', 'hr', 'html', 'isindex', 'li', 'main', 'menu', 'nav',
'noframes', 'noscript', 'ol', 'output', 'p', 'pre', 'section', 'table',
'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'ul'
].indexOf(node.nodeName.toLowerCase()) !== -1
}
var converters = {};
converters.paragraph = {
filter: 'p',
replacement: function (content) {
return '\n\n' + content + '\n\n'
}
};
converters.lineBreak = {
filter: 'br',
replacement: function () {
return ' \n'
}
};
converters.heading = {
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
replacement: function (content, node, options) {
var hLevel = Number(node.nodeName.charAt(1));
if (options.headingStyle === 'setext' && hLevel < 3) {
var underline = repeat((hLevel === 1 ? '=' : '-'), content.length);
return (
'\n\n' + content + '\n' + underline + '\n\n'
)
} else {
return '\n\n' + repeat('#', hLevel) + ' ' + content + '\n\n'
}
}
};
converters.blockquote = {
filter: 'blockquote',
replacement: function (content) {
content = content.replace(/^\n+|\n+$/g, '');
content = content.replace(/^/gm, '> ');
return '\n\n' + content + '\n\n'
}
};
converters.list = {
filter: ['ul', 'ol'],
replacement: function (content, node) {
var parent = node.parentNode;
if (parent.nodeName === 'LI' && parent.lastElementChild === node) {
return '\n' + content
} else {
return '\n\n' + content + '\n\n'
}
}
};
converters.listItem = {
filter: 'li',
replacement: function (content, node, options) {
content = content
.replace(/^\n+/, '') // remove leading newlines
.replace(/\n+$/, '\n') // replace trailing newlines with just a single one
.replace(/\n/gm, '\n '); // indent
var prefix = options.bulletListMarker + ' ';
var parent = node.parentNode;
if (parent.nodeName === 'OL') {
var start = parent.getAttribute('start');
var index = Array.prototype.indexOf.call(parent.children, node);
prefix = (start ? Number(start) + index : index + 1) + '. ';
}
return (
prefix + content + (node.nextSibling && !/\n$/.test(content) ? '\n' : '')
)
}
};
converters.indentedCodeBlock = {
filter: function (node, options) {
return (
options.codeBlockStyle === 'indented' &&
node.nodeName === 'PRE' &&
node.firstChild.nodeName === 'CODE'
)
},
replacement: function (content, node, options) {
return (
'\n\n ' +
node.firstChild.textContent.replace(/\n/g, '\n ') +
'\n\n'
)
}
};
converters.fencedCodeBlock = {
filter: function (node, options) {
return (
options.codeBlockStyle === 'fenced' &&
node.nodeName === 'PRE' &&
node.firstChild.nodeName === 'CODE'
)
},
replacement: function (content, node, options) {
var className = node.firstChild.className || '';
var language = (className.match(/language-(\S+)/) || [null, ''])[1];
return (
'\n\n' + options.fence + language + '\n' +
node.firstChild.textContent +
'\n' + options.fence + '\n\n'
)
}
};
converters.horizontalRule = {
filter: 'hr',
replacement: function (content, node, options) {
return '\n\n' + options.hr + '\n\n'
}
};
converters.inlineLink = {
filter: function (node, options) {
return (
options.linkStyle === 'inlined' &&
node.nodeName === 'A' &&
node.getAttribute('href')
)
},
replacement: function (content, node) {
var href = node.getAttribute('href');
var title = node.title ? ' "' + node.title + '"' : '';
return '[' + content + '](' + href + title + ')'
}
};
converters.referenceLink = {
filter: function (node, options) {
return (
options.linkStyle === 'referenced' &&
node.nodeName === 'A' &&
node.getAttribute('href')
)
},
replacement: function (content, node, options) {
var href = node.getAttribute('href');
var title = node.title ? ' "' + node.title + '"' : '';
var replacement;
var reference;
switch (options.linkReferenceStyle) {
case 'collapsed':
replacement = '[' + content + '][]';
reference = '[' + content + ']: ' + href + title;
break
case 'shortcut':
replacement = '[' + content + ']';
reference = '[' + content + ']: ' + href + title;
break
default:
var id = this.references.length + 1;
replacement = '[' + content + '][' + id + ']';
reference = '[' + id + ']: ' + href + title;
}
this.references.push(reference);
return replacement
},
references: [],
append: function (options) {
var references = '';
if (this.references.length) {
references = '\n\n' + this.references.join('\n') + '\n\n';
this.references = []; // Reset references
}
return references
}
};
converters.emphasis = {
filter: ['em', 'i'],
replacement: function (content, node, options) {
return options.emDelimiter + content + options.emDelimiter
}
};
converters.strong = {
filter: ['strong', 'b'],
replacement: function (content, node, options) {
return options.strongDelimiter + content + options.strongDelimiter
}
};
converters.code = {
filter: function (node) {
var hasSiblings = node.previousSibling || node.nextSibling;
var isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
return node.nodeName === 'CODE' && !isCodeBlock
},
replacement: function (content) {
return '`' + content + '`'
}
};
converters.image = {
filter: 'img',
replacement: function (content, node) {
var alt = node.alt || '';
var src = node.getAttribute('src') || '';
var title = node.title || '';
var titlePart = title ? ' "' + title + '"' : '';
return src ? '![' + alt + ']' + '(' + src + titlePart + ')' : ''
}
};
/**
* This file automatically generated from `pre-publish.js`.
* Do not manually edit.
*/
var index = {
"area": true,
"base": true,
"br": true,
"col": true,
"embed": true,
"hr": true,
"img": true,
"input": true,
"keygen": true,
"link": true,
"menuitem": true,
"meta": true,
"param": true,
"source": true,
"track": true,
"wbr": true
};
/**
* This file automatically generated from `build.js`.
* Do not manually edit.
*/
var index$2 = [
"address",
"article",
"aside",
"audio",
"blockquote",
"canvas",
"dd",
"div",
"dl",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"main",
"nav",
"noscript",
"ol",
"output",
"p",
"pre",
"section",
"table",
"tfoot",
"ul",
"video"
];
var voidElements = index;
Object.keys(voidElements).forEach(function (name) {
voidElements[name.toUpperCase()] = 1;
});
var blockElements = {};
index$2.forEach(function (name) {
blockElements[name.toUpperCase()] = 1;
});
/**
* isBlockElem(node) determines if the given node is a block element.
*
* @param {Node} node
* @return {Boolean}
*/
function isBlockElem(node) {
return !!(node && blockElements[node.nodeName]);
}
/**
* isVoid(node) determines if the given node is a void element.
*
* @param {Node} node
* @return {Boolean}
*/
function isVoid(node) {
return !!(node && voidElements[node.nodeName]);
}
/**
* whitespace(elem [, isBlock]) removes extraneous whitespace from an
* the given element. The function isBlock may optionally be passed in
* to determine whether or not an element is a block element; if none
* is provided, defaults to using the list of block elements provided
* by the `block-elements` module.
*
* @param {Node} elem
* @param {Function} blockTest
*/
function collapseWhitespace(elem, isBlock) {
if (!elem.firstChild || elem.nodeName === 'PRE') return;
if (typeof isBlock !== 'function') {
isBlock = isBlockElem;
}
var prevText = null;
var prevVoid = false;
var prev = null;
var node = next(prev, elem);
while (node !== elem) {
if (node.nodeType === 3) {
// Node.TEXT_NODE
var text = node.data.replace(/[ \r\n\t]+/g, ' ');
if ((!prevText || / $/.test(prevText.data)) && !prevVoid && text[0] === ' ') {
text = text.substr(1);
}
// `text` might be empty at this point.
if (!text) {
node = remove(node);
continue;
}
node.data = text;
prevText = node;
} else if (node.nodeType === 1) {
// Node.ELEMENT_NODE
if (isBlock(node) || node.nodeName === 'BR') {
if (prevText) {
prevText.data = prevText.data.replace(/ $/, '');
}
prevText = null;
prevVoid = false;
} else if (isVoid(node)) {
// Avoid trimming space around non-block, non-BR void elements.
prevText = null;
prevVoid = true;
}
} else {
node = remove(node);
continue;
}
var nextNode = next(prev, node);
prev = node;
node = nextNode;
}
if (prevText) {
prevText.data = prevText.data.replace(/ $/, '');
if (!prevText.data) {
remove(prevText);
}
}
}
/**
* remove(node) removes the given node from the DOM and returns the
* next node in the sequence.
*
* @param {Node} node
* @return {Node} node
*/
function remove(node) {
var next = node.nextSibling || node.parentNode;
node.parentNode.removeChild(node);
return next;
}
/**
* next(prev, current) returns the next node in the sequence, given the
* current and previous nodes.
*
* @param {Node} prev
* @param {Node} current
* @return {Node}
*/
function next(prev, current) {
if (prev && prev.parentNode === current || current.nodeName === 'PRE') {
return current.nextSibling || current.parentNode;
}
return current.firstChild || current.nextSibling || current.parentNode;
}
var whitespace = collapseWhitespace;
/*
* Set up window for Node.js
*/
var root = (typeof window !== 'undefined' ? window : {});
/*
* Parsing HTML strings
*/
function canParseHTMLNatively () {
var Parser = root.DOMParser;
var canParse = false;
// Adapted from https://gist.github.com/1129031
// Firefox/Opera/IE throw errors on unsupported types
try {
// WebKit returns null on unsupported types
if (new Parser().parseFromString('', 'text/html')) {
canParse = true;
}
} catch (e) {}
return canParse
}
function createHTMLParser () {
var Parser = function () {};
// For Node.js environments
if (typeof document === 'undefined') {
var jsdom = require('jsdom');
Parser.prototype.parseFromString = function (string) {
return jsdom.jsdom(string, {
features: {
FetchExternalResources: [],
ProcessExternalResources: false
}
})
};
} else {
if (!shouldUseActiveX()) {
Parser.prototype.parseFromString = function (string) {
var doc = document.implementation.createHTMLDocument('');
doc.open();
doc.write(string);
doc.close();
return doc
};
} else {
Parser.prototype.parseFromString = function (string) {
var doc = new window.ActiveXObject('htmlfile');
doc.designMode = 'on'; // disable on-page scripts
doc.open();
doc.write(string);
doc.close();
return doc
};
}
}
return Parser
}
function shouldUseActiveX () {
var useActiveX = false;
try {
document.implementation.createHTMLDocument('').open();
} catch (e) {
if (window.ActiveXObject) useActiveX = true;
}
return useActiveX
}
var HTMLParser = canParseHTMLNatively() ? root.DOMParser : createHTMLParser();
function RootNode (input) {
var root;
if (typeof input === 'string') {
root = htmlParser().parseFromString(input, 'text/html').body;
} else {
root = input.cloneNode(true);
}
whitespace(root, isBlock);
return root
}
var _htmlParser;
function htmlParser () {
_htmlParser = _htmlParser || new HTMLParser();
return _htmlParser
}
function Node (node) {
node.isBlock = isBlock(node);
node.isVoid = isVoid$1(node);
node.isBlank = isBlank(node);
node.flankingWhitespace = flankingWhitespace(node);
return node
}
function isVoid$1 (node) {
return [
'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input',
'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'
].indexOf(node.nodeName.toLowerCase()) !== -1
}
function isBlank (node) {
return (
!isVoid$1(node) &&
!/A|TH|TD/.test(node.nodeName) &&
/^\s*$/i.test(node.textContent)
)
}
function flankingWhitespace (node) {
var leading = '';
var trailing = '';
if (!isBlock(node)) {
var hasLeading = /^[ \r\n\t]/.test(node.textContent);
var hasTrailing = /[ \r\n\t]$/.test(node.textContent);
if (hasLeading && !isFlankedByWhitespace('left', node)) {
leading = ' ';
}
if (hasTrailing && !isFlankedByWhitespace('right', node)) {
trailing = ' ';
}
}
return { leading: leading, trailing: trailing }
}
function isFlankedByWhitespace (side, node) {
var sibling;
var regExp;
var isFlanked;
if (side === 'left') {
sibling = node.previousSibling;
regExp = / $/;
} else {
sibling = node.nextSibling;
regExp = /^ /;
}
if (sibling) {
if (sibling.nodeType === 3) {
isFlanked = regExp.test(sibling.nodeValue);
} else if (sibling.nodeType === 1 && !isBlock(sibling)) {
isFlanked = regExp.test(sibling.textContent);
}
}
return isFlanked
}
var reduce = Array.prototype.reduce;
var leadingNewLinesRegExp = /^\n*/;
var trailingNewLinesRegExp = /\n*$/;
function TurndownService (options) {
var defaults = {
converters: converters,
headingStyle: 'setext',
hr: '* * *',
bulletListMarker: '*',
codeBlockStyle: 'indented',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**',
linkStyle: 'inlined',
linkReferenceStyle: 'full',
blankConverter: {
replacement: function (content, node) {
return node.isBlock ? '\n\n' : ''
}
},
defaultConverter: {
replacement: function (content, node) {
return node.isBlock ? '\n\n' + content + '\n\n' : content
}
},
keepConverter: {
filter: function (node) {
switch (node.nodeName) {
case 'TABLE':
return true
case 'PRE':
return node.firstChild.nodeName !== 'CODE'
default:
return false
}
},
replacement: function (content, node) {
return node.isBlock ? '\n\n' + node.outerHTML + '\n\n' : node.outerHTML
}
},
removeConverter: {
filter: ['head'],
replacement: function () {
return ''
}
}
};
this.options = extend({}, defaults, options);
}
TurndownService.prototype = {
turndown: function (input) {
if (!canConvert(input)) {
throw new TypeError(
input + ' is not a string, or an element/document/fragment node.'
)
}
if (input === '') return ''
var root = new RootNode(input);
return this.postProcess(this.process(root))
},
/**
* Reduces a DOM node down to its Markdown string equivalent
*/
process: function process (parentNode) {
var _this = this;
return reduce.call(parentNode.childNodes, function (output, node) {
node = new Node(node);
var replacement;
if (node.nodeType === 3) {
replacement = _this.escape(node.nodeValue);
} else if (node.nodeType === 1) {
replacement = _this.replacementForNode(node);
}
return join(output, replacement)
}, '')
},
/**
* Escapes Markdown syntax
*/
escape: function escape (string) {
return (
string
// Escape hr
.replace(/^([-*_] *){3,}$/gm, function (match, character) {
return match.split(character).join('\\' + character)
})
// Escape ol bullet points
.replace(/^(\W* {0,3})(\d+)\. /gm, '$1$2\\. ')
// Escape ul bullet points
.replace(/^([^\\\w]*)([*+-]) /gm, '$1\\$2 ')
// Escape blockquote indents
.replace(/^(\W* {0,3})> /gm, '$1\\> ')
// Escape em/strong *
.replace(/\*{1,2}([^\W*]+\W*)+\*{1,2}/g, function (match) {
return match.replace(/\*/g, '\\*')
})
// Escape em/strong _
.replace(/_{1,2}([^\W_]+\W*)+_{1,2}/g, function (match) {
return match.replace(/_/g, '\\_')
})
// Escape `
.replace(/`([^\W`]+\W*)+`/g, function (match) {
return match.replace(/`/g, '\\`')
})
// Escape link brackets
.replace(/\[([^\]]*)\]/g, '\\[$1\\]') // eslint-disable-line no-useless-escape
)
},
/**
* Converts an element node to its Markdown equivalent
*/
replacementForNode: function replacementForNode (node) {
var converter = this.converterForNode(node);
var content = this.process(node);
var whitespace = node.flankingWhitespace;
if (whitespace.leading || whitespace.trailing) content = content.trim();
return (
whitespace.leading +
converter.replacement(content, node, this.options) +
whitespace.trailing
)
},
/**
* Finds a converter for a given node
*/
converterForNode: function converterForNode (node) {
if (this.filterValue(this.options.keepConverter, node)) {
return this.options.keepConverter
}
if (this.filterValue(this.options.removeConverter, node)) {
return this.options.removeConverter
}
if (node.isBlank) return this.options.blankConverter
for (var key in this.options.converters) {
var converter = this.options.converters[key];
if (this.filterValue(converter, node)) return converter
}
return this.options.defaultConverter
},
filterValue: function filterValue (converter, node) {
var filter = converter.filter;
if (typeof filter === 'string') {
if (filter === node.nodeName.toLowerCase()) return true
} else if (Array.isArray(filter)) {
if (filter.indexOf(node.nodeName.toLowerCase()) > -1) return true
} else if (typeof filter === 'function') {
if (filter.call(converter, node, this.options)) return true
} else {
throw new TypeError('`filter` needs to be a string, array, or function')
}
},
postProcess: function postProcess (output) {
for (var key in this.options.converters) {
var converter = this.options.converters[key];
if (typeof converter.append === 'function') {
output = join(output, converter.append(this.options));
}
}
return output.replace(/^[\t\r\n]+/, '').replace(/[\t\r\n\s]+$/, '')
}
};
function separatingNewlines (output, replacement) {
var newlines = [
output.match(trailingNewLinesRegExp)[0],
replacement.match(leadingNewLinesRegExp)[0]
].sort();
return newlines[newlines.length - 1]
}
function join (string1, string2) {
var separator = separatingNewlines(string1, string2);
// Remove trailing/leading newlines and replace with separator
string1 = string1.replace(trailingNewLinesRegExp, '');
string2 = string2.replace(leadingNewLinesRegExp, '');
return string1 + separator + string2
}
/**
* Determines whether an input can be converted
*/
function canConvert (input) {
return (
input != null && (
typeof input === 'string' ||
input.nodeType && (
input.nodeType === 1 || input.nodeType === 9 || input.nodeType === 11
)
)
)
}
export default TurndownService;