html2markdown
Version:
An HTML to Markdown converter.
563 lines (511 loc) • 11.8 kB
JavaScript
/**
* html2markdown - An HTML to Markdown converter.
*
* This implementation uses HTML or DOM parsing for conversion. Parsing code was
* abstracted out in a parsing function which should be easy to remove in favor
* of other parsing libraries.
*
* Converted MarkDown was tested with ShowDown library for HTML rendering. And
* it tries to create MarkDown that does not confuse ShowDown when certain
* combination of HTML tags come together.
*
* @author Himanshu Gilani
* @author Kates Gasis (original author)
*
*/
/**
* html2markdown
* @param html - html string to convert
* @return converted markdown text
*/
/*
Universal JavaScript Module, supports AMD (RequireJS), Node.js, and the browser.
https://gist.github.com/kirel/1268753
*/
(function (name, definition) {
if (typeof define === 'function') { // AMD
define(definition);
} else if (typeof module !== 'undefined' && module.exports) { // Node.js
module.exports = definition();
} else { // Browser
var theModule = definition(), global = this, old = global[name];
theModule.noConflict = function () {
global[name] = old;
return theModule;
};
global[name] = theModule;
}
})('html2markdown', function() {
function trim(value) {
return value.replace(/^\s+|\s+$/g,"");
}
function endsWith(value, suffix) {
return value.match(suffix+"$") == suffix;
}
function startsWith(value, str) {
return value.indexOf(str) == 0;
}
function html2markdown(html, opts) {
opts = opts || {};
var nodeList = [];
var listTagStack = [];
var linkAttrStack = [];
var blockquoteStack = [];
var preStack = [];
var codeStack = [];
var links = [];
var inlineStyle = opts['inlineStyle'] || false;
var parser = opts['parser'];
var markdownTags = {
"hr": "- - -\n\n",
"br": " \n",
"title": "# ",
"h1": "# ",
"h2": "## ",
"h3": "### ",
"h4": "#### ",
"h5": "##### ",
"h6": "###### ",
"b": "**",
"strong": "**",
"i": "_",
"em": "_",
"dfn": "_",
"var": "_",
"cite": "_",
"span": " ",
"ul": "* ",
"ol": "1. ",
"dl": "- ",
"blockquote": "> "
};
if(!parser && typeof markdownDOMParser !== 'undefined')
parser = markdownDOMParser;
function getListMarkdownTag() {
var listItem = "";
if(listTagStack) {
for ( var i = 0; i < listTagStack.length - 1; i++) {
listItem += " ";
}
}
listItem += peek(listTagStack);
return listItem;
}
function convertAttrs(attrs) {
var attributes = {};
for(var k in attrs) {
var attr = attrs[k];
attributes[attr.name] = attr;
}
return attributes;
}
function peek(list) {
if(list && list.length > 0) {
return list.slice(-1)[0];
}
return "";
}
function peekTillNotEmpty(list) {
if(!list) {
return "";
}
for(var i = list.length - 1; i>=0; i-- ){
if(list[i] != "") {
return list[i];
}
}
return "";
}
function removeIfEmptyTag(start) {
var cleaned = false;
if(start == peekTillNotEmpty(nodeList)) {
while(peek(nodeList) != start) {
nodeList.pop();
}
nodeList.pop();
cleaned = true;
}
return cleaned;
}
function sliceText(start) {
var text = [];
while(nodeList.length > 0 && peek(nodeList) != start) {
var t = nodeList.pop();
text.unshift(t);
}
return text.join("");
}
function block(isEndBlock) {
var lastItem = nodeList.pop();
if (!lastItem) {
return;
}
if(!isEndBlock) {
var block;
if(/\s*\n\n\s*$/.test(lastItem)) {
lastItem = lastItem.replace(/\s*\n\n\s*$/, "\n\n");
block = "";
} else if(/\s*\n\s*$/.test(lastItem)) {
lastItem = lastItem.replace(/\s*\n\s*$/, "\n");
block = "\n";
} else if(/\s+$/.test(lastItem)) {
block = "\n\n";
} else {
block = "\n\n";
}
nodeList.push(lastItem);
nodeList.push(block);
} else {
nodeList.push(lastItem);
if(!endsWith(lastItem, "\n")) {
nodeList.push("\n\n");
}
}
}
function listBlock() {
if(nodeList.length > 0) {
var li = peek(nodeList);
if(!endsWith(li, "\n")) {
nodeList.push("\n");
}
} else {
nodeList.push("\n");
}
}
parser(html,{
start: function(tag, attrs, unary) {
tag = tag.toLowerCase();
if(unary && (tag != "br" && tag != "hr" && tag != "img")) {
return;
}
switch (tag) {
case "br":
nodeList.push(markdownTags[tag]);
break;
case "hr":
block();
nodeList.push(markdownTags[tag]);
break;
case "title":
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
block();
nodeList.push(markdownTags[tag]);
break;
case "b":
case "strong":
case "i":
case "em":
case "dfn":
case "var":
case "cite":
nodeList.push(markdownTags[tag]);
break;
case "code":
case "span":
if(preStack.length > 0)
{
break;
} else if(! /\s+$/.test(peek(nodeList))) {
nodeList.push(markdownTags[tag]);
}
break;
case "p":
case "div":
//case "td":
block();
break;
case "ul":
case "ol":
case "dl":
listTagStack.push(markdownTags[tag]);
// lists are block elements
if(listTagStack.length > 1) {
listBlock();
} else {
block();
}
break;
case "li":
case "dt":
var li = getListMarkdownTag();
nodeList.push(li);
break;
case "a":
var attribs = convertAttrs(attrs);
linkAttrStack.push(attribs);
nodeList.push("[");
break;
case "img":
var attribs = convertAttrs(attrs);
var alt, title, url;
attribs["src"] ? url = attribs["src"].value : url = "";
if(!url) {
break;
}
attribs['alt'] ? alt = trim(attribs['alt'].value) : alt = "";
attribs['title'] ? title = trim(attribs['title'].value) : title = "";
// if parent of image tag is nested in anchor tag use inline style
if(!inlineStyle && !startsWith(peekTillNotEmpty(nodeList), "[")) {
var l = links.indexOf(url);
if(l == -1) {
links.push(url);
l=links.length-1;
}
block();
nodeList.push("![");
if(alt!= "") {
nodeList.push(alt);
} else if (title != null) {
nodeList.push(title);
}
nodeList.push("][" + l + "]");
block();
} else {
//if image is not a link image then treat images as block elements
if(!startsWith(peekTillNotEmpty(nodeList), "[")) {
block();
}
nodeList.push(" + ")");
if(!startsWith(peekTillNotEmpty(nodeList), "[")) {
block(true);
}
}
break;
case "blockquote":
//listBlock();
block();
blockquoteStack.push(markdownTags[tag]);
break;
case "pre":
block();
preStack.push(true);
nodeList.push(" ");
break;
case "table":
nodeList.push("<table>");
break;
case "thead":
nodeList.push("<thead>");
break;
case "tbody":
nodeList.push("<tbody>");
break;
case "tr":
nodeList.push("<tr>");
break;
case "td":
nodeList.push("<td>");
break;
}
},
chars: function(text) {
if(preStack.length > 0) {
text = text.replace(/\n/g,"\n ");
} else if(trim(text) != "") {
text = text.replace(/\s+/g, " ");
var prevText = peekTillNotEmpty(nodeList);
if(/\s+$/.test(prevText)) {
text = text.replace(/^\s+/g, "");
}
} else {
nodeList.push("");
return;
}
//if(blockquoteStack.length > 0 && peekTillNotEmpty(nodeList).endsWith("\n")) {
if(blockquoteStack.length > 0) {
nodeList.push(blockquoteStack.join(""));
}
nodeList.push(text);
},
end: function(tag) {
tag = tag.toLowerCase();
switch (tag) {
case "title":
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
if(!removeIfEmptyTag(markdownTags[tag])) {
block(true);
}
break;
case "p":
case "div":
//case "td":
while(nodeList.length > 0 && trim(peek(nodeList)) == "") {
nodeList.pop();
}
block(true);
break;
case "b":
case "strong":
case "i":
case "em":
case "dfn":
case "var":
case "cite":
if(!removeIfEmptyTag(markdownTags[tag])) {
nodeList.push(trim(sliceText(markdownTags[tag])));
nodeList.push(markdownTags[tag]);
}
break;
case "a":
var text = sliceText("[");
text = text.replace(/\s+/g, " ");
text = trim(text);
if(text == "") {
nodeList.pop();
break;
}
var attrs = linkAttrStack.pop();
var url;
attrs["href"] && attrs["href"].value != "" ? url = attrs["href"].value : url = "";
if(url == "") {
nodeList.pop();
nodeList.push(text);
break;
}
nodeList.push(text);
if(!inlineStyle && !startsWith(peek(nodeList), "!")){
var l = links.indexOf(url);
if(l == -1) {
links.push(url);
l=links.length-1;
}
nodeList.push("][" + l + "]");
} else {
if(startsWith(peek(nodeList), "!")){
var text = nodeList.pop();
text = nodeList.pop() + text;
block();
nodeList.push(text);
}
var title = attrs["title"];
nodeList.push("](" + url + (title ? " \"" + trim(title.value).replace(/\s+/g, " ") + "\"" : "") + ")");
if(startsWith(peek(nodeList), "!")){
block(true);
}
}
break;
case "ul":
case "ol":
case "dl":
listBlock();
listTagStack.pop();
break;
case "li":
case "dt":
var li = getListMarkdownTag();
if(!removeIfEmptyTag(li)) {
var text = trim(sliceText(li));
if(startsWith(text, "[![")) {
nodeList.pop();
block();
nodeList.push(text);
block(true);
} else {
nodeList.push(text);
listBlock();
}
}
break;
case "blockquote":
blockquoteStack.pop();
break;
case "pre":
//uncomment following experimental code to discard line numbers when syntax highlighters are used
//notes this code thorough testing before production user
/*
var p=[];
var flag = true;
var count = 0, whiteSpace = 0, line = 0;
console.log(">> " + peek(nodeList));
while(peek(nodeList).startsWith(" ") || flag == true)
{
//console.log('inside');
var text = nodeList.pop();
p.push(text);
if(flag == true && !text.startsWith(" ")) {
continue;
} else {
flag = false;
}
//var result = parseInt(text.trim());
if(!isNaN(text.trim())) {
count++;
} else if(text.trim() == ""){
whiteSpace++;
} else {
line++;
}
flag = false;
}
console.log(line);
if(line != 0)
{
while(p.length != 0) {
nodeList.push(p.pop());
}
}
*/
block(true);
preStack.pop();
break;
case "code":
case "span":
if(preStack.length > 0)
{
break;
} else if(trim(peek(nodeList)) == "") {
nodeList.pop();
nodeList.push(markdownTags[tag]);
} else {
var text = nodeList.pop();
nodeList.push(trim(text));
nodeList.push(markdownTags[tag]);
}
break;
case "table":
nodeList.push("</table>");
break;
case "thead":
nodeList.push("</thead>");
break;
case "tbody":
nodeList.push("</tbody>");
break;
case "tr":
nodeList.push("</tr>");
break;
case "td":
nodeList.push("</td>");
break;
case "br":
case "hr":
case "img":
break;
}
}
}, {"nodesToIgnore": ["script", "noscript", "object", "iframe", "frame", "head", "style", "label"]});
if(!inlineStyle) {
for ( var i = 0; i < links.length; i++) {
if(i == 0) {
var lastItem = nodeList.pop();
nodeList.push(lastItem.replace(/\s+$/g, ""));
nodeList.push("\n\n[" + i + "]: " + links[i]);
} else {
nodeList.push("\n[" + i + "]: " + links[i]);
}
}
}
return nodeList.join("");
}
return html2markdown;
});