html2markdown
Version:
An HTML to Markdown converter.
433 lines (398 loc) • 8.42 kB
JavaScript
/*
* HTMLParser - This implementation of parser assumes we are parsing HTML in browser
* and user DOM methods available in browser for parsing HTML.
*
* @author Himanshu Gilani
*
*/
/*
Universal JavaScript Module, supports AMD (RequireJS), Node.js, and the browser.
https://gist.github.com/kirel/1268753
*/
(function (name, definition) {
if (typeof define === 'function') { // AMD
define(definition);
} else if (typeof module !== 'undefined' && module.exports) { // Node.js
module.exports = definition();
} else { // Browser
var theModule = definition(), global = this, old = global[name];
theModule.noConflict = function () {
global[name] = old;
return theModule;
};
global[name] = theModule;
}
})('markdownDOMParser', function() {
return function(html, handler, opts) {
opts = opts || {};
var e = document.createElement('div');
e.innerHTML = html;
var node = e;
var nodesToIgnore = opts['nodesToIgnore'] || [];
var parseHiddenNodes = opts['parseHiddenNodes'] || 'false';
var c = node.childNodes;
for ( var i = 0; i < c.length; i++) {
try {
var ignore = false;
for(var k=0; k< nodesToIgnore.length; k++) {
if(c[i].nodeName.toLowerCase() == nodesToIgnore[k]) {
ignore= true;
break;
}
}
//NOTE hidden node testing is expensive in FF.
if (ignore || (!parseHiddenNodes && isHiddenNode(c[i])) ){
continue;
}
if (c[i].nodeName.toLowerCase() != "#text" && c[i].nodeName.toLowerCase() != "#comment") {
var attrs = [];
if (c[i].hasAttributes()) {
var attributes = c[i].attributes;
for ( var a = 0; a < attributes.length; a++) {
var attribute = attributes.item(a);
attrs.push({
name : attribute.nodeName,
value : attribute.nodeValue,
});
}
}
if (handler.start) {
if (c[i].hasChildNodes()) {
handler.start(c[i].nodeName, attrs, false);
//if (c[i].nodeName.toLowerCase() == "pre" || c[i].nodeName.toLowerCase() == "code") {
// handler.chars(c[i].innerHTML);
//} else
if (c[i].nodeName.toLowerCase() == "iframe" || c[i].nodeName.toLowerCase() == "frame") {
if (c[i].contentDocument && c[i].contentDocument.documentElement) {
return HTMLParser(c[i].contentDocument.documentElement, handler, opts);
}
} else if (c[i].hasChildNodes()) {
HTMLParser(c[i], handler, opts);
}
if (handler.end) {
handler.end(c[i].nodeName);
}
} else {
handler.start(c[i].nodeName, attrs, true);
}
}
} else if (c[i].nodeName.toLowerCase() == "#text") {
if (handler.chars) {
handler.chars(c[i].nodeValue);
}
} else if (c[i].nodeName.toLowerCase() == "#comment") {
if (handler.comment) {
handler.comment(c[i].nodeValue);
}
}
} catch (e) {
//properly log error
console.error(e);
console.log("error while parsing node: " + c[i].nodeName.toLowerCase());
}
}
};
function isHiddenNode(node) {
if(node.nodeName.toLowerCase() == "title"){
return false;
}
if (window.getComputedStyle) {
try {
var style = window.getComputedStyle(node, null);
if (style.getPropertyValue && style.getPropertyValue('display') == 'none') {
return true;
}
} catch (e) {
// consume and ignore. node styles are not accessible
}
return false;
}
}
//http://blogs.msdn.com/b/aoakley/archive/2003/11/12/49645.aspx
function HTMLDecode(str) {
var div = document.createElement('div');
div.style.display="none";
div.innerHTML = str;
var decoded = div.firstChild.nodeValue;
div.parentNode.removeChild(div);
return decoded;
}
// HTMLEncode (@author Ulrich Jensen, http://www.htmlescape.net)
var hex = new Array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f');
function HTMLEncode(originalText) {
var preescape = "" + originalText;
var escaped = "";
var i = 0;
for (i = 0; i < preescape.length; i++) {
var p = preescape.charAt(i);
p = "" + escapeCharOther(p);
p = "" + escapeTags(p);
p = "" + escapeBR(p);
escaped = escaped + p;
}
return escaped;
}
function escapeHtmlTextArea(originalText) {
var preescape = "" + originalText;
var escaped = "";
var i = 0;
for (i = 0; i < preescape.length; i++) {
var p = preescape.charAt(i);
p = "" + escapeCharOther(p);
p = "" + escapeTags(p);
escaped = escaped + p;
}
return escaped;
}
function escapeBR(original) {
var thechar = original.charCodeAt(0);
switch (thechar) {
case 10:
return "<br/>";
break; // newline
case '\r':
break;
}
return original;
}
function escapeNBSP(original) {
var thechar = original.charCodeAt(0);
switch (thechar) {
case 32:
return " ";
break; // space
}
return original;
}
function escapeTags(original) {
var thechar = original.charCodeAt(0);
switch (thechar) {
case 60:
return "<";
break; // <
case 62:
return ">";
break; // >
case 34:
return """;
break; // "
}
return original;
}
function escapeCharOther(original) {
var found = true;
var thechar = original.charCodeAt(0);
switch (thechar) {
case 38:
return "&";
break; // &
case 198:
return "Æ";
break; // Æ
case 193:
return "Á";
break; // Á
case 194:
return "Â";
break; // Â
case 192:
return "À";
break; // À
case 197:
return "Å";
break; // Å
case 195:
return "Ã";
break; // Ã
case 196:
return "Ä";
break; // Ä
case 199:
return "Ç";
break; // Ç
case 208:
return "Ð";
break; // Ð
case 201:
return "É";
break; // É
case 202:
return "Ê";
break; // Ê
case 200:
return "È";
break; // È
case 203:
return "Ë";
break; // Ë
case 205:
return "Í";
break; // Í
case 206:
return "Î";
break; // Î
case 204:
return "Ì";
break; // Ì
case 207:
return "Ï";
break; // Ï
case 209:
return "Ñ";
break; // Ñ
case 211:
return "Ó";
break; // Ó
case 212:
return "Ô";
break; // Ô
case 210:
return "Ò";
break; // Ò
case 216:
return "Ø";
break; // Ø
case 213:
return "Õ";
break; // Õ
case 214:
return "Ö";
break; // Ö
case 222:
return "Þ";
break; // Þ
case 218:
return "Ú";
break; // Ú
case 219:
return "Û";
break; // Û
case 217:
return "Ù";
break; // Ù
case 220:
return "Ü";
break; // Ü
case 221:
return "Ý";
break; // Ý
case 225:
return "á";
break; // á
case 226:
return "â";
break; // â
case 230:
return "æ";
break; // æ
case 224:
return "à";
break; // à
case 229:
return "å";
break; // å
case 227:
return "ã";
break; // ã
case 228:
return "ä";
break; // ä
case 231:
return "ç";
break; // ç
case 233:
return "é";
break; // é
case 234:
return "ê";
break; // ê
case 232:
return "è";
break; // è
case 240:
return "ð";
break; // ð
case 235:
return "ë";
break; // ë
case 237:
return "í";
break; // í
case 238:
return "î";
break; // î
case 236:
return "ì";
break; // ì
case 239:
return "ï";
break; // ï
case 241:
return "ñ";
break; // ñ
case 243:
return "ó";
break; // ó
case 244:
return "ô";
break; // ô
case 242:
return "ò";
break; // ò
case 248:
return "ø";
break; // ø
case 245:
return "õ";
break; // õ
case 246:
return "ö";
break; // ö
case 223:
return "ß";
break; // ß
case 254:
return "þ";
break; // þ
case 250:
return "ú";
break; // ú
case 251:
return "û";
break; // û
case 249:
return "ù";
break; // ù
case 252:
return "ü";
break; // ü
case 253:
return "ý";
break; // ý
case 255:
return "ÿ";
break; // ÿ
case 162:
return "¢";
break; // ¢
default:
found = false;
break;
}
if (!found) {
if (thechar > 127) {
var c = thechar;
var a4 = c % 16;
c = Math.floor(c / 16);
var a3 = c % 16;
c = Math.floor(c / 16);
var a2 = c % 16;
c = Math.floor(c / 16);
var a1 = c % 16;
return "&#x" + hex[a1] + hex[a2] + hex[a3] + hex[a4] + ";";
} else {
return original;
}
}
}
});