UNPKG

mhtml2html

Version:

Converts .mhtml to a single .html page.

620 lines (519 loc) 26.3 kB
/** * mhtml2html * * @Author : Mayank Sindwani * @Date : 2016-09-05 * @Description : Converts mhtml to html. * * The MIT License(MIT) * Copyright(c) 2016 Mayank Sindwani **/ 'use strict'; var _typeof = typeof Symbol === "function" && typeof Symbol.iterator === "symbol" ? function (obj) { return typeof obj; } : function (obj) { return obj && typeof Symbol === "function" && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }; (function (root) { // Avoid preprocessors from bundling runtime dependencies. var _require = void 0; if (typeof require !== 'undefined') { _require = require; } var CSS_URL_RULE = "url("; // localize existing namespace. var previous_mymodule = void 0, quotedPrintable = void 0; if (root != undefined) { previous_mymodule = root.mhtml2html; } // Asserts a condition. function assert(condition, error) { if (!condition) { throw new Error(error); } } // Escape unicode and return the ascii representation. // http://stackoverflow.com/questions/834316/how-to-convert-large-utf-8-strings-into-ascii function quote(string) { var escapable = /[\\\"\x00-\x1f\x7f-\uffff]/g, meta = { // table of character substitutions '\b': '\b', '\t': '\t', '\n': '\n', '\f': '\f', '\r': '\r', '"': '"', '\\': '\\' }; escapable.lastIndex = 0; return escapable.test(string) ? string.replace(escapable, function (a) { var c = meta[a]; return typeof c === 'string' ? c : '\\' + ('0000' + a.charCodeAt(0).toString(16)).slice(-4); }) : string; } // Quoted printable // Obtained from the quoted-printable package by @mathias // https://mths.be/quoted-printable v1.0.0 by @mathias | MIT license quotedPrintable = function () { var stringFromCharCode = String.fromCharCode; var decode = function decode(input) { return input // https://tools.ietf.org/html/rfc2045#section-6.7, rule 3: // “Therefore, when decoding a `Quoted-Printable` body, any trailing white // space on a line must be deleted, as it will necessarily have been added // by intermediate transport agents.” .replace(/[\t\x20]$/gm, '') // Remove hard line breaks preceded by `=`. Proper `Quoted-Printable`- // encoded data only contains CRLF line endings, but for compatibility // reasons we support separate CR and LF too. .replace(/=(?:\r\n?|\n|$)/g, '') // Decode escape sequences of the form `=XX` where `XX` is any // combination of two hexidecimal digits. For optimal compatibility, // lowercase hexadecimal digits are supported as well. See // https://tools.ietf.org/html/rfc2045#section-6.7, note 1. .replace(/=([a-fA-F0-9]{2})/g, function ($0, $1) { var codePoint = parseInt($1, 16); return stringFromCharCode(codePoint); }); }; var handleTrailingCharacters = function handleTrailingCharacters(string) { return string.replace(/\x20$/, '=20') // Handle trailing space. .replace(/\t$/, '=09'); // Handle trailing tab. }; var regexUnsafeSymbols = /[\0-\x08\n-\x1F=\x7F-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]/g; var encode = function encode(string) { // Encode symbols that are definitely unsafe (i.e. unsafe in any context). var encoded = string.replace(regexUnsafeSymbols, function (symbol) { if (symbol > '\xFF') { throw RangeError('`quotedPrintable.encode()` expects extended ASCII input only. ' + 'Don\u2019t forget to encode the input first using a character ' + 'encoding like UTF-8.'); } var codePoint = symbol.charCodeAt(0); var hexadecimal = codePoint.toString(16).toUpperCase(); return '=' + ('0' + hexadecimal).slice(-2); }); // Limit lines to 76 characters (not counting the CRLF line endings). var lines = encoded.split(/\r\n?|\n/g); var lineIndex = -1; var lineCount = lines.length; var result = []; while (++lineIndex < lineCount) { var line = lines[lineIndex]; // Leave room for the trailing `=` for soft line breaks. var LINE_LENGTH = 75; var index = 0; var length = line.length; while (index < length) { var buffer = encoded.slice(index, index + LINE_LENGTH); // If this line ends with `=`, optionally followed by a single uppercase // hexadecimal digit, we broke an escape sequence in half. Fix it by // moving these characters to the next line. if (/=$/.test(buffer)) { buffer = buffer.slice(0, LINE_LENGTH - 1); index += LINE_LENGTH - 1; } else if (/=[A-F0-9]$/.test(buffer)) { buffer = buffer.slice(0, LINE_LENGTH - 2); index += LINE_LENGTH - 2; } else { index += LINE_LENGTH; } result.push(buffer); } } // Encode space and tab characters at the end of encoded lines. Note that // with the current implementation, this can only occur at the very end of // the encoded string — every other line ends with `=` anyway. var lastLineLength = buffer.length; if (/[\t\x20]$/.test(buffer)) { // There’s a space or a tab at the end of the last encoded line. Remove // this line from the `result` array, as it needs to change. result.pop(); if (lastLineLength + 2 <= LINE_LENGTH + 1) { // It’s possible to encode the character without exceeding the line // length limit. result.push(handleTrailingCharacters(buffer)); } else { // It’s not possible to encode the character without exceeding the line // length limit. Remvoe the character from the line, and insert a new // line that contains only the encoded character. result.push(buffer.slice(0, lastLineLength - 1), handleTrailingCharacters(buffer.slice(lastLineLength - 1, lastLineLength))); } } // `Quoted-Printable` uses CRLF. return result.join('=\r\n'); }; return { 'encode': encode, 'decode': decode, 'version': '1.0.0' }; }(); // Main module. var mhtml2html = { // Returns the module that was previously defined (if any) for conflict resolution. noConflict: function noConflict() { root.mhtml2html = previous_mymodule; return mhtml2html; }, // Returns an object representing the mhtml and its resources. parse: function parse(mhtml) { var html_only = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false; var MHTML_FSM = { MHTML_HEADERS: 0, MTHML_CONTENT: 1, MHTML_DATA: 2, MHTML_END: 3 }; var asset = void 0, headers = void 0, content = void 0, media = void 0, frames = void 0; // Record-keeping. var location = void 0, encoding = void 0, type = void 0, id = void 0; // Content properties. var state = void 0, next = void 0, index = void 0, i = void 0, j = void 0, l = void 0; // States. var boundary = void 0; // Boundaries. var parser = void 0; // HTML. headers = {}; content = {}; media = {}; frames = {}; // Initial state and index. state = MHTML_FSM.MHTML_HEADERS; i = l = 0; // Discards characters until a non-whitespace character is encountered. function trim() { for (;;) { assert(i < mhtml.length - 1, 'Unexpected EOF'); if (!/\s/.test(mhtml[i])) { break; } i++; if (mhtml[i] == '\n') { l++; } } } // Returns the next line from the index. function getLine(encoding) { var line; j = i; // Wait until a newline character is encountered or when we exceed // the str length. for (;;) { if (mhtml[i] == '\n') { i++;l++; break; } assert(i++ < mhtml.length - 1, 'Unexpected EOF'); } line = mhtml.substring(j, i); // Return the (decoded) line. switch (encoding) { case "quoted-printable": return quotedPrintable.decode(line); case "base64": return line.trim(); default: return line; } } // Splits headers from the first instance of ':' or '='. function splitHeaders(line, obj) { var kv = line.split(/[:=](.+)?/); assert(kv.length >= 2, 'Invalid header; Line ' + l); obj[kv[0].trim()] = kv[1].trim(); } while (state != MHTML_FSM.MHTML_END) { switch (state) { // Fetch document headers including the boundary to use. case MHTML_FSM.MHTML_HEADERS: { next = getLine(); // Use a new line or null character to determine when we should // stop processing headers. if (next != 0 && next != '\n') { splitHeaders(next, headers); } else { boundary = headers['boundary']; // Ensure the extracted boundary exists. assert(boundary !== undefined, 'Missing boundary from document headers; Line ' + l); boundary = boundary.replace(/\"/g, ''); trim(); next = getLine(); // Expect the next boundary to appear. assert(next.includes(boundary), 'Expected boundary; Line ' + l); content = {}; state = MHTML_FSM.MTHML_CONTENT; } break; } // Parse and store content headers. case MHTML_FSM.MTHML_CONTENT: { next = getLine(); // Use a new line or null character to determine when we should // stop processing headers. if (next != 0 && next != '\n') { splitHeaders(next, content); } else { encoding = content['Content-Transfer-Encoding']; type = content['Content-Type']; id = content['Content-ID']; location = content['Content-Location']; // Assume the first boundary to be the document. if (index === undefined) { index = location; assert(index !== undefined && type === "text/html", 'Index not found; Line ' + l); } // Ensure the extracted information exists. assert(id !== undefined || location !== undefined, 'ID or location header not provided; Line ' + l); assert(encoding !== undefined, 'Content-Transfer-Encoding not provided; Line ' + l); assert(type !== undefined, 'Content-Type not provided; Line ' + l); asset = { encoding: encoding, type: type, data: '', id: id }; // Keep track of frames by ID. if (id !== undefined) { frames[id] = asset; } // Associate the first frame with the location. if (location !== undefined && media[location] === undefined) { media[location] = asset; } trim(); content = {}; state = MHTML_FSM.MHTML_DATA; } break; } // Map data to content. case MHTML_FSM.MHTML_DATA: { next = getLine(encoding); // Build the decoded string. while (!next.includes(boundary)) { asset.data += next; next = getLine(encoding); } try { // Decode unicode. asset.data = decodeURIComponent(escape(asset.data)); } catch (e) { e; } // Ignore assets if 'html_only' is set. if (html_only === true && index !== undefined) { if (typeof DOMParser === 'undefined') { assert(typeof _require !== 'undefined', 'Require is not defined.'); // Use jsdom to parse the html. parser = _require('jsdom').jsdom; return parser(asset.data, {}); } // Use the browser's dom parser. parser = new DOMParser(); return parser.parseFromString(asset.data, "text/html"); } // Set the finishing state if there are no more characters. state = i >= mhtml.length - 1 ? MHTML_FSM.MHTML_END : MHTML_FSM.MTHML_CONTENT; break; } } } return { frames: frames, media: media, index: index }; }, // Accepts an mhtml string or parsed object and returns the converted html. convert: function convert(mhtml) { var index = void 0, media = void 0, frames = void 0; // Record-keeping. var parser = void 0, reference = void 0; // Parser. var i = void 0, j = void 0; // States. if ((typeof mhtml === 'undefined' ? 'undefined' : _typeof(mhtml)) === _typeof('')) { mhtml = mhtml2html.parse(mhtml); } else { assert((typeof mhtml === 'undefined' ? 'undefined' : _typeof(mhtml)) === _typeof({}), 'Expected argument of type string or object'); } frames = mhtml.frames; media = mhtml.media; index = mhtml.index; assert((typeof frames === 'undefined' ? 'undefined' : _typeof(frames)) === _typeof({}), 'MHTML error: invalid frames'); assert((typeof media === 'undefined' ? 'undefined' : _typeof(media)) === _typeof({}), 'MHTML error: invalid media'); assert((typeof index === 'undefined' ? 'undefined' : _typeof(index)) === _typeof(' '), 'MHTML error: invalid index'); assert(media[index] && media[index].type === "text/html", 'MHTML error: invalid index'); var _btoa = void 0; if (typeof btoa === 'undefined') { assert(_require !== 'undefined', 'Require is not defined.'); _btoa = _require('btoa'); } else { _btoa = btoa; } // http://stackoverflow.com/questions/14780350/convert-relative-path-to-absolute-using-javascript function resolve(base, relative) { var splitUrl, stack, parts, path; // Ignore paths that start with http, https, or ftp protocols. if (/^((http|https|ftp):\/\/)/.test(relative)) return relative; if (relative[0] == '/') { splitUrl = base.match(new RegExp("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")); // Prefix the path with the base protocol and domain. return splitUrl[2] + '://' + splitUrl[4] + relative; } // Get the absolute path. function getPath(pop) { stack = base.split("/"), parts = relative.split("/"); // TODO: Find a better way to determine whether or not we need // to pop the last filename. if (pop) { stack.pop(); } for (var i = 0; i < parts.length; i++) { if (parts[i] == "..") { stack.pop(); } else if (parts[i] != ".") { stack.push(parts[i]); } } return stack.join("/"); } path = getPath(); if (media[path] == null) { return getPath(true); } return path; } // Replace asset references with the corresponding data. function replaceReference(url, asset) { var path, k; // Get the absolute path of the referenced asset. reference = asset.substring(i, asset.indexOf(')', i)); i += reference.length; path = resolve(url, reference.replace(/(\"|\')/g, '')); if (media[path] == null) { return null; } // Replace the reference with an encoded version of the resource. reference = 'url(\'data:' + media[path].type + ';base64,' + (media[path].encoding === 'base64' ? media[path].data : _btoa(unescape(encodeURIComponent(quote(media[path].data))))) + '\')'; k = i;i = j + reference.length; // Replace the url with the base64 encoded string. return '' + asset.substring(0, j) + reference + asset.substring(k + 1); } // Merge resources into the document. function mergeResources(documentElem) { var childNode = void 0, children = void 0; var nodes = void 0, base = void 0; var href = void 0, src = void 0; var style = void 0; nodes = [documentElem]; while (nodes.length) { childNode = nodes.shift(); children = new Array(Object.keys(childNode.childNodes).length); for (i = 0; i < children.length; i++) { children[i] = childNode.childNodes[i]; } // Resolve each node. children.forEach(function (child) { if (child.getAttribute) { href = child.getAttribute('href'); src = child.getAttribute('src'); } switch (child.tagName) { case 'HEAD': // Link targets should be directed to the outer frame. base = documentElem.createElement("base"); base.setAttribute("target", "_parent"); child.insertBefore(base, child.firstChild); break; case 'LINK': if (typeof media[href] !== 'undefined' && media[href].type === 'text/css') { style = documentElem.createElement('style'); style.type = 'text/css'; i = 0; // Find the next css rule with an external reference. while ((i = media[href].data.indexOf(CSS_URL_RULE, i)) > 0) { j = i;i += CSS_URL_RULE.length; // Try to resolve the reference. reference = replaceReference(href, media[href].data); if (reference != null) { media[href].data = reference; } } style.appendChild(documentElem.createTextNode(quote(media[href].data))); childNode.replaceChild(style, child); } break; case 'IMG': if (typeof media[src] !== 'undefined' && media[src].type.includes('image')) { switch (media[src].encoding) { case 'quoted-printable': reference = 'data:' + media[src].type + ';utf8,' + quotedPrintable.decode(media[src].data); break; case 'base64': reference = 'data:' + media[src].type + ';base64,' + media[src].data; break; default: reference = 'data:' + media[src].type + ';base64,' + _btoa(unescape(encodeURIComponent(quote(media[src].data)))); break; } child.setAttribute('src', reference); } default: for (style in child.style) { if (typeof child.style[style] === 'string') { // Find the next css rule with an external reference. while ((i = child.style[style].indexOf(CSS_URL_RULE, i)) > 0) { j = i;i += CSS_URL_RULE.length; // Try to resolve the reference. reference = replaceReference(index, child.style[style]); if (reference != null) { child.style[style] = reference; } } } } break; } if (child.removeAttribute) { child.removeAttribute('integrity'); } nodes.push(child); }); } return documentElem; } // Return the parsed HTML with resources if (typeof DOMParser === 'undefined') { assert(typeof _require !== 'undefined', 'Require is not defined.'); // Use jsdom to parse the html. parser = _require('jsdom').jsdom; return mergeResources(parser(media[index].data, {})); } // Use the browser's dom parser. parser = new DOMParser(); return mergeResources(parser.parseFromString(media[index].data, "text/html")); } }; // Export if (typeof exports !== 'undefined') { if (typeof module !== 'undefined' && module.exports) { exports = module.exports = mhtml2html; } exports.mhtml2html = mhtml2html; } else if (root != undefined) { root.mhtml2html = mhtml2html; } })(typeof window !== 'undefined' ? window : null);