UNPKG

@salesflare/planer

Version:

Remove reply quotations from emails

340 lines (305 loc) 13.9 kB
// Generated by CoffeeScript 2.5.1 (function() { var CONTENT_CHUNK_SIZE, MAX_LINES_COUNT, MAX_LINE_LENGTH, REGEXES, SPLITTER_MAX_LINES, _CRLF_to_LF, _restore_CRLF, getDelimiter, htmlPlaner, isSplitter, postprocess, preprocess, setReturnFlags; htmlPlaner = require('./htmlPlaner'); REGEXES = require('./regexes'); SPLITTER_MAX_LINES = 4; MAX_LINES_COUNT = 1000; MAX_LINE_LENGTH = 200000; // Extract actual message from email. // Will use provided `contentType` to decide which algorithm to use (plain text or html). // @param msgBody [String] the html content of the email // @param contentType [String] the contentType of the email. Only `text/plain` and `text/html` are supported. // @param dom [Document] the document object to use for html parsing. // @return [String] the text/html of the actual message without quotations exports.extractFrom = function(msgBody, contentType = 'text/plain', dom = null) { if (contentType === 'text/plain') { return exports.extractFromPlain(msgBody); } else if (contentType === 'text/html') { return exports.extractFromHtml(msgBody, dom); } else { console.warn('Unknown contentType', contentType); } return msgBody; }; // Extract actual message from provided textual email. // Store delimiter used by the email (\n or \r\n), // split the email into lines, // use regexes to mark each line as either part of the message or quotation, // remove lines that are part of the quotation, // put message back together using the saved delimeter, // remove changes made by algorithm. // @param msgBody [String] the html content of the email // @return [String] the text of the message without quotations exports.extractFromPlain = function(msgBody) { var delimiter, lines, markers; delimiter = getDelimiter(msgBody); msgBody = preprocess(msgBody, delimiter); lines = msgBody.split(delimiter, MAX_LINES_COUNT); markers = exports.markMessageLines(lines); lines = exports.processMarkedLines(lines, markers); msgBody = lines.join(delimiter); msgBody = postprocess(msgBody); return msgBody; }; // Extract actual message from provided html message body // using tags and plain text algorithm. // Cut out the 'blockquote', 'gmail_quote' tags. // Cut out Microsoft (Outlook, Windows mail) quotations. // Then use plain text algorithm to cut out splitter or // leftover quotation. // This works by adding checkpoint text to all html tags, // then converting html to text, // then extracting quotations from text, // then checking deleted checkpoints, // then deleting necessary tags. // Will use the document provided to create a new document using: // Document.implementation.createHTMLDocument() // @param msgBody [String] the html content of the email // @param dom [Document] a document object or equivalent implementation. // Must respond to `DOMImplementation.createHTMLDocument()`. // @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = function(msgBody, dom) { var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags; if (dom == null) { console.error("No dom provided to parse html."); return msgBody; } if (msgBody.trim() === '') { return msgBody; } [msgBody, crlfReplaced] = _CRLF_to_LF(msgBody); emailDocument = htmlPlaner.createEmailDocument(msgBody, dom); // TODO: this check does not handle cases of emails between various email providers well because // it will find whichever splitter comes first in this list, not necessarily the top-most and stop // checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner // to find the earliest splitter in the DOM. haveCutQuotations = htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument); // Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom); // Add checkpoints to html document numberOfCheckpoints = htmlPlaner.addCheckpoints(emailDocument.body, 0); quotationCheckpoints = Array.apply(null, Array(numberOfCheckpoints)).map(function() { return false; }); // Get plain text version to put through plain text algorithm htmlPlaner.replaceBreakTagsWithLineFeeds(emailDocument); plainTextMsg = emailDocument.body.textContent; plainTextMsg = preprocess(plainTextMsg, "\n", 'text/html'); lines = plainTextMsg.split('\n'); if (lines.length > MAX_LINES_COUNT) { return msgBody; } // Collect checkpoints for each line lineCheckpoints = new Array(lines.length); for (index = k = 0, len = lines.length; k < len; index = ++k) { line = lines[index]; matches = line.match(htmlPlaner.CHECKPOINT_PATTERN) || []; lineCheckpoints[index] = matches.map(function(match) { return parseInt(match.slice(4, -4)); }); } // Remove checkpoints from lines to pass through plain text algorithm lines = lines.map(function(line) { return line.replace(htmlPlaner.CHECKPOINT_PATTERN, ''); }); markers = exports.markMessageLines(lines); returnFlags = {}; exports.processMarkedLines(lines, markers, returnFlags); if (!returnFlags.wereLinesDeleted) { if (haveCutQuotations) { // If we cut a quotation element out of the html, return the html output of the copied document. return _restore_CRLF(emailDocumentCopy.documentElement.outerHTML, crlfReplaced); } else { // There was nothing to remove, return original message. return msgBody; } } // Set quotationCheckpoints to true for checkpoints on lines that were removed for (i = l = ref = returnFlags.firstLine, ref1 = returnFlags.lastLine; (ref <= ref1 ? l <= ref1 : l >= ref1); i = ref <= ref1 ? ++l : --l) { if (!lineCheckpoints[i]) { continue; } ref2 = lineCheckpoints[i]; for (m = 0, len1 = ref2.length; m < len1; m++) { checkpoint = ref2[m]; quotationCheckpoints[checkpoint] = true; } } // Remove the element that have been identified as part of the quoted message htmlPlaner.deleteQuotationTags(emailDocumentCopy.body, 0, quotationCheckpoints); return emailDocumentCopy.documentElement.outerHTML; }; // Mark message lines with markers to distinguish quotation lines. // Markers: // * e - empty line // * f - Forwarded message line, see REGEXES.FWD // * m - line that starts with quotation marker '>' // * s - splitter line // * t - presumably lines from the last message in the conversation // $> markMessageLines(['answer', 'From: foo@bar.com', '', '> question']) // 'tsem' exports.markMessageLines = function(lines) { var i, j, k, markers, ref, splitter, splitterLines; markers = []; i = 0; while (i < lines.length) { if (lines[i].trim() === '') { markers[i] = 'e'; // empty line } else if (REGEXES.QUOT_PATTERN.test(lines[i])) { markers[i] = 'm'; // line with quotation marker } else if (REGEXES.FWD.test(lines[i])) { markers[i] = 'f'; // ---- Forwarded message ---- } else { splitter = isSplitter(lines.slice(i, i + SPLITTER_MAX_LINES).join("\n")); if (splitter) { // splitter[0] is the entire match splitterLines = splitter[0].split("\n"); for (j = k = 0, ref = splitterLines.length; (0 <= ref ? k <= ref : k >= ref); j = 0 <= ref ? ++k : --k) { markers[i + j] = 's'; } i += splitterLines.length - 1; } else { markers[i] = 't'; } } i++; } return markers.join(''); }; // Check the line for each splitter regex. isSplitter = function(line) { var k, len, matchArray, pattern, ref; if (line.length > MAX_LINE_LENGTH) { return null; } ref = REGEXES.SPLITTER_PATTERNS; for (k = 0, len = ref.length; k < len; k++) { pattern = ref[k]; matchArray = pattern.exec(line); if (matchArray && matchArray.index === 0) { return matchArray; } } return null; }; // Run regexes against message's marked lines to strip quotations. // Return only last message lines. // $> processMarkedLines(['Hello', 'From: foo@bar.com', '', '> Hi'], 'tsem']) // ['Hello'] // Will also modify the provided returnFlags object and set the following properties: // returnFlags = { wereLinesDeleted: (true|false), firstLine: (Number), lastLine: (Number) } // @see setReturnFlags exports.processMarkedLines = function(lines, markers, returnFlags = {}) { var inlineMatchRegex, inlineReplyIndex, inlineReplyMatch, isInlineReplyLink, quotationEnd, quotationMatch; // If there are no splitters there should be no markers if (markers.indexOf('s') < 0 && !/(me*){3}/.test(markers)) { markers = markers.replace(/m/g, 't'); } // If the message is a forward do nothing. if (/^[te]*f/.test(markers)) { setReturnFlags(returnFlags, false, -1, -1); return lines; } // Find inline replies (tm's following the first m in markers string) inlineMatchRegex = new RegExp('m(?=e*((?:t+e*)+)m)', 'g'); while (inlineReplyMatch = inlineMatchRegex.exec(lines)) { inlineReplyIndex = markers.indexOf(inlineReplyMatch[1], inlineReplyMatch.index); isInlineReplyLink = false; if (inlineReplyIndex > -1) { isInlineReplyLink = REGEXES.PARENTHESIS_LINK.test(lines[inlineReplyIndex - 1]) || lines[inlineReplyIndex].trim().search(REGEXES.PARENTHESIS_LINK) === 0; } if (!isInlineReplyLink) { setReturnFlags(returnFlags, false, -1, -1); return lines; } } // Cut out text lines coming after splitter if there are no markers there quotationMatch = new RegExp('(se*)+((t|f)+e*)+', 'g').exec(markers); if (quotationMatch) { setReturnFlags(returnFlags, true, quotationMatch.index, lines.length); return lines.slice(0, quotationMatch.index); } // Handle the case with markers quotationMatch = REGEXES.QUOTATION.exec(markers) || REGEXES.EMPTY_QUOTATION.exec(markers); if (quotationMatch) { quotationEnd = quotationMatch.index + quotationMatch[1].length; setReturnFlags(returnFlags, true, quotationMatch.index, quotationEnd); return lines.slice(0, quotationMatch.index).concat(lines.slice(quotationEnd)); } setReturnFlags(returnFlags, false, -1, -1); return lines; }; setReturnFlags = function(returnFlags, wereLinesDeleted, firstLine, lastLine) { returnFlags.wereLinesDeleted = wereLinesDeleted; returnFlags.firstLine = firstLine; return returnFlags.lastLine = lastLine; }; // Prepares msgBody for being stripped. // Replaces link brackets so that they couldn't be taken for quotation marker. // Splits line in two if splitter pattern preceded by some text on the same // line (done only for 'On <date> <person> wrote:' pattern). preprocess = function(msgBody, delimiter, contentType = 'text/plain') { // Normalize links i.e. replace '<', '>' wrapping the link with some symbols // so that '>' closing the link couldn't be mistakenly taken for quotation // marker. // REGEXES.LINK has 1 captured group msgBody = msgBody.replace(REGEXES.LINK, function(entireMatch, groupMatch1, matchIndex) { var newLineIndex; // Look for closest newline character newLineIndex = msgBody.lastIndexOf("\n", matchIndex); // If the new current line starts with a '>' quotation marker, don't mess with the link if (newLineIndex > 0 && msgBody[newLineIndex + 1] === '>') { return entireMatch; } else { return `@@${groupMatch1}@@`; } }); if (contentType === 'text/plain' && msgBody.length < MAX_LINE_LENGTH) { // ON_DATE_SMB_WROTE has 4 captured groups msgBody = msgBody.replace(REGEXES.ON_DATE_SMB_WROTE, function(entireMatch, groupMatch1, groupMatch2, groupMatch3, groupMatch4, matchIndex) { if (matchIndex && msgBody[matchIndex - 1] !== "\n") { return `${delimiter}${entireMatch}`; } else { return entireMatch; } }); } return msgBody; }; // Make up for changes done at preprocessing message. // Replace link brackets back to '<' and '>'. postprocess = function(msgBody) { return msgBody.replace(REGEXES.NORMALIZED_LINK, '<$1>').trim(); }; CONTENT_CHUNK_SIZE = 100; getDelimiter = function(msgBody) { var bodyChunk, contentLength, currentIndex, delimiterMatch; contentLength = msgBody.length; currentIndex = 0; bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE); while (!(delimiterMatch = REGEXES.DELIMITER.exec(bodyChunk)) && currentIndex < contentLength) { currentIndex += CONTENT_CHUNK_SIZE; bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE); } if (delimiterMatch) { return delimiterMatch[0]; } else { return "\n"; } }; _CRLF_to_LF = function(msgBody) { var delimiter; delimiter = getDelimiter(msgBody); if (delimiter === '\r\n') { return [msgBody.replace(new RegExp(delimiter, 'g'), '\n'), true]; } return [msgBody, false]; }; _restore_CRLF = function(msgBody, replaced = true) { if (replaced) { return msgBody.replace(new RegExp('\n', 'g'), '\r\n'); } return msgBody; }; }).call(this);