@salesflare/planer
Version:
Remove reply quotations from emails
340 lines (305 loc) • 13.9 kB
JavaScript
// Generated by CoffeeScript 2.5.1
(function() {
var CONTENT_CHUNK_SIZE, MAX_LINES_COUNT, MAX_LINE_LENGTH, REGEXES, SPLITTER_MAX_LINES, _CRLF_to_LF, _restore_CRLF, getDelimiter, htmlPlaner, isSplitter, postprocess, preprocess, setReturnFlags;
htmlPlaner = require('./htmlPlaner');
REGEXES = require('./regexes');
SPLITTER_MAX_LINES = 4;
MAX_LINES_COUNT = 1000;
MAX_LINE_LENGTH = 200000;
// Extract actual message from email.
// Will use provided `contentType` to decide which algorithm to use (plain text or html).
// @param msgBody [String] the html content of the email
// @param contentType [String] the contentType of the email. Only `text/plain` and `text/html` are supported.
// @param dom [Document] the document object to use for html parsing.
// @return [String] the text/html of the actual message without quotations
exports.extractFrom = function(msgBody, contentType = 'text/plain', dom = null) {
if (contentType === 'text/plain') {
return exports.extractFromPlain(msgBody);
} else if (contentType === 'text/html') {
return exports.extractFromHtml(msgBody, dom);
} else {
console.warn('Unknown contentType', contentType);
}
return msgBody;
};
// Extract actual message from provided textual email.
// Store delimiter used by the email (\n or \r\n),
// split the email into lines,
// use regexes to mark each line as either part of the message or quotation,
// remove lines that are part of the quotation,
// put message back together using the saved delimeter,
// remove changes made by algorithm.
// @param msgBody [String] the html content of the email
// @return [String] the text of the message without quotations
exports.extractFromPlain = function(msgBody) {
var delimiter, lines, markers;
delimiter = getDelimiter(msgBody);
msgBody = preprocess(msgBody, delimiter);
lines = msgBody.split(delimiter, MAX_LINES_COUNT);
markers = exports.markMessageLines(lines);
lines = exports.processMarkedLines(lines, markers);
msgBody = lines.join(delimiter);
msgBody = postprocess(msgBody);
return msgBody;
};
// Extract actual message from provided html message body
// using tags and plain text algorithm.
// Cut out the 'blockquote', 'gmail_quote' tags.
// Cut out Microsoft (Outlook, Windows mail) quotations.
// Then use plain text algorithm to cut out splitter or
// leftover quotation.
// This works by adding checkpoint text to all html tags,
// then converting html to text,
// then extracting quotations from text,
// then checking deleted checkpoints,
// then deleting necessary tags.
// Will use the document provided to create a new document using:
// Document.implementation.createHTMLDocument()
// @param msgBody [String] the html content of the email
// @param dom [Document] a document object or equivalent implementation.
// Must respond to `DOMImplementation.createHTMLDocument()`.
// @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument
exports.extractFromHtml = function(msgBody, dom) {
var checkpoint, crlfReplaced, emailDocument, emailDocumentCopy, haveCutQuotations, i, index, k, l, len, len1, line, lineCheckpoints, lines, m, markers, matches, numberOfCheckpoints, plainTextMsg, quotationCheckpoints, ref, ref1, ref2, returnFlags;
if (dom == null) {
console.error("No dom provided to parse html.");
return msgBody;
}
if (msgBody.trim() === '') {
return msgBody;
}
[msgBody, crlfReplaced] = _CRLF_to_LF(msgBody);
emailDocument = htmlPlaner.createEmailDocument(msgBody, dom);
// TODO: this check does not handle cases of emails between various email providers well because
// it will find whichever splitter comes first in this list, not necessarily the top-most and stop
// checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner
// to find the earliest splitter in the DOM.
haveCutQuotations = htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument);
// Create unaltered copy of email document
emailDocumentCopy = htmlPlaner.createEmailDocument(emailDocument.documentElement.outerHTML, dom);
// Add checkpoints to html document
numberOfCheckpoints = htmlPlaner.addCheckpoints(emailDocument.body, 0);
quotationCheckpoints = Array.apply(null, Array(numberOfCheckpoints)).map(function() {
return false;
});
// Get plain text version to put through plain text algorithm
htmlPlaner.replaceBreakTagsWithLineFeeds(emailDocument);
plainTextMsg = emailDocument.body.textContent;
plainTextMsg = preprocess(plainTextMsg, "\n", 'text/html');
lines = plainTextMsg.split('\n');
if (lines.length > MAX_LINES_COUNT) {
return msgBody;
}
// Collect checkpoints for each line
lineCheckpoints = new Array(lines.length);
for (index = k = 0, len = lines.length; k < len; index = ++k) {
line = lines[index];
matches = line.match(htmlPlaner.CHECKPOINT_PATTERN) || [];
lineCheckpoints[index] = matches.map(function(match) {
return parseInt(match.slice(4, -4));
});
}
// Remove checkpoints from lines to pass through plain text algorithm
lines = lines.map(function(line) {
return line.replace(htmlPlaner.CHECKPOINT_PATTERN, '');
});
markers = exports.markMessageLines(lines);
returnFlags = {};
exports.processMarkedLines(lines, markers, returnFlags);
if (!returnFlags.wereLinesDeleted) {
if (haveCutQuotations) {
// If we cut a quotation element out of the html, return the html output of the copied document.
return _restore_CRLF(emailDocumentCopy.documentElement.outerHTML, crlfReplaced);
} else {
// There was nothing to remove, return original message.
return msgBody;
}
}
// Set quotationCheckpoints to true for checkpoints on lines that were removed
for (i = l = ref = returnFlags.firstLine, ref1 = returnFlags.lastLine; (ref <= ref1 ? l <= ref1 : l >= ref1); i = ref <= ref1 ? ++l : --l) {
if (!lineCheckpoints[i]) {
continue;
}
ref2 = lineCheckpoints[i];
for (m = 0, len1 = ref2.length; m < len1; m++) {
checkpoint = ref2[m];
quotationCheckpoints[checkpoint] = true;
}
}
// Remove the element that have been identified as part of the quoted message
htmlPlaner.deleteQuotationTags(emailDocumentCopy.body, 0, quotationCheckpoints);
return emailDocumentCopy.documentElement.outerHTML;
};
// Mark message lines with markers to distinguish quotation lines.
// Markers:
// * e - empty line
// * f - Forwarded message line, see REGEXES.FWD
// * m - line that starts with quotation marker '>'
// * s - splitter line
// * t - presumably lines from the last message in the conversation
// $> markMessageLines(['answer', 'From: foo@bar.com', '', '> question'])
// 'tsem'
exports.markMessageLines = function(lines) {
var i, j, k, markers, ref, splitter, splitterLines;
markers = [];
i = 0;
while (i < lines.length) {
if (lines[i].trim() === '') {
markers[i] = 'e'; // empty line
} else if (REGEXES.QUOT_PATTERN.test(lines[i])) {
markers[i] = 'm'; // line with quotation marker
} else if (REGEXES.FWD.test(lines[i])) {
markers[i] = 'f'; // ---- Forwarded message ----
} else {
splitter = isSplitter(lines.slice(i, i + SPLITTER_MAX_LINES).join("\n"));
if (splitter) {
// splitter[0] is the entire match
splitterLines = splitter[0].split("\n");
for (j = k = 0, ref = splitterLines.length; (0 <= ref ? k <= ref : k >= ref); j = 0 <= ref ? ++k : --k) {
markers[i + j] = 's';
}
i += splitterLines.length - 1;
} else {
markers[i] = 't';
}
}
i++;
}
return markers.join('');
};
// Check the line for each splitter regex.
isSplitter = function(line) {
var k, len, matchArray, pattern, ref;
if (line.length > MAX_LINE_LENGTH) {
return null;
}
ref = REGEXES.SPLITTER_PATTERNS;
for (k = 0, len = ref.length; k < len; k++) {
pattern = ref[k];
matchArray = pattern.exec(line);
if (matchArray && matchArray.index === 0) {
return matchArray;
}
}
return null;
};
// Run regexes against message's marked lines to strip quotations.
// Return only last message lines.
// $> processMarkedLines(['Hello', 'From: foo@bar.com', '', '> Hi'], 'tsem'])
// ['Hello']
// Will also modify the provided returnFlags object and set the following properties:
// returnFlags = { wereLinesDeleted: (true|false), firstLine: (Number), lastLine: (Number) }
// @see setReturnFlags
exports.processMarkedLines = function(lines, markers, returnFlags = {}) {
var inlineMatchRegex, inlineReplyIndex, inlineReplyMatch, isInlineReplyLink, quotationEnd, quotationMatch;
// If there are no splitters there should be no markers
if (markers.indexOf('s') < 0 && !/(me*){3}/.test(markers)) {
markers = markers.replace(/m/g, 't');
}
// If the message is a forward do nothing.
if (/^[te]*f/.test(markers)) {
setReturnFlags(returnFlags, false, -1, -1);
return lines;
}
// Find inline replies (tm's following the first m in markers string)
inlineMatchRegex = new RegExp('m(?=e*((?:t+e*)+)m)', 'g');
while (inlineReplyMatch = inlineMatchRegex.exec(lines)) {
inlineReplyIndex = markers.indexOf(inlineReplyMatch[1], inlineReplyMatch.index);
isInlineReplyLink = false;
if (inlineReplyIndex > -1) {
isInlineReplyLink = REGEXES.PARENTHESIS_LINK.test(lines[inlineReplyIndex - 1]) || lines[inlineReplyIndex].trim().search(REGEXES.PARENTHESIS_LINK) === 0;
}
if (!isInlineReplyLink) {
setReturnFlags(returnFlags, false, -1, -1);
return lines;
}
}
// Cut out text lines coming after splitter if there are no markers there
quotationMatch = new RegExp('(se*)+((t|f)+e*)+', 'g').exec(markers);
if (quotationMatch) {
setReturnFlags(returnFlags, true, quotationMatch.index, lines.length);
return lines.slice(0, quotationMatch.index);
}
// Handle the case with markers
quotationMatch = REGEXES.QUOTATION.exec(markers) || REGEXES.EMPTY_QUOTATION.exec(markers);
if (quotationMatch) {
quotationEnd = quotationMatch.index + quotationMatch[1].length;
setReturnFlags(returnFlags, true, quotationMatch.index, quotationEnd);
return lines.slice(0, quotationMatch.index).concat(lines.slice(quotationEnd));
}
setReturnFlags(returnFlags, false, -1, -1);
return lines;
};
setReturnFlags = function(returnFlags, wereLinesDeleted, firstLine, lastLine) {
returnFlags.wereLinesDeleted = wereLinesDeleted;
returnFlags.firstLine = firstLine;
return returnFlags.lastLine = lastLine;
};
// Prepares msgBody for being stripped.
// Replaces link brackets so that they couldn't be taken for quotation marker.
// Splits line in two if splitter pattern preceded by some text on the same
// line (done only for 'On <date> <person> wrote:' pattern).
preprocess = function(msgBody, delimiter, contentType = 'text/plain') {
// Normalize links i.e. replace '<', '>' wrapping the link with some symbols
// so that '>' closing the link couldn't be mistakenly taken for quotation
// marker.
// REGEXES.LINK has 1 captured group
msgBody = msgBody.replace(REGEXES.LINK, function(entireMatch, groupMatch1, matchIndex) {
var newLineIndex;
// Look for closest newline character
newLineIndex = msgBody.lastIndexOf("\n", matchIndex);
// If the new current line starts with a '>' quotation marker, don't mess with the link
if (newLineIndex > 0 && msgBody[newLineIndex + 1] === '>') {
return entireMatch;
} else {
return `@@${groupMatch1}@@`;
}
});
if (contentType === 'text/plain' && msgBody.length < MAX_LINE_LENGTH) {
// ON_DATE_SMB_WROTE has 4 captured groups
msgBody = msgBody.replace(REGEXES.ON_DATE_SMB_WROTE, function(entireMatch, groupMatch1, groupMatch2, groupMatch3, groupMatch4, matchIndex) {
if (matchIndex && msgBody[matchIndex - 1] !== "\n") {
return `${delimiter}${entireMatch}`;
} else {
return entireMatch;
}
});
}
return msgBody;
};
// Make up for changes done at preprocessing message.
// Replace link brackets back to '<' and '>'.
postprocess = function(msgBody) {
return msgBody.replace(REGEXES.NORMALIZED_LINK, '<$1>').trim();
};
CONTENT_CHUNK_SIZE = 100;
getDelimiter = function(msgBody) {
var bodyChunk, contentLength, currentIndex, delimiterMatch;
contentLength = msgBody.length;
currentIndex = 0;
bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE);
while (!(delimiterMatch = REGEXES.DELIMITER.exec(bodyChunk)) && currentIndex < contentLength) {
currentIndex += CONTENT_CHUNK_SIZE;
bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE);
}
if (delimiterMatch) {
return delimiterMatch[0];
} else {
return "\n";
}
};
_CRLF_to_LF = function(msgBody) {
var delimiter;
delimiter = getDelimiter(msgBody);
if (delimiter === '\r\n') {
return [msgBody.replace(new RegExp(delimiter, 'g'), '\n'), true];
}
return [msgBody, false];
};
_restore_CRLF = function(msgBody, replaced = true) {
if (replaced) {
return msgBody.replace(new RegExp('\n', 'g'), '\r\n');
}
return msgBody;
};
}).call(this);