@salesflare/planer

htmlPlaner = require './htmlPlaner' REGEXES = require './regexes' SPLITTER_MAX_LINES = 4 MAX_LINES_COUNT = 1000 MAX_LINE_LENGTH = 200000 # Extract actual message from email. # # Will use provided `contentType` to decide which algorithm to use (plain text or html). # # @param msgBody [String] the html content of the email # @param contentType [String] the contentType of the email. Only `text/plain` and `text/html` are supported. # @param dom [Document] the document object to use for html parsing. # @return [String] the text/html of the actual message without quotations exports.extractFrom = (msgBody, contentType= 'text/plain', dom = null) -> if contentType == 'text/plain' return exports.extractFromPlain msgBody else if contentType == 'text/html' return exports.extractFromHtml msgBody, dom else console.warn('Unknown contentType', contentType) return msgBody # Extract actual message from provided textual email. # # Store delimiter used by the email (\n or \r\n), # split the email into lines, # use regexes to mark each line as either part of the message or quotation, # remove lines that are part of the quotation, # put message back together using the saved delimeter, # remove changes made by algorithm. # # @param msgBody [String] the html content of the email # @return [String] the text of the message without quotations exports.extractFromPlain = (msgBody) -> delimiter = getDelimiter msgBody msgBody = preprocess msgBody, delimiter lines = msgBody.split delimiter, MAX_LINES_COUNT markers = exports.markMessageLines lines lines = exports.processMarkedLines lines, markers msgBody = lines.join delimiter msgBody = postprocess msgBody return msgBody # Extract actual message from provided html message body # using tags and plain text algorithm. # # Cut out the 'blockquote', 'gmail_quote' tags. # Cut out Microsoft (Outlook, Windows mail) quotations. # # Then use plain text algorithm to cut out splitter or # leftover quotation. # This works by adding checkpoint text to all html tags, # then converting html to text, # then extracting quotations from text, # then checking deleted checkpoints, # then deleting necessary tags. # # Will use the document provided to create a new document using: # Document.implementation.createHTMLDocument() # # @param msgBody [String] the html content of the email # @param dom [Document] a document object or equivalent implementation. # Must respond to `DOMImplementation.createHTMLDocument()`. # @see https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation/createHTMLDocument exports.extractFromHtml = (msgBody, dom) -> unless dom? console.error("No dom provided to parse html.") return msgBody if msgBody.trim() == '' return msgBody [msgBody, crlfReplaced] = _CRLF_to_LF msgBody emailDocument = htmlPlaner.createEmailDocument msgBody, dom # TODO: this check does not handle cases of emails between various email providers well because # it will find whichever splitter comes first in this list, not necessarily the top-most and stop # checking for others. Possible solution is to use something like compareByDomPosition from htmlPlaner # to find the earliest splitter in the DOM. haveCutQuotations = ( htmlPlaner.cutGmailQuote(emailDocument) || htmlPlaner.cutBlockQuote(emailDocument) || htmlPlaner.cutMicrosoftQuote(emailDocument) || htmlPlaner.cutById(emailDocument) || htmlPlaner.cutFromBlock(emailDocument) ) # Create unaltered copy of email document emailDocumentCopy = htmlPlaner.createEmailDocument emailDocument.documentElement.outerHTML, dom # Add checkpoints to html document numberOfCheckpoints = htmlPlaner.addCheckpoints emailDocument.body, 0 quotationCheckpoints = Array.apply(null, Array(numberOfCheckpoints)).map(-> false) # Get plain text version to put through plain text algorithm htmlPlaner.replaceBreakTagsWithLineFeeds(emailDocument) plainTextMsg = emailDocument.body.textContent plainTextMsg = preprocess plainTextMsg, "\n", 'text/html' lines = plainTextMsg.split '\n' if lines.length > MAX_LINES_COUNT return msgBody # Collect checkpoints for each line lineCheckpoints = new Array(lines.length) for line, index in lines matches = line.match(htmlPlaner.CHECKPOINT_PATTERN) || [] lineCheckpoints[index] = matches.map((match) -> parseInt(match.slice(4, -4))) # Remove checkpoints from lines to pass through plain text algorithm lines = lines.map((line) -> line.replace(htmlPlaner.CHECKPOINT_PATTERN, '')) markers = exports.markMessageLines lines returnFlags = {} exports.processMarkedLines(lines, markers, returnFlags) # No lines deleted by plain text algorithm, ready to return if !returnFlags.wereLinesDeleted if haveCutQuotations # If we cut a quotation element out of the html, return the html output of the copied document. return _restore_CRLF(emailDocumentCopy.documentElement.outerHTML, crlfReplaced) else # There was nothing to remove, return original message. return msgBody # Set quotationCheckpoints to true for checkpoints on lines that were removed for i in [returnFlags.firstLine..returnFlags.lastLine] continue unless lineCheckpoints[i] for checkpoint in lineCheckpoints[i] quotationCheckpoints[checkpoint] = true # Remove the element that have been identified as part of the quoted message htmlPlaner.deleteQuotationTags emailDocumentCopy.body, 0, quotationCheckpoints return emailDocumentCopy.documentElement.outerHTML # Mark message lines with markers to distinguish quotation lines. # # Markers: # * e - empty line # * f - Forwarded message line, see REGEXES.FWD # * m - line that starts with quotation marker '>' # * s - splitter line # * t - presumably lines from the last message in the conversation # # $> markMessageLines(['answer', 'From: foo@bar.com', '', '> question']) # 'tsem' # exports.markMessageLines = (lines) -> markers = [] i = 0 while i < lines.length if lines[i].trim() == '' markers[i] = 'e' # empty line else if REGEXES.QUOT_PATTERN.test(lines[i]) markers[i] = 'm' # line with quotation marker else if REGEXES.FWD.test(lines[i]) markers[i] = 'f' # ---- Forwarded message ---- else splitter = isSplitter(lines.slice(i, i + SPLITTER_MAX_LINES).join("\n")) if splitter # splitter[0] is the entire match splitterLines = splitter[0].split("\n") for j in [0..splitterLines.length] markers[i + j] = 's' i += (splitterLines.length - 1) else markers[i] = 't' i++ return markers.join('') # Check the line for each splitter regex. isSplitter = (line) -> return null if line.length > MAX_LINE_LENGTH for pattern in REGEXES.SPLITTER_PATTERNS matchArray = pattern.exec line if matchArray && matchArray.index == 0 return matchArray return null # Run regexes against message's marked lines to strip quotations. # # Return only last message lines. # $> processMarkedLines(['Hello', 'From: foo@bar.com', '', '> Hi'], 'tsem']) # ['Hello'] # # Will also modify the provided returnFlags object and set the following properties: # returnFlags = { wereLinesDeleted: (true|false), firstLine: (Number), lastLine: (Number) } # @see setReturnFlags exports.processMarkedLines = (lines, markers, returnFlags = {}) -> # If there are no splitters there should be no markers if markers.indexOf('s') < 0 && !/(me*){3}/.test(markers) markers = markers.replace(/m/g, 't') # If the message is a forward do nothing. if /^[te]*f/.test(markers) setReturnFlags returnFlags, false, -1, -1 return lines # Find inline replies (tm's following the first m in markers string) inlineMatchRegex = new RegExp('m(?=e*((?:t+e*)+)m)', 'g') while inlineReplyMatch = inlineMatchRegex.exec(lines) inlineReplyIndex = markers.indexOf(inlineReplyMatch[1], inlineReplyMatch.index) isInlineReplyLink = false if inlineReplyIndex > -1 isInlineReplyLink = (REGEXES.PARENTHESIS_LINK.test(lines[inlineReplyIndex - 1]) || lines[inlineReplyIndex].trim().search(REGEXES.PARENTHESIS_LINK) == 0) if !isInlineReplyLink setReturnFlags returnFlags, false, -1, -1 return lines # Cut out text lines coming after splitter if there are no markers there quotationMatch = new RegExp('(se*)+((t|f)+e*)+', 'g').exec(markers) if quotationMatch setReturnFlags returnFlags, true, quotationMatch.index, lines.length return lines.slice(0, quotationMatch.index) # Handle the case with markers quotationMatch = REGEXES.QUOTATION.exec(markers) || REGEXES.EMPTY_QUOTATION.exec(markers) if quotationMatch quotationEnd = quotationMatch.index + quotationMatch[1].length setReturnFlags returnFlags, true, quotationMatch.index, quotationEnd return lines.slice(0, quotationMatch.index).concat(lines.slice(quotationEnd)) setReturnFlags returnFlags, false, -1, -1 return lines setReturnFlags = (returnFlags, wereLinesDeleted, firstLine, lastLine) -> returnFlags.wereLinesDeleted = wereLinesDeleted returnFlags.firstLine = firstLine returnFlags.lastLine = lastLine # Prepares msgBody for being stripped. # # Replaces link brackets so that they couldn't be taken for quotation marker. # Splits line in two if splitter pattern preceded by some text on the same # line (done only for 'On <date> <person> wrote:' pattern). # preprocess = (msgBody, delimiter, contentType = 'text/plain') -> # Normalize links i.e. replace '<', '>' wrapping the link with some symbols # so that '>' closing the link couldn't be mistakenly taken for quotation # marker. # REGEXES.LINK has 1 captured group msgBody = msgBody.replace REGEXES.LINK, (entireMatch, groupMatch1, matchIndex) -> # Look for closest newline character newLineIndex = msgBody.lastIndexOf("\n", matchIndex) # If the new current line starts with a '>' quotation marker, don't mess with the link if newLineIndex > 0 && msgBody[newLineIndex + 1] == '>' return entireMatch else return "@@#{ groupMatch1 }@@" if contentType == 'text/plain' && msgBody.length < MAX_LINE_LENGTH # ON_DATE_SMB_WROTE has 4 captured groups msgBody = msgBody.replace REGEXES.ON_DATE_SMB_WROTE, (entireMatch, groupMatch1, groupMatch2, groupMatch3, groupMatch4, matchIndex) -> if matchIndex && msgBody[matchIndex - 1] != "\n" return "#{ delimiter }#{ entireMatch }" else return entireMatch return msgBody # Make up for changes done at preprocessing message. # Replace link brackets back to '<' and '>'. postprocess = (msgBody) -> return msgBody.replace(REGEXES.NORMALIZED_LINK, '<$1>').trim() CONTENT_CHUNK_SIZE = 100 getDelimiter = (msgBody) -> contentLength = msgBody.length currentIndex = 0 bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE) while !(delimiterMatch = REGEXES.DELIMITER.exec(bodyChunk)) && currentIndex < contentLength currentIndex += CONTENT_CHUNK_SIZE bodyChunk = msgBody.substr(currentIndex, CONTENT_CHUNK_SIZE) if delimiterMatch return delimiterMatch[0] else return "\n" _CRLF_to_LF = (msgBody) -> delimiter = getDelimiter msgBody if delimiter == '\r\n' return [msgBody.replace(new RegExp(delimiter, 'g'), '\n'), true] return [msgBody, false] _restore_CRLF = (msgBody, replaced = true) -> if replaced return msgBody.replace(new RegExp('\n', 'g'), '\r\n') return msgBody