UNPKG

node-email-reply-parser

Version:

Parses emails to remove replies and other potentially unwanted data

229 lines (193 loc) 8.47 kB
/** * This file is part of node-email-reply-parser. * For the full license information, please see the LICENSE file distributed with this package. */ var Email = require("./Email"); var Fragment = require("./Fragment"); var reverse = require("lodash/reverse"); var filter = require("lodash/filter"); var esrever = require("esrever"); /** * Regular Expression to match signatures * @type {RegExp} */ const SIGNATURE_REGEX = /(?:^\s*--|^\s*__|^-\w|^-- $)|(?:^Sent from my (?:\s*\w+){1,4}$)|(?:^={30,}$)$/; /** * Regular Expression to match quoted material * @type {RegExp} */ const QUOTE_REGEX = />+$/; /** * Regular Expressions for quoted reply headers ("On <date>, <someone> wrote:") * @type {RegExp[]} */ const QUOTE_HEADERS_REGEX = [ /^\s*(On(?:(?!.*On\b|\bwrote:)[\s\S])+wrote:)$/m, // On DATE, NAME <EMAIL> wrote: /^\s*(Le(?:(?!.*Le\b|\bécrit:)[\s\S])+écrit(\s|\xc2\xa0):)$/m, // On DATE, NAME <EMAIL> wrote: /^\s*(El(?:(?!.*El\b|\bescribió:)[\s\S])+escribi(ó|eron):)$/m, // On DATE, NAME <EMAIL> wrote: /^\s*(Il(?:(?!.*Il\b|\bscritto:)[\s\S])+scritto:)$/m, // On DATE, NAME <EMAIL> wrote: /^\s*(Op\s[\S\s]+?(schreef|geschreven)[\S\s]+:)$/m, // Op DATE schreef NAME <EMAIL>:, Op DATE heeft NAME <EMAIL> het volgende geschreven: /^\s*(Em(?:(?!.*Em\b|\bescreveu:)[\s\S])+escreveu:)$/m, // Em DATE, NAME <EMAIL> escreveu: /^\s*((W\sdniu|Dnia)\s[\S\s]+?(pisze|napisał(\(a\))?):)$/mu, // W dniu DATE, NAME <EMAIL> pisze|napisał: /^\s*(Den\s.+\sskrev\s.+:)$/m, // Den DATE skrev NAME <EMAIL>: /^\s*(Am\s.+\sum\s.+\sschrieb\s.+:)$/m, // Am DATE um TIME schrieb NAME: /^(在[\S\s]+写道:)$/m, // > 在 DATE, TIME, NAME 写道: /^(20[0-9]{2}\..+\s작성:)$/m, // DATE TIME NAME 작성: /^(20[0-9]{2}\/.+のメッセージ:)$/m, // DATE TIME、NAME のメッセージ: /^(.+\s<.+>\sschrieb:)$/m, // NAME <EMAIL> schrieb: /^\s*(From\s?:.+\s?(\[|<).+(\]|>))/mu, // "From: NAME <EMAIL>" OR "From : NAME <EMAIL>" OR "From : NAME<EMAIL>"(With support whitespace before start and before <) /^\s*(De\s?:.+\s?(\[|<).+(\]|>))/mu, // "De: NAME <EMAIL>" OR "De : NAME <EMAIL>" OR "De : NAME<EMAIL>" (With support whitespace before start and before <) /^\s*(Van\s?:.+\s?(\[|<).+(\]|>))/mu, // "Van: NAME <EMAIL>" OR "Van : NAME <EMAIL>" OR "Van : NAME<EMAIL>" (With support whitespace before start and before <) /^\s*(Da\s?:.+\s?(\[|<).+(\]|>))/mu, // "Da: NAME <EMAIL>" OR "Da : NAME <EMAIL>" OR "Da : NAME<EMAIL>" (With support whitespace before start and before <) /^(20[0-9]{2}-(?:0?[1-9]|1[012])-(?:0?[0-9]|[1-2][0-9]|3[01]|[1-9])\s[0-2]?[0-9]:\d{2}\s[\S\s]+?:)$/m, // 20YY-MM-DD HH:II GMT+01:00 NAME <EMAIL>: /^\s*([a-z]{3,4}\.[\s\S]+\sskrev[\s\S]+:)$/m, // DATE skrev NAME <EMAIL>: /^\s*(El(?:(?!^>*\s*El\b|\bha escrit:).){0,1000}ha escrit:)$/ms, // El DATE, NAME <EMAIL> ha escrit: ]; /** * Represents a fragment that hasn't been constructed (yet) * @license MIT License */ class FragmentDto { /** * Creates a new fragment DTO */ constructor() { this.lines = []; this.isHidden = false; this.isSignature = false; this.isQuoted = false; } /** * Converts this container to a real fragment * @returns {Fragment} the created fragment */ toFragment() { var content = esrever.reverse(this.lines.join('\n')).replace(/^\n/, ''); return new Fragment(content, this.isHidden, this.isSignature, this.isQuoted); } } /** * Represents a parser to get fragments out of an email * @licence MIT License */ class Parser { /** * Creates a new parser. If any of the parameters are not given then the defaults will be used * @param {RegExp} [signatureRegex] the regular expression used to match signatures * @param {RegExp} [quotedLineRegex] the regular expression used to match quoted lines * @param {RegExp[]} [quoteHeadersRegex] the regular expressions used to find quoted sections based on the header */ constructor(signatureRegex, quotedLineRegex, quoteHeadersRegex) { this._signatureRegex = signatureRegex || SIGNATURE_REGEX; this._quotedLineRegex = quotedLineRegex || QUOTE_REGEX; this._quoteHeadersRegex = quoteHeadersRegex || QUOTE_HEADERS_REGEX; } /** * Parses an email body and converts it to Fragments. * @param {string} text the input text to parse * @returns {Email} the parsed email */ parse(text) { if (typeof(text) !== "string") return new Email([]); var fragments = []; text = text.replace("\r\n", "\n"); // Clean up quoted headers for (var exp of this._quoteHeadersRegex) { var matches = text.match(exp); if (matches && matches.length >= 2) { text = text.replace(matches[1], matches[1].replace(/\n/g, ' ')); } } var fragment = null; for (var line of esrever.reverse(text).split('\n')) { line = line.replace(/\n+$/, ''); // trim excess newlines from end of line if (!this._isSignature(line)) { line = line.replace(/^\s+/, ''); // left trim whitespace from line } if (fragment) { var lastLine = fragment.lines[fragment.lines.length - 1]; if (this._isSignature(lastLine)) { fragment.isSignature = true; this._addFragment(fragment, fragments); fragment = null; } else if (this._isQuoteHeader(lastLine)) { fragment.isQuoted = true; this._addFragment(fragment, fragments); fragment = null; } } var isQuoted = this._isQuote(line); if (fragment === null || !this._isFragmentLine(fragment, line, isQuoted)) { if (fragment) { this._addFragment(fragment, fragments); } fragment = new FragmentDto(); fragment.isQuoted = isQuoted; } fragment.lines.push(line); } if (fragment) { this._addFragment(fragment, fragments); } var emailFragments = []; for (var frag of fragments) { emailFragments.push(frag.toFragment()); } return new Email(reverse(emailFragments)); } /** * Adds a fragment to the given collection, doing some basic state processing on it before pushing * @param {FragmentDto} fragment the fragment to add and update the state of * @param {FragmentDto[]} collection the collection to add the fragment to * @private */ _addFragment(fragment, collection) { if (fragment.isQuoted || fragment.isSignature || fragment.lines.join('').length === 0) { fragment.isHidden = true; } collection.push(fragment); } /** * Determines if the given line is part of the given fragment * @param {FragmentDto} fragment the fragment to check * @param {string} line the line to check * @param {boolean} isQuoted true if the line is quoted, false otherwise * @returns {boolean} true if the line belongs to the fragment * @private */ _isFragmentLine(fragment, line, isQuoted) { if (fragment.isQuoted === isQuoted) return true; if (fragment.isQuoted) { return this._isQuoteHeader(line) || line.length === 0; } return false; } /** * Determines if the given line is a signature * @param {string} text the text to check * @returns {boolean} true if the line is a signature * @private */ _isSignature(text) { return this._signatureRegex.test(esrever.reverse(text)); } /** * Determines if the given line is a quoted line * @param {string} text the text to check * @returns {boolean} true if the line is a quoted line * @private */ _isQuote(text) { return this._quotedLineRegex.test(text); } /** * Determines if the given line is a quote header * @param {string} text the text to check * @returns {boolean} true if the line is a quote header * @private */ _isQuoteHeader(text) { return filter(this._quoteHeadersRegex, exp => exp.test(esrever.reverse(text))).length > 0; } } module.exports = Parser;