UNPKG

whatsapp-chat-parser

Version:

A package to parse WhatsApp chats with Node.js or in the browser 💬

328 lines (319 loc) • 12.7 kB
(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : typeof define === 'function' && define.amd ? define(['exports'], factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global.whatsappChatParser = {})); }(this, (function (exports) { 'use strict'; /** * Checks that the number at a certain index of an array is greater than a * certain value. */ function indexAboveValue(index, value) { return (array) => array[index] > value; } /** * Returns `true` for a negative number, `false` otherwise. * * `0` and `-0` are considered positive. */ function isNegative(number) { return number < 0; } /** * Takes an array of arrays and an index and groups the inner arrays by the * value at the index provided. * @see `utils.test.ts` for a better understanding of this function. */ function groupArrayByValueAtIndex(array, index) { return Object.values(array.reduce((obj, item) => { /* * Keys that are numbers (even strings containing a number) get sorted * when using `Object.values()`. * Adding a prefix avoids this issue. */ const key = `_${item[index]}`; if (!obj[key]) obj[key] = []; obj[key].push(item); return obj; }, {})); } /** * Takes an array of numeric dates and tries to understand if the days come * before the month or the other way around by checking if numbers go above * `12`. * * Output is `true` if days are first, `false` if they are second, or `null` if * it failed to understand the order. */ function checkAbove12(numericDates) { const daysFirst = numericDates.some(indexAboveValue(0, 12)); if (daysFirst) return true; const daysSecond = numericDates.some(indexAboveValue(1, 12)); if (daysSecond) return false; return null; } /** * Takes an array of numeric dates and tries to understand if the days come * before the month or the other way around by checking if a set of numbers * during the same year decrease at some point. * * If it does it's probably the days since months can only increase in a given * year. * * Output is `true` if days are first, `false` if they are second, or `null` if * it failed to understand the order. */ function checkDecreasing(numericDates) { const datesByYear = groupArrayByValueAtIndex(numericDates, 2); const results = datesByYear.map(dates => { const daysFirst = dates.slice(1).some((date, i) => { const [first1] = dates[i]; const [first2] = date; return isNegative(first2 - first1); }); if (daysFirst) return true; const daysSecond = dates.slice(1).some((date, i) => { const [, second1] = dates[i]; const [, second2] = date; return isNegative(second2 - second1); }); if (daysSecond) return false; return null; }); const anyTrue = results.some(value => value === true); if (anyTrue) return true; const anyFalse = results.some(value => value === false); if (anyFalse) return false; return null; } /** * Takes an array of numeric dates and tries to understand if the days come * before the month or the other way around by looking at which number changes * more frequently. * * Output is `true` if days are first, `false` if they are second, or `null` if * it failed to understand the order. */ function changeFrequencyAnalysis(numericDates) { const diffs = numericDates .slice(1) .map((date, i) => date.map((num, j) => Math.abs(numericDates[i][j] - num))); const [first, second] = diffs.reduce((total, diff) => { const [first1, second1] = total; const [first2, second2] = diff; return [first1 + first2, second1 + second2]; }, [0, 0]); if (first > second) return true; if (first < second) return false; return null; } /** * Takes an array of numeric dates and tries to understand if the days come * before the month or the other way around by running the dates through various * checks. * * Output is `true` if days are first, `false` if they are second, or `null` if * it failed to understand the order. */ function daysBeforeMonths(numericDates) { const firstCheck = checkAbove12(numericDates); if (firstCheck !== null) return firstCheck; const secondCheck = checkDecreasing(numericDates); if (secondCheck !== null) return secondCheck; return changeFrequencyAnalysis(numericDates); } /** * Takes `year`, `month` and `day` as strings and pads them to `4`, `2`, `2` * digits respectively. */ function normalizeDate(year, month, day) { return [ // 2 digit years are assumed to be in the 2000-2099 range year.padStart(4, '2000'), month.padStart(2, '0'), day.padStart(2, '0'), ]; } /** * Pushes the longest number in a date to the end, if there is one. Necessary to * ensure the year is the last number. */ function orderDateComponents(date) { const regexSplitDate = /[-/.] ?/; const [a, b, c] = date.split(regexSplitDate); const maxLength = Math.max(a.length, b.length, c.length); if (c.length === maxLength) return [a, b, c]; if (b.length === maxLength) return [a, c, b]; return [b, c, a]; } const regexSplitTime = /[:.]/; /** * Converts time from 12 hour format to 24 hour format. * * Reference: * {@link https://stackoverflow.com/a/40197728/5303634} */ function convertTime12to24(time, ampm) { // eslint-disable-next-line prefer-const let [hours, minutes, seconds] = time.split(regexSplitTime); if (hours === '12') hours = '00'; if (ampm === 'PM') hours = String(parseInt(hours, 10) + 12); return `${hours}:${minutes}${seconds ? `:${seconds}` : ''}`; } /** * Normalizes a time string to have the following format: `hh:mm:ss`. */ function normalizeTime(time) { const [hours, minutes, seconds] = time.split(regexSplitTime); return `${hours.padStart(2, '0')}:${minutes}:${seconds || '00'}`; } /** * Normalizes `am` / `a.m.` / etc. to `AM` (uppercase, no other characters). */ function normalizeAMPM(ampm) { return ampm.replace(/[^apm]/gi, '').toUpperCase(); } const regexParser = /^(?:\u200E|\u200F)*\[?(\d{1,4}[-/.] ?\d{1,4}[-/.] ?\d{1,4})[,.]? \D*?(\d{1,2}[.:]\d{1,2}(?:[.:]\d{1,2})?)(?: ([ap]\.?\s?m\.?))?\]?(?: -|:)? (.+?): ([^]*)/i; const regexParserSystem = /^(?:\u200E|\u200F)*\[?(\d{1,4}[-/.] ?\d{1,4}[-/.] ?\d{1,4})[,.]? \D*?(\d{1,2}[.:]\d{1,2}(?:[.:]\d{1,2})?)(?: ([ap]\.?\s?m\.?))?\]?(?: -|:)? ([^]+)/i; const regexAttachment = /<.+:(.+)>|([A-Z\d-]+\.\w+)\s\(.+\)/; /** * Takes an array of lines and detects the lines that are part of a previous * message (multiline messages) and merges them. * * It also labels messages without an author as system messages. */ function makeArrayOfMessages(lines) { return lines.reduce((acc, line) => { /* * If the line doesn't match the regex it's probably part of the previous * message or a "WhatsApp event" */ if (!regexParser.test(line)) { /* * If it doesn't match the first regex but still matches the system regex * it should be considered a "WhatsApp event" so it gets labeled "system" */ if (regexParserSystem.test(line)) { acc.push({ system: true, msg: line }); } // Else it's part of the previous message and it should be concatenated else if (typeof acc[acc.length - 1] !== 'undefined') { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const prevMessage = acc.pop(); acc.push({ system: prevMessage.system, msg: `${prevMessage.msg}\n${line}`, }); } } else { acc.push({ system: false, msg: line }); } return acc; }, []); } /** * Parses a message extracting the attachment if it's present. */ function parseMessageAttachment(message) { const attachmentMatch = message.match(regexAttachment); if (!attachmentMatch) return null; return { fileName: (attachmentMatch[1] || attachmentMatch[2]).trim(), }; } /** * Parses and array of raw messages into an array of structured objects. */ function parseMessages(messages, options = {}) { let { daysFirst } = options; const { parseAttachments } = options; // Parse messages with regex const parsed = messages.map(obj => { const { system, msg } = obj; // If it's a system message another regex should be used to parse it if (system) { const [, date, time, ampm, message] = regexParserSystem.exec(msg); return { date, time, ampm: ampm || null, author: 'System', message }; } const [, date, time, ampm, author, message] = regexParser.exec(msg); return { date, time, ampm: ampm || null, author, message }; }); // Understand date format if not supplied (do days come first?) if (typeof daysFirst !== 'boolean') { const numericDates = Array.from(new Set(parsed.map(({ date }) => date)), date => orderDateComponents(date).map(Number)); daysFirst = daysBeforeMonths(numericDates); } // Convert date and time in a `Date` object, return the final object return parsed.map(({ date, time, ampm, author, message }) => { let day; let month; let year; const splitDate = orderDateComponents(date); if (daysFirst === false) { [month, day, year] = splitDate; } else { [day, month, year] = splitDate; } [year, month, day] = normalizeDate(year, month, day); const [hours, minutes, seconds] = normalizeTime(ampm ? convertTime12to24(time, normalizeAMPM(ampm)) : time).split(regexSplitTime); const finalObject = { date: new Date(+year, +month - 1, +day, +hours, +minutes, +seconds), author, message, }; // Optionally parse attachments if (parseAttachments) { const attachment = parseMessageAttachment(message); if (attachment) finalObject.attachment = attachment; } return finalObject; }); } const newlinesRegex = /(?:\r\n|\r|\n)/; /** * Parses a string containing a WhatsApp chat log. * * Returns a promise that will contain the parsed messages. * * @since 1.2.0 */ function parseString(string, options = { parseAttachments: false }) { return Promise.resolve(string) .then(data => data.split(newlinesRegex)) .then(makeArrayOfMessages) .then(messages => parseMessages(messages, options)); } /** * Parses a string containing a WhatsApp chat log. * * Returns an array of parsed messages. * * @since 3.2.0 */ function parseStringSync(string, options = { parseAttachments: false }) { const lines = string.split(newlinesRegex); return parseMessages(makeArrayOfMessages(lines), options); } exports.parseString = parseString; exports.parseStringSync = parseStringSync; Object.defineProperty(exports, '__esModule', { value: true }); }))); //# sourceMappingURL=whatsapp-chat-parser.js.map