whatsapp-chat-parser
Version:
A package to parse WhatsApp chats with Node.js or in the browser 💬
328 lines (319 loc) • 12.7 kB
JavaScript
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) :
typeof define === 'function' && define.amd ? define(['exports'], factory) :
(global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global.whatsappChatParser = {}));
}(this, (function (exports) { 'use strict';
/**
* Checks that the number at a certain index of an array is greater than a
* certain value.
*/
function indexAboveValue(index, value) {
return (array) => array[index] > value;
}
/**
* Returns `true` for a negative number, `false` otherwise.
*
* `0` and `-0` are considered positive.
*/
function isNegative(number) {
return number < 0;
}
/**
* Takes an array of arrays and an index and groups the inner arrays by the
* value at the index provided.
* @see `utils.test.ts` for a better understanding of this function.
*/
function groupArrayByValueAtIndex(array, index) {
return Object.values(array.reduce((obj, item) => {
/*
* Keys that are numbers (even strings containing a number) get sorted
* when using `Object.values()`.
* Adding a prefix avoids this issue.
*/
const key = `_${item[index]}`;
if (!obj[key])
obj[key] = [];
obj[key].push(item);
return obj;
}, {}));
}
/**
* Takes an array of numeric dates and tries to understand if the days come
* before the month or the other way around by checking if numbers go above
* `12`.
*
* Output is `true` if days are first, `false` if they are second, or `null` if
* it failed to understand the order.
*/
function checkAbove12(numericDates) {
const daysFirst = numericDates.some(indexAboveValue(0, 12));
if (daysFirst)
return true;
const daysSecond = numericDates.some(indexAboveValue(1, 12));
if (daysSecond)
return false;
return null;
}
/**
* Takes an array of numeric dates and tries to understand if the days come
* before the month or the other way around by checking if a set of numbers
* during the same year decrease at some point.
*
* If it does it's probably the days since months can only increase in a given
* year.
*
* Output is `true` if days are first, `false` if they are second, or `null` if
* it failed to understand the order.
*/
function checkDecreasing(numericDates) {
const datesByYear = groupArrayByValueAtIndex(numericDates, 2);
const results = datesByYear.map(dates => {
const daysFirst = dates.slice(1).some((date, i) => {
const [first1] = dates[i];
const [first2] = date;
return isNegative(first2 - first1);
});
if (daysFirst)
return true;
const daysSecond = dates.slice(1).some((date, i) => {
const [, second1] = dates[i];
const [, second2] = date;
return isNegative(second2 - second1);
});
if (daysSecond)
return false;
return null;
});
const anyTrue = results.some(value => value === true);
if (anyTrue)
return true;
const anyFalse = results.some(value => value === false);
if (anyFalse)
return false;
return null;
}
/**
* Takes an array of numeric dates and tries to understand if the days come
* before the month or the other way around by looking at which number changes
* more frequently.
*
* Output is `true` if days are first, `false` if they are second, or `null` if
* it failed to understand the order.
*/
function changeFrequencyAnalysis(numericDates) {
const diffs = numericDates
.slice(1)
.map((date, i) => date.map((num, j) => Math.abs(numericDates[i][j] - num)));
const [first, second] = diffs.reduce((total, diff) => {
const [first1, second1] = total;
const [first2, second2] = diff;
return [first1 + first2, second1 + second2];
}, [0, 0]);
if (first > second)
return true;
if (first < second)
return false;
return null;
}
/**
* Takes an array of numeric dates and tries to understand if the days come
* before the month or the other way around by running the dates through various
* checks.
*
* Output is `true` if days are first, `false` if they are second, or `null` if
* it failed to understand the order.
*/
function daysBeforeMonths(numericDates) {
const firstCheck = checkAbove12(numericDates);
if (firstCheck !== null)
return firstCheck;
const secondCheck = checkDecreasing(numericDates);
if (secondCheck !== null)
return secondCheck;
return changeFrequencyAnalysis(numericDates);
}
/**
* Takes `year`, `month` and `day` as strings and pads them to `4`, `2`, `2`
* digits respectively.
*/
function normalizeDate(year, month, day) {
return [
// 2 digit years are assumed to be in the 2000-2099 range
year.padStart(4, '2000'),
month.padStart(2, '0'),
day.padStart(2, '0'),
];
}
/**
* Pushes the longest number in a date to the end, if there is one. Necessary to
* ensure the year is the last number.
*/
function orderDateComponents(date) {
const regexSplitDate = /[-/.] ?/;
const [a, b, c] = date.split(regexSplitDate);
const maxLength = Math.max(a.length, b.length, c.length);
if (c.length === maxLength)
return [a, b, c];
if (b.length === maxLength)
return [a, c, b];
return [b, c, a];
}
const regexSplitTime = /[:.]/;
/**
* Converts time from 12 hour format to 24 hour format.
*
* Reference:
* {@link https://stackoverflow.com/a/40197728/5303634}
*/
function convertTime12to24(time, ampm) {
// eslint-disable-next-line prefer-const
let [hours, minutes, seconds] = time.split(regexSplitTime);
if (hours === '12')
hours = '00';
if (ampm === 'PM')
hours = String(parseInt(hours, 10) + 12);
return `${hours}:${minutes}${seconds ? `:${seconds}` : ''}`;
}
/**
* Normalizes a time string to have the following format: `hh:mm:ss`.
*/
function normalizeTime(time) {
const [hours, minutes, seconds] = time.split(regexSplitTime);
return `${hours.padStart(2, '0')}:${minutes}:${seconds || '00'}`;
}
/**
* Normalizes `am` / `a.m.` / etc. to `AM` (uppercase, no other characters).
*/
function normalizeAMPM(ampm) {
return ampm.replace(/[^apm]/gi, '').toUpperCase();
}
const regexParser = /^(?:\u200E|\u200F)*\[?(\d{1,4}[-/.] ?\d{1,4}[-/.] ?\d{1,4})[,.]? \D*?(\d{1,2}[.:]\d{1,2}(?:[.:]\d{1,2})?)(?: ([ap]\.?\s?m\.?))?\]?(?: -|:)? (.+?): ([^]*)/i;
const regexParserSystem = /^(?:\u200E|\u200F)*\[?(\d{1,4}[-/.] ?\d{1,4}[-/.] ?\d{1,4})[,.]? \D*?(\d{1,2}[.:]\d{1,2}(?:[.:]\d{1,2})?)(?: ([ap]\.?\s?m\.?))?\]?(?: -|:)? ([^]+)/i;
const regexAttachment = /<.+:(.+)>|([A-Z\d-]+\.\w+)\s\(.+\)/;
/**
* Takes an array of lines and detects the lines that are part of a previous
* message (multiline messages) and merges them.
*
* It also labels messages without an author as system messages.
*/
function makeArrayOfMessages(lines) {
return lines.reduce((acc, line) => {
/*
* If the line doesn't match the regex it's probably part of the previous
* message or a "WhatsApp event"
*/
if (!regexParser.test(line)) {
/*
* If it doesn't match the first regex but still matches the system regex
* it should be considered a "WhatsApp event" so it gets labeled "system"
*/
if (regexParserSystem.test(line)) {
acc.push({ system: true, msg: line });
}
// Else it's part of the previous message and it should be concatenated
else if (typeof acc[acc.length - 1] !== 'undefined') {
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const prevMessage = acc.pop();
acc.push({
system: prevMessage.system,
msg: `${prevMessage.msg}\n${line}`,
});
}
}
else {
acc.push({ system: false, msg: line });
}
return acc;
}, []);
}
/**
* Parses a message extracting the attachment if it's present.
*/
function parseMessageAttachment(message) {
const attachmentMatch = message.match(regexAttachment);
if (!attachmentMatch)
return null;
return {
fileName: (attachmentMatch[1] || attachmentMatch[2]).trim(),
};
}
/**
* Parses and array of raw messages into an array of structured objects.
*/
function parseMessages(messages, options = {}) {
let { daysFirst } = options;
const { parseAttachments } = options;
// Parse messages with regex
const parsed = messages.map(obj => {
const { system, msg } = obj;
// If it's a system message another regex should be used to parse it
if (system) {
const [, date, time, ampm, message] = regexParserSystem.exec(msg);
return { date, time, ampm: ampm || null, author: 'System', message };
}
const [, date, time, ampm, author, message] = regexParser.exec(msg);
return { date, time, ampm: ampm || null, author, message };
});
// Understand date format if not supplied (do days come first?)
if (typeof daysFirst !== 'boolean') {
const numericDates = Array.from(new Set(parsed.map(({ date }) => date)), date => orderDateComponents(date).map(Number));
daysFirst = daysBeforeMonths(numericDates);
}
// Convert date and time in a `Date` object, return the final object
return parsed.map(({ date, time, ampm, author, message }) => {
let day;
let month;
let year;
const splitDate = orderDateComponents(date);
if (daysFirst === false) {
[month, day, year] = splitDate;
}
else {
[day, month, year] = splitDate;
}
[year, month, day] = normalizeDate(year, month, day);
const [hours, minutes, seconds] = normalizeTime(ampm ? convertTime12to24(time, normalizeAMPM(ampm)) : time).split(regexSplitTime);
const finalObject = {
date: new Date(+year, +month - 1, +day, +hours, +minutes, +seconds),
author,
message,
};
// Optionally parse attachments
if (parseAttachments) {
const attachment = parseMessageAttachment(message);
if (attachment)
finalObject.attachment = attachment;
}
return finalObject;
});
}
const newlinesRegex = /(?:\r\n|\r|\n)/;
/**
* Parses a string containing a WhatsApp chat log.
*
* Returns a promise that will contain the parsed messages.
*
* @since 1.2.0
*/
function parseString(string, options = { parseAttachments: false }) {
return Promise.resolve(string)
.then(data => data.split(newlinesRegex))
.then(makeArrayOfMessages)
.then(messages => parseMessages(messages, options));
}
/**
* Parses a string containing a WhatsApp chat log.
*
* Returns an array of parsed messages.
*
* @since 3.2.0
*/
function parseStringSync(string, options = { parseAttachments: false }) {
const lines = string.split(newlinesRegex);
return parseMessages(makeArrayOfMessages(lines), options);
}
exports.parseString = parseString;
exports.parseStringSync = parseStringSync;
Object.defineProperty(exports, '__esModule', { value: true });
})));
//# sourceMappingURL=whatsapp-chat-parser.js.map