xenforo-dl
Version:
XenForo Forum Downloader
217 lines • 9.98 kB
JavaScript
var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) {
if (kind === "m") throw new TypeError("Private method is not writable");
if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter");
if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it");
return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value;
};
var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) {
if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter");
if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it");
return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver);
};
var _Parser_instances, _Parser_logger, _Parser_findForumLinks, _Parser_parseNav, _Parser_checkNumber, _Parser_htmlToText;
import { convert as htmlToText } from 'html-to-text';
import { load as cheerioLoad } from 'cheerio';
import { commonLog } from '../utils/logging/Logger.js';
import { trimNewlines } from 'trim-newlines';
import URLHelper from '../utils/URLHelper.js';
export default class Parser {
constructor(logger) {
_Parser_instances.add(this);
this.name = 'Parser';
_Parser_logger.set(this, void 0);
__classPrivateFieldSet(this, _Parser_logger, logger, "f");
}
log(level, ...msg) {
commonLog(__classPrivateFieldGet(this, _Parser_logger, "f"), level, this.name, ...msg);
}
parseThreadPage(html, originURL) {
const $ = cheerioLoad(html);
const idAttr = $('html').attr('data-content-key') || '';
const id = idAttr.startsWith('thread-') ? Number(idAttr.substring(7)) : null;
if (!id) {
throw Error(`Failed to obtain thread ID from "${originURL}"`);
}
const siteName = $('meta[property="og:site_name"]').attr('content');
const url = $('link[rel="canonical"]').attr('href') || '';
const title = $('meta[property="og:title"]').attr('content') || '';
if (!url || !title) {
throw Error(`Failed to obtain 'url' and 'title' from "${originURL}"`);
}
const breadcrumbs = $('ul.p-breadcrumbs').first().find('li[itemprop="itemListElement"]').map((_i, _el) => {
const el = $(_el);
const crumbEl = el.find('a[itemprop="item"]');
const href = crumbEl.attr('href');
const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, crumbEl.html());
if (href && title) {
return {
url: new URL(href, url).toString(),
title: _i > 0 ? title : siteName || new URL(url).host
};
}
return null;
})
.toArray()
.filter((v) => v !== null);
const messages = $('article.message')
.map((_i, _el) => {
const el = $(_el);
const author = el.attr('data-author');
const idAttr = el.find('div.message-userContent').attr('data-lb-id') || '';
const id = idAttr.startsWith('post-') ? Number(idAttr.substring(5)) : null;
if (!id) {
this.log('warn', 'Message skipped: failed to obtain ID.');
return null;
}
const index = el.find('ul.message-attribution-opposite li').last().text().trim();
const attachmentLinks = el
.find('a')
.map((_i, _el) => {
const linkEl = $(_el);
const href = linkEl.attr('href');
if (href) {
const attachmentLinkRegex = /\/attachments\/(.+)\.(\d+)/g;
const matches = attachmentLinkRegex.exec(href);
if (matches && !isNaN(Number(matches[2]))) {
const imgEl = linkEl.find('img');
return {
id: Number(matches[2]),
url: new URL(href, url).toString(),
filename: imgEl.attr('alt') || imgEl.attr('title'),
el: linkEl
};
}
}
return null;
})
.toArray()
.filter((v) => v !== null);
attachmentLinks.forEach((link) => link.el.remove());
const attachments = attachmentLinks.map((link, i) => {
return {
id: link.id,
index: i,
url: link.url,
filename: link.filename
};
});
const body = trimNewlines(__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, el.find('article.message-body').html()).trim());
const publishedAt = el.find('ul.message-attribution-main li.u-concealed time.u-dt').attr('datetime');
return {
id,
index,
author,
publishedAt,
body,
attachments
};
})
.toArray()
.filter((v) => v !== null);
return {
id,
url,
breadcrumbs,
title,
messages,
...__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_parseNav).call(this, $, url)
};
}
parseForumPage(html, originURL) {
const $ = cheerioLoad(html);
const idAttr = $('html').attr('data-content-key') || '';
const id = idAttr.startsWith('forum-') ? Number(idAttr.substring(6)) : null;
if (!id) {
throw Error(`Failed to obtain forum ID from "${originURL}"`);
}
const url = $('link[rel="canonical"]').attr('href') || '';
const title = $('meta[property="og:title"]').attr('content') || '';
if (!url || !title) {
throw Error(`Failed to obtain 'url' and 'title' from "${originURL}"`);
}
const subforums = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_findForumLinks).call(this, $('div.node--forum'), $, url);
const threads = $('div.structItem--thread div.structItem-title')
.find('a')
.map((_i, _el) => {
const el = $(_el);
const href = el.attr('href');
if (href) {
const threadLink = URLHelper.parseThreadURL(href);
const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, el.html()).trim();
if (threadLink?.id && title) {
const threadURL = new URL(href, url).toString();
return {
title,
url: threadURL.endsWith('/unread') ? threadURL.substring(0, threadURL.length - 7) : threadURL
};
}
}
return null;
})
.toArray()
.reduce((result, t) => {
if (t !== null && !result.find((t2) => t2.url === t.url)) {
result.push(t);
}
return result;
}, []);
return {
id,
url,
title,
subforums,
threads,
...__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_parseNav).call(this, $, url)
};
}
parseGenericPage(html, url) {
const $ = cheerioLoad(html);
const forums = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_findForumLinks).call(this, $('.node-title'), $, url);
return {
forums
};
}
}
_Parser_logger = new WeakMap(), _Parser_instances = new WeakSet(), _Parser_findForumLinks = function _Parser_findForumLinks(el, $, baseURL) {
return el.find('a')
.map((_i, _el) => {
const linkEl = $(_el);
const href = linkEl.attr('href');
if (href) {
const forumLink = URLHelper.parseForumURL(href);
const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, linkEl.html()).trim();
if (forumLink?.id && title) {
return {
title,
url: new URL(href, baseURL).toString()
};
}
}
return null;
})
.toArray()
.reduce((result, f) => {
if (f !== null && !result.find((f2) => f2.url === f.url)) {
result.push(f);
}
return result;
}, []);
}, _Parser_parseNav = function _Parser_parseNav($, url) {
const pageNavEl = $('div.pageNav ul.pageNav-main');
const currentPage = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_checkNumber).call(this, pageNavEl.find('li.pageNav-page--current a').first().text()) || 1;
const totalPages = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_checkNumber).call(this, pageNavEl.find('li').last().text()) || 1;
const nextHrefEl = $('div.pageNav a.pageNav-jump--next').attr('href');
const nextURL = nextHrefEl ? new URL(nextHrefEl, url).toString() : undefined;
return { currentPage, totalPages, nextURL };
}, _Parser_checkNumber = function _Parser_checkNumber(value) {
if (!isNaN(Number(value))) {
return Number(value);
}
return undefined;
}, _Parser_htmlToText = function _Parser_htmlToText(value) {
if (value === undefined || value === null) {
return '';
}
return htmlToText(value);
};
//# sourceMappingURL=Parser.js.map