UNPKG

xenforo-dl

Version:
217 lines 9.98 kB
var __classPrivateFieldSet = (this && this.__classPrivateFieldSet) || function (receiver, state, value, kind, f) { if (kind === "m") throw new TypeError("Private method is not writable"); if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a setter"); if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot write private member to an object whose class did not declare it"); return (kind === "a" ? f.call(receiver, value) : f ? f.value = value : state.set(receiver, value)), value; }; var __classPrivateFieldGet = (this && this.__classPrivateFieldGet) || function (receiver, state, kind, f) { if (kind === "a" && !f) throw new TypeError("Private accessor was defined without a getter"); if (typeof state === "function" ? receiver !== state || !f : !state.has(receiver)) throw new TypeError("Cannot read private member from an object whose class did not declare it"); return kind === "m" ? f : kind === "a" ? f.call(receiver) : f ? f.value : state.get(receiver); }; var _Parser_instances, _Parser_logger, _Parser_findForumLinks, _Parser_parseNav, _Parser_checkNumber, _Parser_htmlToText; import { convert as htmlToText } from 'html-to-text'; import { load as cheerioLoad } from 'cheerio'; import { commonLog } from '../utils/logging/Logger.js'; import { trimNewlines } from 'trim-newlines'; import URLHelper from '../utils/URLHelper.js'; export default class Parser { constructor(logger) { _Parser_instances.add(this); this.name = 'Parser'; _Parser_logger.set(this, void 0); __classPrivateFieldSet(this, _Parser_logger, logger, "f"); } log(level, ...msg) { commonLog(__classPrivateFieldGet(this, _Parser_logger, "f"), level, this.name, ...msg); } parseThreadPage(html, originURL) { const $ = cheerioLoad(html); const idAttr = $('html').attr('data-content-key') || ''; const id = idAttr.startsWith('thread-') ? Number(idAttr.substring(7)) : null; if (!id) { throw Error(`Failed to obtain thread ID from "${originURL}"`); } const siteName = $('meta[property="og:site_name"]').attr('content'); const url = $('link[rel="canonical"]').attr('href') || ''; const title = $('meta[property="og:title"]').attr('content') || ''; if (!url || !title) { throw Error(`Failed to obtain 'url' and 'title' from "${originURL}"`); } const breadcrumbs = $('ul.p-breadcrumbs').first().find('li[itemprop="itemListElement"]').map((_i, _el) => { const el = $(_el); const crumbEl = el.find('a[itemprop="item"]'); const href = crumbEl.attr('href'); const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, crumbEl.html()); if (href && title) { return { url: new URL(href, url).toString(), title: _i > 0 ? title : siteName || new URL(url).host }; } return null; }) .toArray() .filter((v) => v !== null); const messages = $('article.message') .map((_i, _el) => { const el = $(_el); const author = el.attr('data-author'); const idAttr = el.find('div.message-userContent').attr('data-lb-id') || ''; const id = idAttr.startsWith('post-') ? Number(idAttr.substring(5)) : null; if (!id) { this.log('warn', 'Message skipped: failed to obtain ID.'); return null; } const index = el.find('ul.message-attribution-opposite li').last().text().trim(); const attachmentLinks = el .find('a') .map((_i, _el) => { const linkEl = $(_el); const href = linkEl.attr('href'); if (href) { const attachmentLinkRegex = /\/attachments\/(.+)\.(\d+)/g; const matches = attachmentLinkRegex.exec(href); if (matches && !isNaN(Number(matches[2]))) { const imgEl = linkEl.find('img'); return { id: Number(matches[2]), url: new URL(href, url).toString(), filename: imgEl.attr('alt') || imgEl.attr('title'), el: linkEl }; } } return null; }) .toArray() .filter((v) => v !== null); attachmentLinks.forEach((link) => link.el.remove()); const attachments = attachmentLinks.map((link, i) => { return { id: link.id, index: i, url: link.url, filename: link.filename }; }); const body = trimNewlines(__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, el.find('article.message-body').html()).trim()); const publishedAt = el.find('ul.message-attribution-main li.u-concealed time.u-dt').attr('datetime'); return { id, index, author, publishedAt, body, attachments }; }) .toArray() .filter((v) => v !== null); return { id, url, breadcrumbs, title, messages, ...__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_parseNav).call(this, $, url) }; } parseForumPage(html, originURL) { const $ = cheerioLoad(html); const idAttr = $('html').attr('data-content-key') || ''; const id = idAttr.startsWith('forum-') ? Number(idAttr.substring(6)) : null; if (!id) { throw Error(`Failed to obtain forum ID from "${originURL}"`); } const url = $('link[rel="canonical"]').attr('href') || ''; const title = $('meta[property="og:title"]').attr('content') || ''; if (!url || !title) { throw Error(`Failed to obtain 'url' and 'title' from "${originURL}"`); } const subforums = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_findForumLinks).call(this, $('div.node--forum'), $, url); const threads = $('div.structItem--thread div.structItem-title') .find('a') .map((_i, _el) => { const el = $(_el); const href = el.attr('href'); if (href) { const threadLink = URLHelper.parseThreadURL(href); const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, el.html()).trim(); if (threadLink?.id && title) { const threadURL = new URL(href, url).toString(); return { title, url: threadURL.endsWith('/unread') ? threadURL.substring(0, threadURL.length - 7) : threadURL }; } } return null; }) .toArray() .reduce((result, t) => { if (t !== null && !result.find((t2) => t2.url === t.url)) { result.push(t); } return result; }, []); return { id, url, title, subforums, threads, ...__classPrivateFieldGet(this, _Parser_instances, "m", _Parser_parseNav).call(this, $, url) }; } parseGenericPage(html, url) { const $ = cheerioLoad(html); const forums = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_findForumLinks).call(this, $('.node-title'), $, url); return { forums }; } } _Parser_logger = new WeakMap(), _Parser_instances = new WeakSet(), _Parser_findForumLinks = function _Parser_findForumLinks(el, $, baseURL) { return el.find('a') .map((_i, _el) => { const linkEl = $(_el); const href = linkEl.attr('href'); if (href) { const forumLink = URLHelper.parseForumURL(href); const title = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_htmlToText).call(this, linkEl.html()).trim(); if (forumLink?.id && title) { return { title, url: new URL(href, baseURL).toString() }; } } return null; }) .toArray() .reduce((result, f) => { if (f !== null && !result.find((f2) => f2.url === f.url)) { result.push(f); } return result; }, []); }, _Parser_parseNav = function _Parser_parseNav($, url) { const pageNavEl = $('div.pageNav ul.pageNav-main'); const currentPage = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_checkNumber).call(this, pageNavEl.find('li.pageNav-page--current a').first().text()) || 1; const totalPages = __classPrivateFieldGet(this, _Parser_instances, "m", _Parser_checkNumber).call(this, pageNavEl.find('li').last().text()) || 1; const nextHrefEl = $('div.pageNav a.pageNav-jump--next').attr('href'); const nextURL = nextHrefEl ? new URL(nextHrefEl, url).toString() : undefined; return { currentPage, totalPages, nextURL }; }, _Parser_checkNumber = function _Parser_checkNumber(value) { if (!isNaN(Number(value))) { return Number(value); } return undefined; }, _Parser_htmlToText = function _Parser_htmlToText(value) { if (value === undefined || value === null) { return ''; } return htmlToText(value); }; //# sourceMappingURL=Parser.js.map