UNPKG

social-butterfly

Version:

Incorporate federated social network protocols easily. Used with Hello, world federated blog.

226 lines (187 loc) 7.28 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.discoverAndParseFeedFromUrl = discoverAndParseFeedFromUrl; exports.parseFeedAndInsertIntoDb = parseFeedAndInsertIntoDb; exports.mapFeedAndInsertIntoDb = mapFeedAndInsertIntoDb; exports.retrieveFeed = retrieveFeed; exports.parseFeed = parseFeed; var _crawler = require("./crawler"); var _feedparser = _interopRequireDefault(require("feedparser")); var _exceptions = require("./exceptions"); var _stream = require("stream"); var _cheerio = _interopRequireDefault(require("cheerio")); function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; } async function discoverAndParseFeedFromUrl(url) { const { content, feedUrl } = await discoverAndRetrieveFeedFromUrl(url); const { feedEntries, feedMeta } = await parseFeed(content); return { feedEntries, feedMeta, feedUrl }; } async function discoverAndRetrieveFeedFromUrl(url) { const response = await (0, _crawler.fetchUrl)(url); const content = await response.text(); const contentType = response.headers.get('content-type'); if (contentType?.includes('text/html')) { return await parseHtmlAndRetrieveFeed(url, content); } // The url is the feed already, just send that back. return { content, feedUrl: url }; } async function parseHtmlAndRetrieveFeed(websiteUrl, html) { const $ = _cheerio.default.load(html); const links = $('link[rel="alternate"]').filter((index, el) => ($(el).attr('type') || '').match(/(rss|atom)/)); let feedUrl = links.first().attr('href'); if (!feedUrl) { throw new _exceptions.HTTPError(404, websiteUrl, 'feed: no feed url'); } feedUrl = (0, _crawler.createAbsoluteUrl)(websiteUrl, feedUrl); const content = await retrieveFeed(feedUrl); return { content, feedUrl }; } async function parseFeedAndInsertIntoDb(options, userRemote, feedResponseText, logger) { try { const { feedEntries } = await parseFeed(feedResponseText); await mapFeedAndInsertIntoDb(options, userRemote, feedEntries, logger); } catch (ex) { logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: parseFeed FAILED.\n${ex}`); } } async function mapFeedAndInsertIntoDb(options, userRemote, feedEntries, logger) { let newEntries, skippedCount; try { [newEntries, skippedCount] = await mapFeedEntriesToModelEntries(options, feedEntries, userRemote); logger && logger.info(`${userRemote.local_username} - ${userRemote.profile_url}: ` + `parsed ${newEntries.length} entries, skipped ${skippedCount}.`); } catch (ex) { logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: mapFeed FAILED.\n${ex}`); return; } try { newEntries.length && (await options.saveRemoteContent(newEntries)); logger && logger.info(`${userRemote.local_username} - ${userRemote.profile_url}: inserted ${newEntries.length} entries into db.`); } catch (ex) { logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: db insertion failed.\n${ex.stack}`); } } async function retrieveFeed(feedUrl) { return await (0, _crawler.fetchText)(feedUrl); } async function parseFeed(content) { const { feedEntries, feedMeta } = await new Promise((resolve, reject) => { const feedEntries = []; new TextStream({}, content).pipe(new _feedparser.default()).on('error', function (error) { reject(`FeedParser failed to parse feed: ${error}`); }).on('readable', function () { try { let feedEntry = this.read(); while (feedEntry) { feedEntries.push(feedEntry); feedEntry = this.read(); } } catch (ex) { reject(ex.message); } }).on('end', function () { resolve({ feedEntries, feedMeta: this.meta }); }); }); return { feedEntries, feedMeta }; } async function mapFeedEntriesToModelEntries(options, feedEntries, userRemote) { const entries = await Promise.all(feedEntries.map(async feedEntry => await handleEntry(options, feedEntry, userRemote))); const filteredEntries = entries.filter(entry => entry); const skippedCount = entries.length - filteredEntries.length; return [filteredEntries, skippedCount]; } async function handleEntry(options, feedEntry, userRemote) { const entryId = feedEntry.guid || feedEntry.link || feedEntry.permalink; const link = feedEntry.link || feedEntry.permalink; const existingModelEntry = await options.getRemoteContent(userRemote.local_username, entryId); let dateUpdated = new Date(); if (feedEntry.date) { dateUpdated = new Date(feedEntry.date); } else if (feedEntry.pubdate) { dateUpdated = new Date(feedEntry.pubdate); } // We ignore if we already have the item in our DB. // Also, we don't keep items that are over options.feedMaxDaysOld. if (existingModelEntry?.type === 'comment' || existingModelEntry && +existingModelEntry.updatedAt === +dateUpdated || dateUpdated < new Date(Date.now() - options.constants.feedMaxDaysOld)) { return; } let view = feedEntry.description || feedEntry.summary; const thumbnail = feedEntry['media:group']?.['media:thumbnail']?.['@']['url']; if (!view && thumbnail) { view = `<a href="${link}" target="_blank" rel="noopener noreferrer"><img src="${thumbnail}" alt="thumbnail" /></a>`; } view = (0, _crawler.sanitizeHTML)(view); // XXX(mime): A shortcoming of feedparser currently is that it doesn't resolve relative urls for feeds that have // urls in the content, e.g. kottke.org Fix this hackily for now. It should really be looking at xml:base in the XML. const HTML_ATTRIBUTES_WITH_LINKS = ['action', 'background', 'cite', 'classid', 'codebase', 'href', 'longdesc', 'profile', 'src', 'usemap']; const RELATIVE_REGEXP = new RegExp(`(${HTML_ATTRIBUTES_WITH_LINKS.join('|')})(=['"])/`, 'gi'); view = view.replace(RELATIVE_REGEXP, `$1$2${userRemote.profile_url}/`); // Comments and threads let comments_count = 0; let comments_updated; const atomLinks = feedEntry['atom:link'] ? [feedEntry['atom:link']].flat(1) : []; const replies = atomLinks.find(el => el['@'].rel === 'replies'); if (replies) { comments_count = parseInt(replies['@'].count); comments_updated = new Date(replies['@'].updated); } const thread = feedEntry['thr:in-reply-to']?.['@'].ref; // Avatar const pocoPhotos = feedEntry['atom:author']?.['poco:photos']; const avatar = pocoPhotos && pocoPhotos['poco:value']['#']; return { id: existingModelEntry?.id || undefined, avatar, comments_count, comments_updated, content: '', createdAt: feedEntry.pubdate || new Date(), creator: feedEntry.author, from_user: userRemote.profile_url, from_user_remote_id: userRemote.id, link, post_id: entryId, thread, title: feedEntry.title || 'untitled', to_username: userRemote.local_username, type: 'post', updatedAt: dateUpdated, username: userRemote.username, view }; } class TextStream extends _stream.Readable { constructor(options, text) { super(options); this.text = text; } _read() { this.push(this.text); this.push(null); } }