social-butterfly
Version:
Incorporate federated social network protocols easily. Used with Hello, world federated blog.
226 lines (187 loc) • 7.28 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.discoverAndParseFeedFromUrl = discoverAndParseFeedFromUrl;
exports.parseFeedAndInsertIntoDb = parseFeedAndInsertIntoDb;
exports.mapFeedAndInsertIntoDb = mapFeedAndInsertIntoDb;
exports.retrieveFeed = retrieveFeed;
exports.parseFeed = parseFeed;
var _crawler = require("./crawler");
var _feedparser = _interopRequireDefault(require("feedparser"));
var _exceptions = require("./exceptions");
var _stream = require("stream");
var _cheerio = _interopRequireDefault(require("cheerio"));
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
async function discoverAndParseFeedFromUrl(url) {
const {
content,
feedUrl
} = await discoverAndRetrieveFeedFromUrl(url);
const {
feedEntries,
feedMeta
} = await parseFeed(content);
return {
feedEntries,
feedMeta,
feedUrl
};
}
async function discoverAndRetrieveFeedFromUrl(url) {
const response = await (0, _crawler.fetchUrl)(url);
const content = await response.text();
const contentType = response.headers.get('content-type');
if (contentType?.includes('text/html')) {
return await parseHtmlAndRetrieveFeed(url, content);
} // The url is the feed already, just send that back.
return {
content,
feedUrl: url
};
}
async function parseHtmlAndRetrieveFeed(websiteUrl, html) {
const $ = _cheerio.default.load(html);
const links = $('link[rel="alternate"]').filter((index, el) => ($(el).attr('type') || '').match(/(rss|atom)/));
let feedUrl = links.first().attr('href');
if (!feedUrl) {
throw new _exceptions.HTTPError(404, websiteUrl, 'feed: no feed url');
}
feedUrl = (0, _crawler.createAbsoluteUrl)(websiteUrl, feedUrl);
const content = await retrieveFeed(feedUrl);
return {
content,
feedUrl
};
}
async function parseFeedAndInsertIntoDb(options, userRemote, feedResponseText, logger) {
try {
const {
feedEntries
} = await parseFeed(feedResponseText);
await mapFeedAndInsertIntoDb(options, userRemote, feedEntries, logger);
} catch (ex) {
logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: parseFeed FAILED.\n${ex}`);
}
}
async function mapFeedAndInsertIntoDb(options, userRemote, feedEntries, logger) {
let newEntries, skippedCount;
try {
[newEntries, skippedCount] = await mapFeedEntriesToModelEntries(options, feedEntries, userRemote);
logger && logger.info(`${userRemote.local_username} - ${userRemote.profile_url}: ` + `parsed ${newEntries.length} entries, skipped ${skippedCount}.`);
} catch (ex) {
logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: mapFeed FAILED.\n${ex}`);
return;
}
try {
newEntries.length && (await options.saveRemoteContent(newEntries));
logger && logger.info(`${userRemote.local_username} - ${userRemote.profile_url}: inserted ${newEntries.length} entries into db.`);
} catch (ex) {
logger && logger.error(`${userRemote.local_username} - ${userRemote.profile_url}: db insertion failed.\n${ex.stack}`);
}
}
async function retrieveFeed(feedUrl) {
return await (0, _crawler.fetchText)(feedUrl);
}
async function parseFeed(content) {
const {
feedEntries,
feedMeta
} = await new Promise((resolve, reject) => {
const feedEntries = [];
new TextStream({}, content).pipe(new _feedparser.default()).on('error', function (error) {
reject(`FeedParser failed to parse feed: ${error}`);
}).on('readable', function () {
try {
let feedEntry = this.read();
while (feedEntry) {
feedEntries.push(feedEntry);
feedEntry = this.read();
}
} catch (ex) {
reject(ex.message);
}
}).on('end', function () {
resolve({
feedEntries,
feedMeta: this.meta
});
});
});
return {
feedEntries,
feedMeta
};
}
async function mapFeedEntriesToModelEntries(options, feedEntries, userRemote) {
const entries = await Promise.all(feedEntries.map(async feedEntry => await handleEntry(options, feedEntry, userRemote)));
const filteredEntries = entries.filter(entry => entry);
const skippedCount = entries.length - filteredEntries.length;
return [filteredEntries, skippedCount];
}
async function handleEntry(options, feedEntry, userRemote) {
const entryId = feedEntry.guid || feedEntry.link || feedEntry.permalink;
const link = feedEntry.link || feedEntry.permalink;
const existingModelEntry = await options.getRemoteContent(userRemote.local_username, entryId);
let dateUpdated = new Date();
if (feedEntry.date) {
dateUpdated = new Date(feedEntry.date);
} else if (feedEntry.pubdate) {
dateUpdated = new Date(feedEntry.pubdate);
} // We ignore if we already have the item in our DB.
// Also, we don't keep items that are over options.feedMaxDaysOld.
if (existingModelEntry?.type === 'comment' || existingModelEntry && +existingModelEntry.updatedAt === +dateUpdated || dateUpdated < new Date(Date.now() - options.constants.feedMaxDaysOld)) {
return;
}
let view = feedEntry.description || feedEntry.summary;
const thumbnail = feedEntry['media:group']?.['media:thumbnail']?.['@']['url'];
if (!view && thumbnail) {
view = `<a href="${link}" target="_blank" rel="noopener noreferrer"><img src="${thumbnail}" alt="thumbnail" /></a>`;
}
view = (0, _crawler.sanitizeHTML)(view); // XXX(mime): A shortcoming of feedparser currently is that it doesn't resolve relative urls for feeds that have
// urls in the content, e.g. kottke.org Fix this hackily for now. It should really be looking at xml:base in the XML.
const HTML_ATTRIBUTES_WITH_LINKS = ['action', 'background', 'cite', 'classid', 'codebase', 'href', 'longdesc', 'profile', 'src', 'usemap'];
const RELATIVE_REGEXP = new RegExp(`(${HTML_ATTRIBUTES_WITH_LINKS.join('|')})(=['"])/`, 'gi');
view = view.replace(RELATIVE_REGEXP, `$1$2${userRemote.profile_url}/`); // Comments and threads
let comments_count = 0;
let comments_updated;
const atomLinks = feedEntry['atom:link'] ? [feedEntry['atom:link']].flat(1) : [];
const replies = atomLinks.find(el => el['@'].rel === 'replies');
if (replies) {
comments_count = parseInt(replies['@'].count);
comments_updated = new Date(replies['@'].updated);
}
const thread = feedEntry['thr:in-reply-to']?.['@'].ref; // Avatar
const pocoPhotos = feedEntry['atom:author']?.['poco:photos'];
const avatar = pocoPhotos && pocoPhotos['poco:value']['#'];
return {
id: existingModelEntry?.id || undefined,
avatar,
comments_count,
comments_updated,
content: '',
createdAt: feedEntry.pubdate || new Date(),
creator: feedEntry.author,
from_user: userRemote.profile_url,
from_user_remote_id: userRemote.id,
link,
post_id: entryId,
thread,
title: feedEntry.title || 'untitled',
to_username: userRemote.local_username,
type: 'post',
updatedAt: dateUpdated,
username: userRemote.username,
view
};
}
class TextStream extends _stream.Readable {
constructor(options, text) {
super(options);
this.text = text;
}
_read() {
this.push(this.text);
this.push(null);
}
}