@rowanmanning/feed-parser
Version:
A well-tested and resilient parser for RSS and Atom feeds
392 lines (350 loc) • 9.2 kB
JavaScript
'use strict';
const { Feed } = require('./base');
const { InvalidFeedError } = require('../errors/invalid-feed');
const { isNotNull } = require('../utils/is-not-null');
const { parseContactString } = require('../utils/parse-contact-string');
const { RssFeedItem } = require('./item/rss');
/**
* @import { FeedAuthor, FeedCategory, FeedGenerator, FeedImage, FeedMeta } from './base'
*/
const httpRegExp = /^https?:\/\//i;
/**
* @type {string}
*/
const RSS_VERSION_0_9 = '0.9';
/**
* @type {string}
*/
const RSS_VERSION_1_0 = '1.0';
/**
* @type {string}
*/
const RSS_VERSION_2_0 = '2.0';
/**
* @type {Array<string>}
*/
const SUPPORTED_RSS_VERSIONS = [RSS_VERSION_0_9, RSS_VERSION_1_0, RSS_VERSION_2_0];
/**
* Class representing an RSS feed.
*/
class RssFeed extends Feed {
/**
* @type {import('../xml/element').Element}
*/
#root;
/**
* @type {import('../xml/element').Element}
*/
#channel;
/**
* @type {Array<import('./item/rss').RssFeedItem> | null}
*/
#itemCache = null;
/**
* Class constructor.
*
* @param {import('../xml/document').Document} document
* The XML document to extract data from.
* @throws {InvalidFeedError}
* Throws an invalid feed error if an unrecoverable issue is found with the RSS feed.
*/
constructor(document) {
super(document);
const root =
this.document.findElementWithName('rss') || this.document.findElementWithName('rdf');
if (!root) {
throw new InvalidFeedError('The RSS feed does not have a root element');
}
this.#root = root;
const channel = this.#root.findElementWithName('channel');
if (!channel) {
throw new InvalidFeedError('The RSS feed does not have a channel element');
}
this.#channel = channel;
}
/**
* @override
* @returns {import('../xml/element').Element}
* Returns the XML element which represents the feed.
*/
get element() {
return this.#channel;
}
/**
* @returns {string | null}
* Returns the version of RSS the feed uses. Exposed publicly in the `meta` property.
*/
get #version() {
const version = this.#root.getAttribute('version');
const namespace = this.#root.getAttribute('xmlns');
if (version && SUPPORTED_RSS_VERSIONS.includes(version)) {
return version;
}
if (version?.startsWith(RSS_VERSION_0_9)) {
return RSS_VERSION_0_9;
}
if (
namespace === 'http://channel.netscape.com/rdf/simple/0.9/' ||
namespace === 'http://my.netscape.com/rdf/simple/0.9/'
) {
return RSS_VERSION_0_9;
}
if (namespace === 'http://purl.org/rss/1.0/') {
return RSS_VERSION_1_0;
}
return null;
}
/**
* @override
* @returns {FeedMeta}
* Returns meta information about the feed.
*/
get meta() {
return {
type: this.#root.name,
version: this.#version
};
}
/**
* @override
* @returns {string | null}
* Returns the feed language.
*/
get language() {
return (
this.element.findElementWithName('language')?.textContentNormalized ||
this.#root.getAttribute('xml:lang') ||
this.#root.getAttribute('lang') ||
super.language
);
}
/**
* @override
* @returns {string | null}
* Returns the feed description.
*/
get description() {
return (
this.element.findElementWithName('description')?.textContentNormalized ||
this.element.findElementWithName('subtitle')?.textContentNormalized ||
super.description
);
}
/**
* @override
* @returns {string | null}
* Returns the feed copyright information.
*/
get copyright() {
return (
this.element.findElementWithName('copyright')?.textContentNormalized ||
this.element.findElementWithName('rights')?.textContentNormalized ||
super.copyright
);
}
/**
* @override
* @returns {string | null}
* Returns the feed URL.
*/
get url() {
const links = this.element.findElementsWithName('link');
if (!links?.length) {
return super.url;
}
return links.find((link) => link.textContentNormalized)?.textContentAsUrl || super.url;
}
/**
* @override
* @returns {string | null}
* Returns the feed's link to itself.
*/
get self() {
// Note: we don't namespace this because many feeds have some
// weird namespacing going on
const links = this.element.findElementsWithName('link');
if (!links?.length) {
return super.self;
}
return (
links.find((link) => link.getAttribute('rel') === 'self')?.getAttributeAsUrl('href') ||
super.self
);
}
/**
* @override
* @returns {Date | null}
* Returns the date that the feed was published on.
*/
get published() {
return this.element.findElementWithName('pubdate')?.textContentAsDate || super.published;
}
/**
* @override
* @returns {Date | null}
* Returns the date that the feed was last updated on.
*/
get updated() {
return (
this.element.findElementWithName('lastbuilddate')?.textContentAsDate ||
this.element.findElementWithName('date')?.textContentAsDate ||
super.updated
);
}
/**
* @override
* @returns {FeedGenerator | null}
* Returns information about the software that generated the feed.
*/
get generator() {
const label = this.element.findElementWithName('generator')?.textContentNormalized;
if (label) {
return {
label,
version: null,
url: null
};
}
return super.generator;
}
/**
* @override
* @returns {FeedImage | null}
* Returns an image representing the feed.
*/
get image() {
const images = this.element.findElementsWithName('image');
const image = images.find((img) => img.namespace !== 'itunes');
const itunesImage = images.find((img) => img.namespace === 'itunes');
if (!image && !itunesImage) {
return super.image;
}
let title = null;
let url = null;
// Try a regular image first
if (image) {
title = image.findElementWithName('title')?.textContentNormalized || null;
url = image.findElementWithName('url')?.textContentAsUrl || null;
}
// If that fails, check for an itunes image
if (!url && itunesImage) {
url = itunesImage.getAttributeAsUrl('href');
}
if (url) {
return {
title,
url
};
}
return super.image;
}
/**
* @override
* @returns {Array<FeedAuthor>}
* Returns the authors of the feed.
*/
get authors() {
// NOTE: we explicitly ignore the webmaster property here:
// the webmaster is not an author
return [
...this.element.findElementsWithName('managingeditor'),
...this.element.findElementsWithName('author'),
...this.element.findElementsWithName('creator')
]
.map((author) => {
return parseContactString(author.textContentNormalized);
})
.filter(isNotNull);
}
/**
* @override
* @returns {Array<FeedCategory>}
* Returns the categories the feed belongs to.
*/
get categories() {
const categoryElements = this.element.findElementsWithName('category');
const categories = categoryElements
.filter((category) => !RssFeed.#isItunesElement(category))
.map((category) => {
const term = category.textContentNormalized;
const domain = category.getAttribute('domain') || '';
const url = httpRegExp.test(domain) ? category.getAttributeAsUrl('domain') : null;
if (!term) {
return null;
}
return {
label: term,
term,
url
};
});
const itunesCategories = categoryElements
.filter((category) => RssFeed.#isItunesElement(category))
.flatMap((category) => {
const url = null;
const level1Category = category.getAttribute('text');
if (!level1Category) {
return null;
}
const childCategories = category
.findElementsWithName('category')
.filter((child) => RssFeed.#isItunesElement(child))
.map((childCategory) => {
const level2Category = childCategory.getAttribute('text');
if (!level2Category) {
return null;
}
return {
label: `${level1Category}/${level2Category}`,
term: `${level1Category}/${level2Category}`,
url
};
})
.filter(isNotNull);
if (childCategories.length) {
return childCategories;
}
return {
label: level1Category,
term: level1Category,
url
};
});
const subjects = this.element.findElementsWithName('subject').map((subject) => {
const term = subject.textContentNormalized;
return term ? { term, label: term, url: null } : null;
});
return [...categories, ...itunesCategories, ...subjects].filter(isNotNull);
}
/**
* @override
* @returns {Array<import('./item/rss').RssFeedItem>}
* Returns all content items in the feed.
*/
get items() {
if (this.#itemCache) {
return this.#itemCache;
}
const items = [
...this.element.findElementsWithName('item'),
...this.#root.findElementsWithName('item')
].flatMap((itemElement) => {
return new RssFeedItem(this, itemElement);
});
this.#itemCache = items;
return items;
}
/**
* Check whether an element is an iTunes element.
*
* @param {import('../xml/element').Element} element
* @returns {boolean}
*/
static #isItunesElement(element) {
return (
element.namespaceUri === 'http://www.itunes.com/dtds/podcast-1.0.dtd' ||
element.namespace === 'itunes'
);
}
}
exports.RssFeed = RssFeed;