rss-parser-browser
Version:
A simple, light-weight RSS parser for browser. Parse strings, URLs, or files and get a JS object back
286 lines (258 loc) • 7.75 kB
JavaScript
;
var Entities = require("entities");
var url = require('url');
var XML2JS = require('xml2js');
var HTTP = require('http');
var HTTPS = require('https');
var Parser = module.exports = {};
var FEED_FIELDS = [
['author', 'creator'],
['dc:publisher', 'publisher'],
['dc:creator', 'creator'],
['dc:source', 'source'],
['dc:title', 'title'],
['dc:type', 'type'],
'title',
'description',
'author',
'pubDate',
'webMaster',
'managingEditor',
'generator',
'link',
];
var ITEM_FIELDS = [
['author', 'creator'],
['dc:creator', 'creator'],
['dc:date', 'date'],
['dc:language', 'language'],
['dc:rights', 'rights'],
['dc:source', 'source'],
['dc:title', 'title'],
'title',
'link',
'pubDate',
'author',
'content:encoded',
'enclosure',
'dc:creator',
'dc:date',
];
var mapItunesField = function(f) {
return ['itunes:' + f, f];
}
var PODCAST_FEED_FIELDS = ([
'author',
'subtitle',
'summary',
'explicit'
]).map(mapItunesField);
var PODCAST_ITEM_FIELDS = ([
'author',
'subtitle',
'summary',
'explicit',
'duration',
'image'
]).map(mapItunesField);
var stripHtml = function(str) {
return str.replace(/<(?:.|\n)*?>/gm, '');
}
var getSnippet = function(str) {
return Entities.decode(stripHtml(str)).trim();
}
var getContent = function(content) {
if (typeof content._ === 'string') {
return content._;
} else if (typeof content === 'object') {
var builder = new XML2JS.Builder({headless: true, explicitRoot: true, rootName: 'div', renderOpts: {pretty: false}});
return builder.buildObject(content);
} else {
return content;
}
}
var parseAtomFeed = function(xmlObj, options, callback) {
var feed = xmlObj.feed;
var json = {feed: {entries: []}};
if (feed.link) {
if (feed.link[0] && feed.link[0].$.href) json.feed.link = feed.link[0].$.href;
if (feed.link[1] && feed.link[1].$.href) json.feed.feedUrl = feed.link[1].$.href;
}
if (feed.title) {
var title = feed.title[0] || '';
if (title._) title = title._
if (title) json.feed.title = title;
}
var entries = feed.entry;
(entries || []).forEach(function (entry) {
var item = {};
if (entry.title) {
var title = entry.title[0] || '';
if (title._) title = title._;
if (title) item.title = title;
}
if (entry.link && entry.link.length) item.link = entry.link[0].$.href;
if (entry.updated && entry.updated.length) item.pubDate = new Date(entry.updated[0]).toISOString();
if (entry.author && entry.author.length) item.author = entry.author[0].name[0];
if (entry.content && entry.content.length) {
item.content = getContent(entry.content[0]);
item.contentSnippet = getSnippet(item.content)
}
if (entry.id) {
item.id = entry.id[0];
}
json.feed.entries.push(item);
});
callback(null, json);
}
var parseRSS1 = function(xmlObj, options, callback) {
xmlObj = xmlObj['rdf:RDF'];
var channel = xmlObj.channel[0];
var items = xmlObj.item;
return parseRSS(channel, items, options, callback);
}
var parseRSS2 = function(xmlObj, options, callback) {
var channel = xmlObj.rss.channel[0];
var items = channel.item;
return parseRSS(channel, items, options, function(err, data) {
if (err) return callback(err);
if (xmlObj.rss.$['xmlns:itunes']) {
decorateItunes(data, channel);
}
callback(null, data);
});
}
var parseRSS = function(channel, items, options, callback) {
items = items || [];
options.customFields = options.customFields || {};
var itemFields = ITEM_FIELDS.concat(options.customFields.item || []);
var feedFields = FEED_FIELDS.concat(options.customFields.feed || []);
var json = {feed: {entries: []}};
if (channel['atom:link']) json.feed.feedUrl = channel['atom:link'][0].$.href;
copyFromXML(channel, json.feed, feedFields);
items.forEach(function(item) {
var entry = {};
copyFromXML(item, entry, itemFields);
if (item.enclosure) {
entry.enclosure = item.enclosure[0].$;
}
if (item.description) {
entry.content = getContent(item.description[0]);
entry.contentSnippet = getSnippet(entry.content);
}
if (item.guid) {
entry.guid = item.guid[0];
if (entry.guid._) entry.guid = entry.guid._;
}
if (item.category) entry.categories = item.category;
var date = entry.pubDate || entry.date;
if (date) {
try {
entry.isoDate = new Date(date.trim()).toISOString();
} catch (e) {
// Ignore bad date format
}
}
json.feed.entries.push(entry);
})
callback(null, json);
}
var copyFromXML = function(xml, dest, fields) {
fields.forEach(function(f) {
var from = f;
var to = f;
if (Array.isArray(f)) {
from = f[0];
to = f[1];
}
if (xml[from] !== undefined) dest[to] = xml[from][0];
})
}
/**
* Add iTunes specific fields from XML to extracted JSON
*
* @access public
* @param {object} json extracted
* @param {object} channel parsed XML
*/
var decorateItunes = function decorateItunes(json, channel) {
var items = channel.item || [],
entry = {};
json.feed.itunes = {}
if (channel['itunes:owner']) {
var owner = {},
image;
if(channel['itunes:owner'][0]['itunes:name']) {
owner.name = channel['itunes:owner'][0]['itunes:name'][0];
}
if(channel['itunes:owner'][0]['itunes:email']) {
owner.email = channel['itunes:owner'][0]['itunes:email'][0];
}
if(channel['itunes:image']) {
image = channel['itunes:image'][0].$.href
}
if(image) {
json.feed.itunes.image = image;
}
json.feed.itunes.owner = owner;
}
copyFromXML(channel, json.feed.itunes, PODCAST_FEED_FIELDS);
items.forEach(function(item, index) {
var entry = json.feed.entries[index];
entry.itunes = {};
copyFromXML(item, entry.itunes, PODCAST_ITEM_FIELDS);
var image = item['itunes:image'];
if (image && image[0] && image[0].$ && image[0].$.href) {
entry.itunes.image = image[0].$.href;
}
});
}
Parser.parseString = function(xml, options, callback) {
if (!callback) {
callback = options;
options = {};
}
XML2JS.parseString(xml, function(err, result) {
if (err) return callback(err);
if (result.feed) {
return parseAtomFeed(result, options, callback)
} else if (result.rss && result.rss.$.version && result.rss.$.version.indexOf('2') === 0) {
return parseRSS2(result, options, callback);
} else {
return parseRSS1(result, options, callback);
}
});
}
Parser.parseURL = function(feedUrl, options, callback) {
if (!callback) {
callback = options;
options = {};
}
options.__redirectCount = options.__redirectCount || 0;
if (options.maxRedirects === undefined) options.maxRedirects = 1;
var xml = '';
var get = feedUrl.indexOf('https') === 0 ? HTTPS.get : HTTP.get;
var parsedUrl = url.parse(feedUrl);
var req = get({
auth: parsedUrl.auth,
protocol: parsedUrl.protocol,
hostname: parsedUrl.hostname,
path: parsedUrl.path,
headers: {'User-Agent': 'rss-parser'}
}, function(res) {
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers['location']) {
if (options.maxRedirects === 0) return callback(new Error("Status code " + res.statusCode));
if (options.__redirectCount === options.maxRedirects) return callback(new Error("Too many redirects"));
options.__redirectCount++;
return Parser.parseURL(res.headers['location'], options, callback);
}
res.setEncoding('utf8');
res.on('data', function(chunk) {
xml += chunk;
});
res.on('end', function() {
return Parser.parseString(xml, options, callback);
})
})
req.on('error', callback);
}