opmlparser
Version:
OPML parsing using sax js
331 lines (299 loc) • 10.8 kB
JavaScript
/**********************************************************************
node-opmlparser - OPML parser for node.
http://github.com/danmactough/node-opmlparser
Copyright (c) 2011 Dan MacTough
http://yabfog.com
**********************************************************************/
/*jshint sub:true, laxcomma:true */
/**
* Module dependencies.
*/
var sax = require('sax')
, util = require('util')
, TransformStream = require('readable-stream').Transform
, utils = require('./utils')
;
/**
* OpmlParser constructor. Most apps will only use one instance.
*
* @api public
*/
function OpmlParser (options) {
if (!(this instanceof OpmlParser)) return new OpmlParser(options);
TransformStream.call(this);
this._readableState.objectMode = true;
this._readableState.highWaterMark = 16; // max. # of output nodes buffered
this.init();
this.parseOpts(options);
// See https://github.com/isaacs/sax-js for more info
this.stream = sax.createStream(this.options.strict /* strict mode - no by default */, {lowercase: true, xmlns: false });
this.stream.on('error', this.handleSaxError.bind(this));
this.stream.on('processinginstruction', this.handleProcessingInstruction.bind(this));
this.stream.on('opentag', this.handleOpenTag.bind(this));
this.stream.on('closetag',this.handleCloseTag.bind(this));
this.stream.on('text', this.handleText.bind(this));
this.stream.on('cdata', this.handleText.bind(this));
this.stream.on('end', this.handleEnd.bind(this));
}
util.inherits(OpmlParser, TransformStream);
/*
* Initializes the SAX stream
*
* Initializes the class-variables
*/
OpmlParser.prototype.init = function (){
this.meta = {
'#ns': []
, '@': []
, '#xml': {}
};
this.stack = [];
this.xmlbase = [];
this.errors = [];
this.counter = 0;
};
/*
* Parse options
*/
OpmlParser.prototype.parseOpts = function (options) {
this.options = options || {};
if (!('strict' in this.options)) this.options.strict = false;
if (!('resume_saxerror' in this.options)) this.options.resume_saxerror = true;
if ('MAX_BUFFER_LENGTH' in this.options) {
sax.MAX_BUFFER_LENGTH = this.options.MAX_BUFFER_LENGTH; // set to Infinity to have unlimited buffers
} else {
sax.MAX_BUFFER_LENGTH = 16 * 1024 * 1024; // 16M versus the 64K default
}
if (this.options.opmlurl) this.xmlbase.unshift({ '#name': 'xml', '#': this.options.opmlurl});
};
OpmlParser.prototype.handleEnd = function () {
// We made it to the end without throwing, but let's make sure we were actually
// parsing a feed
if (!(this.meta && this.meta['#type'] === 'opml')) {
var e = new Error('Not an outline');
return this.handleError(e);
}
this.push(null);
};
OpmlParser.prototype.handleSaxError = function (e) {
this.emit('error', e);
if (this.options.resume_saxerror) {
this.resumeSaxError();
}
};
OpmlParser.prototype.resumeSaxError = function () {
if (this.stream._parser) {
this.stream._parser.error = null;
this.stream._parser.resume();
}
};
OpmlParser.prototype.handleError = function (e) {
this.emit('error', e);
};
OpmlParser.prototype.handleProcessingInstruction = function (node) {
if (node.name !== 'xml') return;
this.meta['#xml'] = node.body.trim().split(' ').reduce(function (map, attr) {
var parts = attr.split('=');
map[parts[0]] = parts[1] && parts[1].length > 2 && parts[1].match(/^.(.*?).$/)[1];
return map;
}, {});
};
OpmlParser.prototype.handleOpenTag = function (node) {
// First, update the current xml:base so that URI resolutions are correct
this.joinXmlBase(node);
var n = {
'#name': node.name, // Avoid namespace collissions later...
'#prefix': node.prefix, // The current ns prefix
'#local': node.local, // The current element name, sans prefix
'#uri': node.uri, // The current ns uri
'#type': utils.nslookup(node.uri, 'opml') || utils.nslookup(node.uri, undefined) ? 'opml' : null, // make it easier to check if this is a standard opml node
'@': this.handleAttributes(node),
'#': ''
};
// Handle root node
if (this.stack.length === 0 &&
(n['#name'] === 'opml' || (n['#local'] === 'opml' && n['#type'] === 'opml'))) {
this.meta['#type'] = 'opml';
this.meta['#ns'] = {};
this.meta['@'] = {};
Object.keys(n['@']).forEach(function (name) {
if (name.indexOf('xmlns') === 0) {
this.meta['#ns'][name] = n['@'][name];
} else if (name !== 'version') {
this.meta['@'][name] = n['@'][name];
}
}, this);
this.meta['#version'] = n['@']['version'] || '1.1';
}
// Track the outline count so we can later walk the tree
if (n['#name'] === 'outline' || (n['#local'] === 'outline' && n['#type'] === 'opml')) {
n['#isoutline'] = true;
n['@']['#id'] = ++this.counter;
n['@']['#parentid'] = this.getParentId();
if ('category' in n['@']) n['@']['categories'] = this.getCategories(n['@']);
if ('xmlurl' in n['@'] || n['@']['type'] === 'rss') { // a feed is found
n['@']['#type'] = 'feed';
n['@']['folder'] = this.getFolderName(this.stack[0]);
}
}
this.stack.unshift(n);
};
OpmlParser.prototype.handleCloseTag = function (el) {
var n = this.stack.shift()
, baseurl
, stdEl;
if (this.xmlbase && this.xmlbase.length) {
baseurl = this.xmlbase[0]['#'];
if (n['#name'] === this.xmlbase[0]['#name']) {
void this.xmlbase.shift();
}
}
// Normalize the text node
if ('#' in n) {
if (n['#'].match(/^\s*$/)) {
// Delete text nodes with nothing by whitespace
delete n['#'];
} else {
n['#'] = n['#'].trim();
// If this is a bare text node with no attributes, set the property value
// as the value of the text node, unless it's an outline element
if (!n['#isoutline'] && Object.keys(n).filter(function (key) { return key !== '@' || !Object.keys(n[key]).length; }).length === 1) {
n = n['#'];
// I'm through with this guy...
return;
}
}
}
if (n['#isoutline']) { // We have an outline node
if (!this.meta.title && this.stack[1] && this.stack[1].head) { // We haven't yet parsed all the metadata
utils.merge(this.meta, this.handleMeta(this.stack[1].head), true);
}
if (!baseurl && this.xmlbase && this.xmlbase.length) { // handleMeta was able to infer a baseurl without xml:base or options.feedurl
n = utils.reresolve(n, this.xmlbase[0]['#']);
}
// All the information in a outline element is in the attributes.
this.push(n['@']);
} else if ((n['#name'] === 'head' ||
(n['#local'] === 'head' && n['#type'] === 'opml')) &&
!this.meta.title) { // We haven't yet parsed all the metadata
utils.merge(this.meta, this.handleMeta(n), true);
}
if (this.stack.length > 0) {
if (n['#prefix'] && n['#local'] && !n['#type']) {
stdEl = n['#prefix'] + ':' + n['#local'];
} else if (n['#name'] && n['#type'] && n['#type'] !== this.meta['#type']) {
stdEl = n['#name'];
} else {
stdEl = n['#local'] || n['#name'];
}
if (!this.stack[0].hasOwnProperty(stdEl)) {
this.stack[0][stdEl] = n;
} else if (this.stack[0][stdEl] instanceof Array) {
this.stack[0][stdEl].push(n);
} else {
this.stack[0][stdEl] = [this.stack[0][stdEl], n];
}
}
};
OpmlParser.prototype.handleText = function (text) {
if (this.stack.length) {
if (this.stack[0] && '#' in this.stack[0]) {
this.stack[0]['#'] += text;
} else {
this.stack[0]['#'] = text;
}
}
};
OpmlParser.prototype.joinXmlBase = function (node) {
if ('xml:base' in node.attributes) {
if (this.xmlbase.length) {
node.attributes['xml:base'] = utils.resolve(this.xmlbase[0]['#'], node.attributes['xml:base'].trim());
}
this.xmlbase.unshift({ '#name': node.name, '#': node.attributes['xml:base']});
}
};
OpmlParser.prototype.handleAttributes = function (node) {
var attrs = {}
, names = Object.keys(node.attributes);
if (names.length) {
names.forEach(function (name) {
if (this.xmlbase.length && (name === 'href' || name === 'src')) {
// Apply xml:base to these elements as they appear
// rather than leaving it to the ultimate parser
attrs[name] = node.attributes[name] ? utils.resolve(this.xmlbase[0]['#'], node.attributes[name].trim()) : '';
}
else {
attrs[name] = node.attributes[name] ? node.attributes[name].trim() : '';
}
}, this);
}
return attrs;
};
OpmlParser.prototype.handleMeta = function (node) {
if (!node) return {};
var meta = {};
// Set all the meta keys to null
['title', 'datecreated', 'datemodified', 'ownername', 'owneremail', 'owneremail', 'docs', 'expansionstate', 'vertscrollstate', 'windowtop', 'windowleft', 'windowbottom', 'windowright'].forEach(function (property){
meta[property] = null;
});
Object.keys(node).forEach(function(name){
var el = node[name];
switch(name){
case('title'):
meta.title = utils.get(el);
break;
case('datecreated'):
case('datemodified'):
meta[name] = utils.get(el) ? new Date(el['#']) : null;
break;
case('ownername'):
case('ownerid'):
case('docs'):
case('expansionstate'):
case('vertscrollstate'):
case('windowtop'):
case('windowleft'):
case('windowbottom'):
case('windowright'):
meta[name] = utils.get(el);
break;
}
// Fill with all native other namespaced properties
if (name.indexOf('#') !== 0) {
if (~name.indexOf(':')) meta[name] = el;
else if (!(name in meta)) {
// allow arbitrary elements in head and assume the #text is the value
meta[name] = utils.get(el);
}
}
// if (name.indexOf('#') !== 0 && ~name.indexOf(':')) meta[name] = el;
});
return meta;
};
OpmlParser.prototype.getParentId = function () {
var parent = this.stack.length && this.stack[0];
return ((parent && (parent['#name'] === 'outline' || (parent['#local'] === 'outline' && utils.nslookup([parent['#uri']], 'opml')))) ?
parent['@']['#id'] :
0);
};
OpmlParser.prototype.getFolderName = function (node) {
if (!node) return '';
if (utils.get(node, '#name') == 'outline' && utils.get(node, '@') && utils.get(node['@'], 'text'))
return utils.get(node['@'], 'text');
else
return '';
};
OpmlParser.prototype.getCategories = function (node) {
if (!node || !('category' in node)) return [];
else return utils.unique(utils.get(node, 'category').split(',').map(function (cat){ return cat.trim(); }));
};
// Naive Stream API
OpmlParser.prototype._transform = function (data, encoding, done) {
this.stream.write(data);
done();
};
OpmlParser.prototype._flush = function (done) {
this.stream.end();
done();
};
exports = module.exports = OpmlParser;