UNPKG

gurkha

Version:
257 lines (233 loc) 7.32 kB
'use strict'; var cheerio = require('cheerio'); function gurkha (schema, options) { /*jshint validthis: true */ var cheerioOpts, externalVars; if (typeof (schema) !== 'object' && typeof (schema) !== 'string') { throw new Error('Illegal argument: constructor must receive a schema' + 'object, string or array'); } if (options !== undefined) { if (typeof (options) !== 'object') { throw new Error('Illegal argument: if options are present, they must be an object.'); } cheerioOpts = options.options; externalVars = options.params; } this._schema = schema; this._options = cheerioOpts || {}; this._extvars = externalVars || {}; } // reserved object members gurkha.prototype._reserved = { '$sanitizer': true, '$rule': true, '$topLevel': true, '$post': true, '$ignore': true, '$constant': true }; // traverses the schema recursively in order to build the object gurkha.prototype._parse = function ($currentElement, sch, sanitizer) { var _this = this; if (sch instanceof Array) { return _this._parseArray($currentElement, sch, sanitizer); } else if (typeof (sch) === 'object' && sch !== null) { return _this._parseObject($currentElement, sch, sanitizer); } else if (typeof (sch) === 'string') { return _this._parseString($currentElement, sch, sanitizer); } else { throw new Error('Illegal argument: schema values must be object, string or array. Got ' + sch); } }; gurkha.prototype._parseArray = function ($currentElement, sch, sanitizer) { var _this = this; var $ = _this._$; var resultArray = []; var i; if (!$currentElement) { $currentElement = $(_this._html); } for (i = 0; i < sch.length; i += 1) { var value = sch[i]; resultArray.push(_this._parse($currentElement, value, sanitizer)); } if (resultArray.length === 1) { resultArray._ignore = true; } return resultArray; }; gurkha.prototype._parseObject = function ($currentElement, sch, sanitizer) { var _this = this; var $ = _this._$; var rule = sch.$rule; var resultArray = []; // options var topLevel = sch.$topLevel; var post = sch.$post; var ignore = sch.$ignore || function () { return false; }; var constant = sch.$constant; // ignore everything else in the object if a constant is specified if (constant !== undefined) { return [constant]; } if (rule) { if (typeof (rule) !== 'string') { throw new Error('Illegal type: Rules must be in String format'); } // $currentElement is null if the schema object is not nested, meaning the rule should select from the entire DOM // topLevel indicates that the rule should select from the entire DOM, // ignoring any previous rules in outer schema objects if (!$currentElement || topLevel) { $currentElement = $(rule); } else { $currentElement = $($currentElement).find(rule); } // build object for each element selected by the rule $currentElement.each(function (index, el) { var $el = $(el); // only build the object if the element doesn't meet the schema's ignore criteria if (typeof (ignore) !== 'function') { throw new Error('Illegal type: Filters must be in function format'); } if (!ignore($el, _this._extvars)) { resultArray.push(_this._build($el, sch, sanitizer)); } }); // no basic rule specified } else { if (!$currentElement || topLevel) { $currentElement = $(_this._html); } // if there is no rule we build only one object resultArray.push(_this._build($currentElement, sch, sanitizer)); } // post-processing if (post) { if (typeof (post) !== 'function') { throw new Error('Illegal type: Post-processing functions must be in function format'); } else { return resultArray.map(function (result) { // we must flatten the object in order for the post-processing function to work properly return post(_this._flatten2(result), _this._extvars); }); } } else { return resultArray; } }; gurkha.prototype._parseString = function ($currentElement, sch, sanitizer) { var _this = this; var $ = _this._$; var resultArray = []; var $subElement; // a single string is a rule, so we select the elements that match it if (!$currentElement) { $subElement = $(sch); } else { $subElement = $currentElement.find(sch); } // push result for each element selected by the rule $subElement.each(function (index, el) { var $el = $(el); if (!sanitizer) { resultArray.push($el.text()); } else { resultArray.push(sanitizer($el, _this._extvars)); } }); return resultArray; }; // auxiliary function to build the object gurkha.prototype._build = function ($el, sch, sanitizer) { var _this = this; var value; var result = {}; var keyCount = 0; // override the previous sanitizer if a new one exists in the object sanitizer = sch.$sanitizer || sanitizer; if (sanitizer) { if (typeof (sanitizer) !== 'function') { throw new Error('Illegal type: Sanitizers must be in function format'); } } for (var key in sch) { // skip reserved keys if (_this._reserved[key] || !key) { continue; } else { keyCount += 1; value = sch[key]; // unreserved object members must ignore previous sanitizer functions result[key] = _this._parse($el, value, sch.$sanitizer); } } // if the object has no members other than the reserved ones, // return the result of the selection rather than an object if (keyCount === 0) { if (sanitizer) { return sanitizer($el, _this._extvars); } else { return $el.text(); } } else { return result; } }; // wrapper function to avoid flattening the outer array gurkha.prototype._flatten = function (val) { var array = []; var i; for (i = 0; i < val.length; i += 1) { array.push(this._flatten2(val[i])); } return array; }; // flatten any inner arrays with only one value gurkha.prototype._flatten2 = function (val) { var array = []; // flatten recursively if (val instanceof Array) { // if the array is flagged to be ignored we don't flatten it if (val.length === 1 && !val._ignore) { return this._flatten2(val[0]); } else { var i; // if the array was flagged to be ignored, we clone it to get rid of the property without using delete if (val._ignore) { val = [this._flatten2(val[0])]; } for (i = 0; i < val.length; i += 1) { array.push(this._flatten2(val[i])); } return array; } } else { var result = {}; // flatten recursively if (typeof (val) === 'object') { for (var key in val) { if (!key) { continue; } else { result[key] = this._flatten2(val[key]); } } return result; // single values are returned as-is } else { return val; } } }; // exposed parsing function, wrapper for _parse gurkha.prototype.parse = function (html, params) { this._$ = cheerio.load(html, this._options); this._html = html; // override parameters if (params) { this._extvars = params; } return this._flatten(this._parse(null, this._schema)); }; module.exports = gurkha;