UNPKG

artoo-js

Version:

The client-side scraping companion.

1,843 lines (1,528 loc) 42.9 kB
;(function(undefined) { 'use strict'; /** * artoo core * =========== * * The main artoo namespace and its vital properties. */ // Checking whether a body exists var body; if ('document' in this) { body = document.getElementsByTagName('body')[0]; if (!body) { body = document.createElement('body'); document.documentElement.appendChild(body); } } // Main object var artoo = { // Standard properties $: {}, jquery: { applyPlugins: function() { artoo.jquery.plugins.map(function(p) { p(artoo.$); }); }, plugins: [] }, mountNode: body, stylesheets: {}, templates: {}, // Emitter shim properties _enabled: true, _children: [], _handlers: {}, _handlersAll: [] }; // Non-writable version Object.defineProperty(artoo, 'version', { value: '0.4.4' }); // Exporting to global scope this.artoo = artoo; }).call(this); /** * artoo node.js shim * =================== * * Make it possible to require artoo through node. */ var artoo = this.artoo; (function() { 'use strict'; /** * Here is the list of every allowed parameter when using Emitter#on: * @type {Object} */ var __allowedOptions = { once: 'boolean', scope: 'object' }; /** * The emitter's constructor. It initializes the handlers-per-events store and * the global handlers store. * * Emitters are useful for non-DOM events communication. Read its methods * documentation for more information about how it works. * * @return {Emitter} The fresh new instance. */ var Emitter = function() { this._enabled = true; this._children = []; this._handlers = {}; this._handlersAll = []; }; /** * This method binds one or more functions to the emitter, handled to one or a * suite of events. So, these functions will be executed anytime one related * event is emitted. * * It is also possible to bind a function to any emitted event by not * specifying any event to bind the function to. * * Recognized options: * ******************* * - {?boolean} once If true, the handlers will be unbound after the first * execution. Default value: false. * - {?object} scope If a scope is given, then the listeners will be called * with this scope as "this". * * Variant 1: * ********** * > myEmitter.on('myEvent', function(e) { console.log(e); }); * > // Or: * > myEmitter.on('myEvent', function(e) { console.log(e); }, { once: true }); * * @param {string} event The event to listen to. * @param {function} handler The function to bind. * @param {?object} options Eventually some options. * @return {Emitter} Returns this. * * Variant 2: * ********** * > myEmitter.on( * > ['myEvent1', 'myEvent2'], * > function(e) { console.log(e); } * >); * > // Or: * > myEmitter.on( * > ['myEvent1', 'myEvent2'], * > function(e) { console.log(e); } * > { once: true }} * >); * * @param {array} events The events to listen to. * @param {function} handler The function to bind. * @param {?object} options Eventually some options. * @return {Emitter} Returns this. * * Variant 3: * ********** * > myEmitter.on({ * > myEvent1: function(e) { console.log(e); }, * > myEvent2: function(e) { console.log(e); } * > }); * > // Or: * > myEmitter.on({ * > myEvent1: function(e) { console.log(e); }, * > myEvent2: function(e) { console.log(e); } * > }, { once: true }); * * @param {object} bindings An object containing pairs event / function. * @param {?object} options Eventually some options. * @return {Emitter} Returns this. * * Variant 4: * ********** * > myEmitter.on(function(e) { console.log(e); }); * > // Or: * > myEmitter.on(function(e) { console.log(e); }, { once: true}); * * @param {function} handler The function to bind to every events. * @param {?object} options Eventually some options. * @return {Emitter} Returns this. */ Emitter.prototype.on = function(a, b, c) { var i, l, k, event, eArray, bindingObject; // Variant 1 and 2: if (typeof b === 'function') { eArray = typeof a === 'string' ? [a] : a; for (i = 0, l = eArray.length; i !== l; i += 1) { event = eArray[i]; // Check that event is not '': if (!event) continue; if (!this._handlers[event]) this._handlers[event] = []; bindingObject = { handler: b }; for (k in c || {}) if (__allowedOptions[k]) bindingObject[k] = c[k]; else throw new Error( 'The option "' + k + '" is not recognized by Emmett.' ); this._handlers[event].push(bindingObject); } // Variant 3: } else if (a && typeof a === 'object' && !Array.isArray(a)) for (event in a) Emitter.prototype.on.call(this, event, a[event], b); // Variant 4: else if (typeof a === 'function') { bindingObject = { handler: a }; for (k in c || {}) if (__allowedOptions[k]) bindingObject[k] = c[k]; else throw new Error( 'The option "' + k + '" is not recognized by Emmett.' ); this._handlersAll.push(bindingObject); } // No matching variant: else throw new Error('Wrong arguments.'); return this; }; /** * This method works exactly as the previous #on, but will add an options * object if none is given, and set the option "once" to true. * * The polymorphism works exactly as with the #on method. */ Emitter.prototype.once = function(a, b, c) { // Variant 1 and 2: if (typeof b === 'function') { c = c || {}; c.once = true; this.on(a, b, c); // Variants 3 and 4: } else if ( // Variant 3: (a && typeof a === 'object' && !Array.isArray(a)) || // Variant 4: (typeof a === 'function') ) { b = b || {}; b.once = true; this.on(a, b); // No matching variant: } else throw new Error('Wrong arguments.'); return this; }; /** * This method unbinds one or more functions from events of the emitter. So, * these functions will no more be executed when the related events are * emitted. If the functions were not bound to the events, nothing will * happen, and no error will be thrown. * * Variant 1: * ********** * > myEmitter.off('myEvent', myHandler); * * @param {string} event The event to unbind the handler from. * @param {function} handler The function to unbind. * @return {Emitter} Returns this. * * Variant 2: * ********** * > myEmitter.off(['myEvent1', 'myEvent2'], myHandler); * * @param {array} events The events to unbind the handler from. * @param {function} handler The function to unbind. * @return {Emitter} Returns this. * * Variant 3: * ********** * > myEmitter.off({ * > myEvent1: myHandler1, * > myEvent2: myHandler2 * > }); * * @param {object} bindings An object containing pairs event / function. * @return {Emitter} Returns this. * * Variant 4: * ********** * > myEmitter.off(myHandler); * * @param {function} handler The function to unbind from every events. * @return {Emitter} Returns this. */ Emitter.prototype.off = function(events, handler) { var i, n, j, m, k, a, event, eArray = typeof events === 'string' ? [events] : events; if (arguments.length === 1 && typeof eArray === 'function') { handler = arguments[0]; // Handlers bound to events: for (k in this._handlers) { a = []; for (i = 0, n = this._handlers[k].length; i !== n; i += 1) if (this._handlers[k][i].handler !== handler) a.push(this._handlers[k][i]); this._handlers[k] = a; } a = []; for (i = 0, n = this._handlersAll.length; i !== n; i += 1) if (this._handlersAll[i].handler !== handler) a.push(this._handlersAll[i]); this._handlersAll = a; } else if (arguments.length === 2) { for (i = 0, n = eArray.length; i !== n; i += 1) { event = eArray[i]; if (this._handlers[event]) { a = []; for (j = 0, m = this._handlers[event].length; j !== m; j += 1) if (this._handlers[event][j].handler !== handler) a.push(this._handlers[event][j]); this._handlers[event] = a; } if (this._handlers[event] && this._handlers[event].length === 0) delete this._handlers[event]; } } return this; }; /** * This method unbinds every handlers attached to every or any events. So, * these functions will no more be executed when the related events are * emitted. If the functions were not bound to the events, nothing will * happen, and no error will be thrown. * * Usage: * ****** * > myEmitter.unbindAll(); * * @return {Emitter} Returns this. */ Emitter.prototype.unbindAll = function() { var k; this._handlersAll = []; for (k in this._handlers) delete this._handlers[k]; return this; }; /** * This method emits the specified event(s), and executes every handlers bound * to the event(s). * * Use cases: * ********** * > myEmitter.emit('myEvent'); * > myEmitter.emit('myEvent', myData); * > myEmitter.emit(['myEvent1', 'myEvent2']); * > myEmitter.emit(['myEvent1', 'myEvent2'], myData); * * @param {string|array} events The event(s) to emit. * @param {object?} data The data. * @return {Emitter} Returns this. */ Emitter.prototype.emit = function(events, data) { var i, n, j, m, z, a, event, child, handlers, eventName, self = this, eArray = typeof events === 'string' ? [events] : events; // Check that the emitter is enabled: if (!this._enabled) return this; data = data === undefined ? {} : data; for (i = 0, n = eArray.length; i !== n; i += 1) { eventName = eArray[i]; handlers = (this._handlers[eventName] || []).concat(this._handlersAll); if (handlers.length) { event = { type: eventName, data: data || {}, target: this }; a = []; for (j = 0, m = handlers.length; j !== m; j += 1) { // We have to verify that the handler still exists in the array, // as it might have been mutated already if ( ( this._handlers[eventName] && this._handlers[eventName].indexOf(handlers[j]) >= 0 ) || this._handlersAll.indexOf(handlers[j]) >= 0 ) { handlers[j].handler.call( 'scope' in handlers[j] ? handlers[j].scope : this, event ); // Since the listener callback can mutate the _handlers, // we register the handlers we want to remove, not the ones // we want to keep if (handlers[j].once) a.push(handlers[j]); } } // Go through handlers to remove for (z = 0; z < a.length; z++) { this._handlers[eventName].splice(a.indexOf(a[z]), 1); } } } // Events propagation: for (i = 0, n = this._children.length; i !== n; i += 1) { child = this._children[i]; child.emit.apply(child, arguments); } return this; }; /** * This method creates a new instance of Emitter and binds it as a child. Here * is what children do: * - When the parent emits an event, the children will emit the same later * - When a child is killed, it is automatically unreferenced from the parent * - When the parent is killed, all children will be killed as well * * @return {Emitter} Returns the fresh new child. */ Emitter.prototype.child = function() { var self = this, child = new Emitter(); child.on('emmett:kill', function() { if (self._children) for (var i = 0, l = self._children.length; i < l; i++) if (self._children[i] === child) { self._children.splice(i, 1); break; } }); this._children.push(child); return child; }; /** * This returns an array of handler functions corresponding to the given * event or every handler functions if an event were not to be given. * * @param {?string} event Name of the event. * @return {Emitter} Returns this. */ function mapHandlers(a) { var i, l, h = []; for (i = 0, l = a.length; i < l; i++) h.push(a[i].handler); return h; } Emitter.prototype.listeners = function(event) { var handlers = [], k, i, l; // If no event is passed, we return every handlers if (!event) { handlers = mapHandlers(this._handlersAll); for (k in this._handlers) handlers = handlers.concat(mapHandlers(this._handlers[k])); // Retrieving handlers per children for (i = 0, l = this._children.length; i < l; i++) handlers = handlers.concat(this._children[i].listeners()); } // Else we only retrieve the needed handlers else { handlers = mapHandlers(this._handlers[event]); // Retrieving handlers per children for (i = 0, l = this._children.length; i < l; i++) handlers = handlers.concat(this._children[i].listeners(event)); } return handlers; }; /** * This method will first dispatch a "emmett:kill" event, and then unbinds all * listeners and make it impossible to ever rebind any listener to any event. */ Emitter.prototype.kill = function() { this.emit('emmett:kill'); this.unbindAll(); this._handlers = null; this._handlersAll = null; this._enabled = false; if (this._children) for (var i = 0, l = this._children.length; i < l; i++) this._children[i].kill(); this._children = null; }; /** * This method disabled the emitter, which means its emit method will do * nothing. * * @return {Emitter} Returns this. */ Emitter.prototype.disable = function() { this._enabled = false; return this; }; /** * This method enables the emitter. * * @return {Emitter} Returns this. */ Emitter.prototype.enable = function() { this._enabled = true; return this; }; /** * Version: */ Emitter.version = '2.1.2'; // Export: artoo.emitter = Emitter; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo Node.js utilities * ======================== * * Some useful utilities when using artoo.js within node. */ var cheerio = require('cheerio'), path = require('path'); // Setting initial context artoo.$ = cheerio.load(''); // Methods artoo.bootstrap = function(cheerioInstance) { ['scrape', 'scrapeOne', 'scrapeTable'].forEach(function(m) { cheerioInstance.prototype[m] = function() { return artoo[m].apply( artoo, [artoo.$(this)].concat(Array.prototype.slice.call(arguments))); }; }); }; artoo.bootstrap(cheerio); artoo.setContext = function($) { // Fixing context artoo.$ = $; }; // Giving paths to alternative lib versions so they can be used afterwards artoo.paths = { browser: path.join(__dirname, 'artoo.concat.js'), chrome: path.join(__dirname, 'artoo.chrome.js'), phantom: path.join(__dirname, 'artoo.phantom.js') }; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo helpers * ============== * * Some useful helpers. */ var _root = this; // Extending Emmett Object.setPrototypeOf = Object.setPrototypeOf || function (obj, proto) { obj.__proto__ = proto; return obj; }; var ee = new artoo.emitter(); Object.setPrototypeOf(artoo, Object.getPrototypeOf(ee)); // Legacy support // TODO: drop this asap artoo.hooks = { trigger: function(name) { artoo.emit(name); } }; /** * Generic Helpers * ---------------- * * Some basic helpers from collection handling to type checking. */ // Useless function function noop() {} // Recursively extend objects function extend() { var i, k, res = {}, l = arguments.length; for (i = l - 1; i >= 0; i--) for (k in arguments[i]) if (res[k] && isPlainObject(arguments[i][k])) res[k] = extend(arguments[i][k], res[k]); else res[k] = arguments[i][k]; return res; } // Is the var an array? function isArray(v) { return v instanceof Array; } // Is the var an object? function isObject(v) { return v instanceof Object; } // Is the var a real NaN function isRealNaN(v) { return isNaN(v) && (typeof v === 'number'); } // Is the var a plain object? function isPlainObject(v) { return v instanceof Object && !(v instanceof Array) && !(v instanceof Function); } // Is a var non primitive? function isNonPrimitive(v) { return isPlainObject(v) || isArray(v); } // Is a var primitive? function isPrimitive(v) { return !isNonScalar(v); } // Get first item of array returning true to given function function first(a, fn, scope) { for (var i = 0, l = a.length; i < l; i++) { if (fn.call(scope || null, a[i])) return a[i]; } return; } // Get the index of an element in an array by function function indexOf(a, fn, scope) { for (var i = 0, l = a.length; i < l; i++) { if (fn.call(scope || null, a[i])) return i; } return -1; } // Retrieve a file extenstion from filename or url function getExtension(url) { var a = url.split('.'); if (a.length === 1 || (a[0] === '' && a.length === 2)) return ''; return a.pop(); } /** * Document Helpers * ----------------- * * Functions to deal with DOM selection and the current document. */ // Checking whether a variable is a jQuery selector function isSelector(v) { return (artoo.$ && v instanceof artoo.$) || (jQuery && v instanceof jQuery) || ($ && v instanceof $); } // Checking whether a variable is a DOM document function isDocument(v) { return v instanceof HTMLDocument || v instanceof XMLDocument; } // Get either string or document and return valid jQuery selection function jquerify(v) { var $ = artoo.$; if (isDocument(v)) return $(v); return $('<div />').append(v); } // Creating an HTML or XML document function createDocument(root, namespace) { if (!root) return document.implementation.createHTMLDocument(); else return document.implementation.createDocument( namespace || null, root, null ); } // Loading an external file the same way the browser would load it from page function getScript(url, async, cb) { if (typeof async === 'function') { cb = async; async = false; } var el = document.createElement('script'); // Script attributes el.type = 'text/javascript'; el.src = url; // Should the script be loaded asynchronously? if (async) el.async = true; // Defining callbacks el.onload = el.onreadystatechange = function() { if ((!this.readyState || this.readyState == 'loaded' || this.readyState == 'complete')) { el.onload = el.onreadystatechange = null; // Removing element from head artoo.mountNode.removeChild(el); if (typeof cb === 'function') cb(); } }; // Appending the script to head artoo.mountNode.appendChild(el); } // Loading an external stylesheet function getStylesheet(data, isUrl, cb) { var el = document.createElement(isUrl ? 'link' : 'style'), head = document.getElementsByTagName('head')[0]; el.type = 'text/css'; if (isUrl) { el.href = data; el.rel = 'stylesheet'; // Waiting for script to load el.onload = el.onreadystatechange = function() { if ((!this.readyState || this.readyState == 'loaded' || this.readyState == 'complete')) { el.onload = el.onreadystatechange = null; if (typeof cb === 'function') cb(); } }; } else { el.innerHTML = data; } // Appending the stylesheet to head head.appendChild(el); } var globalsBlackList = [ '__commandLineAPI', 'applicationCache', 'chrome', 'closed', 'console', 'crypto', 'CSS', 'defaultstatus', 'defaultStatus', 'devicePixelRatio', 'document', 'external', 'frameElement', 'history', 'indexedDB', 'innerHeight', 'innerWidth', 'length', 'localStorage', 'location', 'name', 'offscreenBuffering', 'opener', 'outerHeight', 'outerWidth', 'pageXOffset', 'pageYOffset', 'performance', 'screen', 'screenLeft', 'screenTop', 'screenX', 'screenY', 'scrollX', 'scrollY', 'sessionStorage', 'speechSynthesis', 'status', 'styleMedia' ]; function getGlobalVariables() { var p = Object.getPrototypeOf(_root), o = {}, i; for (i in _root) if (!~i.indexOf('webkit') && !(i in p) && _root[i] !== _root && !(_root[i] instanceof BarProp) && !(_root[i] instanceof Navigator) && !~globalsBlackList.indexOf(i)) o[i] = _root[i]; return o; } /** * Async Helpers * -------------- * * Some helpful functions to deal with asynchronous matters. */ // Waiting for something to happen function waitFor(check, cb, params) { params = params || {}; if (typeof cb === 'object') { params = cb; cb = params.done; } var milliseconds = params.interval || 30, j = 0; var i = setInterval(function() { if (check()) { clearInterval(i); cb(null); } if (params.timeout && params.timeout - (j * milliseconds) <= 0) { clearInterval(i); cb(new Error('timeout')); } j++; }, milliseconds); } // Dispatch asynchronous function function async() { var args = Array.prototype.slice.call(arguments); return setTimeout.apply(null, [args[0], 0].concat(args.slice(1))); } // Launching tasks in parallel with an optional limit function parallel(tasks, params, last) { var onEnd = (typeof params === 'function') ? params : params.done || last, running = [], results = [], d = 0, t, l, i; if (typeof onEnd !== 'function') onEnd = noop; function cleanup() { running.forEach(function(r) { clearTimeout(r); }); } function onTaskEnd(err, result) { // Adding results to accumulator results.push(result); if (err) { cleanup(); return onEnd(err, results); } if (++d >= tasks.length) { // Parallel action is finished, returning return onEnd(null, results); } // Adding on stack t = tasks[i++]; running.push(async(t, onTaskEnd)); } for (i = 0, l = params.limit || tasks.length; i < l; i++) { t = tasks[i]; // Dispatching the function asynchronously running.push(async(t, onTaskEnd)); } } /** * Monkey Patching * ---------------- * * Some monkey patching shortcuts. Useful for sniffers and overriding * native functions. */ function before(targetFunction, beforeFunction) { // Replacing the target function return function() { // Applying our function beforeFunction.apply(this, Array.prototype.slice.call(arguments)); // Applying the original function return targetFunction.apply(this, Array.prototype.slice.call(arguments)); }; } /** * Exportation * ------------ */ // Exporting to artoo root artoo.injectScript = function(url, cb) { getScript(url, cb); }; artoo.injectStyle = function(url, cb) { getStylesheet(url, true, cb); }; artoo.injectInlineStyle = function(text) { getStylesheet(text, false); }; artoo.waitFor = waitFor; artoo.getGlobalVariables = getGlobalVariables; // Exporting to artoo helpers artoo.helpers = { before: before, createDocument: createDocument, extend: extend, first: first, getExtension: getExtension, indexOf: indexOf, isArray: isArray, isDocument: isDocument, isObject: isObject, isPlainObject: isPlainObject, isRealNaN: isRealNaN, isSelector: isSelector, isNonPrimitive: isNonPrimitive, isPrimitive: isPrimitive, jquerify: jquerify, noop: noop, parallel: parallel }; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo parsers * ============== * * Compilation of small parsers aim at understanding some popular web * string formats such as querystrings, headers etc. */ function parseQueryString(s) { var data = {}; s.split('&').forEach(function(item) { var pair = item.split('='); data[decodeURIComponent(pair[0])] = pair[1] ? decodeURIComponent(pair[1]) : true; }); return data; } function parseUrl(url) { var data = {href: url}; // Searching for a protocol var ps = url.split('://'); if (ps.length > 1) data.protocol = ps[0]; url = ps[ps.length > 1 ? 1 : 0]; // Searching for an authentification var a = url.split('@'); if (a.length > 1) { var as = a[0].split(':'); if (as.length > 1) { data.auth = { user: as[0], password: as[1] }; } else { data.auth = { user: as[0] }; } url = a[1]; } // Searching for origin var m = url.match(/([^\/:]+)(.*)/); data.host = m[1]; data.hostname = m[1]; if (m[2]) { var f = m[2].trim(); // Port if (f.charAt(0) === ':') { data.port = +f.match(/\d+/)[0]; data.host += ':' + data.port; } // Path data.path = '/' + f.split('/').slice(1).join('/'); data.pathname = data.path.split('?')[0].split('#')[0]; } // Tld if (~data.hostname.search('.')) { var ds = data.hostname.split('.'); // Check for IP if (!(ds.length === 4 && ds.every(function(i) { return !isNaN(+i); }))) { // Checking TLD-less urls if (ds.length > 1) { // TLD data.tld = ds[ds.length - 1]; // Domain data.domain = ds[ds.length - 2]; // Subdomains if (ds.length > 2) { data.subdomains = []; for (var i = 0, l = ds.length - 2; i < l; i++) data.subdomains.unshift(ds[i]); } } else { // TLD-less url data.domain = ds[0]; } } else { // This is an IP data.domain = data.hostname; } } // Hash var hs = url.split('#'); if (hs.length > 1) { data.hash = '#' + hs[1]; } // Querystring var qs = url.split('?'); if (qs.length > 1) { data.search = '?' + qs[1]; data.query = parseQueryString(qs[1]); } // Extension var ss = data.pathname.split('/'), es = ss[ss.length - 1].split('.'); if (es.length > 1) data.extension = es[es.length - 1]; return data; } function parseHeaders(headers) { var data = {}; headers.split('\n').filter(function(item) { return item.trim(); }).forEach(function(item) { if (item) { var pair = item.split(': '); data[pair[0]] = pair[1]; } }); return data; } function parseCookie(s) { var cookie = { httpOnly: false, secure: false }; if (!s.trim()) return; s.split('; ').forEach(function(item) { // Path if (~item.search(/path=/i)) { cookie.path = item.split('=')[1]; } else if (~item.search(/expires=/i)) { cookie.expires = item.split('=')[1]; } else if (~item.search(/httponly/i) && !~item.search('=')) { cookie.httpOnly = true; } else if (~item.search(/secure/i) && !~item.search('=')) { cookie.secure = true; } else { var is = item.split('='); cookie.key = is[0]; cookie.value = decodeURIComponent(is[1]); } }); return cookie; } function parseCookies(s) { var cookies = {}; if (!s.trim()) return cookies; s.split('; ').forEach(function(item) { var pair = item.split('='); cookies[pair[0]] = decodeURIComponent(pair[1]); }); return cookies; } /** * Exporting */ artoo.parsers = { cookie: parseCookie, cookies: parseCookies, headers: parseHeaders, queryString: parseQueryString, url: parseUrl }; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo writers * ============== * * Compilation of writers for popular formats such as CSV or YAML. */ // Dependencies var isPlainObject = artoo.helpers.isPlainObject, isArray = artoo.helpers.isArray, isPrimitive = artoo.helpers.isPrimitive, isNonPrimitive = artoo.helpers.isNonPrimitive, isRealNaN = artoo.helpers.isRealNaN; /** * CSV * --- * * Converts an array of array or array of objects into a correct * CSV string for exports purposes. * * Exposes some handful options such as choice of delimiters or order * of keys to handle. */ // Convert an object into an array of its properties function objectToArray(o, order) { order = order || Object.keys(o); return order.map(function(k) { return o[k]; }); } // Retrieve an index of keys present in an array of objects function keysIndex(a) { var keys = [], l, k, i; for (i = 0, l = a.length; i < l; i++) for (k in a[i]) if (!~keys.indexOf(k)) keys.push(k); return keys; } // Escape a string for a RegEx function rescape(s) { return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); } // Converting an array of arrays into a CSV string function toCSVString(data, params) { if (data.length === 0) { return ''; } params = params || {}; var header = params.headers || [], plainObject = isPlainObject(data[0]), keys = plainObject && (params.order || keysIndex(data)), oData, i; // Defaults var escape = params.escape || '"', delimiter = params.delimiter || ','; // Dealing with headers polymorphism if (!header.length) if (plainObject && params.headers !== false) header = keys; // Should we append headers oData = (header.length ? [header] : []).concat( plainObject ? data.map(function(e) { return objectToArray(e, keys); }) : data ); // Converting to string return oData.map(function(row) { return row.map(function(item) { // Wrapping escaping characters var i = ('' + (typeof item === 'undefined' ? '' : item)).replace( new RegExp(rescape(escape), 'g'), escape + escape ); // Escaping if needed return ~i.indexOf(delimiter) || ~i.indexOf(escape) || ~i.indexOf('\n') ? escape + i + escape : i; }).join(delimiter); }).join('\n'); } /** * YAML * ---- * * Converts JavaScript data into a YAML string for export purposes. */ // Characters to escape in YAML var ymlEscape = /[:#,\-\[\]\{\}&%]|!{1,2}/; // Creating repeating sequences function repeatString(string, nb) { var s = string, l, i; if (nb <= 0) return ''; for (i = 1, l = nb | 0; i < l; i++) s += string; return s; } // YAML conversion var yml = { string: function(string) { return (~string.search(ymlEscape)) ? '\'' + string.replace(/'/g, '\'\'') + '\'' : string; }, number: function(nb) { return '' + nb; }, array: function(a, lvl) { lvl = lvl || 0; if (!a.length) return '[]'; var string = '', l, i; for (i = 0, l = a.length; i < l; i++) { string += repeatString(' ', lvl); if (isPrimitive(a[i])) { string += '- ' + processYAMLVariable(a[i]) + '\n'; } else { if (isPlainObject(a[i])) string += '-' + processYAMLVariable(a[i], lvl + 1, true); else string += processYAMLVariable(a[i], lvl + 1); } } return string; }, object: function(o, lvl, indent) { lvl = lvl || 0; if (!Object.keys(o).length) return (lvl ? '- ' : '') + '{}'; var string = '', key, c = 0, i; for (i in o) { key = yml.string(i); string += repeatString(' ', lvl); if (indent && !c) string = string.slice(0, -1); string += key + ': ' + (isNonPrimitive(o[i]) ? '\n' : '') + processYAMLVariable(o[i], lvl + 1) + '\n'; c++; } return string; }, fn: function(fn) { return yml.string(fn.toString()); }, boolean: function(v) { return '' + v; }, nullValue: function(v) { return '~'; } }; // Get the correct handler corresponding to variable type function processYAMLVariable(v, lvl, indent) { // Scalars if (typeof v === 'string') return yml.string(v); else if (typeof v === 'number') return yml.number(v); else if (typeof v === 'boolean') return yml.boolean(v); else if (typeof v === 'undefined' || v === null || isRealNaN(v)) return yml.nullValue(v); // Nonscalars else if (isPlainObject(v)) return yml.object(v, lvl, indent); else if (isArray(v)) return yml.array(v, lvl); else if (typeof v === 'function') return yml.fn(v); // Error else throw TypeError('artoo.writers.processYAMLVariable: wrong type.'); } // Converting JavaScript variables to a YAML string function toYAMLString(data) { return '---\n' + processYAMLVariable(data); } /** * Web Formats * ------------ * * Converts JavaScript data into standard web formats such as querystrings. */ function toQueryString(o, fn) { if (!isPlainObject(o)) throw Error('artoo.writers.queryString: wrong arguments.'); var s = '', k; for (k in o) { s += (s ? '&' : '') + k + '=' + encodeURIComponent(typeof fn === 'function' ? fn(o[k]) : o[k]); } return s; } function toCookie(key, value, params) { params = params || {}; var cookie = key + '=' + encodeURIComponent(value); if (params.days) { var date = new Date(); date.setTime(date.getTime() + (params.days * 24 * 60 * 60 * 1000)); cookie += '; expires=' + date.toGMTString(); } if (params.path) cookie += '; path=' + params.path; if (params.domain) cookie += '; domain=' + params.domain; if (params.httpOnly) cookie += '; HttpOnly'; if (params.secure) cookie += '; Secure'; return cookie; } /** * Exporting */ artoo.writers = { cookie: toCookie, csv: toCSVString, queryString: toQueryString, yaml: toYAMLString }; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo helpers * ============== * * Replacing some helpers by their node.js counterparts. */ var _root = this; // False function artoo.helpers.isDocument = function(v) { return false; }; // Is this a cheerio selector? artoo.helpers.isSelector = function(v) { return !!(v && v.prototype && v.prototype.cheerio && v.prototype.cheerio === '[cheerio object]') || !!(v._root && v.options && 'normalizeWhitespace' in v.options); }; }).call(this); ;(function(undefined) { 'use strict'; /** * artoo scrape methods * ===================== * * Some scraping helpers. */ var _root = this, extend = artoo.helpers.extend; /** * Helpers */ function step(o, scope) { var $ = artoo.$, $sel = o.sel ? $(scope).find(o.sel) : $(scope), val; // Polymorphism if (typeof o === 'function') { val = o.call(scope, $, $sel.get()); } else if (typeof o.method === 'function') val = o.method.call($sel.get(), $, $sel.get()); else if (typeof o === 'string') { if (typeof $sel[o] === 'function') val = $sel[o](); else val = $sel.attr(o); } else { val = (o.attr !== undefined) ? $sel.attr(o.attr) : $sel[o.method || 'text'](); } // Default value? if (o.defaultValue && !val) val = o.defaultValue; return val; } // Scraping function after polymorphism has been taken care of function scrape(iterator, data, params, cb) { var $ = artoo.$, scraped = [], loneSelector = !!data.attr || !!data.method || data.scrape || typeof data === 'string' || typeof data === 'function'; params = params || {}; // Transforming to selector var $iterator; if (typeof iterator === 'function') $iterator = $(iterator($)); else $iterator = $(iterator); // Iteration $iterator.each(function(i) { var item = {}, p; // TODO: figure iteration scope elsewhere for scrape recursivity if (loneSelector) item = (typeof data === 'object' && 'scrape' in data) ? scrape( (data.sel ? $(this).find(data.sel) : $(this)) .find(data.scrape.iterator), data.scrape.data, data.scrape.params ) : step(data, this); else for (p in data) { item[p] = (typeof data[p] === 'object' && 'scrape' in data[p]) ? scrape( (data[p].sel ? $(this).find(data[p].sel) : $(this)) .find(data[p].scrape.iterator), data[p].scrape.data, data[p].scrape.params ) : step(data[p], this); } scraped.push(item); // Breaking if limit i attained return !params.limit || i < params.limit - 1; }); scraped = params.one ? scraped[0] : scraped; // Triggering callback if (typeof cb === 'function') cb(scraped); // Returning data return scraped; } // Function taking care of harsh polymorphism function polymorphism(iterator, data, params, cb) { var h = artoo.helpers, i, d, p, c; if (h.isPlainObject(iterator) && !h.isSelector(iterator) && !h.isDocument(iterator) && (iterator.iterator || iterator.data || iterator.params)) { d = iterator.data; p = h.isPlainObject(iterator.params) ? iterator.params : {}; i = iterator.iterator; } else { d = data; p = h.isPlainObject(params) ? params : {}; i = iterator; } // Default values d = d || 'text'; c = typeof cb === 'function' ? cb : typeof params === 'function' ? params : p.done; return [i, d, p, c]; } /** * Public interface */ artoo.scrape = function(iterator, data, params, cb) { var args = polymorphism(iterator, data, params, cb); // Warn if no iterator or no data if (!args[0] || !args[1]) throw TypeError('artoo.scrape: wrong arguments.'); return scrape.apply(this, args); }; // Scrape only the first corresponding item artoo.scrapeOne = function(iterator, data, params, cb) { var args = polymorphism(iterator, data, params, cb); // Extending parameters args[2] = artoo.helpers.extend(args[2], {limit: 1, one: true}); return scrape.apply(this, args); }; // Scrape a table // TODO: handle different contexts // TODO: better header handle artoo.scrapeTable = function(root, params, cb) { var $ = artoo.$; params = params || {}; var sel = root, headers; if (!params.headers) { return artoo.scrape($(sel).find('tr:has(td)'), { scrape: { iterator: 'td', data: params.data || 'text' } }, params, cb); } else { var headerType = params.headers.type || params.headers.method && 'first' || params.headers, headerFn = params.headers.method; if (headerType === 'th') { headers = artoo.scrape( $(sel).find('th'), headerFn || 'text' ); } else if (headerType === 'first') { headers = artoo.scrape( $(sel).find(' tr:has(td):first-of-type td'), headerFn || 'text' ); } else if (artoo.helpers.isArray(headerType)) { headers = headerType; } else { throw TypeError('artoo.scrapeTable: wrong headers type.'); } // Scraping return artoo.scrape( $(sel).find('tr:has(td)' + (headerType === 'first' ? ':not(:first-of-type)' : '')), function() { var o = {}; headers.forEach(function(h, i) { o[h] = step( params.data || 'text', $(this).find('td:nth-of-type(' + (i + 1) + ')') ); }, this); return o; }, params, cb); } }; /** * jQuery plugin */ function _scrape($) { var methods = ['scrape', 'scrapeOne', 'scrapeTable']; methods.forEach(function(method) { $.fn[method] = function() { return artoo[method].apply( artoo, [$(this)].concat(Array.prototype.slice.call(arguments))); }; }); } // Exporting artoo.jquery.plugins.push(_scrape); }).call(this); /** * artoo node.js require * ====================== * * Simply exporting artoo through a node module. */ module.exports = artoo;