UNPKG

website-to-json

Version:

Converts all websites to JSON data

151 lines (129 loc) 3.62 kB
const _ = require('lodash'); const Promise = require('bluebird'); const request = Promise.promisifyAll(require('request')); const fs = Promise.promisifyAll(require('fs')); const converter = require('./src/html-to-data') const Horseman = require('node-horseman'); const logger = require('winston'); exports.processUrlWithRequestAsync = function(url, options) { options = options || {} return request.getAsync({ url: url, jar: true, encoding: options.encoding, gzip: true, timeout: options.timeout || 8000, time: !!options.time, followAllRedirects: options.followAllRedirects || true, headers: options.headers || {'accept-languages': 'en'}, forever: options.forever === false ? false : true }) .then(function(res) { if (res.statusCode === 429) { throw new Error('Request blocked with: 429'); } else if (res.statusCode !== 200) { throw new Error('Non 200 status code: ' + res.statusCode); } return { body: res.body, headers: res.headers, timingPhases: res.timingPhases, http: { statusCode: res.statusCode }, url: res.request.href, originalUrl: url }; }) } exports.processUrlWithPhantomAsync = function(url, options) { options = options || {} return new Promise(function(resolve, reject) { var horseman = new Horseman({ loadImages: false, timeout: options.timeout || 5000, injectJquery: true, //proxy: data.proxy, //phantomPath: phantomjs.path }); horseman //.userAgent(data.userAgent) .open(url) /*.on('error', function(msg) { throw new Error('Unexpected error') })*/ .html() .then(function(body) { return resolve({ body: body, originalUrl: url }); }) }) .catch((err) => { throw new Error('Unexpected error') }) } exports.processUrlAsync = function(url, data) { if (data.type === 'phantomjs') { return exports.processUrlWithPhantomAsync(url, data); } return exports.processUrlWithRequestAsync(url, data); } /** * recipes * timeout * type * fields * keywords */ exports.extractUrl = function(url, options) { options = options || {} var DEBUG = !!(process.env.DEBUG || options.debug); if (!url) { throw new Error('url is required') } if (url.indexOf('http') === -1) { url = 'http://' + url } if (options.nightmare) { return new Promise(function (resolve, reject) { options.nightmare.goto(url) .wait('body') .evaluate(function(){ return document.documentElement.innerHTML }) .then(function(title) { return resolve(converter.convert(url, title, options)); }) }).timeout(10000) } if (DEBUG) logger.profile('get html ' + url); return exports.processUrlAsync(url, options) .then((result) => { if (DEBUG) logger.profile('get html ' + url); if (DEBUG) logger.profile('process html ' + url); var output = _.merge(converter.convert(url, result.body, options), { url: result.url, originalUrl: result.originalUrl }); if (options.headers === true) { output.headers = result.headers; } if (options.time === true) { output.timings = result.timingPhases; } if (options.http === true) { output.http = result.http; } if (output.id === 'id_not_specified') { delete output.id; } if (options.stringify) { return JSON.stringify(output, null, 2); } if (DEBUG) logger.profile('process html ' + url); return output; }) } exports.extractData = exports.extractUrl;