UNPKG

ispider

Version:

iSpider is a simple and fast web spider written with Nodejs!

251 lines (243 loc) 7.67 kB
/** * Created by yuexing on 2016/06/03. * API具体参照 http://phantomjs.org/api/ * 这是一个node和phantomjs的连接器 * <p> * 把 isNode设置成false,可以单独测试这个js文件. * 例如: phantomjs --load-images=true loadspeed.js http://www.163.com * </p> */ 'use strict'; //默认值[true] 代表的是用node启动,false代表就是用纯命令启动 var isNode = true; var page = require('webpage').create(), system = require('system'), urlInfo, url, startTime = Date.now(); //定义phantomjs是否执行成功的基本code var MsgCode = { SUCCESS: 2000, FAIL: 50000000, NAVIGATE_EXCEPTION: 40000000 }; //定义和node传送数据的基本格式 var JsonData = function (code, msg, data) { this.code = code || MsgCode.SUCCESS; this.msg = msg || ""; this.data = data || {}; }; (function (logic) { //初始化page var initPage = function () { page.settings.userAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'; page.viewportSize = {width: 1280, height: 980}; if (system.args.length === 1) { logic.sendToNode(new JsonData(MsgCode.NAVIGATE_EXCEPTION, 'Usage: phantom_spec.js <some URL>', {})); phantom.exit(); } page.viewportSize = {width: 1280, height: 980}; try { //console.log(system.args[1]); urlInfo = JSON.parse(system.args[1]); url = decodeURIComponent(urlInfo.url); page.settings.resourceTimeout = urlInfo.timeout; } catch (e) { console.log(system.args); if (isNode) { logic.sendToNode(new JsonData(MsgCode.NAVIGATE_EXCEPTION, 'js parse MsgCode.FAIL', {errData: e})); phantom.exit(); } else { //以下处理是非node命令执行phantomjs, urlInfo = {isTest: true, isLoadFinish: false, isLoadTime: 3000}; url = system.args[1]; console.log(url); } } }; //初始化page的事件 //更多事件参见http://phantomjs.org/api/webpage/ var pageEvent = function () { //以下是页面的加载事件(其实都是event) /*** * 页面初始化之前就会调用 * */ page.onInitialized = function () { }; /*** * 页面加载开始 * */ page.onLoadStarted = function () { }; //以下是页面的event事件 /*** * 当前页面的URL发生变化时候 * @param {string} targetUrl 最新的url * */ page.onUrlChanged = function (targetUrl) { }; /*** * 当前页面的导航事件 TODO 暂时不是很清楚 * @param {string} url 导航的URl * @param {string} type ['Undefined', 'LinkClicked', 'FormSubmitted', 'BackOrForward', 'Reload', 'FormResubmitted', 'Other'] * @param {boolean} willNavigate [true]代表导航发生,[false]代表会锁定 * @param {boolean} main [true]代表事件产生者是当前frame,[false]代表是子frame * */ page.onNavigationRequested = function (url, type, willNavigate, main) { //page.customHeaders = { // "client_pid": pid, // "page": url //}; }; /*** * 发出请求资源的请求 * 这里可以用来拦截一些不必要的资源请求,比如一些日志请求 * @param {object} requestData * */ page.onResourceRequested = function (requestData, networkRequest) { //console.log('requested: ' + JSON.stringify(requestData, undefined, 4)); //requestData包含以下信息 //[id] //[method] http method //[url] //[time] 日期对象包含请求的日期 //[headers] 请求头信息 //networkRequest 这是网络请求对象本身,包含以下信息 //[abort()] 中止当前的网络请求。中止当前的网络请求,会引发onResourceError回调。 //[changeUrl(newUrl)] 改变当前网址的网络请求。通过调用networkRequest.changeUrl(newUrl),可以通过变更请求的URL。 //[setHeader(key, value)] }; /*** * 已经获取到资源时 * @param {object} res * */ page.onResourceReceived = function (res) { //console.log('received: ' + JSON.stringify(res, undefined, 4)); //res包含以下信息 基本同onResourceRequested //id : the number of the requested resource //url : the URL of the requested resource //time : Date object containing the date of the response //headers : list of http headers //bodySize : size of the received content decompressed (entire content or chunk content) //contentType : the content type if specified //redirectURL : if there is a redirection, the redirected URL //stage : “start”, “end” (FIXME: other value for intermediate chunk?) //status : http status code. ex: 200 //statusText : http status text. ex: OK }; /*** * 当页面请求资源获取失败时 * @param {object} req * */ page.onResourceError = function (err) { if (resourceError.url === page.url) { logic.sendToNode(new JsonData(MsgCode.FAIL, 'Unable to load resourc', { "url": err.url, "errorCode": err.errorCode, "description": err.errorString })); } }; /*** * 当页面请求资源获取超时 * @param {object} err * */ page.onResourceTimeout = function (err) { logic.sendToNode(new JsonData(MsgCode.FAIL, 'Network timeout on resource', { "url": err.url, "errorCode": err.errorCode, "description": err.errorString })); phantom.exit(1); }; }; //开始执行 (function () { initPage(); pageEvent(); //打开页面 page.open(url, function (status) { logic.main(status); }); }()); }({ //和请求的phantom程序通信 sendToNode: function (jsonData) { jsonData.data.urlInfo = urlInfo; // 通过writeLine会有数据传输大小限制 //末尾加上#phantomjs-data-end#是用来告诉node,传输的data已经传输完成。 system.stdout.writeLine(JSON.stringify(jsonData) + '#phantomjs-data-end#'); }, //数据处理 main: function (status) { var result = { "url": page.url, "statusCode": page.status, "content": page.content, "requestDate": Date.now() - startTime, "cookies": page.cookies }; this.nodeData(status, result) }, nodeData: function (status, result) { if (status == 'success') { this.testRecordData(); this.sendToNode(new JsonData(null, null, result)); } else { this.sendToNode(new JsonData(MsgCode.FAIL, 'Open page failed', {errData: url})); } phantom.exit(); }, //测试模式下记录html和生成的图片 testRecordData: function () { if (urlInfo.isGenerateImg) { var fs = require('fs'); //如果没有指定路径,默认是当前目录_lib; var test_path = urlInfo.generatePath || '.'; //quality清晰度,用1就够了,用100的话,生成的一张图片有几十M了 var name = '\\' + this.getDomain(page.url) + new Date()._getDate(); console.log(name); page.render(test_path + name + '.png', {format: 'PNG', quality: '1'}); fs.write(test_path + name + '.html', page.content, 'w'); } }, getDomain: function (url) { if(!url){ return ""; } var domains = [ 'com', 'cn', 'org', 'net', 'hk', 'cc', 'top', 'wang', 'tv', 'de', 'com.cn', 'com.hk', 'co.jp' ]; var domain = new RegExp('\([-\\w]+.\(\?\:'+domains.join('|')+'\)\)').exec(url); if(domain && domain.length > 1){ return domain[1]; } return ""; } })); Date.prototype._getDate = function (d) { d = d || new Date(); return [ d.getFullYear(), (d.getMonth() + 1)._padLeft(), (d.getDate())._padLeft(), (d.getHours())._padLeft(), (d.getMinutes())._padLeft(), (d.getSeconds())._padLeft()].join(''); }; Number.prototype._padLeft = function(base,chr){ var len = (String(base || 10).length - String(this).length)+1; return len > 0? new Array(len).join(chr || '0')+this : this; };