web-spider
Version:
web-spider is a simple and fast web spider written with Nodejs!
128 lines (116 loc) • 4.23 kB
JavaScript
/**
* Created by yuexing on 2016/06/03.
* 爬虫的动态下载模块
* 这里用的phantomjs,具体参见http://phantomChild.org/api
*/
;
const util = require('util');
const child_process = require('child_process');
const path = require('path');
const osType = require('os').type();
/**
* @param {object} _baseDownloader downloader对象
* */
let dynamicDownloader = function (_baseDownloader) {
let spiderCore = _baseDownloader.spiderCore,
urlInfo = spiderCore.spiderConf.urlInfo,
logger = spiderCore.logger,
baseMsgCode = spiderCore._config.baseMsgCode,
cwdPath = path.resolve(_baseDownloader.cwdPath, '../src/_lib');
//兼容windows
let cmd = /Windows/.test(osType) ? 'phantomjs' : './phantomjs';
//1. 初始化phantomjs 命令行
let cmdLines = [cmd];
//2. 添加静态phantomjs config.json文件
let configJson = urlInfo.phantomConfig !== false && urlInfo.phantomConfig || path.resolve(cwdPath, 'phantom-config.json');
cmdLines.push('--config=' + configJson);
//3. 添加动态phantomjs 命令
urlInfo.phantomLines && urlInfo.phantomLines.length > 0 && cmdLines.push(urlInfo.phantomLines.join(' '));
//如果有代理的话,添加代理
if(urlInfo.proxy){
cmdLines.push('--proxy=' + urlInfo.proxy.host + ":" + urlInfo.proxy.port);
cmdLines.push('--proxy-type='+ urlInfo.proxy.protocol);
if(urlInfo.proxy.username){
cmdLines.push('--proxy-auth=' + urlInfo.proxy.username + ":" + urlInfo.proxy.password);
}
}
//4. 添加和phantomjs桥接js
let phantomBridgeJs = urlInfo.phantomBridgeJs || 'phantom_spec.js';
cmdLines.push(phantomBridgeJs);
//5. 添加命令参数
//为了防止url截断,所以对其encode,用spawn模式的话,可以不用encode
urlInfo.url = encodeURIComponent(urlInfo.url);
cmdLines.push(JSON.stringify(JSON.stringify(urlInfo)));//在exec模式下,会对其自动解析,所以需要双层stringify,如果是spawn,则只需要一层就够了
//6. 执行phantomjs 命令
logger.info(cmdLines.join(' '));
let phantomChild = child_process.exec(
cmdLines.join(' '),
{cwd: cwdPath, stdio: 'pipe', maxBuffer: 2000 * 1024, timeout: urlInfo.timeout || 5000}
);
//let phantomChild = child_process.spawn(cmd, cmdLines,
// {cwd: cwdPath, stdio: 'pipe'}
//);
let killPhantomjs = function (err, data) {
//负责销毁phantomjs和传递数据
if (err) {
_baseDownloader.sendData(err, data);
}else{
try {
data = JSON.parse(data);
if (baseMsgCode.success !== data.code) {
data = data.msg;
} else {
data = data.data;
}
_baseDownloader.sendData(null, data);
} catch (e) {
_baseDownloader.sendData(e, data);
}
}
phantomChild.kill();
};
phantomChild.on('error', function (err) {
killPhantomjs('phantomChild error: ' + err);
});
let receivedData = '';
//获取phantomjs的数据
phantomChild.stdout.on('data', function (data) {
//logger.info("phantomChild data");
if(typeof data == "object"){
data = data.toString();
}
//logger.info(data);
data = data.trim();
if (receivedData == '' && !data.startsWith('{')) {
killPhantomjs('phantomChild: ' + data);
} else {
receivedData += data;
//因为从phantomjs获取的数据是分段传输,所以需要判断是否传输完成。
if (data.endsWith('}#phantomjs-data-end#')) {
var emit_string = receivedData.replace("#phantomjs-data-end#", '');
receivedData = '';
phantomChild.emit('receivedData', emit_string);
}
}
});
//经过完整加工的phantomjs的数据
phantomChild.on('receivedData', function (data) {
try {
killPhantomjs(null, data);//data.toString('utf8')
} catch (e) {
killPhantomjs(util.format('Page content parse error: %s', e));
return;
}
});
//错误提示
phantomChild.stderr.on('data', function (data) {
killPhantomjs('phantomChild stderr: ' + data.toString('utf8'));
});
phantomChild.on('exit', function (code) {
if (code != 0)logger.error('child process exited with code ' + code);
});
phantomChild.on('close', function (signal) {
if (signal != 0)logger.error('child process closed with signal ' + signal);
});
};
module.exports = dynamicDownloader;