spiking
Version:
A lightweight web crawler.
349 lines (346 loc) • 12.1 kB
JavaScript
/*
Copyright © 2019 Ciyang. All rights reserved.
*/
const request = require('request');
const fs = require('fs');
const { JSDOM } = require("jsdom");
const MD5 = require("crypto-js/md5");
const puppeteer = require('puppeteer-core');
const { TimeoutError } = require('puppeteer-core/Errors');
function requestOpt() {
return {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
},
timeout: 10000
};
}
class DynamicMultipleTasks {
constructor(url = '') {
this.urlSet = new Set();
this.waitQueue = new Array();
this.path = './';
this.multipleNum = 10;
this.errorQueue = new Array();
this.runningNum = 0;
this.finishNum = 0;
this.req = request.defaults();
this.browserCreating = 0;
this.browserRunning = 0;
this.normalImgQueue = new Array();
this.displayWindow = true;
this.chromePath = '';
this.pageList = new Array();
this.stop = 1;
try {
this.firstUrl = new URL(url);
} catch (e) {
console.log('URL parsing error.');
}
this.push(url);
}
push(url = '') {
try {
var u = new URL(url, this.firstUrl);
url = u.href;
if (!this.urlSet.has(url)) this.waitQueue.push(url);
} catch (e) {
console.log('Error URL : ' + url);
}
return;
}
pushImg(url = '') {
try {
var u = new URL(url, this.firstUrl);
url = u.href;
if (!this.urlSet.has(url)) this.normalImgQueue.push(url);
} catch (e) {
console.log('Error URL : ' + url);
}
return;
}
setPath(path = './') {
this.path = path;
console.log(path);
}
setMultipleNum(multiplenum = 10) {
this.multipleNum = parseInt(multiplenum);
}
setProxy(porxy = '') {
this.proxy = porxy;
try {
this.req = request.defaults({
'proxy': porxy
});
} catch (e) {
console.log('Proxy error.');
}
}
setDisplay(disp = true) {
this.displayWindow = disp;
}
setChromePath(path = '') {
this.chromePath = path;
}
setMainWindow(mainWindow = new BrowserWindow()) {
this.mainWindow = mainWindow;
}
newPage() {
return new Promise((resolve, reject) => {
this.browser.newPage().then(res => {
resolve({
page: res,
inUsing: 0
});
}).catch(err => {
reject(err);
});
})
}
openBrowser() {
var tasks = this;
if (!tasks.browserRunning) {
console.log('Create browser');
tasks.browserCreating = 1;
var opt = {
executablePath: tasks.chromePath,
headless: !tasks.displayWindow
};
// if (tasks.proxy) opt.args = ['--proxy-server=' + tasks.proxy];
return new Promise((resolve, reject) => {
puppeteer.launch(opt).then(res => {
tasks.browser = res;
tasks.browserRunning = 1;
tasks.newPage().then(res2 => {
tasks.pageList.push(res2);
tasks.browserCreating = 0;
resolve(0);
}).catch(err2 => {
this.browser.close().then(() => {
tasks.browserCreating = 0;
}).catch(() => {
tasks.browserCreating = 0;
});
reject(err2);
});
}).catch(err => {
tasks.browserCreating = 0;
console.log('Browser error');
reject(err);
});
});
}
return new Promise((resolve) => { resolve(0) });
}
loadDynamically(url = '') {
var tasks = this;
return new Promise((resolve, reject) => {
(function useBrowser() {
for (var i = 0; i < tasks.pageList.length; i++) {
if (tasks.pageList[i].inUsing) continue;
++tasks.pageList[i].inUsing;
if (tasks.pageList[i].inUsing > 1) throw new Error("There is a problem with the code.");
tasks.pageList[i].page.goto(url, { waitUntil: 'networkidle2', timeout: 10000 }).then(() => {
tasks.pageList[i].page.content().then(res2 => {
--tasks.pageList[i].inUsing;
resolve(res2);
}).catch(err => {
console.log('An unexpected error when downloading pictures, url : ' + url);
reject(err);
});
}).catch(err => {
if (err instanceof TimeoutError) {
tasks.pageList[i].page.content().then(res2 => {
--tasks.pageList[i].inUsing;
resolve(res2);
}).catch(err2 => {
console.log('An unexpected error when downloading pictures, url : ' + url);
--tasks.pageList[i].inUsing;
reject(err2);
});
} else {
--tasks.pageList[i].inUsing;
reject(err);
}
});
return;
}
setTimeout(useBrowser, 100);
})();
});
}
downloadingImg(url2 = '', path2 = '') {
var strem = fs.createWriteStream(path2);
if (strem) {
var tasks = this;
tasks.req.get(url2, function (error2) {
if (error2) setTimeout(tasks.downloadingImg, 50, url2, path2);
}).pipe(strem);
} else {
console.log('An unexpected error when downloading pictures, url : ${url2}, path : ${path2}');
}
}
download(url = '') {
if (this.urlSet.has(url)) return new Promise((resolve) => { resolve(3); });
this.urlSet.add(url);
var tasks = this;
return new Promise((resolve, reject) => {
tasks.req(url, requestOpt(), function (error, response) {
if (!error && response.statusCode == 200) {
var u = new URL(url);
if (tasks.firstUrl && u.host != tasks.firstUrl.host && response.headers['content-type'].search('image') == -1) {
resolve(3);
return;
}
if (response.headers['content-type'] && response.headers['content-type'].search('image') != -1) {
resolve(2);
try {
var upath = MD5(response.request.href).toString();
var ctype = response.headers['content-type'];
ctype = ctype.substr(ctype.indexOf('/') + 1);
var res = ctype.indexOf(';');
if (res != -1) ctype = ctype.substr(0, res - 1);
tasks.downloadingImg(url, tasks.path + '/' + upath + '.' + ctype);
} catch (e) {
console.log(`An unexpected error when downloading pictures, url : ${url} , error : ${e.message}`);
}
}
else if (response.headers['content-type'] && response.headers['content-type'].search('text') != -1) {
tasks.loadDynamically(url).then(res => {
resolve(1);
var dom = new JSDOM(res);
var imgList = dom.window.document.getElementsByTagName('img');
for (const iterator of imgList) {
if (iterator.src)
tasks.pushImg(iterator.src);
if (iterator.href)
tasks.push(iterator.href);
}
var aList = dom.window.document.getElementsByTagName('a');
for (const iterator of aList) {
if (iterator.src)
tasks.push(iterator.src);
if (iterator.href)
tasks.push(iterator.href);
}
}).catch(err => {
tasks.urlSet.delete(url);
reject(err);
});
} else {
resolve(3);
}
} else {
tasks.urlSet.delete(url);
reject(error);
}
});
});
}
downloadImg(url = '') {
if (this.urlSet.has(url)) return new Promise((resolve) => { resolve(3); });
this.urlSet.add(url);
var tasks = this;
return new Promise((resolve, reject) => {
tasks.req(url, requestOpt(), function (error, response) {
if (!error && response.statusCode == 200) {
if (!response.headers['content-type'] || response.headers['content-type'].search('image') == -1) {
tasks.urlSet.delete(url);
tasks.push(url);
resolve(3);
return;
}
resolve(2);
try {
var upath = MD5(response.request.href).toString();
var ctype = response.headers['content-type'];
ctype = ctype.substr(ctype.indexOf('/') + 1);
var res = ctype.indexOf(';');
if (res != -1) ctype = ctype.substr(0, res - 1);
tasks.downloadingImg(url, tasks.path + '/' + upath + '.' + ctype);
} catch (e) {
console.log(`An unexpected error when downloading pictures, url : ${url} , error : ${e.message}`);
}
} else {
tasks.urlSet.delete(url);
reject(error);
}
});
});
}
workMultiple(callback) {
this.stop = 0;
this.mainWindow.setProgressBar(2);
var cnt = 0, cnt2 = 0, tasks = this;
var loop = function () {
if (tasks.stop) return;
tasks.mainWindow.webContents.send('setProgress', {
a: cnt,
b: ((tasks.waitQueue.length + tasks.normalImgQueue.length) ? (tasks.waitQueue.length + tasks.normalImgQueue.length) : 1) + cnt
});
tasks.mainWindow.webContents.send('setImageNum', cnt2);
tasks.mainWindow.webContents.send('setErrorNum', tasks.errorQueue.length);
if (!tasks.runningNum && !tasks.waitQueue.length) {
tasks.mainWindow.setProgressBar(-1);
tasks.stop = 1;
tasks.browser.close().then(() => { callback(tasks); }).catch(() => { callback(tasks); });
return;
}
tasks.mainWindow.setProgressBar(cnt / (((tasks.waitQueue.length + tasks.normalImgQueue.length) ? (tasks.waitQueue.length + tasks.normalImgQueue.length) : 1) + cnt));
if ((!tasks.waitQueue.length && !tasks.normalImgQueue.length) || tasks.runningNum >= tasks.multipleNum) return setTimeout(loop, 100);
var tmpy = Math.min(tasks.normalImgQueue.length, tasks.multipleNum);
for (var i = 0; i < tmpy; i++) {
(function (url = '') {
tasks.downloadImg(url).then(resp => {
if (resp == 2) cnt2++;
if (resp != 3) cnt++;
}).catch(() => {
tasks.errorQueue.push(url);
});
})(tasks.normalImgQueue.shift());
}
var tmpx = Math.min(tasks.waitQueue.length, tasks.multipleNum - tasks.runningNum);
for (var i = 0; i < tmpx; i++) {
++tasks.runningNum;
(function (url = '') {
tasks.download(url).then(resp => {
if (resp == 2) cnt2++;
if (resp != 3) cnt++;
tasks.runningNum--;
}).catch(() => {
tasks.errorQueue.push(url);
tasks.runningNum--;
});
})(tasks.waitQueue.shift());
}
return setTimeout(loop, 100);
};
var initPage = function () {
if (tasks.pageList.length >= tasks.multipleNum) setTimeout(loop, 100);
for (var i = tasks.pageList.length; i < tasks.multipleNum; i++) {
tasks.newPage().then(res2 => {
tasks.pageList.push(res2);
if (tasks.pageList.length == tasks.multipleNum) setTimeout(loop, 100);
}).catch(() => {
console.log('Browser page error');
tasks.browser.close().then(() => {
tasks.browserRunning = 0;
callback(tasks);
}).catch(() => {
tasks.browserRunning = 0;
callback(tasks);
});
});
}
}
tasks.openBrowser().then(() => {
initPage();
}).catch(() => { });
return;
}
close() {
tasks.browserRunning = 0;
return this.browser.close();
}
}
exports.DynamicMultipleTasks = DynamicMultipleTasks;