jarviscrawlercore
Version:
jarvis crawler core
213 lines (168 loc) • 4.98 kB
JavaScript
// const {sleep} = require('../utils');
const log = require('../log');
const {disableDownloadOthersEx} = require('../page.utils');
const {WaitAllResponse} = require('../waitallresponse');
const {string2float, string2int} = require('../string.utils');
const {getSubobjectID} = require('./utils');
// const {WaitFrameNavigated} = require('../waitframenavigated');
// const {waitForFunction} = require('../waitutils');
/**
* douban book
* @param {object} browser - browser
* @param {string} id - id
* @param {number} timeout - timeout in microseconds
* @return {object} result - {error: err, ret: ret}
*/
async function book(browser, id, timeout) {
const page = await browser.newPage();
let awaiterr;
const baseurl = 'https://book.douban.com/subject/' + id + '/';
await disableDownloadOthersEx(page, (req)=>{
const url = req.url();
if (url.indexOf('rtb.openx.net') >= 0) {
return true;
}
return false;
});
const waitAllResponse = new WaitAllResponse(page);
// const mainframe = await page.mainFrame();
// const waitchgpage = new WaitFrameNavigated(page, mainframe, async (frame) => {
// const cururl = frame.url();
// return cururl.indexOf(searchurl) == 0;
// });
await page
.setViewport({
width: 1280,
height: 600,
deviceScaleFactor: 1,
})
.catch((err) => {
awaiterr = err;
});
if (awaiterr) {
log.error('douban.book.setViewport', awaiterr);
await page.close();
return {error: awaiterr.toString()};
}
await page
.goto(baseurl, {
timeout: timeout,
})
.catch((err) => {
awaiterr = err;
});
if (awaiterr) {
log.error('douban.book.goto', awaiterr);
await page.close();
return {error: awaiterr.toString()};
}
const isdone = await waitAllResponse.waitDone(timeout);
if (!isdone) {
const err = new Error('douban.book.waitDone timeout');
log.error('douban.book.waitDone ' + baseurl, err);
await page.close();
return {error: err.toString()};
}
const ret = {};
ret.title = await page.$$eval('h1', (eles) => {
if (eles.length > 0) {
return eles[0].innerText;
}
return undefined;
});
ret.cover = await page.$$eval('#mainpic', (eles) => {
if (eles.length > 0) {
const imgs = eles[0].getElementsByTagName('img');
if (imgs.length > 0) {
return imgs[0].src;
}
}
return undefined;
});
ret.authors = await page.$$eval('#info', (eles) => {
if (eles.length > 0) {
const lstpl = eles[0].getElementsByClassName('pl');
if (lstpl.length > 0) {
const lsta = lstpl[0].parentElement.getElementsByTagName('a');
if (lsta.length > 0) {
const lst = [];
for (let i = 0; i < lsta.length; ++i) {
const arr = lsta[i].innerText.split(':');
lst.push(arr[arr.length - 1]);
}
return lst;
}
}
}
return undefined;
});
ret.score = await page.$$eval('.ll.rating_num', (eles) => {
if (eles.length > 0) {
return eles[0].innerText;
}
return undefined;
});
ret.ratingNums = await page.$$eval('.rating_people', (eles) => {
if (eles.length > 0) {
const lstspan = eles[0].getElementsByTagName('span');
if (lstspan.length > 0) {
return lstspan[0].innerText;
}
}
return undefined;
});
ret.intro = await page.$$eval('.intro', (eles) => {
if (eles.length > 0) {
return eles[0].innerText;
}
return undefined;
});
ret.lstLink = await page.$$eval('#db-rec-section', (eles) => {
if (eles.length > 0) {
const lstdl = eles[0].getElementsByTagName('dl');
const lst = [];
for (let i = 0; i < lstdl.length; ++i) {
if (lstdl[i].className != 'clear') {
const lsta = lstdl[i].getElementsByTagName('a');
if (lsta.length > 1) {
lst.push({url: lsta[1].href, title: lsta[1].innerText});
}
}
}
return lst;
}
return undefined;
});
ret.lstTag = await page.$$eval('a.tag', (eles) => {
if (eles.length > 0) {
const lst = [];
for (let i = 0; i < eles.length; ++i) {
lst.push(eles[i].innerText);
}
return lst;
}
return undefined;
});
await page.close();
if (ret.score) {
const scoreret = string2float(ret.score);
if (scoreret.error) {
return {error: 'string2float(ret.score) ' + scoreret.error.toString()};
}
ret.score = scoreret.num;
}
if (ret.ratingNums) {
const siret = string2int(ret.ratingNums);
if (siret.error) {
return {error: 'string2int(ret.ratingNums) ' + siret.error.toString()};
}
ret.ratingNums = siret.num;
}
if (ret.lstLink) {
for (let i = 0; i < ret.lstLink.length; ++i) {
ret.lstLink[i].id = getSubobjectID(ret.lstLink[i].url);
}
}
return {ret: ret};
}
exports.book = book;