UNPKG

jarviscrawlercore

Version:
397 lines (315 loc) 7.93 kB
const {sleep, hashMD5} = require('../utils'); const messages = require('../../pbjs/result_pb'); const {getImageInfo} = require('../imgutils'); const {IPMgr} = require('../ipmgr'); const log = require('../log'); /** * getURL - get url * @param {string} url - url * @return {string} url - url */ function getURL(url) { if (url.indexOf('data:') == 0) { url = 'local:data-' + hashMD5(url); } else { // const urlinfo = new URL(url); const arr = url.split('?'); return arr[0]; } return url; } /** * findReq - find a request * @param {array} reqs - request list * @param {string} url - url * @return {object} req - request */ function findReq(reqs, url) { for (let i = 0; i < reqs.length; ++i) { if (reqs[i].url == url) { return reqs[i]; } } return undefined; } /** * isReqFinished - is request finished? * @param {array} reqs - request list * @return {bool} isfinished - is finished */ function isReqFinished(reqs) { // const ct = Date.now(); // const endnums = 0; for (let i = 0; i < reqs.length; ++i) { if (reqs[i].status == 0) { // if (!reqs[i].hasres) { // if (ct - reqs[i].st >= 30000) { // reqs[i].status = 404; // reqs[i].et = ct; // ++endnums; // continue; // } // } return false; // return {isfinished: false, endnums: endnums}; } } return true; // return {isfinished: true, endnums: endnums}; } /** * analyze page * @param {object} browser - browser * @param {string} url - url * @param {object} viewport - {width, height, deviceScaleFactor, * isMobile, hasTouch, isLandscape} * @param {object} options - {screenshots, logs, timeout, screenshotsDelay} * @return {object} result - {error: err, ret: ret} */ async function analyzePage(browser, url, viewport, options) { const ipmgr = new IPMgr(); let needscreenshots = false; let needlogs = false; let timeout = 3 * 60 * 1000; let screenshotsDelay = 0; if (options) { if (options.screenshots) { needscreenshots = true; } if (options.logs) { needlogs = true; } if (options.timeout > 0) { timeout = options.timeout * 1000; } if (options.screenshotsDelay > 0) { screenshotsDelay = options.screenshotsDelay * 1000; } } const page = await browser.newPage(); await page.setCacheEnabled(false); if (viewport) { await page.setViewport(viewport); } const pagebt = Date.now(); const lstErr = []; const lstReq = []; const lstLogs = []; const lstScreenshots = []; let waitend = false; // let downloadNums = 0; page.on('console', (msg) => { if (needlogs) { lstLogs.push(msg.text()); } if (msg.type() == 'error') { lstErr.push(msg.text()); } }); page.on('error', (err) => { lstErr.push(err.toString()); log.error('ERROR - ' + err.toString()); }); page.on('request', (req) => { if (waitend) { return; } const url = getURL(req.url()); const oldreq = findReq(lstReq, url); if (oldreq) { return; } log.info('request - ', url); lstReq.push({ url: url, st: Date.now(), et: -1, status: 0, buflen: 0, contentType: '', isGZip: false, imgWidth: 0, imgHeight: 0, hasres: false, }); }); page.on('requestfailed', async (req) => { if (waitend) { return; } const url = getURL(req.url()); const curreq = findReq(lstReq, url); if (curreq) { if (curreq.status == 0) { curreq.status = -1; curreq.et = Date.now(); } } log.info('requestfailed - ', url); }); page.on('response', async (res) => { if (waitend) { return; } // ++downloadNums; const url = getURL(res.url()); log.info('response - ', url); const req = findReq(lstReq, url); if (req) { req.hasres = true; const headers = res.headers(); if ( headers['content-type'] && headers['content-type'].indexOf('video') == 0 ) { req.et = Date.now(); req.status = res.status(); // --downloadNums; return; } if (res.status() == 302) { req.et = Date.now(); req.status = res.status(); // --downloadNums; return; } const buf = await res.buffer(); req.buflen = buf.byteLength; req.et = Date.now(); req.status = res.status(); if (headers['content-type']) { req.contentType = headers['content-type']; } if ( headers['content-encoding'] && headers['content-encoding'].indexOf('gzip') >= 0 ) { req.isGZip = true; } if (req.contentType.indexOf('image/') >= 0) { const ir = await getImageInfo(buf); if (ir) { req.imgWidth = ir.w; req.imgHeight = ir.h; } } const remoteaddr = res.remoteAddress(); if (remoteaddr) { req.remoteaddr = remoteaddr.ip + ':' + remoteaddr.port; } } else { log.info('no response', url); } // --downloadNums; }); let pagegotoerr = undefined; await page.goto(url, {timeout: timeout}).catch((err) => { log.error('analyzePage.goto', url, err); pagegotoerr = err; }); // await page // .goto(url, { // waitUntil: 'networkidle2', // }) // .catch((err) => { // log.error('analyzePage.goto', url, err); // pagegotoerr = err; // }); if (pagegotoerr) { await page.close(); return {error: pagegotoerr}; } const startwaittime = Date.now(); let isdone = false; while (true) { if (Date.now() - startwaittime >= timeout) { await page.close(); return {error: 'timeout'}; } if (isReqFinished(lstReq)) { if (isdone) { break; } await sleep(3000); isdone = true; } else { isdone = false; await sleep(1000); } } const pageet = Date.now(); if (needscreenshots) { if (screenshotsDelay > 0) { await sleep(screenshotsDelay); } let buf = await page.screenshot({ // path: './page001.png', fullPage: true, type: 'jpeg', quality: 60, }); isdone = false; while (true) { if (isReqFinished(lstReq)) { if (isdone) { break; } await sleep(3000); isdone = true; } else { isdone = false; await sleep(1000); } } waitend = true; buf = await page.screenshot({ // path: './page001.png', fullPage: true, type: 'jpeg', quality: 60, }); const screenshot = { name: 'screenshot.jpg', type: messages.AnalyzeScreenshotType.AST_JPG, buf: buf, }; lstScreenshots.push(screenshot); } await page.close(); const ret = { pageTime: pageet - pagebt, pageBytes: 0, errs: lstErr, logs: lstLogs, screenshots: lstScreenshots, }; if (lstReq.length > 0) { ret.reqs = []; for (let i = 0; i < lstReq.length; ++i) { const curreq = { url: lstReq[i].url, downloadTime: lstReq[i].et - lstReq[i].st, status: lstReq[i].status, bufBytes: lstReq[i].buflen, startTime: lstReq[i].st, isGZip: lstReq[i].isGZip, contentType: lstReq[i].contentType, imgWidth: lstReq[i].imgWidth, imgHeight: lstReq[i].imgHeight, }; curreq.ipaddr = await ipmgr.getIP(lstReq[i].url); ret.reqs.push(curreq); } } log.info('page time is ', ret.pageTime); for (let i = 0; i < lstReq.length; ++i) { ret.pageBytes += lstReq[i].buflen; } log.info('page bytes is ', ret.pageBytes); log.info('err - ', JSON.stringify(lstErr)); log.info('request - ', JSON.stringify(lstReq)); return { ret: ret, }; } exports.analyzePage = analyzePage;