UNPKG

jarviscrawlercore

Version:
588 lines (482 loc) 14.6 kB
const {WaitFrameNavigated} = require('../waitframenavigated'); const {closeDialog} = require('./utils'); const {sleep} = require('../utils'); const log = require('../log'); // const {getElementPropertyString} = require('../eleutils'); /** * parseURL - parse URL * @param {string} url - url * @return {string} url - url */ function parseURL(url) { const url1 = url.split('.com/'); if (url1.length > 1) { const url2 = url1[1].split('&ti='); return url2[0]; } return url; } /** * getFirstProductURL - get first product url * @param {object} page - page * @param {number} timeout - timeout in microseconds * @return {string} url - url */ async function getFirstProductURL(page) { let awaiterr; const ret = await page .$$eval('.plp-products-wrap', (eles) => { console.log(eles); if (eles.length > 0) { eles = eles[0].getElementsByClassName('product'); if (eles.length > 0) { const link = eles[0].getElementsByTagName('a'); if (link.length > 0) { return link[0].href; } } } return ''; }) .catch((err) => { awaiterr = err; }); if (awaiterr) { log.error('getFirstProductURL.$$eval .plp-products-wrap', awaiterr); return ''; } return ret; } // /** // * getPageURL - get page url // * @param {object} page - page // * @param {number} pageid - pageid, is like 1, 2, 3 // * @return {object} ret - {error, url} // */ // async function getPageURL(page, pageid) { // let awaiterr; // const url = await page // .evaluate((pageid) => { // const lstpages = document.getElementsByClassName('page-link'); // if (lstpages.length > 0) { // const lsta = lstpages[pageid - 2].getElementsByTagName('a'); // if (lsta.length > 0) { // return lsta[0].href; // } // } // return ''; // }, pageid) // .catch((err) => { // awaiterr = err; // }); // if (awaiterr) { // return {error: awaiterr}; // } // return {url: url}; // } /** * getPageURLWithIndex - get page url * @param {object} page - page * @param {number} pageindex - pageindex, is like 0, 1, 2, 3 * @return {object} ret - {error, url} */ async function getPageURLWithIndex(page, pageindex) { let awaiterr; const url = await page .evaluate((pageindex) => { const lstpages = document.getElementsByClassName('page-link'); if ( lstpages.length > 0 && pageindex >= 0 && pageindex < lstpages.length ) { const lsta = lstpages[pageindex].getElementsByTagName('a'); if (lsta.length > 0) { return lsta[0].href; } } return ''; }, pageindex) .catch((err) => { awaiterr = err; }); if (awaiterr) { return {error: awaiterr}; } return {url: url}; } /** * countPageObjIndex - count pageobj index * @param {object} page - page * @param {number} pageid - pageid, is like 1, 2, 3 * @return {object} ret - {error, pi} */ async function countPageObjIndex(page, pageid) { let awaiterr; const pi = await page .evaluate((pageid) => { const lstpages = document.getElementsByClassName('page-link'); if (lstpages.length > 0) { let mini = 1; let maxi = 1; for (let i = 0; i < lstpages.length; ++i) { try { const curtext = lstpages[i].innerText; const curpi = parseInt(curtext); if (i == 1) { mini = curpi; } else if (i == lstpages.length - 2) { maxi = curpi; } if (curpi == pageid) { return i; } } catch (err) {} } if (pageid > maxi) { if (maxi >= 95) { return -(lstpages.length - 3); } return -(lstpages.length - 2); } if (pageid < mini) { return -1; } } return -99999; }, pageid) .catch((err) => { awaiterr = err; }); if (awaiterr) { return {error: awaiterr}; } return {pi: pi}; } /** * chgPage - change to page * @param {object} page - page * @param {number} pageid - pageid, is like 1, 2, 3 * @param {string} baseurl - baseurl * @param {string} firsturl - firsturl * @param {number} timeout - timeout in microseconds * @return {error} err - error */ async function chgPage(page, pageid, baseurl, firsturl, timeout) { if (pageid > 1) { await sleep(3 * 1000); let awaiterr; await page .waitForSelector('.page-link', {timeout: timeout}) .catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } const mainframe = await page.mainFrame(); const waitchgpage = new WaitFrameNavigated(page, mainframe, async (frame) => { const url = frame.url(); return url.indexOf(baseurl) == 0; }); let cpi = -1; while (true) { await page .waitForSelector('.page-link', {timeout: timeout}) .catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } const cpoi = await countPageObjIndex(page, pageid); if (cpoi.error) { return cpoi.error; } if (cpoi.pi == -99999) { return new Error('chgPage invalid pi(-99999)'); } const lstpages = await page.$$('.page-link').catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } if (cpoi.pi >= 0 && cpoi.pi >= lstpages.length) { return new Error( 'chgPage invalid pi(' + cpoi.pi + ',' + lstpages.length + ')', ); } if (cpoi.pi >= 0) { cpi = cpoi.pi; break; } const ccpi = -cpoi.pi; await lstpages[ccpi].hover().catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } const urlret = await getPageURLWithIndex(page, ccpi); if (urlret.error) { return urlret.error; } baseurl = urlret.url; await lstpages[ccpi].click().catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } waitchgpage.resetex(); const isok = await waitchgpage.waitDone(timeout); if (!isok) { return new Error('chgPage.waitDone timeout'); } } if (cpi < 0) { return new Error('chgPage invalid cpi'); } const lstpages = await page.$$('.page-link').catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } await lstpages[cpi].hover().catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } const urlret = await getPageURLWithIndex(page, cpi); if (urlret.error) { return urlret.error; } baseurl = urlret.url; await lstpages[cpi].click().catch((err) => { awaiterr = err; }); if (awaiterr) { return awaiterr; } const isok = await waitchgpage.waitDone(timeout); if (!isok) { return new Error('chgPage.waitDone timeout'); } waitchgpage.release(); let curms = 0; while (true) { const cururl = await getFirstProductURL(page); if (cururl != '' && cururl != firsturl) { break; } await sleep(1000); curms += 1000; if (curms > timeout) { break; } } } return undefined; } /** * steepandcheapProducts - steepandcheap products * @param {object} browser - browser * @param {string} url - url * @param {number} pageid - pageid, is like 1, 2, 3 * @param {number} timeout - timeout in microseconds * @return {object} ret - {error, ret} */ async function steepandcheapProducts(browser, url, pageid, timeout) { let awaiterr = undefined; const page = await browser.newPage(); await page .setViewport({ width: 1280, height: 600, deviceScaleFactor: 1, }) .catch((err) => { awaiterr = err; }); if (awaiterr) { log.error('steepandcheapProducts.setViewport', awaiterr); await page.close(); return {error: awaiterr.toString()}; } // await page.setRequestInterception(true); // page.on('request', async (req) => { // const rt = req.resourceType(); // if (rt == 'image' || rt == 'media' || rt == 'font') { // await req.abort(); // return; // } // await req.continue(); // }); // if (pageid > 1) { // url += '&page=' + (pageid - 1).toString(); // } const furl = 'https://www.steepandcheap.com/' + url + '?sort=-price'; await page .goto(furl, { timeout: timeout, }) .catch((err) => { awaiterr = err; }); if (awaiterr) { log.error('steepandcheapProducts.goto', awaiterr); await page.close(); return {error: awaiterr.toString()}; } awaiterr = await closeDialog(page); if (awaiterr) { log.error('steepandcheapProducts.chgPage ', awaiterr); await page.close(); return {error: 'steepandcheapProducts.chgPage ' + awaiterr.toString()}; } const firsturl = await getFirstProductURL(page); awaiterr = await chgPage(page, pageid, furl, firsturl, timeout); if (awaiterr) { log.error('steepandcheapProducts.chgPage ', awaiterr); await page.close(); return {error: awaiterr.toString()}; } await page.waitForSelector('.plp-products-wrap').catch((err) => { awaiterr = err; }); if (awaiterr) { log.error( 'steepandcheapProducts.waitForSelector .plp-products-wrap', awaiterr, ); await page.close(); return {error: awaiterr.toString()}; } const ret = await page .$$eval('.plp-products-wrap', (eles) => { console.log(eles); if (eles.length > 0) { eles = eles[0].getElementsByClassName('product'); const ret = []; for (let i = 0; i < eles.length; ++i) { const curret = {currency: 'USD'}; const curele = eles[i]; const isnew = curele.getElementsByClassName('pli-new-icon'); if (isnew.length > 0) { curret.isNew = true; } const stocklevel = curele.getElementsByClassName('pli-stock-level'); if (stocklevel.length > 0) { const stocklevelarr = stocklevel[0].innerText.trim().split(' ', -1); if (stocklevelarr.length != 3) { console.log('invalid stock level ' + stocklevel[0].innerText); } else { try { curret.stockLevel = parseInt(stocklevelarr[1]); } catch (err) { console.log('invalid stock level ' + stocklevel[0].innerText); } } } const brandname = curele.getElementsByClassName('ui-pl-name-brand'); if (brandname.length > 0) { curret.brandName = brandname[0].innerText; } const titlename = curele.getElementsByClassName('ui-pl-name-title'); if (titlename.length > 0) { curret.productName = titlename[0].innerText.split('-', -1); } const lowprice = curele.getElementsByClassName( 'ui-pl-pricing-low-price', ); if (lowprice.length > 0) { const lowpricearr = lowprice[0].innerText.split('$', -1); if (lowpricearr.length != 2) { console.log('invalid low price ' + lowprice[0].innerText); } else { try { curret.curPrice = parseFloat(lowpricearr[1]); } catch (err) { console.log('invalid low price ' + lowprice[0].innerText); } } } const highprice = curele.getElementsByClassName( 'ui-pl-pricing-high-price', ); if (highprice.length > 0) { const highpricearr = highprice[0].innerText.split('$', -1); if (highpricearr.length != 2) { console.log('invalid high price ' + highprice[0].innerText); } else { try { curret.price = parseFloat(highpricearr[1]); } catch (err) { console.log('invalid high price ' + highprice[0].innerText); } } } const ratingbase = curele.getElementsByClassName('rating-base'); if (ratingbase.length > 0) { try { curret.ratingValue = parseInt( ratingbase[0].children[0].innerText, ); } catch (err) { console.log('invalid rating-base ' + ratingbase[0]); } } const reviews = curele.getElementsByClassName('ui-pl-reviews'); if (reviews.length > 0) { try { curret.reviews = parseInt(reviews[0].children[2].innerText); } catch (err) { console.log('invalid reviews ' + reviews[0]); } } const link = curele.getElementsByTagName('a'); if (link.length > 0) { curret.url = link[0].href; } ret.push(curret); } return ret; } return []; }) .catch((err) => { awaiterr = err; }); if (awaiterr) { log.error('steepandcheapProducts.$$eval .product', awaiterr); await page.close(); return {error: awaiterr.toString()}; } const ret1 = await page .$$eval('.page-number', (eles) => { const ret1 = {}; try { ret1.maxPage = parseInt(eles[eles.length - 1].innerText); } catch (err) { console.log('invalid page number ' + eles[eles.length - 1]); } return ret1; }) .catch((err) => { awaiterr = err; }); if (awaiterr) { log.error('steepandcheapProducts.$$eval .page-number', awaiterr); await page.close(); return {error: awaiterr.toString()}; } await page.close(); for (let i = 0; i < ret.length; ++i) { ret[i].url = parseURL(ret[i].url); } return {ret: {maxPage: ret1.maxPage, products: ret}}; } exports.steepandcheapProducts = steepandcheapProducts;