easy_web_crawler
Version:
Web crawler wrapper around puppeteer module to simply the crawling on ajax/java script enabled pages.
49 lines (47 loc) • 1.73 kB
JavaScript
var Scraper = require('../index.js')
var isArrayEqual = function(x, y) {
return _(x).xorWith(y, _.isEqual).isEmpty();
};
var _ = require('lodash');
async function main(searchfor) {
var scraper = new Scraper();
scraper.startWithURLs("https://www.youtube.com/results?search_query=" + searchfor)
scraper.waitBetweenPageLoad(0)
scraper.callbackOnPageLoad(async function (page) {
var result = []
var prev = []
for (var j = 0; j < 150; j++) {
var now = []
await page.evaluate(x => {
window.scrollBy(0, document.body.scrollHeight);
}, 0);
await page.waitFor(1000);
try {
var elems = await page.$$('ytd-video-renderer')
for (var i = 0; i < elems.length; i++) {
var elem = elems[i]
var title = await elem.$('#video-title')
var img = await elem.$('#img')
title = await page.evaluate(body => body.innerHTML, title);
img = await page.evaluate(body => body.getAttribute("src"), img);
now.push({ title: title.trim(), img: img })
}
} catch (e) { }
/* check if new items loaded */
if (false && isArrayEqual(now, prev)) {
break;
} else {
prev = now;
result = result.concat(now)
}
}
result = _.uniqBy(result,_.isEqual)
page.write_text_to_file(JSON.stringify(result), "./example/result.json")
});
scraper.callbackOnFinish(function (result) {
;
})
await scraper.start()
}
var search=""
main(search)