UNPKG

easy_web_crawler

Version:

Web crawler wrapper around puppeteer module to simply the crawling on ajax/java script enabled pages.

164 lines (143 loc) 4.74 kB
//this file is used to generate documentation only /** * Main Scraper class * @example * // npm install easy_web_crawler * const Scaper = require('easy_web_crawler') * var scraper =new Scraper(); */ class Scaper { constructor() { } /** * This is mandatory.<br> * Take the list of urls used as the starting point. * @param {(string|string[])} listOfURLs * @example * // add the urls as the starting point * scaper.startWithURLs(['www.googl.com','www.bing.com']) * scaper.startWithURLs('www.googl.com') */ startWithURLs(listOfURLs) { } /** * Takes a non async callback function as argument,url added to processing queue only if the function return true value.<br> * This is optional. * By default is accept all urls added to processing queue.<br> * @param {function} nonAsyncFunction * @example * // accept url contains www.google.com * scraper.allowIfMatches(function(url) { * return url.indexOf('www.google.com')>-1 * }) */ allowIfMatches(nonAsyncFunction) { } /** * This is optional setting.<br> * This will save your progress in the file and you can stop and start the scraper from the previous state.<br> * The file is a sqlite db file you can modify the content using sqllite clients.<br> * If no file specified the stored in memory.. * @param {string} filePath * @example * // state stored in state.db file * scraper.saveProgressInFile("./state.db") */ saveProgressInFile(filePath) { } /** * This will allow the scraper to automatically download all the links form the page and add to processing queue.<br> * Note the urls will be filtered if allowIfMatches function return 'false'. * @param {boolean} enableAutoCrawler - true to enable * @example * scraper.enableAutoCrawler(true) */ enableAutoCrawler(flag) { } /** * Time delay between each page load in milliseconds * @param {number} [delayInMilliSeconds=0] * @example * //wait for 90 milliseconds between page load * scraper.waitBetweenPageLoad(90) */ waitBetweenPageLoad(delayInMilliSeconds) { } /** * Final callback when scarping is completed * @param {number} asyncFunction * @example * scraper.callbackOnFinish(function(result){ * console.log(result) * }) */ callbackOnFinish(asyncFunction) { } /** * This is the main function.Your scarping logic to be defined in the function.<br> * This called for each page in the processing queue.<br> * Called with pupetter page object as input.<br> * The page object input got addtional methods to support scraping * @see page * @param {function} asyncFunction - a sync function with single input argument page. * @example * scraper.waitBetweenPageLoad(90) */ callbackOnPageLoad(asyncFunction) { } /** * To start the scraping process. * callbackOnFinish function is called once the scraping is completed. * @example * scraper.start() */ start() { } } /** * Pupetter page class. * Enhanced with supporting function detailed below. * */ class Page { /** * Download image from url and save to local disk * @param {string} image_download_url * @param {string} where_to_full_file_path * @example * scraper.callbackOnPageLoad(async function(page){ * var img = await page.$('img') * var img_src = await page.evaluate(img => img.getAttribute("src"), img); * page.download_image(img_src,"usr/test/profile.png") * }) */ download_image(image_download_url, where_to_full_file_path) { } /** * Save the text result ,this will returned as input to callbackOnFinish function<br> * Each url can store one result * @param {string} text * @example * scraper.callbackOnPageLoad(async function(page){ * var article = await page.$eval('article', tag => tag.innerText); * page.saveResult(article) * }) */ saveResult(text) { } /** * Write text content to local file * @param {string} content * @param {string} filename * @example * scraper.callbackOnPageLoad(async function(page){ * var article = await page.$eval('article', tag => tag.innerText); * page.download_image(article,"usr/test/article.txt") * }); */ write_text_to_file(content, filename) { } /** * Add the url to processing queue * @param {string} url * @example * scraper.callbackOnPageLoad(async function(page){ * var a = await page.$('a') * var url = await page.evaluate(a => a.getAttribute("href"), a); * page.add_url_to_queue(url) * }); */ add_url_to_queue(url) { } }