UNPKG

hammer-scrape

Version:

Unifies Cheerio and Puppeteer for the most streamline scraping experience

144 lines 6.66 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const web_scraping_engine_1 = require("../web_scraping_engine"); const engine_type_1 = require("../engine_type"); const engine_core_type_1 = require("../engine_core_type"); const cheerio_parsing_1 = require("../cores/cheerio_parsing"); const puppeteer_parsing_1 = require("../cores/puppeteer_parsing"); const puppeteer_manipulate_1 = require("../cores/puppeteer_manipulate"); const engine_mode_1 = require("../engine_mode"); const engine_errors_1 = require("../engine_errors"); const core_errors_1 = require("../core_errors"); class HammerEngine extends web_scraping_engine_1.WebScrapingEngine { /** This serves as our basic */ constructor(elementPingSelector, lazy = true, sharedManager, forceBrowser) { super(engine_type_1.default.Dynamic, engine_core_type_1.default.CheerioAndPuppeteer); this.elementPingSelector = elementPingSelector; this.lazy = lazy; this.usingPuppeteer = false; this.sharingManager = sharedManager !== undefined; this.sharedManager = sharedManager; this.forceBrowser = forceBrowser !== undefined ? forceBrowser : false; } load() { return new Promise((resolve) => { this.parsingCore = null; this.manipulationCore = null; resolve(); }); } /** * Gets a boolean that indicates if we are using puppeteer to additionally parse the page */ usingPuppeteerParse() { return this.usingPuppeteer ? true : false; } process(url) { if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) { return new Promise(async (resolve) => { this.setEngineMode(engine_mode_1.default.Loading); this.usingPuppeteer = false; if (!this.forceBrowser) { // we are going to initially try to find the element we want first using cheerio. // if cheerio fails to find it we will load it up using the puppeteer parsing core. let cheerioParsingCore = new cheerio_parsing_1.default(url); await cheerioParsingCore.initialize(); let elementExist = await cheerioParsingCore.elementExist(this.elementPingSelector); if (elementExist) { this.parsingCore = cheerioParsingCore; } else { this.forceBrowser = true; } } if (this.forceBrowser) { let puppeteerParsingCore = new puppeteer_parsing_1.default(url, this.sharedManager); await puppeteerParsingCore.initialize(); this.usingPuppeteer = true; this.parsingCore = puppeteerParsingCore; } // when we are in lazy mode, we have no desire to actually create a manipulation core until we absolutely need too if (!this.lazy) { // initialize our manipulation core let puppeteerManipulatingCore = new puppeteer_manipulate_1.default(url, this.sharedManager); await puppeteerManipulatingCore.initialize({ sharedRequest: this.usingPuppeteer ? this.parsingCore.getRequest() : null, }); this.manipulationCore = puppeteerManipulatingCore; } this.setEngineMode(engine_mode_1.default.Idling); resolve(); }); } else { throw new engine_errors_1.EngineCannotSwitchModeError(); } } parse(callback) { if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) { return new Promise((resolve) => { this.setEngineMode(engine_mode_1.default.Parsing); callback(this.getParsingCore()).then(() => { this.setEngineMode(engine_mode_1.default.Idling); resolve(); }); }); } else { throw new engine_errors_1.EngineCannotSwitchModeError(); } } manipulate(callback) { if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) { return new Promise(async (resolve) => { // if we are in lazy mode if (this.lazy) { if (this.manipulationCore === null && this.parsingCore !== null) { // initialize our manipulation core let puppeteerManipulatingCore = new puppeteer_manipulate_1.default(this.parsingCore.getUrl(), this.sharedManager); await puppeteerManipulatingCore.initialize({ sharedRequest: this.usingPuppeteer ? this.parsingCore.getRequest() : null, }); this.manipulationCore = puppeteerManipulatingCore; } else { throw new core_errors_1.CoreNotInitializedError(); } } // call the original manipulate method which will handle setting the proper modes await super.manipulate(callback); // if we are using cheerio as our parsing engine we need to rebuild the page data that we are working on if (!this.usingPuppeteer) { let html = await this.manipulationCore.getDocumentHtml(); // we are going to initially try to find the element we want first using cheerio. // if cheerio fails to find it we will load it up using the puppeteer parsing core. this.parsingCore.getRequest().runFromHtml(html); } resolve(); }); } else { throw new engine_errors_1.EngineCannotSwitchModeError(); } } shutoff() { return new Promise(async (resolve) => { if (this.parsingCore !== null) { await this.parsingCore.dispose(); this.parsingCore = null; } if (this.manipulationCore !== null) { await this.manipulationCore.dispose(); this.manipulationCore = null; } resolve(); }); } } exports.HammerEngine = HammerEngine; exports.default = HammerEngine; //# sourceMappingURL=hammer_engine.js.map