hammer-scrape
Version:
Unifies Cheerio and Puppeteer for the most streamline scraping experience
144 lines • 6.66 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
const web_scraping_engine_1 = require("../web_scraping_engine");
const engine_type_1 = require("../engine_type");
const engine_core_type_1 = require("../engine_core_type");
const cheerio_parsing_1 = require("../cores/cheerio_parsing");
const puppeteer_parsing_1 = require("../cores/puppeteer_parsing");
const puppeteer_manipulate_1 = require("../cores/puppeteer_manipulate");
const engine_mode_1 = require("../engine_mode");
const engine_errors_1 = require("../engine_errors");
const core_errors_1 = require("../core_errors");
class HammerEngine extends web_scraping_engine_1.WebScrapingEngine {
/** This serves as our basic */
constructor(elementPingSelector, lazy = true, sharedManager, forceBrowser) {
super(engine_type_1.default.Dynamic, engine_core_type_1.default.CheerioAndPuppeteer);
this.elementPingSelector = elementPingSelector;
this.lazy = lazy;
this.usingPuppeteer = false;
this.sharingManager = sharedManager !== undefined;
this.sharedManager = sharedManager;
this.forceBrowser = forceBrowser !== undefined ? forceBrowser : false;
}
load() {
return new Promise((resolve) => {
this.parsingCore = null;
this.manipulationCore = null;
resolve();
});
}
/**
* Gets a boolean that indicates if we are using puppeteer to additionally parse the page
*/
usingPuppeteerParse() {
return this.usingPuppeteer ? true : false;
}
process(url) {
if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) {
return new Promise(async (resolve) => {
this.setEngineMode(engine_mode_1.default.Loading);
this.usingPuppeteer = false;
if (!this.forceBrowser) {
// we are going to initially try to find the element we want first using cheerio.
// if cheerio fails to find it we will load it up using the puppeteer parsing core.
let cheerioParsingCore = new cheerio_parsing_1.default(url);
await cheerioParsingCore.initialize();
let elementExist = await cheerioParsingCore.elementExist(this.elementPingSelector);
if (elementExist) {
this.parsingCore = cheerioParsingCore;
}
else {
this.forceBrowser = true;
}
}
if (this.forceBrowser) {
let puppeteerParsingCore = new puppeteer_parsing_1.default(url, this.sharedManager);
await puppeteerParsingCore.initialize();
this.usingPuppeteer = true;
this.parsingCore = puppeteerParsingCore;
}
// when we are in lazy mode, we have no desire to actually create a manipulation core until we absolutely need too
if (!this.lazy) {
// initialize our manipulation core
let puppeteerManipulatingCore = new puppeteer_manipulate_1.default(url, this.sharedManager);
await puppeteerManipulatingCore.initialize({
sharedRequest: this.usingPuppeteer
? this.parsingCore.getRequest()
: null,
});
this.manipulationCore = puppeteerManipulatingCore;
}
this.setEngineMode(engine_mode_1.default.Idling);
resolve();
});
}
else {
throw new engine_errors_1.EngineCannotSwitchModeError();
}
}
parse(callback) {
if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) {
return new Promise((resolve) => {
this.setEngineMode(engine_mode_1.default.Parsing);
callback(this.getParsingCore()).then(() => {
this.setEngineMode(engine_mode_1.default.Idling);
resolve();
});
});
}
else {
throw new engine_errors_1.EngineCannotSwitchModeError();
}
}
manipulate(callback) {
if (this.isCorrectEngineMode(engine_mode_1.default.Idling)) {
return new Promise(async (resolve) => {
// if we are in lazy mode
if (this.lazy) {
if (this.manipulationCore === null && this.parsingCore !== null) {
// initialize our manipulation core
let puppeteerManipulatingCore = new puppeteer_manipulate_1.default(this.parsingCore.getUrl(), this.sharedManager);
await puppeteerManipulatingCore.initialize({
sharedRequest: this.usingPuppeteer
? this.parsingCore.getRequest()
: null,
});
this.manipulationCore = puppeteerManipulatingCore;
}
else {
throw new core_errors_1.CoreNotInitializedError();
}
}
// call the original manipulate method which will handle setting the proper modes
await super.manipulate(callback);
// if we are using cheerio as our parsing engine we need to rebuild the page data that we are working on
if (!this.usingPuppeteer) {
let html = await this.manipulationCore.getDocumentHtml();
// we are going to initially try to find the element we want first using cheerio.
// if cheerio fails to find it we will load it up using the puppeteer parsing core.
this.parsingCore.getRequest().runFromHtml(html);
}
resolve();
});
}
else {
throw new engine_errors_1.EngineCannotSwitchModeError();
}
}
shutoff() {
return new Promise(async (resolve) => {
if (this.parsingCore !== null) {
await this.parsingCore.dispose();
this.parsingCore = null;
}
if (this.manipulationCore !== null) {
await this.manipulationCore.dispose();
this.manipulationCore = null;
}
resolve();
});
}
}
exports.HammerEngine = HammerEngine;
exports.default = HammerEngine;
//# sourceMappingURL=hammer_engine.js.map