UNPKG

ayakashi

Version:

The next generation web scraping framework

259 lines (258 loc) 13.4 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const core_1 = __importDefault(require("@ayakashi/request/core")); const createConnection_1 = require("../engine/createConnection"); const prelude_1 = require("../prelude/prelude"); const yield_1 = require("../prelude/actions/yield"); const request_1 = require("../prelude/actions/request"); const cookies_1 = require("../prelude/actions/cookies"); const path_1 = require("path"); const pipeproc_1 = require("pipeproc"); const compiler_1 = require("../preloaderCompiler/compiler"); const opLog_1 = require("../opLog/opLog"); const client_1 = require("../bridge/client"); const cookies_2 = require("./cookies"); const cookieHelpers_1 = require("../utils/cookieHelpers"); const loaders_1 = require("./loaders"); const debug_1 = __importDefault(require("debug")); const d = debug_1.default("ayakashi:scraperWrapper"); function scraperWrapper(log) { return __awaiter(this, void 0, void 0, function* () { try { const opLog = opLog_1.getOpLog(); opLog.info("running scraper", log.body.module); const bridgeClient = client_1.getBridgeClient(log.body.connectionConfig.bridgePort); //get a tab and create a connection let tab; try { tab = yield bridgeClient.getTarget(); if (!tab) { throw new Error("no_target"); } } catch (e) { d(e); opLog.error("Could not create a chrome target"); throw e; } let connection; try { connection = yield createConnection_1.createConnection(tab, log.body.connectionConfig.bridgePort, log.body.config.emulatorOptions); if (!connection) { throw new Error("no_connection"); } } catch (e) { d(e); opLog.error("Could not create a connection"); throw e; } //ignoreCertificateErrors option if (log.body.ignoreCertificateErrors) { yield connection.client.Security.setIgnoreCertificateErrors({ ignore: true }); } //user-agent setup const userAgentData = yield bridgeClient.getUserAgentData({ agent: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.userAgent) || undefined, platform: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.platform) || undefined, persistentSession: log.body.persistentSession }); if (!userAgentData) { throw new Error("could not generate userAgent"); } const acceptLanguage = (log.body.config.emulatorOptions && log.body.config.emulatorOptions.acceptLanguage) || "en-US"; yield connection.client.Emulation.setUserAgentOverride({ userAgent: userAgentData.userAgent, platform: userAgentData.platform, acceptLanguage: acceptLanguage }); //get cookie jar const { jar, cookies } = yield cookies_2.getCookieJar(log.body.connectionConfig.bridgePort, { persistentSession: log.body.persistentSession }); //add all cookies from the jar to chrome if (cookies.length > 0) { yield connection.client.Network.setCookies({ cookies: cookieHelpers_1.toChromeCookies(cookies) }); } //add all chrome cookies to jar and to the persistent store after every page load connection.unsubscribers.push(connection.client.Page.domContentEventFired(function () { return __awaiter(this, void 0, void 0, function* () { const chromeCookies = yield connection.client.Network.getCookies(); cookieHelpers_1.toRequestCookies(chromeCookies.cookies).forEach(function (cookie) { jar.setCookie(cookieHelpers_1.toCookieString(cookie), cookieHelpers_1.getCookieUrl(cookie)); }); yield cookies_2.updateCookieJar(log.body.connectionConfig.bridgePort, jar, { persistentSession: log.body.persistentSession }); }); })); //check pipes and initialize the instance using the prelude if (log.body.config.pipeConsole !== false) { connection.pipe.console(function (text) { if (text && text.indexOf("[Ayakashi]") === -1) { opLog.debug(`<Scraper:${log.body.module}:Browser>`, text); } }); } if (log.body.config.pipeExceptions !== false) { connection.pipe.uncaughtException(function (exception) { opLog.debug(`<Scraper:${log.body.module}:Browser:Exception>`, JSON.stringify(exception, null, 2)); }); } const ayakashiInstance = yield prelude_1.prelude(connection); //attach the request API const myRequest = core_1.default.defaults({ headers: { "User-Agent": userAgentData.userAgent, //tslint:disable max-line-length Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", //tslint:enable max-line-length "accept-language": acceptLanguage, "cache-control": "no-cache", pragma: "no-cache" }, proxy: log.body.proxyUrl || undefined, strictSSL: !log.body.ignoreCertificateErrors, gzipOrBrotli: true, timeout: 10000, jar: jar }); function cookieSync() { return __awaiter(this, void 0, void 0, function* () { const requestCookies = cookieHelpers_1.getAllCookiesFromRequestJar(jar); //sync request cookies with chrome if (requestCookies.length > 0) { yield connection.client.Network.setCookies({ cookies: cookieHelpers_1.toChromeCookies(requestCookies) }); } //sync request cookies with the persistent store yield cookies_2.updateCookieJar(log.body.connectionConfig.bridgePort, jar, { persistentSession: log.body.persistentSession }); }); } request_1.attachRequest(ayakashiInstance, myRequest, cookieSync); cookies_1.attachCookieActions(ayakashiInstance, jar, connection, cookieSync); //connect to pipeproc const pipeprocClient = pipeproc_1.PipeProc(); yield pipeprocClient.connect({ socket: `ipc://${path_1.resolve(log.body.storeProjectFolder, "ipc")}` }); //attach the yield methods const yieldWatcher = { yieldedAtLeastOnce: false }; yield_1.attachYields(ayakashiInstance, pipeprocClient, log.body.saveTopic, log.body.selfTopic, yieldWatcher); //load domQL as a preloader const domqlPreloader = yield compiler_1.compile(log.body.appRoot, `./lib/domQL/domQL`, "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true); yield connection.injectPreloader({ compiled: domqlPreloader, as: "domQL", waitForDOM: false }); //load findCssSelector as a preloader const findCssSelectorPreloader = yield compiler_1.compile(log.body.appRoot, `@ayakashi/get-node-selector`, "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true); yield connection.injectPreloader({ compiled: findCssSelectorPreloader, as: "getNodeSelector", waitForDOM: false }); //load the old detection patches const detectionPatches = yield compiler_1.compile(log.body.appRoot, "./lib/detection/patch", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true); yield connection.injectPreloader({ compiled: detectionPatches, as: "detectionPatches", waitForDOM: false }); // load stealth patches const stealthPatches = yield compiler_1.compile(log.body.appRoot, "./lib/detection/stealth.js", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true); yield connection.injectPreloader({ compiled: stealthPatches, as: "stealthPatches", waitForDOM: false }); //load the marshalling helpers const marshalling = yield compiler_1.compile(log.body.appRoot, "./lib/utils/marshalling", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true); yield connection.injectPreloader({ compiled: marshalling, as: "marshalling", waitForDOM: false }); //load external actions/extractors/preloaders yield loadExternals(connection, ayakashiInstance, log); //autoLoad local actions/extractors/preloaders if (log.body.config.localAutoLoad !== false) { yield loadLocals(connection, ayakashiInstance, log); } //activate the connection and load the scraper yield connection.activate(); let scraperModule; try { if (log.body.config.simple) { scraperModule = require(path_1.resolve(log.body.projectFolder, log.body.module)); } else { scraperModule = require(path_1.resolve(log.body.projectFolder, "scrapers", log.body.module)); } if (typeof scraperModule !== "function") { scraperModule = scraperModule.default; } if (typeof scraperModule !== "function") { throw new Error(`Scraper <${log.body.module}> is not a function`); } } catch (e) { opLog.error(e.message); yield connection.release(); throw e; } //run the scraper let result; try { //@ts-ignore if (log.body.input && log.body.input.value && log.body.input.value.continue === true) delete log.body.input.value.continue; result = yield scraperModule(ayakashiInstance, log.body.input.value || {}, log.body.params || {}); } catch (e) { opLog.error(`There was an error while running scraper <${log.body.module}> -`, e.message, e.stack); yield connection.release(); throw e; } if (result) { yield ayakashiInstance.yield(result); } if (!result && !yieldWatcher.yieldedAtLeastOnce) { yield ayakashiInstance.yield({ continue: true }); } yield connection.release(); } catch (e) { d(e); throw e; } }); } exports.default = scraperWrapper; function loadExternals(connection, ayakashiInstance, log) { return __awaiter(this, void 0, void 0, function* () { loaders_1.loadExternalActions(ayakashiInstance, log.body.projectFolder, log.body.load.actions); loaders_1.loadExternalExtractors(ayakashiInstance, log.body.projectFolder, log.body.load.extractors); yield loaders_1.loadExternalPreloaders(connection, log.body.projectFolder, log.body.storeProjectFolder, log.body.load.preloaders); }); } function loadLocals(connection, ayakashiInstance, log) { return __awaiter(this, void 0, void 0, function* () { loaders_1.loadLocalProps(ayakashiInstance, log.body.projectFolder); loaders_1.loadLocalActions(ayakashiInstance, log.body.projectFolder); loaders_1.loadLocalExtractors(ayakashiInstance, log.body.projectFolder); yield loaders_1.loadLocalPreloaders(connection, log.body.projectFolder, log.body.storeProjectFolder); }); }