UNPKG

ayakashi

Version:

The next generation web scraping framework

340 lines (339 loc) 15.6 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.run = void 0; const browser_1 = require("../engine/browser"); const bridge_1 = require("../bridge/bridge"); const connection_1 = require("../bridge/connection"); const userAgent_1 = require("../bridge/userAgent"); const cookies_1 = require("../bridge/cookies"); const path_1 = require("path"); const pipeproc_1 = require("pipeproc"); const uuid_1 = require("uuid"); const dayjs_1 = __importDefault(require("dayjs")); const opLog_1 = require("../opLog/opLog"); const os_1 = require("os"); const fs_1 = require("fs"); const parseConfig_1 = require("./parseConfig"); const downloader_1 = require("../chromeDownloader/downloader"); const chromium_1 = require("../store/chromium"); const project_1 = require("../store/project"); const sessionDb_1 = require("../sessionDb/sessionDb"); const getRandomPort_1 = require("../utils/getRandomPort"); const debug_1 = __importDefault(require("debug")); const d = debug_1.default("ayakashi:runner"); const SIGINT = "SIGINT"; function run(projectFolder, config, options) { return __awaiter(this, void 0, void 0, function* () { const opLog = opLog_1.getOpLog(); let steps; let procGenerators; let initializers; const pipeprocClient = pipeproc_1.PipeProc(); let headlessChrome = null; let hasPrevious; let storeProjectFolder = yield project_1.getOrCreateStoreProjectFolder(options.simpleScraper ? `${projectFolder}/${options.simpleScraper}` : projectFolder, options.sessionKey); if (process.platform === "win32") { storeProjectFolder = storeProjectFolder.replace(/\\/g, "/"); } try { steps = parseConfig_1.firstPass(config); parseConfig_1.checkStepLevels(steps); parseConfig_1.validateStepFormat(steps); if (parseConfig_1.hasTypo(steps, config)) { opLog.error("The configuration still uses one of scrapper/renderlessScrapper/apiScrapper."); opLog.error("This was a typo and has been deprecated."); opLog.error("Please use scraper/renderlessScraper/apiScraper (with a single 'p') instead."); throw new Error("Deprecated configuration option"); } if (!options.simpleScraper && fs_1.existsSync(path_1.resolve(projectFolder, "scrappers")) && !fs_1.existsSync(path_1.resolve(projectFolder, "scrapers"))) { opLog.error("This project still uses a 'scrappers' folder."); opLog.error("This was a typo and has been deprecated."); opLog.error("Please move all your scraper files to a 'scrapers' folder (with a single 'p')."); throw new Error("Deprecated folder structure"); } //@ts-ignore if (config.config && config.config.userAgent) { opLog.error("The global userAgent option has been deprecated."); opLog.error("You can configure the userAgent in the emulatorOptions of each pipeline step"); opLog.error("Read more here: https://ayakashi-io.github.io/docs/reference/ayakashi-config-file.html#emulator-options"); throw new Error("Deprecated configuration option"); } if (options.resume && options.clean) { opLog.error("Cannot use both --resume and --clean"); throw new Error("Invalid run parameters"); } hasPrevious = yield project_1.hasPreviousRun(storeProjectFolder); if (!options.resume && options.clean && hasPrevious) { opLog.warn("cleaning previous run"); yield project_1.clearPreviousRun(storeProjectFolder); //re-create the project folder after clean yield project_1.getOrCreateStoreProjectFolder(options.simpleScraper ? `${projectFolder}/${options.simpleScraper}` : projectFolder, options.sessionKey); } else if (!options.resume && !options.clean && hasPrevious) { opLog.error("Cannot start a new run while a previous unfinished run exists."); opLog.error("Use --resume to resume the previous run or --clean to clear it and start a new one."); throw new Error("Invalid run parameters"); } else if (options.resume && hasPrevious) { opLog.warn("resuming previous run"); const [changed, lastConfig] = yield project_1.configChanged(config, storeProjectFolder); if (changed) { opLog.warn("restoring old config"); //tslint:disable no-parameter-reassignment config = lastConfig; //tslint:enable no-parameter-reassignment } } config.config = config.config || {}; if (config.config.bridgePort === 0) { config.config.bridgePort = yield getRandomPort_1.getRandomPort(); } else if (!config.config.bridgePort) { config.config.bridgePort = 9731; } if (config.config.protocolPort === 0) { config.config.protocolPort = yield getRandomPort_1.getRandomPort(); } else if (!config.config.protocolPort) { config.config.protocolPort = 9730; } const parsedConfig = parseConfig_1.createProcGenerators(config, steps, { bridgePort: config.config.bridgePort, protocolPort: config.config.protocolPort, persistentSession: (config.config && config.config.persistentSession === true) || false, projectFolder: projectFolder, storeProjectFolder: storeProjectFolder, operationId: uuid_1.v4(), startDate: dayjs_1.default().format("YYYY-MM-DD-HH-mm-ss") }); procGenerators = parsedConfig.procGenerators; initializers = parsedConfig.initializers; } catch (e) { throw e; } //start bridge const { bridge, closeBridge } = yield bridge_1.startBridge(config.config.bridgePort); function bridgeSigintListener() { return __awaiter(this, void 0, void 0, function* () { d("trap SIGINT, closing bridge"); yield closeBridge(); process.removeListener(SIGINT, bridgeSigintListener); }); } process.on(SIGINT, bridgeSigintListener); let chromePath; if (config.config && config.config.chromePath) { chromePath = config.config.chromePath; } else { if (!(yield chromium_1.isChromiumAlreadyInstalled()) || !(yield chromium_1.isCfT())) { yield downloader_1.downloadChromium({ useExact: true, revision: yield downloader_1.getRecommendedChromiumRevision(), useChannel: false, channel: "" }, ""); } chromePath = yield chromium_1.getChromePath(); } try { //launch chrome if (parseConfig_1.isUsingNormalScraper(steps, config)) { d("using normal scraper(s), chrome will be spawned"); headlessChrome = yield launch(config, storeProjectFolder, chromePath); //add bridge connection routes connection_1.addConnectionRoutes(bridge, headlessChrome); } else { d("using renderless scraper(s) only, chrome will not be spawned"); } //finalize systemProcs const procs = procGenerators.map(function (generator) { return { name: generator.name, offset: ">", maxReclaims: generator.config.retries || 1, reclaimTimeout: -1, onMaxReclaimsReached: "disable", from: generator.from, to: generator.to, processor: generator.processor }; }); //initialize sessionDb const { sessionDb, UserAgentDataModel, CookieModel } = yield sessionDb_1.sessionDbInit(storeProjectFolder, { create: true }); //add bridge userAgent routes userAgent_1.addUserAgentRoutes(bridge, sessionDb, UserAgentDataModel); //add bridge cookie routes cookies_1.addCookiesRoutes(bridge, sessionDb, CookieModel); //launch pipeproc let workers; if (config.config && config.config.workers && config.config.workers > 0) { workers = config.config.workers; } else { const stepCount = steps.length <= 4 ? 1 : parseConfig_1.countSteps(steps) - 3; workers = stepCount > os_1.cpus().length ? os_1.cpus().length : stepCount; } let workerConcurrency; if (config.config && config.config.workerConcurrency && config.config.workerConcurrency > 0) { workerConcurrency = config.config.workerConcurrency; opLog.info(`using workers: ${workers} (concurrency: ${workerConcurrency})`); } else { workerConcurrency = 1; opLog.info(`using workers: ${workers}`); } const waiter = opLog.waiter("initializing"); yield pipeprocClient.spawn({ socket: `ipc://${path_1.resolve(storeProjectFolder, "ipc")}`, location: project_1.getPipeprocFolder(storeProjectFolder), workers: workers, workerConcurrency: workerConcurrency, workerRestartAfter: 100 }); function sigintListener() { return __awaiter(this, void 0, void 0, function* () { d("trap SIGINT, closing pipeproc"); yield pipeprocClient.shutdown(); process.removeListener(SIGINT, sigintListener); }); } process.on(SIGINT, sigintListener); if (options.resume && hasPrevious) { yield Promise.all(procs.map(function (proc) { return __awaiter(this, void 0, void 0, function* () { try { yield pipeprocClient.reclaimProc(proc.name); } catch (_e) { } if (options.restartDisabledSteps) { try { yield pipeprocClient.resumeProc(proc.name); } catch (_e) { } } }); })); } else { yield project_1.saveLastConfig(config, storeProjectFolder); //register the systemProcs and init the project //@ts-ignore yield Promise.all(procs.map(proc => pipeprocClient.systemProc(proc))); yield pipeprocClient.commit(initializers.map(init => { return { topic: init, body: {} }; })); } waiter.succeed("running"); //close yield pipeprocClient.waitForProcs(); let procWithError = false; for (const pr of procs) { const proc = yield pipeprocClient.inspectProc(pr.name); if (proc.status === "disabled") { procWithError = true; } } if (headlessChrome) { yield headlessChrome.close(); } yield closeBridge(); yield pipeprocClient.shutdown(); if (!procWithError) { opLog.info("cleaning run state"); yield project_1.clearPreviousRun(storeProjectFolder); } else { opLog.warn("Run finished but some steps were disabled due to errors, run state will not be cleared"); opLog.warn("You can try re-running them by passing --resume and --restartDisabledSteps"); } } catch (e) { try { yield pipeprocClient.shutdown(); } catch (_e) { d(_e); } try { if (headlessChrome) { yield headlessChrome.close(); } } catch (_e) { d(_e); } try { yield closeBridge(); } catch (_e) { d(_e); } opLog.error("Failed to run project"); throw e; } }); } exports.run = run; //tslint:disable cyclomatic-complexity function launch(config, storeProjectFolder, chromePath) { return __awaiter(this, void 0, void 0, function* () { //check top level config options let headless = true; if (config.config && config.config.headless === false) { headless = false; } let autoOpenDevTools = true; if (config.config && config.config.openDevTools === false) { autoOpenDevTools = false; } let persistentSession = false; if (config.config && config.config.persistentSession === true) { persistentSession = true; } let proxyUrl; if (config.config && config.config.proxyUrl) { proxyUrl = config.config.proxyUrl; } let windowHeight; if (config.config && config.config.windowHeight) { windowHeight = config.config.windowHeight; } let windowWidth; if (config.config && config.config.windowWidth) { windowWidth = config.config.windowWidth; } //spawn the chrome instance const headlessChrome = browser_1.getInstance(); yield headlessChrome.init({ headless: headless, chromePath: chromePath, autoOpenDevTools: autoOpenDevTools, protocolPort: (config.config).protocolPort, sessionDir: persistentSession ? path_1.resolve(storeProjectFolder, "chromium_session") : undefined, proxyUrl: proxyUrl, windowHeight: windowHeight, windowWidth: windowWidth }); return headlessChrome; }); } //tslint:enable cyclomatic-complexity