ayakashi
Version:
The next generation web scraping framework
340 lines (339 loc) • 15.6 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.run = void 0;
const browser_1 = require("../engine/browser");
const bridge_1 = require("../bridge/bridge");
const connection_1 = require("../bridge/connection");
const userAgent_1 = require("../bridge/userAgent");
const cookies_1 = require("../bridge/cookies");
const path_1 = require("path");
const pipeproc_1 = require("pipeproc");
const uuid_1 = require("uuid");
const dayjs_1 = __importDefault(require("dayjs"));
const opLog_1 = require("../opLog/opLog");
const os_1 = require("os");
const fs_1 = require("fs");
const parseConfig_1 = require("./parseConfig");
const downloader_1 = require("../chromeDownloader/downloader");
const chromium_1 = require("../store/chromium");
const project_1 = require("../store/project");
const sessionDb_1 = require("../sessionDb/sessionDb");
const getRandomPort_1 = require("../utils/getRandomPort");
const debug_1 = __importDefault(require("debug"));
const d = debug_1.default("ayakashi:runner");
const SIGINT = "SIGINT";
function run(projectFolder, config, options) {
return __awaiter(this, void 0, void 0, function* () {
const opLog = opLog_1.getOpLog();
let steps;
let procGenerators;
let initializers;
const pipeprocClient = pipeproc_1.PipeProc();
let headlessChrome = null;
let hasPrevious;
let storeProjectFolder = yield project_1.getOrCreateStoreProjectFolder(options.simpleScraper ? `${projectFolder}/${options.simpleScraper}` : projectFolder, options.sessionKey);
if (process.platform === "win32") {
storeProjectFolder = storeProjectFolder.replace(/\\/g, "/");
}
try {
steps = parseConfig_1.firstPass(config);
parseConfig_1.checkStepLevels(steps);
parseConfig_1.validateStepFormat(steps);
if (parseConfig_1.hasTypo(steps, config)) {
opLog.error("The configuration still uses one of scrapper/renderlessScrapper/apiScrapper.");
opLog.error("This was a typo and has been deprecated.");
opLog.error("Please use scraper/renderlessScraper/apiScraper (with a single 'p') instead.");
throw new Error("Deprecated configuration option");
}
if (!options.simpleScraper && fs_1.existsSync(path_1.resolve(projectFolder, "scrappers")) &&
!fs_1.existsSync(path_1.resolve(projectFolder, "scrapers"))) {
opLog.error("This project still uses a 'scrappers' folder.");
opLog.error("This was a typo and has been deprecated.");
opLog.error("Please move all your scraper files to a 'scrapers' folder (with a single 'p').");
throw new Error("Deprecated folder structure");
}
//@ts-ignore
if (config.config && config.config.userAgent) {
opLog.error("The global userAgent option has been deprecated.");
opLog.error("You can configure the userAgent in the emulatorOptions of each pipeline step");
opLog.error("Read more here: https://ayakashi-io.github.io/docs/reference/ayakashi-config-file.html#emulator-options");
throw new Error("Deprecated configuration option");
}
if (options.resume && options.clean) {
opLog.error("Cannot use both --resume and --clean");
throw new Error("Invalid run parameters");
}
hasPrevious = yield project_1.hasPreviousRun(storeProjectFolder);
if (!options.resume && options.clean && hasPrevious) {
opLog.warn("cleaning previous run");
yield project_1.clearPreviousRun(storeProjectFolder);
//re-create the project folder after clean
yield project_1.getOrCreateStoreProjectFolder(options.simpleScraper ? `${projectFolder}/${options.simpleScraper}` : projectFolder, options.sessionKey);
}
else if (!options.resume && !options.clean && hasPrevious) {
opLog.error("Cannot start a new run while a previous unfinished run exists.");
opLog.error("Use --resume to resume the previous run or --clean to clear it and start a new one.");
throw new Error("Invalid run parameters");
}
else if (options.resume && hasPrevious) {
opLog.warn("resuming previous run");
const [changed, lastConfig] = yield project_1.configChanged(config, storeProjectFolder);
if (changed) {
opLog.warn("restoring old config");
//tslint:disable no-parameter-reassignment
config = lastConfig;
//tslint:enable no-parameter-reassignment
}
}
config.config = config.config || {};
if (config.config.bridgePort === 0) {
config.config.bridgePort = yield getRandomPort_1.getRandomPort();
}
else if (!config.config.bridgePort) {
config.config.bridgePort = 9731;
}
if (config.config.protocolPort === 0) {
config.config.protocolPort = yield getRandomPort_1.getRandomPort();
}
else if (!config.config.protocolPort) {
config.config.protocolPort = 9730;
}
const parsedConfig = parseConfig_1.createProcGenerators(config, steps, {
bridgePort: config.config.bridgePort,
protocolPort: config.config.protocolPort,
persistentSession: (config.config && config.config.persistentSession === true) || false,
projectFolder: projectFolder,
storeProjectFolder: storeProjectFolder,
operationId: uuid_1.v4(),
startDate: dayjs_1.default().format("YYYY-MM-DD-HH-mm-ss")
});
procGenerators = parsedConfig.procGenerators;
initializers = parsedConfig.initializers;
}
catch (e) {
throw e;
}
//start bridge
const { bridge, closeBridge } = yield bridge_1.startBridge(config.config.bridgePort);
function bridgeSigintListener() {
return __awaiter(this, void 0, void 0, function* () {
d("trap SIGINT, closing bridge");
yield closeBridge();
process.removeListener(SIGINT, bridgeSigintListener);
});
}
process.on(SIGINT, bridgeSigintListener);
let chromePath;
if (config.config && config.config.chromePath) {
chromePath = config.config.chromePath;
}
else {
if (!(yield chromium_1.isChromiumAlreadyInstalled()) || !(yield chromium_1.isCfT())) {
yield downloader_1.downloadChromium({
useExact: true,
revision: yield downloader_1.getRecommendedChromiumRevision(),
useChannel: false,
channel: ""
}, "");
}
chromePath = yield chromium_1.getChromePath();
}
try {
//launch chrome
if (parseConfig_1.isUsingNormalScraper(steps, config)) {
d("using normal scraper(s), chrome will be spawned");
headlessChrome = yield launch(config, storeProjectFolder, chromePath);
//add bridge connection routes
connection_1.addConnectionRoutes(bridge, headlessChrome);
}
else {
d("using renderless scraper(s) only, chrome will not be spawned");
}
//finalize systemProcs
const procs = procGenerators.map(function (generator) {
return {
name: generator.name,
offset: ">",
maxReclaims: generator.config.retries || 1,
reclaimTimeout: -1,
onMaxReclaimsReached: "disable",
from: generator.from,
to: generator.to,
processor: generator.processor
};
});
//initialize sessionDb
const { sessionDb, UserAgentDataModel, CookieModel } = yield sessionDb_1.sessionDbInit(storeProjectFolder, { create: true });
//add bridge userAgent routes
userAgent_1.addUserAgentRoutes(bridge, sessionDb, UserAgentDataModel);
//add bridge cookie routes
cookies_1.addCookiesRoutes(bridge, sessionDb, CookieModel);
//launch pipeproc
let workers;
if (config.config && config.config.workers && config.config.workers > 0) {
workers = config.config.workers;
}
else {
const stepCount = steps.length <= 4 ? 1 : parseConfig_1.countSteps(steps) - 3;
workers = stepCount > os_1.cpus().length ? os_1.cpus().length : stepCount;
}
let workerConcurrency;
if (config.config && config.config.workerConcurrency && config.config.workerConcurrency > 0) {
workerConcurrency = config.config.workerConcurrency;
opLog.info(`using workers: ${workers} (concurrency: ${workerConcurrency})`);
}
else {
workerConcurrency = 1;
opLog.info(`using workers: ${workers}`);
}
const waiter = opLog.waiter("initializing");
yield pipeprocClient.spawn({
socket: `ipc://${path_1.resolve(storeProjectFolder, "ipc")}`,
location: project_1.getPipeprocFolder(storeProjectFolder),
workers: workers,
workerConcurrency: workerConcurrency,
workerRestartAfter: 100
});
function sigintListener() {
return __awaiter(this, void 0, void 0, function* () {
d("trap SIGINT, closing pipeproc");
yield pipeprocClient.shutdown();
process.removeListener(SIGINT, sigintListener);
});
}
process.on(SIGINT, sigintListener);
if (options.resume && hasPrevious) {
yield Promise.all(procs.map(function (proc) {
return __awaiter(this, void 0, void 0, function* () {
try {
yield pipeprocClient.reclaimProc(proc.name);
}
catch (_e) { }
if (options.restartDisabledSteps) {
try {
yield pipeprocClient.resumeProc(proc.name);
}
catch (_e) { }
}
});
}));
}
else {
yield project_1.saveLastConfig(config, storeProjectFolder);
//register the systemProcs and init the project
//@ts-ignore
yield Promise.all(procs.map(proc => pipeprocClient.systemProc(proc)));
yield pipeprocClient.commit(initializers.map(init => {
return {
topic: init,
body: {}
};
}));
}
waiter.succeed("running");
//close
yield pipeprocClient.waitForProcs();
let procWithError = false;
for (const pr of procs) {
const proc = yield pipeprocClient.inspectProc(pr.name);
if (proc.status === "disabled") {
procWithError = true;
}
}
if (headlessChrome) {
yield headlessChrome.close();
}
yield closeBridge();
yield pipeprocClient.shutdown();
if (!procWithError) {
opLog.info("cleaning run state");
yield project_1.clearPreviousRun(storeProjectFolder);
}
else {
opLog.warn("Run finished but some steps were disabled due to errors, run state will not be cleared");
opLog.warn("You can try re-running them by passing --resume and --restartDisabledSteps");
}
}
catch (e) {
try {
yield pipeprocClient.shutdown();
}
catch (_e) {
d(_e);
}
try {
if (headlessChrome) {
yield headlessChrome.close();
}
}
catch (_e) {
d(_e);
}
try {
yield closeBridge();
}
catch (_e) {
d(_e);
}
opLog.error("Failed to run project");
throw e;
}
});
}
exports.run = run;
//tslint:disable cyclomatic-complexity
function launch(config, storeProjectFolder, chromePath) {
return __awaiter(this, void 0, void 0, function* () {
//check top level config options
let headless = true;
if (config.config && config.config.headless === false) {
headless = false;
}
let autoOpenDevTools = true;
if (config.config && config.config.openDevTools === false) {
autoOpenDevTools = false;
}
let persistentSession = false;
if (config.config && config.config.persistentSession === true) {
persistentSession = true;
}
let proxyUrl;
if (config.config && config.config.proxyUrl) {
proxyUrl = config.config.proxyUrl;
}
let windowHeight;
if (config.config && config.config.windowHeight) {
windowHeight = config.config.windowHeight;
}
let windowWidth;
if (config.config && config.config.windowWidth) {
windowWidth = config.config.windowWidth;
}
//spawn the chrome instance
const headlessChrome = browser_1.getInstance();
yield headlessChrome.init({
headless: headless,
chromePath: chromePath,
autoOpenDevTools: autoOpenDevTools,
protocolPort: (config.config).protocolPort,
sessionDir: persistentSession ? path_1.resolve(storeProjectFolder, "chromium_session") : undefined,
proxyUrl: proxyUrl,
windowHeight: windowHeight,
windowWidth: windowWidth
});
return headlessChrome;
});
}
//tslint:enable cyclomatic-complexity