UNPKG

ayakashi

Version:

The next generation web scraping framework

419 lines (418 loc) 16.9 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const yargs_1 = __importDefault(require("yargs")); const prompts_1 = __importDefault(require("prompts")); const runner_1 = require("../runner/runner"); const opLog_1 = require("../opLog/opLog"); const downloader_1 = require("../chromeDownloader/downloader"); const chromium_1 = require("../store/chromium"); const getDirectory_1 = require("./getDirectory"); const prepareStandard_1 = require("./prepareStandard"); const prepareSimple_1 = require("./prepareSimple"); const prepareFromJson_1 = require("./prepareFromJson"); const getName_1 = require("./scaffold/getName"); const generateProp_1 = require("./scaffold/generateProp"); const generateAction_1 = require("./scaffold/generateAction"); const generateExtractor_1 = require("./scaffold/generateExtractor"); const generatePreloader_1 = require("./scaffold/generatePreloader"); const generateScraper_1 = require("./scaffold/generateScraper"); const generateRenderlessScraper_1 = require("./scaffold/generateRenderlessScraper"); const generateApiScraper_1 = require("./scaffold/generateApiScraper"); const generateScript_1 = require("./scaffold/generateScript"); const generateProject_1 = require("./scaffold/generateProject"); const refreshUA_1 = require("./refreshUA"); const updateStealth_1 = require("./updateStealth"); const tsHelpers_1 = require("./tsHelpers"); const packageJson = require("../../package.json"); yargs_1.default //@ts-ignore .command("run [dir]", "Runs a project", (_argv) => { yargs_1.default .positional("dir", { describe: "The root directory of a project or a scraper file when --simple mode is used", default: "." }) .option("configFile", { describe: "Use an alternative configFile", alias: "c", default: "" }) .option("jsonConfig", { describe: "Use a json string as config", alias: "jc" }) .option("sessionKey", { describe: "Use a specific run session", default: "default", coerce: function (v) { return String(v); } }) .option("simple", { type: "boolean", describe: "Run a single scraper" }) .option("simpleRenderless", { type: "boolean", describe: "Run a single renderlessScraper" }) .option("simpleApi", { type: "boolean", describe: "Run a single apiScraper" }) .option("resume", { type: "boolean", describe: "Resume execution of a previous unfinished run" }) .option("restartDisabledSteps", { type: "boolean", describe: "Will restart all steps that terminated due to an error. Only works when --resume is used" }) .option("clean", { type: "boolean", describe: "Clear the previous run if it exists and start from the beginning" }) .option("out", { describe: "Select the saving format when --simple mode is used", default: "stdout", choices: ["sqlite", "csv", "json", "stdout"] }) .option("skipTsBuild", { type: "boolean", describe: "Skip automatic typescript compilation" }) .epilogue("Learn more at https://ayakashi-io.github.io/docs/reference/cli-commands.html#run"); }, function (argv) { return __awaiter(this, void 0, void 0, function* () { const opLog = opLog_1.getOpLog(); opLog.info("Ayakashi version:", packageJson.version); const resume = argv.resume || false; const restartDisabledSteps = argv.restartDisabledSteps || false; const clean = argv.clean || false; let directory; let config; let simpleScraper = null; if (argv.jsonConfig) { const fromJson = prepareFromJson_1.prepareFromJson(argv.dir, argv.jsonConfig); config = fromJson.config; directory = fromJson.directory; } else { if (argv.simple) { const simple = prepareSimple_1.prepareSimple(argv.dir, argv.out, "scraper"); config = simple.config; directory = simple.directory; simpleScraper = simple.scraper; } else if (argv.simpleRenderless) { const simple = prepareSimple_1.prepareSimple(argv.dir, argv.out, "renderlessScraper"); config = simple.config; directory = simple.directory; simpleScraper = simple.scraper; } else if (argv.simpleApi) { const simple = prepareSimple_1.prepareSimple(argv.dir, argv.out, "apiScraper"); config = simple.config; directory = simple.directory; simpleScraper = simple.scraper; } else { const standard = yield prepareStandard_1.prepareStandard(argv.dir, argv.configFile, argv.skipTsBuild || false); config = standard.config; directory = standard.directory; } } runner_1.run(directory, config, { resume: resume, restartDisabledSteps: restartDisabledSteps, clean: clean, simpleScraper: simpleScraper, sessionKey: argv.sessionKey }).then(function () { return __awaiter(this, void 0, void 0, function* () { opLog.info("Nothing more to do!"); }); }).catch(function (err) { opLog.error("Something went wrong", err); process.exit(1); }); }); }) //@ts-ignore .command("new [dir]", "Generates a new project", (_argv) => { yargs_1.default .positional("dir", { describe: "Where to place the generated files", default: "." }) .option("project", { type: "boolean", describe: "Generate a new project" }) .option("scraper", { type: "boolean", describe: "Generate a new scraper" }) .option("renderlessScraper", { type: "boolean", describe: "Generate a new renderlessScraper" }) .option("apiScraper", { type: "boolean", describe: "Generate a new apiScraper" }) .option("script", { type: "boolean", describe: "Generate a new script" }) .option("prop", { type: "boolean", describe: "Generate a new prop" }) .option("action", { type: "boolean", describe: "Generate a new action" }) .option("extractor", { type: "boolean", describe: "Generate a new extractor" }) .option("preloader", { type: "boolean", describe: "Generate a new preloader" }) .option("name", { type: "string", describe: "The name of the new scraper|renderlessScraper|script|prop|action|extractor|preloader" }) .option("ts", { type: "boolean", describe: "Generate a typescript project" }) .option("js", { type: "boolean", describe: "Generate a javascript project" }) .conflicts("ts", "js") .epilogue("Learn more at https://ayakashi-io.github.io/docs/reference/cli-commands.html#new"); //@ts-ignore }, function (argv) { return __awaiter(this, void 0, void 0, function* () { //tslint:disable cyclomatic-complexity const opLog = opLog_1.getOpLog(); if ((!argv.prop && !argv.project && !argv.action && !argv.extractor && !argv.preloader && !argv.scraper && !argv.renderlessScraper && !argv.apiScraper && !argv.script) || argv.project) { let ts; if (argv.ts === undefined && argv.js === undefined) { const response = yield prompts_1.default({ type: "select", name: "projectType", message: "Do you want to generate a javascript or typescript project?", choices: [{ title: "Javascript", value: "js" }, { title: "Typescript", value: "ts" }], instructions: false }); if (!response.projectType) { opLog.error("Select a project type to continue"); process.exit(1); } if (response.projectType === "ts") { ts = true; } else { ts = false; } } else if (argv.ts) { ts = true; } else { ts = false; } if (argv.dir === ".") { yield generateProject_1.generateProject(getDirectory_1.getDirectory(argv.dir), true, ts); } else { yield generateProject_1.generateProject(getDirectory_1.getDirectory(argv.dir, false), false, ts); } } else if (argv.prop) { const name = yield getName_1.getName(argv.name, "prop"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateProp_1.generateProp(directory, name, ts); } else if (argv.action) { const name = yield getName_1.getName(argv.name, "action"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateAction_1.generateAction(directory, name, ts); } else if (argv.extractor) { const name = yield getName_1.getName(argv.name, "extractor"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateExtractor_1.generateExtractor(directory, name, ts); } else if (argv.preloader) { const name = yield getName_1.getName(argv.name, "preloader"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generatePreloader_1.generatePreloader(directory, name, ts); } else if (argv.scraper) { const name = yield getName_1.getName(argv.name, "scraper"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateScraper_1.generateScraper(directory, name, ts); } else if (argv.renderlessScraper) { const name = yield getName_1.getName(argv.name, "renderlessScraper"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateRenderlessScraper_1.generateRenderlessScraper(directory, name, ts); } else if (argv.apiScraper) { const name = yield getName_1.getName(argv.name, "apiScraper"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateApiScraper_1.generateApiScraper(directory, name, ts); } else if (argv.script) { const name = yield getName_1.getName(argv.name, "script"); let directory = getDirectory_1.getDirectory(argv.dir); const ts = yield tsHelpers_1.isTypescriptProject(directory); if (ts) directory = yield tsHelpers_1.getTypescriptRoot(directory); yield generateScript_1.generateScript(directory, name, ts); } }); }) //@ts-ignore .command("update-chrome", "Downloads the recommended, latest or specified chrome revision", (_argv) => { yargs_1.default .option("revision", { describe: "Download a specific revision. Format: '114.0.5735.133'", type: "string", alias: "r" }) .option("stable", { describe: "Download the latest stable revision", type: "boolean" }) .option("beta", { describe: "Download the latest beta revision", type: "boolean" }) .option("dev", { describe: "Download the latest dev revision", type: "boolean" }) .option("canary", { describe: "Download the latest canary revision", type: "boolean" }) .epilogue("Learn more at https://ayakashi-io.github.io/docs/reference/cli-commands.html#update-chrome"); //@ts-ignore }, function (argv) { return __awaiter(this, void 0, void 0, function* () { const storedRevision = yield chromium_1.getStoredRevision(); const options = { useExact: false, revision: "", useChannel: false, channel: "" }; if (argv.revision) { options.revision = argv.revision; options.useExact = true; } else if (argv.stable) { options.channel = "Stable"; options.useChannel = true; } else if (argv.beta) { options.channel = "Beta"; options.useChannel = true; } else if (argv.dev) { options.channel = "Dev"; options.useChannel = true; } else if (argv.canary) { options.channel = "Canary"; options.useChannel = true; } else { options.revision = yield downloader_1.getRecommendedChromiumRevision(); options.useExact = true; } yield downloader_1.downloadChromium(options, storedRevision); }); }) //@ts-ignore .command("update-ua", "Updates the builtin database of user agent strings", (_argv) => { yargs_1.default .epilogue("Learn more at https://ayakashi-io.github.io/docs/installation#updating-subcomponents"); //@ts-ignore }, function (argv) { return __awaiter(this, void 0, void 0, function* () { yield refreshUA_1.refreshUA(); }); }) //@ts-ignore .command("update-stealth", "Updates the headless chrome stealth patches", (_argv) => { yargs_1.default .epilogue("Learn more at https://ayakashi-io.github.io/docs/installation#updating-subcomponents"); //@ts-ignore }, function (argv) { return __awaiter(this, void 0, void 0, function* () { yield updateStealth_1.updateStealthPatches(); }); }) //@ts-ignore .command("info", "System information", (_argv) => { //@ts-ignore }, function (argv) { return __awaiter(this, void 0, void 0, function* () { const opLog = opLog_1.getOpLog(); const storedRevision = yield chromium_1.getStoredRevision(); opLog.info(`Ayakashi version: ${packageJson.version}`); if ((yield chromium_1.isChromiumAlreadyInstalled()) && (yield chromium_1.isCfT())) { opLog.info(`Chrome revision: ${storedRevision}`); } else { opLog.info(`Chrome revision: none`); } }); }) .demandCommand().recommendCommands().strict() .epilogue("Learn more at https://ayakashi-io.github.io/docs/reference/cli-commands.html") .argv;