UNPKG

ayakashi

Version:

The next generation web scraping framework

608 lines (607 loc) 29.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.createProcGenerators = exports.getObjectReference = exports.hasTypo = exports.isUsingNormalScraper = exports.countSteps = exports.validateStepFormat = exports.checkStepLevels = exports.firstPass = void 0; const path_1 = require("path"); const fullPath = path_1.resolve(__dirname); let appRoot = fullPath.replace(path_1.sep + "lib" + path_1.sep + "runner", ""); if (process.platform === "win32") { appRoot = appRoot.replace(/\\/g, "/"); } function firstPass(config, previous) { if (!config) { throw new Error("The config must be an object"); } //tslint:disable no-any const firstPassArray = []; //tslint:enable no-any if (config.waterfall && Array.isArray(config.waterfall)) { config.waterfall.forEach(function (step, stepIndex) { if (previous) { firstPassArray.push(`${previous}_waterfall_${stepIndex}`); } else { firstPassArray.push(`waterfall_${stepIndex}`); } if (step.parallel && Array.isArray(step.parallel)) { if (previous) { //@ts-ignore firstPassArray.push(firstPass(step, `${previous}_waterfall_${stepIndex}`)); } else { //@ts-ignore firstPassArray.push(firstPass(step, `waterfall_${stepIndex}`)); } } //@ts-ignore if (step.waterfall && Array.isArray(step.waterfall)) { if (previous) { //@ts-ignore firstPassArray.push(firstPass(step, `${previous}_waterfall_${stepIndex}`)); } else { //@ts-ignore firstPassArray.push(firstPass(step, `waterfall_${stepIndex}`)); } } }); } if (config.parallel && Array.isArray(config.parallel)) { config.parallel.forEach(function (step, stepIndex) { if (previous) { firstPassArray.push(`${previous}_parallel_${stepIndex}`); } else { firstPassArray.push(`parallel_${stepIndex}`); } if (step.waterfall && Array.isArray(step.waterfall)) { if (previous) { //@ts-ignore firstPassArray.push(firstPass(step, `${previous}_parallel_${stepIndex}`)); } else { //@ts-ignore firstPassArray.push(firstPass(step, `parallel_${stepIndex}`)); } } //@ts-ignore if (step.parallel && Array.isArray(step.parallel)) { if (previous) { //@ts-ignore firstPassArray.push(firstPass(step, `${previous}_parallel_${stepIndex}`)); } else { //@ts-ignore firstPassArray.push(firstPass(step, `parallel_${stepIndex}`)); } } }); } return firstPassArray; } exports.firstPass = firstPass; function checkStepLevels(steps) { if (steps.length === 0) { throw new Error("Can't have an empty level"); } steps.forEach(function (step) { if (Array.isArray(step)) { if (step.length === 0) { throw new Error("Can't have an empty level"); } checkStepLevels(step); } else { if (step.split("_").length > 4) { throw new Error("Can't have more than two nested levels"); } } }); } exports.checkStepLevels = checkStepLevels; function validateStepFormat(steps) { if (steps[0] !== "waterfall_0" && steps[0] !== "parallel_0") { throw new Error("Top level element must be parallel or waterfall"); } steps.forEach(function (step) { if (Array.isArray(step)) { step.forEach(function (stepN) { if (stepN.split("_")[stepN.split("_").length - 2] === "parallel" && stepN.split("_")[stepN.split("_").length - 4] === "parallel") { throw new Error("Can't nest a parallel inside a parallel"); } if (stepN.split("_")[stepN.split("_").length - 2] === "waterfall" && stepN.split("_")[stepN.split("_").length - 4] === "waterfall") { throw new Error("Can't nest a waterfall inside a waterfall"); } }); } }); } exports.validateStepFormat = validateStepFormat; function countSteps(steps) { let count = 0; steps.forEach(function (step) { if (Array.isArray(step)) { count += countSteps(step); } else { count += 1; } }); return count; } exports.countSteps = countSteps; function isUsingNormalScraper(steps, config) { let using = false; for (const step of steps) { if (Array.isArray(step)) { for (const st of step) { if (getObjectReference(config, st).type === "scraper") { using = true; } } } else { if (getObjectReference(config, step).type === "scraper") { using = true; } } } return using; } exports.isUsingNormalScraper = isUsingNormalScraper; function hasTypo(steps, config) { let typo = false; for (const step of steps) { if (Array.isArray(step)) { for (const st of step) { const type = getObjectReference(config, st).type; if (type === "scrapper" || type === "apiScrapper" || type === "renderlessScrapper") { typo = true; } } } else { const type = getObjectReference(config, step).type; if (type === "scrapper" || type === "apiScrapper" || type === "renderlessScrapper") { typo = true; } } } return typo; } exports.hasTypo = hasTypo; function getObjectReference(config, stepName) { const formatedStepName = stepName.replace(/subwaterfall/g, "parallel"); //@ts-ignore return formatedStepName.split("_").reduce(function (acc, key) { //@ts-ignore return acc[key]; }, config) || {}; } exports.getObjectReference = getObjectReference; function createProcGenerators(config, steps, options) { const procGenerators = []; const top = steps[0].split("_")[0]; const initializers = []; if (top === "parallel") { steps .map(function (step, i) { if (typeof step === "string" && (typeof steps[i + 1] === "string" || !steps[i + 1])) { return [step]; } else { return step; } }) .filter(step => Array.isArray(step)) .map(function (step) { return step.map(function (st) { const splitStep = st.split("_"); return splitStep.join("_").replace(/parallel/g, "subwaterfall"); }); }) .forEach(function (step, i) { if (step[0] !== `subwaterfall_${i}`) { step.unshift(`subwaterfall_${i}`); } initializers.push(`init_${i}`); step.unshift(`init_${i}`); step.push("end"); _createProcGenerators(config, step, options, procGenerators); }); } else { initializers.push("init"); steps.unshift("init"); steps.push("end"); _createProcGenerators(config, steps, options, procGenerators); } return { procGenerators, initializers }; } exports.createProcGenerators = createProcGenerators; function _createProcGenerators(config, steps, options, procGenerators) { steps.forEach(function (step, index) { const previousStep = steps[index - 1]; const previousPreviousStep = steps[index - 2]; if (Array.isArray(step)) { step.forEach(function (st) { if (previousStep) { if (Array.isArray(previousStep)) { previousStep.forEach(function (pst) { const isParallel = checkParallel(pst); addPreStep(config, pst, st, isParallel, options, procGenerators); if (isParallel && previousPreviousStep) { addParallelPreStep(config, previousPreviousStep, st, options, procGenerators); } }); } else { const isParallel = checkParallel(previousStep); addPreStep(config, previousStep, st, isParallel, options, procGenerators); if (isParallel && previousPreviousStep) { addParallelPreStep(config, previousPreviousStep, st, options, procGenerators); } } } addStep(config, st, procGenerators); }); } else { if (previousStep) { if (Array.isArray(previousStep)) { previousStep.forEach(function (pst) { const isParallel = checkParallel(pst); addPreStep(config, pst, step, isParallel, options, procGenerators); if (isParallel && previousPreviousStep) { addParallelPreStep(config, previousPreviousStep, step, options, procGenerators); } }); } else { const isParallel = checkParallel(previousStep); addPreStep(config, previousStep, step, isParallel, options, procGenerators); if (isParallel && previousPreviousStep) { addParallelPreStep(config, previousPreviousStep, step, options, procGenerators); } } } addStep(config, step, procGenerators); } }); } function checkParallel(step) { let isParallel = false; if (step.split("_")[step.split("_").length - 2] === "parallel") { isParallel = true; } return isParallel; } function addStep(config, step, procGenerators) { if (step.match("init")) return; if (!procGenerators.find(pr => pr.from === `pre_${step}` && pr.to === step)) { const objectRef = getObjectReference(config, step); if (objectRef.type === "scraper") { if (!objectRef.module) return; procGenerators.push({ name: `proc_from_pre_${step}_to_${step}`, from: `pre_${step}`, to: step, processor: path_1.resolve(appRoot, "lib/runner/scraperWrapper.js"), config: objectRef.config || {} }); } else if (objectRef.type === "renderlessScraper") { if (!objectRef.module) return; procGenerators.push({ name: `proc_from_pre_${step}_to_${step}`, from: `pre_${step}`, to: step, processor: path_1.resolve(appRoot, "lib/runner/renderlessScraperWrapper.js"), config: objectRef.config || {} }); } else if (objectRef.type === "apiScraper") { if (!objectRef.module) return; procGenerators.push({ name: `proc_from_pre_${step}_to_${step}`, from: `pre_${step}`, to: step, processor: path_1.resolve(appRoot, "lib/runner/apiScraperWrapper.js"), config: objectRef.config || {} }); } else { objectRef.type = "script"; if (!objectRef.module) return; procGenerators.push({ name: `proc_from_pre_${step}_to_${step}`, from: `pre_${step}`, to: step, processor: path_1.resolve(appRoot, "lib/runner/scriptWrapper.js"), config: objectRef.config || {} }); } } } function addPreStep(config, previousStep, step, isParallel, options, procGenerators) { if (step.match("init")) return; if (!procGenerators.find(pr => pr.from === previousStep && pr.to === `pre_${step}`) && !isParallel) { procGenerators.push({ name: `proc_from_${previousStep}_to_pre_${step}`, from: previousStep, to: `pre_${step}`, config: {}, //tslint:disable max-line-length processor: new Function("log", ` const obj = ${JSON.stringify(getObjectReference(config, step))}; if (obj.type === "scraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "renderlessScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "apiScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else { return Promise.resolve({ input: log.body, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousStep}", appRoot: "${appRoot}" }); } `) //tslint:enable max-line-length }); } } function addParallelPreStep(config, previousPreviousStep, step, options, procGenerators) { if (step.match("init")) return; if (Array.isArray(previousPreviousStep)) { previousPreviousStep.forEach(function (ppst) { if (!procGenerators.find(pr => pr.from === ppst && pr.to === `pre_${step}`)) { procGenerators.push({ name: `proc_from_${ppst}_to_pre_${step}`, from: ppst, to: `pre_${step}`, config: {}, //tslint:disable max-line-length processor: new Function("log", ` const obj = ${JSON.stringify(getObjectReference(config, step))}; if (obj.type === "scraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${ppst}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "renderlessScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${ppst}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "apiScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", persistentSession: ${options.persistentSession}, operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${ppst}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else { return Promise.resolve({ input: log.body, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${ppst}", appRoot: "${appRoot}" }); } `) //tslint:enable max-line-length }); } }); } else { if (!procGenerators.find(pr => pr.from === previousPreviousStep && pr.to === `pre_${step}`)) { procGenerators.push({ name: `proc_from_${previousPreviousStep}_to_pre_${step}`, from: previousPreviousStep, to: `pre_${step}`, config: {}, //tslint:disable max-line-length processor: new Function("log", ` const obj = ${JSON.stringify(getObjectReference(config, step))}; if (obj.type === "scraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousPreviousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "renderlessScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, load: (obj && obj.load) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousPreviousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else if (obj.type === "apiScraper") { return Promise.resolve({ input: log.body, config: (obj && obj.config) || {}, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousPreviousStep}", appRoot: "${appRoot}", proxyUrl: "${(config.config && config.config.proxyUrl) || ""}", ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false} }); } else { return Promise.resolve({ input: log.body, params: (obj && obj.params) || {}, module: (obj && obj.module) || "", connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })}, saveTopic: "${step}", projectFolder: "${options.projectFolder}", storeProjectFolder: "${options.storeProjectFolder}", operationId: "${options.operationId}", startDate: "${options.startDate}", procName: "proc_from_pre_${step}_to_${step}", selfTopic: "${previousPreviousStep}", appRoot: "${appRoot}" }); } `) //tslint:enable max-line-length }); } } }