ayakashi
Version:
The next generation web scraping framework
608 lines (607 loc) • 29.4 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.createProcGenerators = exports.getObjectReference = exports.hasTypo = exports.isUsingNormalScraper = exports.countSteps = exports.validateStepFormat = exports.checkStepLevels = exports.firstPass = void 0;
const path_1 = require("path");
const fullPath = path_1.resolve(__dirname);
let appRoot = fullPath.replace(path_1.sep + "lib" + path_1.sep + "runner", "");
if (process.platform === "win32") {
appRoot = appRoot.replace(/\\/g, "/");
}
function firstPass(config, previous) {
if (!config) {
throw new Error("The config must be an object");
}
//tslint:disable no-any
const firstPassArray = [];
//tslint:enable no-any
if (config.waterfall && Array.isArray(config.waterfall)) {
config.waterfall.forEach(function (step, stepIndex) {
if (previous) {
firstPassArray.push(`${previous}_waterfall_${stepIndex}`);
}
else {
firstPassArray.push(`waterfall_${stepIndex}`);
}
if (step.parallel && Array.isArray(step.parallel)) {
if (previous) {
//@ts-ignore
firstPassArray.push(firstPass(step, `${previous}_waterfall_${stepIndex}`));
}
else {
//@ts-ignore
firstPassArray.push(firstPass(step, `waterfall_${stepIndex}`));
}
}
//@ts-ignore
if (step.waterfall && Array.isArray(step.waterfall)) {
if (previous) {
//@ts-ignore
firstPassArray.push(firstPass(step, `${previous}_waterfall_${stepIndex}`));
}
else {
//@ts-ignore
firstPassArray.push(firstPass(step, `waterfall_${stepIndex}`));
}
}
});
}
if (config.parallel && Array.isArray(config.parallel)) {
config.parallel.forEach(function (step, stepIndex) {
if (previous) {
firstPassArray.push(`${previous}_parallel_${stepIndex}`);
}
else {
firstPassArray.push(`parallel_${stepIndex}`);
}
if (step.waterfall && Array.isArray(step.waterfall)) {
if (previous) {
//@ts-ignore
firstPassArray.push(firstPass(step, `${previous}_parallel_${stepIndex}`));
}
else {
//@ts-ignore
firstPassArray.push(firstPass(step, `parallel_${stepIndex}`));
}
}
//@ts-ignore
if (step.parallel && Array.isArray(step.parallel)) {
if (previous) {
//@ts-ignore
firstPassArray.push(firstPass(step, `${previous}_parallel_${stepIndex}`));
}
else {
//@ts-ignore
firstPassArray.push(firstPass(step, `parallel_${stepIndex}`));
}
}
});
}
return firstPassArray;
}
exports.firstPass = firstPass;
function checkStepLevels(steps) {
if (steps.length === 0) {
throw new Error("Can't have an empty level");
}
steps.forEach(function (step) {
if (Array.isArray(step)) {
if (step.length === 0) {
throw new Error("Can't have an empty level");
}
checkStepLevels(step);
}
else {
if (step.split("_").length > 4) {
throw new Error("Can't have more than two nested levels");
}
}
});
}
exports.checkStepLevels = checkStepLevels;
function validateStepFormat(steps) {
if (steps[0] !== "waterfall_0" && steps[0] !== "parallel_0") {
throw new Error("Top level element must be parallel or waterfall");
}
steps.forEach(function (step) {
if (Array.isArray(step)) {
step.forEach(function (stepN) {
if (stepN.split("_")[stepN.split("_").length - 2] === "parallel" &&
stepN.split("_")[stepN.split("_").length - 4] === "parallel") {
throw new Error("Can't nest a parallel inside a parallel");
}
if (stepN.split("_")[stepN.split("_").length - 2] === "waterfall" &&
stepN.split("_")[stepN.split("_").length - 4] === "waterfall") {
throw new Error("Can't nest a waterfall inside a waterfall");
}
});
}
});
}
exports.validateStepFormat = validateStepFormat;
function countSteps(steps) {
let count = 0;
steps.forEach(function (step) {
if (Array.isArray(step)) {
count += countSteps(step);
}
else {
count += 1;
}
});
return count;
}
exports.countSteps = countSteps;
function isUsingNormalScraper(steps, config) {
let using = false;
for (const step of steps) {
if (Array.isArray(step)) {
for (const st of step) {
if (getObjectReference(config, st).type === "scraper") {
using = true;
}
}
}
else {
if (getObjectReference(config, step).type === "scraper") {
using = true;
}
}
}
return using;
}
exports.isUsingNormalScraper = isUsingNormalScraper;
function hasTypo(steps, config) {
let typo = false;
for (const step of steps) {
if (Array.isArray(step)) {
for (const st of step) {
const type = getObjectReference(config, st).type;
if (type === "scrapper" || type === "apiScrapper" || type === "renderlessScrapper") {
typo = true;
}
}
}
else {
const type = getObjectReference(config, step).type;
if (type === "scrapper" || type === "apiScrapper" || type === "renderlessScrapper") {
typo = true;
}
}
}
return typo;
}
exports.hasTypo = hasTypo;
function getObjectReference(config, stepName) {
const formatedStepName = stepName.replace(/subwaterfall/g, "parallel");
//@ts-ignore
return formatedStepName.split("_").reduce(function (acc, key) {
//@ts-ignore
return acc[key];
}, config) || {};
}
exports.getObjectReference = getObjectReference;
function createProcGenerators(config, steps, options) {
const procGenerators = [];
const top = steps[0].split("_")[0];
const initializers = [];
if (top === "parallel") {
steps
.map(function (step, i) {
if (typeof step === "string" && (typeof steps[i + 1] === "string" || !steps[i + 1])) {
return [step];
}
else {
return step;
}
})
.filter(step => Array.isArray(step))
.map(function (step) {
return step.map(function (st) {
const splitStep = st.split("_");
return splitStep.join("_").replace(/parallel/g, "subwaterfall");
});
})
.forEach(function (step, i) {
if (step[0] !== `subwaterfall_${i}`) {
step.unshift(`subwaterfall_${i}`);
}
initializers.push(`init_${i}`);
step.unshift(`init_${i}`);
step.push("end");
_createProcGenerators(config, step, options, procGenerators);
});
}
else {
initializers.push("init");
steps.unshift("init");
steps.push("end");
_createProcGenerators(config, steps, options, procGenerators);
}
return { procGenerators, initializers };
}
exports.createProcGenerators = createProcGenerators;
function _createProcGenerators(config, steps, options, procGenerators) {
steps.forEach(function (step, index) {
const previousStep = steps[index - 1];
const previousPreviousStep = steps[index - 2];
if (Array.isArray(step)) {
step.forEach(function (st) {
if (previousStep) {
if (Array.isArray(previousStep)) {
previousStep.forEach(function (pst) {
const isParallel = checkParallel(pst);
addPreStep(config, pst, st, isParallel, options, procGenerators);
if (isParallel && previousPreviousStep) {
addParallelPreStep(config, previousPreviousStep, st, options, procGenerators);
}
});
}
else {
const isParallel = checkParallel(previousStep);
addPreStep(config, previousStep, st, isParallel, options, procGenerators);
if (isParallel && previousPreviousStep) {
addParallelPreStep(config, previousPreviousStep, st, options, procGenerators);
}
}
}
addStep(config, st, procGenerators);
});
}
else {
if (previousStep) {
if (Array.isArray(previousStep)) {
previousStep.forEach(function (pst) {
const isParallel = checkParallel(pst);
addPreStep(config, pst, step, isParallel, options, procGenerators);
if (isParallel && previousPreviousStep) {
addParallelPreStep(config, previousPreviousStep, step, options, procGenerators);
}
});
}
else {
const isParallel = checkParallel(previousStep);
addPreStep(config, previousStep, step, isParallel, options, procGenerators);
if (isParallel && previousPreviousStep) {
addParallelPreStep(config, previousPreviousStep, step, options, procGenerators);
}
}
}
addStep(config, step, procGenerators);
}
});
}
function checkParallel(step) {
let isParallel = false;
if (step.split("_")[step.split("_").length - 2] === "parallel") {
isParallel = true;
}
return isParallel;
}
function addStep(config, step, procGenerators) {
if (step.match("init"))
return;
if (!procGenerators.find(pr => pr.from === `pre_${step}` && pr.to === step)) {
const objectRef = getObjectReference(config, step);
if (objectRef.type === "scraper") {
if (!objectRef.module)
return;
procGenerators.push({
name: `proc_from_pre_${step}_to_${step}`,
from: `pre_${step}`,
to: step,
processor: path_1.resolve(appRoot, "lib/runner/scraperWrapper.js"),
config: objectRef.config || {}
});
}
else if (objectRef.type === "renderlessScraper") {
if (!objectRef.module)
return;
procGenerators.push({
name: `proc_from_pre_${step}_to_${step}`,
from: `pre_${step}`,
to: step,
processor: path_1.resolve(appRoot, "lib/runner/renderlessScraperWrapper.js"),
config: objectRef.config || {}
});
}
else if (objectRef.type === "apiScraper") {
if (!objectRef.module)
return;
procGenerators.push({
name: `proc_from_pre_${step}_to_${step}`,
from: `pre_${step}`,
to: step,
processor: path_1.resolve(appRoot, "lib/runner/apiScraperWrapper.js"),
config: objectRef.config || {}
});
}
else {
objectRef.type = "script";
if (!objectRef.module)
return;
procGenerators.push({
name: `proc_from_pre_${step}_to_${step}`,
from: `pre_${step}`,
to: step,
processor: path_1.resolve(appRoot, "lib/runner/scriptWrapper.js"),
config: objectRef.config || {}
});
}
}
}
function addPreStep(config, previousStep, step, isParallel, options, procGenerators) {
if (step.match("init"))
return;
if (!procGenerators.find(pr => pr.from === previousStep && pr.to === `pre_${step}`) &&
!isParallel) {
procGenerators.push({
name: `proc_from_${previousStep}_to_pre_${step}`,
from: previousStep,
to: `pre_${step}`,
config: {},
//tslint:disable max-line-length
processor: new Function("log", `
const obj = ${JSON.stringify(getObjectReference(config, step))};
if (obj.type === "scraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "renderlessScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "apiScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else {
return Promise.resolve({
input: log.body,
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousStep}",
appRoot: "${appRoot}"
});
}
`)
//tslint:enable max-line-length
});
}
}
function addParallelPreStep(config, previousPreviousStep, step, options, procGenerators) {
if (step.match("init"))
return;
if (Array.isArray(previousPreviousStep)) {
previousPreviousStep.forEach(function (ppst) {
if (!procGenerators.find(pr => pr.from === ppst && pr.to === `pre_${step}`)) {
procGenerators.push({
name: `proc_from_${ppst}_to_pre_${step}`,
from: ppst,
to: `pre_${step}`,
config: {},
//tslint:disable max-line-length
processor: new Function("log", `
const obj = ${JSON.stringify(getObjectReference(config, step))};
if (obj.type === "scraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${ppst}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "renderlessScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${ppst}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "apiScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
persistentSession: ${options.persistentSession},
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${ppst}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else {
return Promise.resolve({
input: log.body,
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${ppst}",
appRoot: "${appRoot}"
});
}
`)
//tslint:enable max-line-length
});
}
});
}
else {
if (!procGenerators.find(pr => pr.from === previousPreviousStep && pr.to === `pre_${step}`)) {
procGenerators.push({
name: `proc_from_${previousPreviousStep}_to_pre_${step}`,
from: previousPreviousStep,
to: `pre_${step}`,
config: {},
//tslint:disable max-line-length
processor: new Function("log", `
const obj = ${JSON.stringify(getObjectReference(config, step))};
if (obj.type === "scraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort, protocolPort: options.protocolPort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousPreviousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "renderlessScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
load: (obj && obj.load) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousPreviousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else if (obj.type === "apiScraper") {
return Promise.resolve({
input: log.body,
config: (obj && obj.config) || {},
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousPreviousStep}",
appRoot: "${appRoot}",
proxyUrl: "${(config.config && config.config.proxyUrl) || ""}",
ignoreCertificateErrors: ${(config.config && config.config.ignoreCertificateErrors) || false}
});
} else {
return Promise.resolve({
input: log.body,
params: (obj && obj.params) || {},
module: (obj && obj.module) || "",
connectionConfig: ${JSON.stringify({ bridgePort: options.bridgePort })},
saveTopic: "${step}",
projectFolder: "${options.projectFolder}",
storeProjectFolder: "${options.storeProjectFolder}",
operationId: "${options.operationId}",
startDate: "${options.startDate}",
procName: "proc_from_pre_${step}_to_${step}",
selfTopic: "${previousPreviousStep}",
appRoot: "${appRoot}"
});
}
`)
//tslint:enable max-line-length
});
}
}
}