ayakashi
Version:
The next generation web scraping framework
259 lines (258 loc) • 13.4 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const core_1 = __importDefault(require("@ayakashi/request/core"));
const createConnection_1 = require("../engine/createConnection");
const prelude_1 = require("../prelude/prelude");
const yield_1 = require("../prelude/actions/yield");
const request_1 = require("../prelude/actions/request");
const cookies_1 = require("../prelude/actions/cookies");
const path_1 = require("path");
const pipeproc_1 = require("pipeproc");
const compiler_1 = require("../preloaderCompiler/compiler");
const opLog_1 = require("../opLog/opLog");
const client_1 = require("../bridge/client");
const cookies_2 = require("./cookies");
const cookieHelpers_1 = require("../utils/cookieHelpers");
const loaders_1 = require("./loaders");
const debug_1 = __importDefault(require("debug"));
const d = debug_1.default("ayakashi:scraperWrapper");
function scraperWrapper(log) {
return __awaiter(this, void 0, void 0, function* () {
try {
const opLog = opLog_1.getOpLog();
opLog.info("running scraper", log.body.module);
const bridgeClient = client_1.getBridgeClient(log.body.connectionConfig.bridgePort);
//get a tab and create a connection
let tab;
try {
tab = yield bridgeClient.getTarget();
if (!tab) {
throw new Error("no_target");
}
}
catch (e) {
d(e);
opLog.error("Could not create a chrome target");
throw e;
}
let connection;
try {
connection = yield createConnection_1.createConnection(tab, log.body.connectionConfig.bridgePort, log.body.config.emulatorOptions);
if (!connection) {
throw new Error("no_connection");
}
}
catch (e) {
d(e);
opLog.error("Could not create a connection");
throw e;
}
//ignoreCertificateErrors option
if (log.body.ignoreCertificateErrors) {
yield connection.client.Security.setIgnoreCertificateErrors({ ignore: true });
}
//user-agent setup
const userAgentData = yield bridgeClient.getUserAgentData({
agent: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.userAgent) || undefined,
platform: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.platform) || undefined,
persistentSession: log.body.persistentSession
});
if (!userAgentData) {
throw new Error("could not generate userAgent");
}
const acceptLanguage = (log.body.config.emulatorOptions && log.body.config.emulatorOptions.acceptLanguage) || "en-US";
yield connection.client.Emulation.setUserAgentOverride({
userAgent: userAgentData.userAgent,
platform: userAgentData.platform,
acceptLanguage: acceptLanguage
});
//get cookie jar
const { jar, cookies } = yield cookies_2.getCookieJar(log.body.connectionConfig.bridgePort, {
persistentSession: log.body.persistentSession
});
//add all cookies from the jar to chrome
if (cookies.length > 0) {
yield connection.client.Network.setCookies({
cookies: cookieHelpers_1.toChromeCookies(cookies)
});
}
//add all chrome cookies to jar and to the persistent store after every page load
connection.unsubscribers.push(connection.client.Page.domContentEventFired(function () {
return __awaiter(this, void 0, void 0, function* () {
const chromeCookies = yield connection.client.Network.getCookies();
cookieHelpers_1.toRequestCookies(chromeCookies.cookies).forEach(function (cookie) {
jar.setCookie(cookieHelpers_1.toCookieString(cookie), cookieHelpers_1.getCookieUrl(cookie));
});
yield cookies_2.updateCookieJar(log.body.connectionConfig.bridgePort, jar, {
persistentSession: log.body.persistentSession
});
});
}));
//check pipes and initialize the instance using the prelude
if (log.body.config.pipeConsole !== false) {
connection.pipe.console(function (text) {
if (text && text.indexOf("[Ayakashi]") === -1) {
opLog.debug(`<Scraper:${log.body.module}:Browser>`, text);
}
});
}
if (log.body.config.pipeExceptions !== false) {
connection.pipe.uncaughtException(function (exception) {
opLog.debug(`<Scraper:${log.body.module}:Browser:Exception>`, JSON.stringify(exception, null, 2));
});
}
const ayakashiInstance = yield prelude_1.prelude(connection);
//attach the request API
const myRequest = core_1.default.defaults({
headers: {
"User-Agent": userAgentData.userAgent,
//tslint:disable max-line-length
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
//tslint:enable max-line-length
"accept-language": acceptLanguage,
"cache-control": "no-cache",
pragma: "no-cache"
},
proxy: log.body.proxyUrl || undefined,
strictSSL: !log.body.ignoreCertificateErrors,
gzipOrBrotli: true,
timeout: 10000,
jar: jar
});
function cookieSync() {
return __awaiter(this, void 0, void 0, function* () {
const requestCookies = cookieHelpers_1.getAllCookiesFromRequestJar(jar);
//sync request cookies with chrome
if (requestCookies.length > 0) {
yield connection.client.Network.setCookies({
cookies: cookieHelpers_1.toChromeCookies(requestCookies)
});
}
//sync request cookies with the persistent store
yield cookies_2.updateCookieJar(log.body.connectionConfig.bridgePort, jar, {
persistentSession: log.body.persistentSession
});
});
}
request_1.attachRequest(ayakashiInstance, myRequest, cookieSync);
cookies_1.attachCookieActions(ayakashiInstance, jar, connection, cookieSync);
//connect to pipeproc
const pipeprocClient = pipeproc_1.PipeProc();
yield pipeprocClient.connect({ socket: `ipc://${path_1.resolve(log.body.storeProjectFolder, "ipc")}` });
//attach the yield methods
const yieldWatcher = { yieldedAtLeastOnce: false };
yield_1.attachYields(ayakashiInstance, pipeprocClient, log.body.saveTopic, log.body.selfTopic, yieldWatcher);
//load domQL as a preloader
const domqlPreloader = yield compiler_1.compile(log.body.appRoot, `./lib/domQL/domQL`, "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true);
yield connection.injectPreloader({ compiled: domqlPreloader, as: "domQL", waitForDOM: false });
//load findCssSelector as a preloader
const findCssSelectorPreloader = yield compiler_1.compile(log.body.appRoot, `@ayakashi/get-node-selector`, "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true);
yield connection.injectPreloader({
compiled: findCssSelectorPreloader,
as: "getNodeSelector",
waitForDOM: false
});
//load the old detection patches
const detectionPatches = yield compiler_1.compile(log.body.appRoot, "./lib/detection/patch", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true);
yield connection.injectPreloader({
compiled: detectionPatches,
as: "detectionPatches",
waitForDOM: false
});
// load stealth patches
const stealthPatches = yield compiler_1.compile(log.body.appRoot, "./lib/detection/stealth.js", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true);
yield connection.injectPreloader({
compiled: stealthPatches,
as: "stealthPatches",
waitForDOM: false
});
//load the marshalling helpers
const marshalling = yield compiler_1.compile(log.body.appRoot, "./lib/utils/marshalling", "ayakashi", `${log.body.storeProjectFolder}/.cache/preloaders/`, true);
yield connection.injectPreloader({
compiled: marshalling,
as: "marshalling",
waitForDOM: false
});
//load external actions/extractors/preloaders
yield loadExternals(connection, ayakashiInstance, log);
//autoLoad local actions/extractors/preloaders
if (log.body.config.localAutoLoad !== false) {
yield loadLocals(connection, ayakashiInstance, log);
}
//activate the connection and load the scraper
yield connection.activate();
let scraperModule;
try {
if (log.body.config.simple) {
scraperModule = require(path_1.resolve(log.body.projectFolder, log.body.module));
}
else {
scraperModule = require(path_1.resolve(log.body.projectFolder, "scrapers", log.body.module));
}
if (typeof scraperModule !== "function") {
scraperModule = scraperModule.default;
}
if (typeof scraperModule !== "function") {
throw new Error(`Scraper <${log.body.module}> is not a function`);
}
}
catch (e) {
opLog.error(e.message);
yield connection.release();
throw e;
}
//run the scraper
let result;
try {
//@ts-ignore
if (log.body.input && log.body.input.value && log.body.input.value.continue === true)
delete log.body.input.value.continue;
result = yield scraperModule(ayakashiInstance, log.body.input.value || {}, log.body.params || {});
}
catch (e) {
opLog.error(`There was an error while running scraper <${log.body.module}> -`, e.message, e.stack);
yield connection.release();
throw e;
}
if (result) {
yield ayakashiInstance.yield(result);
}
if (!result && !yieldWatcher.yieldedAtLeastOnce) {
yield ayakashiInstance.yield({ continue: true });
}
yield connection.release();
}
catch (e) {
d(e);
throw e;
}
});
}
exports.default = scraperWrapper;
function loadExternals(connection, ayakashiInstance, log) {
return __awaiter(this, void 0, void 0, function* () {
loaders_1.loadExternalActions(ayakashiInstance, log.body.projectFolder, log.body.load.actions);
loaders_1.loadExternalExtractors(ayakashiInstance, log.body.projectFolder, log.body.load.extractors);
yield loaders_1.loadExternalPreloaders(connection, log.body.projectFolder, log.body.storeProjectFolder, log.body.load.preloaders);
});
}
function loadLocals(connection, ayakashiInstance, log) {
return __awaiter(this, void 0, void 0, function* () {
loaders_1.loadLocalProps(ayakashiInstance, log.body.projectFolder);
loaders_1.loadLocalActions(ayakashiInstance, log.body.projectFolder);
loaders_1.loadLocalExtractors(ayakashiInstance, log.body.projectFolder);
yield loaders_1.loadLocalPreloaders(connection, log.body.projectFolder, log.body.storeProjectFolder);
});
}