ayakashi
Version:
The next generation web scraping framework
167 lines (166 loc) • 8.32 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const core_1 = __importDefault(require("@ayakashi/request/core"));
const request_1 = require("../prelude/actions/request");
const path_1 = require("path");
const jsdom_1 = require("jsdom");
const loaders_1 = require("./loaders");
const pipeproc_1 = require("pipeproc");
const renderlessPrelude_1 = require("../prelude/renderlessPrelude");
const yield_1 = require("../prelude/actions/yield");
const cookies_1 = require("../prelude/actions/cookies");
const opLog_1 = require("../opLog/opLog");
const client_1 = require("../bridge/client");
const cookies_2 = require("./cookies");
const debug_1 = __importDefault(require("debug"));
const d = debug_1.default("ayakashi:renderlessScraperWrapper");
function renderlessScraperWrapper(log) {
return __awaiter(this, void 0, void 0, function* () {
try {
const opLog = opLog_1.getOpLog();
opLog.info("running renderlessScraper", log.body.module);
const bridgeClient = client_1.getBridgeClient(log.body.connectionConfig.bridgePort);
const ayakashiInstance = yield renderlessPrelude_1.renderlessPrelude();
//user-agent setup
const userAgentData = yield bridgeClient.getUserAgentData({
agent: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.userAgent) || undefined,
platform: (log.body.config.emulatorOptions && log.body.config.emulatorOptions.platform) || undefined,
persistentSession: log.body.persistentSession
});
if (!userAgentData) {
throw new Error("could not generate userAgent");
}
const acceptLanguage = (log.body.config.emulatorOptions && log.body.config.emulatorOptions.acceptLanguage) || "en-US";
//get cookie jar
const { jar } = yield cookies_2.getCookieJar(log.body.connectionConfig.bridgePort, {
persistentSession: log.body.persistentSession
});
//attach the request API
const myRequest = core_1.default.defaults({
headers: {
"User-Agent": userAgentData.userAgent,
//tslint:disable max-line-length
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
//tslint:enable max-line-length
"accept-language": acceptLanguage,
"cache-control": "no-cache",
pragma: "no-cache"
},
proxy: log.body.proxyUrl || undefined,
strictSSL: !log.body.ignoreCertificateErrors,
gzipOrBrotli: true,
timeout: 10000,
jar: jar
});
function cookieSync() {
return __awaiter(this, void 0, void 0, function* () {
//sync request cookies with the persistent store
yield cookies_2.updateCookieJar(log.body.connectionConfig.bridgePort, jar, {
persistentSession: log.body.persistentSession
});
});
}
request_1.attachRequest(ayakashiInstance, myRequest, cookieSync);
cookies_1.attachCookieActions(ayakashiInstance, jar, null, cookieSync);
//define the load methods
ayakashiInstance.load = function (url, timeout) {
return __awaiter(this, void 0, void 0, function* () {
d("loading url: ", url);
const html = yield ayakashiInstance.get(url, {
timeout: timeout || 10000
});
d("url loaded");
d("building DOM");
if (html) {
yield this.__attachDOM(new jsdom_1.JSDOM(html));
loaders_1.loadLocalProps(ayakashiInstance, log.body.projectFolder);
}
else {
yield ayakashiInstance.__connection.release();
throw new Error("Invalid page");
}
d("DOM built");
});
};
ayakashiInstance.loadHtml = function (html) {
return __awaiter(this, void 0, void 0, function* () {
d("building DOM");
if (html) {
yield this.__attachDOM(new jsdom_1.JSDOM(html));
loaders_1.loadLocalProps(ayakashiInstance, log.body.projectFolder);
}
else {
yield ayakashiInstance.__connection.release();
throw new Error("Invalid page");
}
d("DOM built");
});
};
//connect to pipeproc
const pipeprocClient = pipeproc_1.PipeProc();
yield pipeprocClient.connect({ socket: `ipc://${path_1.resolve(log.body.storeProjectFolder, "ipc")}` });
//attach the yield methods
const yieldWatcher = { yieldedAtLeastOnce: false };
yield_1.attachYields(ayakashiInstance, pipeprocClient, log.body.saveTopic, log.body.selfTopic, yieldWatcher);
loaders_1.loadExternalExtractors(ayakashiInstance, log.body.projectFolder, log.body.load.extractors);
loaders_1.loadLocalExtractors(ayakashiInstance, log.body.projectFolder);
let scraperModule;
try {
if (log.body.config.simple) {
scraperModule = require(path_1.resolve(log.body.projectFolder, log.body.module));
}
else {
scraperModule = require(path_1.resolve(log.body.projectFolder, "scrapers", log.body.module));
}
if (typeof scraperModule !== "function") {
scraperModule = scraperModule.default;
}
if (typeof scraperModule !== "function") {
throw new Error(`Scraper <${log.body.module}> is not a function`);
}
}
catch (e) {
opLog.error(e.message);
yield ayakashiInstance.__connection.release();
throw e;
}
//run the scraper
let result;
try {
//@ts-ignore
if (log.body.input && log.body.input.value && log.body.input.value.continue === true)
delete log.body.input.value.continue;
result = yield scraperModule(ayakashiInstance, log.body.input.value || {}, log.body.params || {});
}
catch (e) {
opLog.error(`There was an error while running scraper <${log.body.module}> -`, e.message, e.stack);
yield ayakashiInstance.__connection.release();
throw e;
}
if (result) {
yield ayakashiInstance.yield(result);
}
if (!result && !yieldWatcher.yieldedAtLeastOnce) {
yield ayakashiInstance.yield({ continue: true });
}
yield ayakashiInstance.__connection.release();
}
catch (e) {
d(e);
throw e;
}
});
}
exports.default = renderlessScraperWrapper;