maxun-core
Version:
Core package for Maxun, responsible for data extraction
911 lines • 57.6 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const adblocker_playwright_1 = require("@cliqz/adblocker-playwright");
const cross_fetch_1 = __importDefault(require("cross-fetch"));
const path_1 = __importDefault(require("path"));
const events_1 = require("events");
const logic_1 = require("./types/logic");
const utils_1 = require("./utils/utils");
const concurrency_1 = __importDefault(require("./utils/concurrency"));
const preprocessor_1 = __importDefault(require("./preprocessor"));
const logger_1 = __importStar(require("./utils/logger"));
/**
* Class for running the Smart Workflows.
*/
class Interpreter extends events_1.EventEmitter {
constructor(workflow, options) {
var _a;
super();
this.stopper = null;
this.blocker = null;
this.cumulativeResults = [];
this.workflow = workflow.workflow;
this.initializedWorkflow = null;
this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => {
(0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN);
}, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options);
this.concurrency = new concurrency_1.default(this.options.maxConcurrency);
this.log = (...args) => (0, logger_1.default)(...args);
const error = preprocessor_1.default.validateWorkflow(workflow);
if (error) {
throw (error);
}
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.debugMessage) {
const oldLog = this.log;
// @ts-ignore
this.log = (...args) => {
if (args[1] !== logger_1.Level.LOG) {
this.options.debugChannel.debugMessage(typeof args[0] === 'string' ? args[0] : args[0].message);
}
oldLog(...args);
};
}
adblocker_playwright_1.PlaywrightBlocker.fromLists(cross_fetch_1.default, ['https://easylist.to/easylist/easylist.txt']).then(blocker => {
this.blocker = blocker;
}).catch(err => {
this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR);
});
}
applyAdBlocker(page) {
return __awaiter(this, void 0, void 0, function* () {
if (this.blocker) {
try {
yield this.blocker.enableBlockingInPage(page);
}
catch (err) {
this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
}
}
});
}
disableAdBlocker(page) {
return __awaiter(this, void 0, void 0, function* () {
if (this.blocker) {
try {
yield this.blocker.disableBlockingInPage(page);
}
catch (err) {
this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR);
}
}
});
}
// private getSelectors(workflow: Workflow, actionId: number): string[] {
// const selectors: string[] = [];
// // Validate actionId
// if (actionId <= 0) {
// console.log("No previous selectors to collect.");
// return selectors; // Empty array as there are no previous steps
// }
// // Iterate from the start up to (but not including) actionId
// for (let index = 0; index < actionId; index++) {
// const currentSelectors = workflow[index]?.where?.selectors;
// console.log(`Selectors at step ${index}:`, currentSelectors);
// if (currentSelectors && currentSelectors.length > 0) {
// currentSelectors.forEach((selector) => {
// if (!selectors.includes(selector)) {
// selectors.push(selector); // Avoid duplicates
// }
// });
// }
// }
// console.log("Collected Selectors:", selectors);
// return selectors;
// }
getSelectors(workflow) {
var _a, _b;
const selectorsSet = new Set();
if (workflow.length === 0) {
return [];
}
for (let index = workflow.length - 1; index >= 0; index--) {
const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors;
if (currentSelectors && currentSelectors.length > 0) {
currentSelectors.forEach((selector) => selectorsSet.add(selector));
return Array.from(selectorsSet);
}
}
return [];
}
/**
* Returns the context object from given Page and the current workflow.\
* \
* `workflow` is used for selector extraction - function searches for used selectors to
* look for later in the page's context.
* @param page Playwright Page object
* @param workflow Current **initialized** workflow (array of where-what pairs).
* @returns {PageState} State of the current page.
*/
getState(page, workflowCopy, selectors) {
return __awaiter(this, void 0, void 0, function* () {
/**
* All the selectors present in the current Workflow
*/
// const selectors = Preprocessor.extractSelectors(workflow);
// console.log("Current selectors:", selectors);
/**
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
* @param selector Selector to be queried
* @returns True if the targetted element is actionable, false otherwise.
*/
// const actionable = async (selector: string): Promise<boolean> => {
// try {
// const proms = [
// page.isEnabled(selector, { timeout: 10000 }),
// page.isVisible(selector, { timeout: 10000 }),
// ];
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
// } catch (e) {
// // log(<Error>e, Level.ERROR);
// return false;
// }
// };
/**
* Object of selectors present in the current page.
*/
// const presentSelectors: SelectorArray = await Promise.all(
// selectors.map(async (selector) => {
// if (await actionable(selector)) {
// return [selector];
// }
// return [];
// }),
// ).then((x) => x.flat());
const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () {
try {
yield page.waitForSelector(selector, { state: 'attached' });
return [selector];
}
catch (e) {
return [];
}
}))).then((x) => x.flat());
const action = workflowCopy[workflowCopy.length - 1];
// console.log("Next action:", action)
let url = page.url();
if (action && action.where.url !== url && action.where.url !== "about:blank") {
url = action.where.url;
}
return {
url,
cookies: (yield page.context().cookies([page.url()]))
.reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}),
selectors: presentSelectors,
};
});
}
/**
* Tests if the given action is applicable with the given context.
* @param where Tested *where* condition
* @param context Current browser context.
* @returns True if `where` is applicable in the given context, false otherwise
*/
applicable(where, context, usedActions = []) {
/**
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
* \
* For every key in `subset`, there must be a corresponding key with equal scalar
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
* @param subset Arbitrary non-cyclic JS object (where clause)
* @param superset Arbitrary non-cyclic JS object (browser context)
* @returns `true` if `subset <= superset`, `false` otherwise.
*/
const inclusive = (subset, superset) => (Object.entries(subset).every(([key, value]) => {
/**
* Arrays are compared without order (are transformed into objects before comparison).
*/
const parsedValue = Array.isArray(value) ? (0, utils_1.arrayToObject)(value) : value;
const parsedSuperset = {};
parsedSuperset[key] = Array.isArray(superset[key])
? (0, utils_1.arrayToObject)(superset[key])
: superset[key];
if ((key === 'url' || key === 'selectors') &&
Array.isArray(value) && Array.isArray(superset[key]) &&
value.length === 0 && superset[key].length === 0) {
return true;
}
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
return value.some(selector => superset[key].includes(selector));
}
// Every `subset` key must exist in the `superset` and
// have the same value (strict equality), or subset[key] <= superset[key]
return parsedSuperset[key]
&& ((parsedSuperset[key] === parsedValue)
|| ((parsedValue).constructor.name === 'RegExp' && parsedValue.test(parsedSuperset[key]))
|| ((parsedValue).constructor.name !== 'RegExp'
&& typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key])));
}));
// Every value in the "where" object should be compliant to the current state.
return Object.entries(where).every(([key, value]) => {
if (logic_1.operators.includes(key)) {
const array = Array.isArray(value)
? value
: Object.entries(value).map((a) => Object.fromEntries([a]));
// every condition is treated as a single context
switch (key) {
case '$and':
return array === null || array === void 0 ? void 0 : array.every((x) => this.applicable(x, context));
case '$or':
return array === null || array === void 0 ? void 0 : array.some((x) => this.applicable(x, context));
case '$not':
return !this.applicable(value, context); // $not should be a unary operator
default:
throw new Error('Undefined logic operator.');
}
}
else if (logic_1.meta.includes(key)) {
const testRegexString = (x) => {
if (typeof value === 'string') {
return x === value;
}
return value.test(x);
};
switch (key) {
case '$before':
return !usedActions.find(testRegexString);
case '$after':
return !!usedActions.find(testRegexString);
default:
throw new Error('Undefined meta operator.');
}
}
else {
// Current key is a base condition (url, cookies, selectors)
return inclusive({ [key]: value }, context);
}
});
}
/**
* Given a Playwright's page object and a "declarative" list of actions, this function
* calls all mentioned functions on the Page object.\
* \
* Manipulates the iterator indexes (experimental feature, likely to be removed in
* the following versions of maxun-core)
* @param page Playwright Page object
* @param steps Array of actions.
*/
carryOutSteps(page, steps) {
var _a;
return __awaiter(this, void 0, void 0, function* () {
/**
* Defines overloaded (or added) methods/actions usable in the workflow.
* If a method overloads any existing method of the Page class, it accepts the same set
* of parameters *(but can override some!)*\
* \
* Also, following piece of code defines functions to be run in the browser's context.
* Beware of false linter errors - here, we know better!
*/
const wawActions = {
screenshot: (params) => __awaiter(this, void 0, void 0, function* () {
var _b;
if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) {
this.options.debugChannel.setActionType('screenshot');
}
const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined }));
yield this.options.binaryCallback(screenshotBuffer, 'image/png');
}),
enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () {
var _c;
if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.setActionType) {
this.options.debugChannel.setActionType('enqueueLinks');
}
const links = yield page.locator(selector)
.evaluateAll(
// @ts-ignore
(elements) => elements.map((a) => a.href).filter((x) => x));
const context = page.context();
for (const link of links) {
// eslint-disable-next-line
this.concurrency.addJob(() => __awaiter(this, void 0, void 0, function* () {
try {
const newPage = yield context.newPage();
yield newPage.goto(link);
yield newPage.waitForLoadState('networkidle');
yield this.runLoop(newPage, this.initializedWorkflow);
}
catch (e) {
// `runLoop` uses soft mode, so it recovers from it's own exceptions
// but newPage(), goto() and waitForLoadState() don't (and will kill
// the interpreter by throwing).
this.log(e, logger_1.Level.ERROR);
}
}));
}
yield page.close();
}),
scrape: (selector) => __awaiter(this, void 0, void 0, function* () {
var _d;
if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) {
this.options.debugChannel.setActionType('scrape');
}
yield this.ensureScriptsLoaded(page);
const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector);
yield this.options.serializableCallback(scrapeResults);
}),
scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () {
var _e;
if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) {
this.options.debugChannel.setActionType('scrapeSchema');
}
if (this.options.mode && this.options.mode === 'editor') {
yield this.options.serializableCallback({});
return;
}
yield this.ensureScriptsLoaded(page);
const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
this.cumulativeResults = [];
}
if (this.cumulativeResults.length === 0) {
this.cumulativeResults.push({});
}
const mergedResult = this.cumulativeResults[0];
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
Object.entries(resultToProcess).forEach(([key, value]) => {
if (value !== undefined) {
mergedResult[key] = value;
}
});
console.log("Updated merged result:", mergedResult);
yield this.options.serializableCallback([mergedResult]);
}),
scrapeList: (config) => __awaiter(this, void 0, void 0, function* () {
var _f;
if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) {
this.options.debugChannel.setActionType('scrapeList');
}
if (this.options.mode && this.options.mode === 'editor') {
yield this.options.serializableCallback({});
return;
}
yield this.ensureScriptsLoaded(page);
if (!config.pagination) {
const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
yield this.options.serializableCallback(scrapeResults);
}
else {
const scrapeResults = yield this.handlePagination(page, config);
yield this.options.serializableCallback(scrapeResults);
}
}),
scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () {
var _g;
if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.setActionType) {
this.options.debugChannel.setActionType('scrapeListAuto');
}
yield this.ensureScriptsLoaded(page);
const scrapeResults = yield page.evaluate((listSelector) => {
return window.scrapeListAuto(listSelector);
}, config.listSelector);
yield this.options.serializableCallback(scrapeResults);
}),
scroll: (pages) => __awaiter(this, void 0, void 0, function* () {
var _h;
if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) {
this.options.debugChannel.setActionType('scroll');
}
yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () {
for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) {
// @ts-ignore
window.scrollTo(0, window.scrollY + window.innerHeight);
}
}), pages !== null && pages !== void 0 ? pages : 1);
}),
script: (code) => __awaiter(this, void 0, void 0, function* () {
var _j;
if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) {
this.options.debugChannel.setActionType('script');
}
const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor;
const x = new AsyncFunction('page', 'log', code);
yield x(page, this.log);
}),
flag: () => __awaiter(this, void 0, void 0, function* () {
return new Promise((res) => {
var _a;
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
this.options.debugChannel.setActionType('flag');
}
this.emit('flag', page, res);
});
}),
};
const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () {
console.log("Executing action:", methodName, args);
if (methodName === 'press' || methodName === 'type') {
// Extract only the first two arguments for these methods
const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
yield invokee[methodName](...limitedArgs);
return;
}
if (!args || Array.isArray(args)) {
yield invokee[methodName](...(args !== null && args !== void 0 ? args : []));
}
else {
yield invokee[methodName](args);
}
});
for (const step of steps) {
this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG);
if (step.action in wawActions) {
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
yield wawActions[step.action](...(params !== null && params !== void 0 ? params : []));
}
else {
if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) {
this.options.debugChannel.setActionType(String(step.action));
}
// Implements the dot notation for the "method name" in the workflow
const levels = String(step.action).split('.');
const methodName = levels[levels.length - 1];
let invokee = page;
for (const level of levels.splice(0, levels.length - 1)) {
invokee = invokee[level];
}
if (methodName === 'waitForLoadState') {
try {
yield executeAction(invokee, methodName, step.args);
}
catch (error) {
yield executeAction(invokee, methodName, 'domcontentloaded');
}
}
else if (methodName === 'click') {
try {
yield executeAction(invokee, methodName, step.args);
}
catch (error) {
try {
yield executeAction(invokee, methodName, [step.args[0], { force: true }]);
}
catch (error) {
continue;
}
}
}
else {
yield executeAction(invokee, methodName, step.args);
}
}
yield new Promise((res) => { setTimeout(res, 500); });
}
});
}
handlePagination(page, config) {
return __awaiter(this, void 0, void 0, function* () {
let allResults = [];
let previousHeight = 0;
let scrapedItems = new Set();
let visitedUrls = new Set();
const MAX_RETRIES = 3;
const RETRY_DELAY = 1000; // 1 second delay between retries
const MAX_UNCHANGED_RESULTS = 5;
const debugLog = (message, ...args) => {
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
};
const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () {
const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config);
const newResults = results.filter(item => {
const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey))
return false;
scrapedItems.add(uniqueKey);
return true;
});
allResults = allResults.concat(newResults);
debugLog("Results collected:", allResults.length);
});
const checkLimit = () => {
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
return true;
}
return false;
};
// Enhanced button finder with retry mechanism
const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () {
let updatedSelectors = [...selectors];
for (let i = 0; i < selectors.length; i++) {
const selector = selectors[i];
let retryCount = 0;
let selectorSuccess = false;
while (retryCount < MAX_RETRIES && !selectorSuccess) {
try {
const button = yield page.waitForSelector(selector, {
state: 'attached',
timeout: 10000
});
if (button) {
debugLog('Found working selector:', selector);
return {
button,
workingSelector: selector,
updatedSelectors
};
}
}
catch (error) {
retryCount++;
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
if (retryCount < MAX_RETRIES) {
yield page.waitForTimeout(RETRY_DELAY);
}
else {
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
updatedSelectors = updatedSelectors.filter(s => s !== selector);
}
}
}
}
return {
button: null,
workingSelector: null,
updatedSelectors
};
});
const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () {
try {
return yield operation();
}
catch (error) {
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
yield page.waitForTimeout(RETRY_DELAY);
return retryOperation(operation, retryCount + 1);
}
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
return false;
}
});
let availableSelectors = config.pagination.selector.split(',');
let unchangedResultCounter = 0;
try {
while (true) {
switch (config.pagination.type) {
case 'scrollDown': {
let previousResultCount = allResults.length;
yield scrapeCurrentPage();
if (checkLimit()) {
return allResults;
}
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
yield page.waitForTimeout(2000);
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
const currentResultCount = allResults.length;
if (currentResultCount === previousResultCount) {
unchangedResultCounter++;
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
return allResults;
}
}
else {
unchangedResultCounter = 0;
}
if (currentHeight === previousHeight) {
return allResults;
}
previousHeight = currentHeight;
break;
}
case 'scrollUp': {
let previousResultCount = allResults.length;
yield scrapeCurrentPage();
if (checkLimit()) {
return allResults;
}
yield page.evaluate(() => window.scrollTo(0, 0));
yield page.waitForTimeout(2000);
const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop);
const currentResultCount = allResults.length;
if (currentResultCount === previousResultCount) {
unchangedResultCounter++;
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
return allResults;
}
}
else {
unchangedResultCounter = 0;
}
if (currentTopHeight === 0) {
return allResults;
}
previousHeight = currentTopHeight;
break;
}
case 'clickNext': {
const currentUrl = page.url();
visitedUrls.add(currentUrl);
yield scrapeCurrentPage();
if (checkLimit())
return allResults;
const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!button || !workingSelector) {
// Final retry for navigation when no selectors work
const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () {
try {
yield page.evaluate(() => window.history.forward());
const newUrl = page.url();
return !visitedUrls.has(newUrl);
}
catch (_a) {
return false;
}
}));
if (!success)
return allResults;
break;
}
let retryCount = 0;
let paginationSuccess = false;
// Capture basic content signature before click
const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () {
return yield page.evaluate((selector) => {
const items = document.querySelectorAll(selector);
return {
url: window.location.href,
itemCount: items.length,
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
};
}, config.listSelector);
});
const beforeSignature = yield captureContentSignature();
debugLog(`Before click: ${beforeSignature.itemCount} items`);
while (retryCount < MAX_RETRIES && !paginationSuccess) {
try {
try {
yield Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}).catch(e => {
throw e;
}),
button.click()
]);
debugLog("Navigation successful after regular click");
yield page.waitForTimeout(2000);
paginationSuccess = true;
}
catch (navError) {
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
try {
yield Promise.all([
page.waitForNavigation({
waitUntil: 'networkidle',
timeout: 15000
}).catch(e => {
throw e;
}),
button.dispatchEvent('click')
]);
debugLog("Navigation successful after dispatch event");
yield page.waitForTimeout(2000);
paginationSuccess = true;
}
catch (dispatchNavError) {
try {
yield button.click();
yield page.waitForTimeout(2000);
}
catch (clickError) {
yield button.dispatchEvent('click');
yield page.waitForTimeout(2000);
}
}
}
yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { });
if (!paginationSuccess) {
const newUrl = page.url();
const afterSignature = yield captureContentSignature();
if (newUrl !== currentUrl) {
debugLog(`URL changed to ${newUrl}`);
visitedUrls.add(newUrl);
paginationSuccess = true;
}
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
debugLog("Content changed without URL change");
paginationSuccess = true;
}
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
paginationSuccess = true;
}
}
}
catch (error) {
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
}
if (!paginationSuccess) {
retryCount++;
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
yield page.waitForTimeout(RETRY_DELAY);
}
}
}
if (!paginationSuccess) {
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
return allResults;
}
break;
}
case 'clickLoadMore': {
yield scrapeCurrentPage();
if (checkLimit())
return allResults;
let loadMoreCounter = 0;
let previousResultCount = allResults.length;
let noNewItemsCounter = 0;
const MAX_NO_NEW_ITEMS = 2;
while (true) {
// Find working button with retry mechanism
const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!workingSelector || !loadMoreButton) {
debugLog('No working Load More selector found after retries');
return allResults;
}
// Implement retry mechanism for clicking the button
let retryCount = 0;
let clickSuccess = false;
while (retryCount < MAX_RETRIES && !clickSuccess) {
try {
try {
yield loadMoreButton.click();
clickSuccess = true;
}
catch (error) {
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
// If regular click fails, try dispatchEvent
try {
yield loadMoreButton.dispatchEvent('click');
clickSuccess = true;
}
catch (dispatchError) {
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
throw dispatchError; // Propagate error to trigger retry
}
}
if (clickSuccess) {
yield page.waitForTimeout(1000);
loadMoreCounter++;
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
}
}
catch (error) {
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
retryCount++;
if (retryCount < MAX_RETRIES) {
debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
yield page.waitForTimeout(RETRY_DELAY);
}
}
}
if (!clickSuccess) {
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
return allResults;
}
// Wait for content to load and check scroll height
yield page.waitForTimeout(2000);
yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
yield page.waitForTimeout(2000);
const currentHeight = yield page.evaluate(() => document.body.scrollHeight);
const heightChanged = currentHeight !== previousHeight;
previousHeight = currentHeight;
yield scrapeCurrentPage();
const currentResultCount = allResults.length;
const newItemsAdded = currentResultCount > previousResultCount;
if (!newItemsAdded) {
noNewItemsCounter++;
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
return allResults;
}
}
else {
noNewItemsCounter = 0;
previousResultCount = currentResultCount;
}
if (checkLimit())
return allResults;
if (!heightChanged) {
debugLog('No more items loaded after Load More');
return allResults;
}
}
}
default: {
yield scrapeCurrentPage();
return allResults;
}
}
if (checkLimit())
break;
}
}
catch (error) {
debugLog(`Fatal error: ${error.message}`);
return allResults;
}
return allResults;
});
}
getMatchingActionId(workflow, pageState, usedActions) {
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
const step = workflow[actionId];
const isApplicable = this.applicable(step.where, pageState, usedActions);
console.log("-------------------------------------------------------------");
console.log(`Where:`, step.where);
console.log(`Page state:`, pageState);
console.log(`Match result: ${isApplicable}`);
console.log("-------------------------------------------------------------");
if (isApplicable) {
return actionId;
}
}
}
removeShadowSelectors(workflow) {
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
const step = workflow[actionId];
// Check if step has where and selectors
if (step.where && Array.isArray(step.where.selectors)) {
// Filter out selectors that contain ">>"
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
}
}
return workflow;
}
removeSpecialSelectors(workflow) {
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
const step = workflow[actionId];
if (step.where && Array.isArray(step.where.selectors)) {
// Filter out if selector has EITHER ":>>" OR ">>"
step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>')));
}
}
return workflow;
}
runLoop(p, workflow) {
var _a, _b;
return __awaiter(this, void 0, void 0, function* () {
let workflowCopy = JSON.parse(JSON.stringify(workflow));
workflowCopy = this.removeSpecialSelectors(workflowCopy);
// apply ad-blocker to the curren