UNPKG

maxun-core

Version:

Core package for Maxun, responsible for data extraction

911 lines 57.6 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const adblocker_playwright_1 = require("@cliqz/adblocker-playwright"); const cross_fetch_1 = __importDefault(require("cross-fetch")); const path_1 = __importDefault(require("path")); const events_1 = require("events"); const logic_1 = require("./types/logic"); const utils_1 = require("./utils/utils"); const concurrency_1 = __importDefault(require("./utils/concurrency")); const preprocessor_1 = __importDefault(require("./preprocessor")); const logger_1 = __importStar(require("./utils/logger")); /** * Class for running the Smart Workflows. */ class Interpreter extends events_1.EventEmitter { constructor(workflow, options) { var _a; super(); this.stopper = null; this.blocker = null; this.cumulativeResults = []; this.workflow = workflow.workflow; this.initializedWorkflow = null; this.options = Object.assign({ maxRepeats: 5, maxConcurrency: 5, serializableCallback: (data) => { (0, logger_1.default)(JSON.stringify(data), logger_1.Level.WARN); }, binaryCallback: () => { (0, logger_1.default)('Received binary data, thrashing them.', logger_1.Level.WARN); }, debug: false, debugChannel: {} }, options); this.concurrency = new concurrency_1.default(this.options.maxConcurrency); this.log = (...args) => (0, logger_1.default)(...args); const error = preprocessor_1.default.validateWorkflow(workflow); if (error) { throw (error); } if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.debugMessage) { const oldLog = this.log; // @ts-ignore this.log = (...args) => { if (args[1] !== logger_1.Level.LOG) { this.options.debugChannel.debugMessage(typeof args[0] === 'string' ? args[0] : args[0].message); } oldLog(...args); }; } adblocker_playwright_1.PlaywrightBlocker.fromLists(cross_fetch_1.default, ['https://easylist.to/easylist/easylist.txt']).then(blocker => { this.blocker = blocker; }).catch(err => { this.log(`Failed to initialize ad-blocker:`, logger_1.Level.ERROR); }); } applyAdBlocker(page) { return __awaiter(this, void 0, void 0, function* () { if (this.blocker) { try { yield this.blocker.enableBlockingInPage(page); } catch (err) { this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR); } } }); } disableAdBlocker(page) { return __awaiter(this, void 0, void 0, function* () { if (this.blocker) { try { yield this.blocker.disableBlockingInPage(page); } catch (err) { this.log(`Ad-blocker operation failed:`, logger_1.Level.ERROR); } } }); } // private getSelectors(workflow: Workflow, actionId: number): string[] { // const selectors: string[] = []; // // Validate actionId // if (actionId <= 0) { // console.log("No previous selectors to collect."); // return selectors; // Empty array as there are no previous steps // } // // Iterate from the start up to (but not including) actionId // for (let index = 0; index < actionId; index++) { // const currentSelectors = workflow[index]?.where?.selectors; // console.log(`Selectors at step ${index}:`, currentSelectors); // if (currentSelectors && currentSelectors.length > 0) { // currentSelectors.forEach((selector) => { // if (!selectors.includes(selector)) { // selectors.push(selector); // Avoid duplicates // } // }); // } // } // console.log("Collected Selectors:", selectors); // return selectors; // } getSelectors(workflow) { var _a, _b; const selectorsSet = new Set(); if (workflow.length === 0) { return []; } for (let index = workflow.length - 1; index >= 0; index--) { const currentSelectors = (_b = (_a = workflow[index]) === null || _a === void 0 ? void 0 : _a.where) === null || _b === void 0 ? void 0 : _b.selectors; if (currentSelectors && currentSelectors.length > 0) { currentSelectors.forEach((selector) => selectorsSet.add(selector)); return Array.from(selectorsSet); } } return []; } /** * Returns the context object from given Page and the current workflow.\ * \ * `workflow` is used for selector extraction - function searches for used selectors to * look for later in the page's context. * @param page Playwright Page object * @param workflow Current **initialized** workflow (array of where-what pairs). * @returns {PageState} State of the current page. */ getState(page, workflowCopy, selectors) { return __awaiter(this, void 0, void 0, function* () { /** * All the selectors present in the current Workflow */ // const selectors = Preprocessor.extractSelectors(workflow); // console.log("Current selectors:", selectors); /** * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability). * @param selector Selector to be queried * @returns True if the targetted element is actionable, false otherwise. */ // const actionable = async (selector: string): Promise<boolean> => { // try { // const proms = [ // page.isEnabled(selector, { timeout: 10000 }), // page.isVisible(selector, { timeout: 10000 }), // ]; // return await Promise.all(proms).then((bools) => bools.every((x) => x)); // } catch (e) { // // log(<Error>e, Level.ERROR); // return false; // } // }; /** * Object of selectors present in the current page. */ // const presentSelectors: SelectorArray = await Promise.all( // selectors.map(async (selector) => { // if (await actionable(selector)) { // return [selector]; // } // return []; // }), // ).then((x) => x.flat()); const presentSelectors = yield Promise.all(selectors.map((selector) => __awaiter(this, void 0, void 0, function* () { try { yield page.waitForSelector(selector, { state: 'attached' }); return [selector]; } catch (e) { return []; } }))).then((x) => x.flat()); const action = workflowCopy[workflowCopy.length - 1]; // console.log("Next action:", action) let url = page.url(); if (action && action.where.url !== url && action.where.url !== "about:blank") { url = action.where.url; } return { url, cookies: (yield page.context().cookies([page.url()])) .reduce((p, cookie) => (Object.assign(Object.assign({}, p), { [cookie.name]: cookie.value })), {}), selectors: presentSelectors, }; }); } /** * Tests if the given action is applicable with the given context. * @param where Tested *where* condition * @param context Current browser context. * @returns True if `where` is applicable in the given context, false otherwise */ applicable(where, context, usedActions = []) { /** * Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\ * \ * For every key in `subset`, there must be a corresponding key with equal scalar * value in `superset`, or `inclusive(subset[key], superset[key])` must hold. * @param subset Arbitrary non-cyclic JS object (where clause) * @param superset Arbitrary non-cyclic JS object (browser context) * @returns `true` if `subset <= superset`, `false` otherwise. */ const inclusive = (subset, superset) => (Object.entries(subset).every(([key, value]) => { /** * Arrays are compared without order (are transformed into objects before comparison). */ const parsedValue = Array.isArray(value) ? (0, utils_1.arrayToObject)(value) : value; const parsedSuperset = {}; parsedSuperset[key] = Array.isArray(superset[key]) ? (0, utils_1.arrayToObject)(superset[key]) : superset[key]; if ((key === 'url' || key === 'selectors') && Array.isArray(value) && Array.isArray(superset[key]) && value.length === 0 && superset[key].length === 0) { return true; } if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) { return value.some(selector => superset[key].includes(selector)); } // Every `subset` key must exist in the `superset` and // have the same value (strict equality), or subset[key] <= superset[key] return parsedSuperset[key] && ((parsedSuperset[key] === parsedValue) || ((parsedValue).constructor.name === 'RegExp' && parsedValue.test(parsedSuperset[key])) || ((parsedValue).constructor.name !== 'RegExp' && typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key]))); })); // Every value in the "where" object should be compliant to the current state. return Object.entries(where).every(([key, value]) => { if (logic_1.operators.includes(key)) { const array = Array.isArray(value) ? value : Object.entries(value).map((a) => Object.fromEntries([a])); // every condition is treated as a single context switch (key) { case '$and': return array === null || array === void 0 ? void 0 : array.every((x) => this.applicable(x, context)); case '$or': return array === null || array === void 0 ? void 0 : array.some((x) => this.applicable(x, context)); case '$not': return !this.applicable(value, context); // $not should be a unary operator default: throw new Error('Undefined logic operator.'); } } else if (logic_1.meta.includes(key)) { const testRegexString = (x) => { if (typeof value === 'string') { return x === value; } return value.test(x); }; switch (key) { case '$before': return !usedActions.find(testRegexString); case '$after': return !!usedActions.find(testRegexString); default: throw new Error('Undefined meta operator.'); } } else { // Current key is a base condition (url, cookies, selectors) return inclusive({ [key]: value }, context); } }); } /** * Given a Playwright's page object and a "declarative" list of actions, this function * calls all mentioned functions on the Page object.\ * \ * Manipulates the iterator indexes (experimental feature, likely to be removed in * the following versions of maxun-core) * @param page Playwright Page object * @param steps Array of actions. */ carryOutSteps(page, steps) { var _a; return __awaiter(this, void 0, void 0, function* () { /** * Defines overloaded (or added) methods/actions usable in the workflow. * If a method overloads any existing method of the Page class, it accepts the same set * of parameters *(but can override some!)*\ * \ * Also, following piece of code defines functions to be run in the browser's context. * Beware of false linter errors - here, we know better! */ const wawActions = { screenshot: (params) => __awaiter(this, void 0, void 0, function* () { var _b; if ((_b = this.options.debugChannel) === null || _b === void 0 ? void 0 : _b.setActionType) { this.options.debugChannel.setActionType('screenshot'); } const screenshotBuffer = yield page.screenshot(Object.assign(Object.assign({}, params), { path: undefined })); yield this.options.binaryCallback(screenshotBuffer, 'image/png'); }), enqueueLinks: (selector) => __awaiter(this, void 0, void 0, function* () { var _c; if ((_c = this.options.debugChannel) === null || _c === void 0 ? void 0 : _c.setActionType) { this.options.debugChannel.setActionType('enqueueLinks'); } const links = yield page.locator(selector) .evaluateAll( // @ts-ignore (elements) => elements.map((a) => a.href).filter((x) => x)); const context = page.context(); for (const link of links) { // eslint-disable-next-line this.concurrency.addJob(() => __awaiter(this, void 0, void 0, function* () { try { const newPage = yield context.newPage(); yield newPage.goto(link); yield newPage.waitForLoadState('networkidle'); yield this.runLoop(newPage, this.initializedWorkflow); } catch (e) { // `runLoop` uses soft mode, so it recovers from it's own exceptions // but newPage(), goto() and waitForLoadState() don't (and will kill // the interpreter by throwing). this.log(e, logger_1.Level.ERROR); } })); } yield page.close(); }), scrape: (selector) => __awaiter(this, void 0, void 0, function* () { var _d; if ((_d = this.options.debugChannel) === null || _d === void 0 ? void 0 : _d.setActionType) { this.options.debugChannel.setActionType('scrape'); } yield this.ensureScriptsLoaded(page); const scrapeResults = yield page.evaluate((s) => window.scrape(s !== null && s !== void 0 ? s : null), selector); yield this.options.serializableCallback(scrapeResults); }), scrapeSchema: (schema) => __awaiter(this, void 0, void 0, function* () { var _e; if ((_e = this.options.debugChannel) === null || _e === void 0 ? void 0 : _e.setActionType) { this.options.debugChannel.setActionType('scrapeSchema'); } if (this.options.mode && this.options.mode === 'editor') { yield this.options.serializableCallback({}); return; } yield this.ensureScriptsLoaded(page); const scrapeResult = yield page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema); if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) { this.cumulativeResults = []; } if (this.cumulativeResults.length === 0) { this.cumulativeResults.push({}); } const mergedResult = this.cumulativeResults[0]; const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult; Object.entries(resultToProcess).forEach(([key, value]) => { if (value !== undefined) { mergedResult[key] = value; } }); console.log("Updated merged result:", mergedResult); yield this.options.serializableCallback([mergedResult]); }), scrapeList: (config) => __awaiter(this, void 0, void 0, function* () { var _f; if ((_f = this.options.debugChannel) === null || _f === void 0 ? void 0 : _f.setActionType) { this.options.debugChannel.setActionType('scrapeList'); } if (this.options.mode && this.options.mode === 'editor') { yield this.options.serializableCallback({}); return; } yield this.ensureScriptsLoaded(page); if (!config.pagination) { const scrapeResults = yield page.evaluate((cfg) => window.scrapeList(cfg), config); yield this.options.serializableCallback(scrapeResults); } else { const scrapeResults = yield this.handlePagination(page, config); yield this.options.serializableCallback(scrapeResults); } }), scrapeListAuto: (config) => __awaiter(this, void 0, void 0, function* () { var _g; if ((_g = this.options.debugChannel) === null || _g === void 0 ? void 0 : _g.setActionType) { this.options.debugChannel.setActionType('scrapeListAuto'); } yield this.ensureScriptsLoaded(page); const scrapeResults = yield page.evaluate((listSelector) => { return window.scrapeListAuto(listSelector); }, config.listSelector); yield this.options.serializableCallback(scrapeResults); }), scroll: (pages) => __awaiter(this, void 0, void 0, function* () { var _h; if ((_h = this.options.debugChannel) === null || _h === void 0 ? void 0 : _h.setActionType) { this.options.debugChannel.setActionType('scroll'); } yield page.evaluate((pagesInternal) => __awaiter(this, void 0, void 0, function* () { for (let i = 1; i <= (pagesInternal !== null && pagesInternal !== void 0 ? pagesInternal : 1); i += 1) { // @ts-ignore window.scrollTo(0, window.scrollY + window.innerHeight); } }), pages !== null && pages !== void 0 ? pages : 1); }), script: (code) => __awaiter(this, void 0, void 0, function* () { var _j; if ((_j = this.options.debugChannel) === null || _j === void 0 ? void 0 : _j.setActionType) { this.options.debugChannel.setActionType('script'); } const AsyncFunction = Object.getPrototypeOf(() => __awaiter(this, void 0, void 0, function* () { })).constructor; const x = new AsyncFunction('page', 'log', code); yield x(page, this.log); }), flag: () => __awaiter(this, void 0, void 0, function* () { return new Promise((res) => { var _a; if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) { this.options.debugChannel.setActionType('flag'); } this.emit('flag', page, res); }); }), }; const executeAction = (invokee, methodName, args) => __awaiter(this, void 0, void 0, function* () { console.log("Executing action:", methodName, args); if (methodName === 'press' || methodName === 'type') { // Extract only the first two arguments for these methods const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args]; yield invokee[methodName](...limitedArgs); return; } if (!args || Array.isArray(args)) { yield invokee[methodName](...(args !== null && args !== void 0 ? args : [])); } else { yield invokee[methodName](args); } }); for (const step of steps) { this.log(`Launching ${String(step.action)}`, logger_1.Level.LOG); if (step.action in wawActions) { // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) const params = !step.args || Array.isArray(step.args) ? step.args : [step.args]; yield wawActions[step.action](...(params !== null && params !== void 0 ? params : [])); } else { if ((_a = this.options.debugChannel) === null || _a === void 0 ? void 0 : _a.setActionType) { this.options.debugChannel.setActionType(String(step.action)); } // Implements the dot notation for the "method name" in the workflow const levels = String(step.action).split('.'); const methodName = levels[levels.length - 1]; let invokee = page; for (const level of levels.splice(0, levels.length - 1)) { invokee = invokee[level]; } if (methodName === 'waitForLoadState') { try { yield executeAction(invokee, methodName, step.args); } catch (error) { yield executeAction(invokee, methodName, 'domcontentloaded'); } } else if (methodName === 'click') { try { yield executeAction(invokee, methodName, step.args); } catch (error) { try { yield executeAction(invokee, methodName, [step.args[0], { force: true }]); } catch (error) { continue; } } } else { yield executeAction(invokee, methodName, step.args); } } yield new Promise((res) => { setTimeout(res, 500); }); } }); } handlePagination(page, config) { return __awaiter(this, void 0, void 0, function* () { let allResults = []; let previousHeight = 0; let scrapedItems = new Set(); let visitedUrls = new Set(); const MAX_RETRIES = 3; const RETRY_DELAY = 1000; // 1 second delay between retries const MAX_UNCHANGED_RESULTS = 5; const debugLog = (message, ...args) => { console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args); }; const scrapeCurrentPage = () => __awaiter(this, void 0, void 0, function* () { const results = yield page.evaluate((cfg) => window.scrapeList(cfg), config); const newResults = results.filter(item => { const uniqueKey = JSON.stringify(item); if (scrapedItems.has(uniqueKey)) return false; scrapedItems.add(uniqueKey); return true; }); allResults = allResults.concat(newResults); debugLog("Results collected:", allResults.length); }); const checkLimit = () => { if (config.limit && allResults.length >= config.limit) { allResults = allResults.slice(0, config.limit); return true; } return false; }; // Enhanced button finder with retry mechanism const findWorkingButton = (selectors) => __awaiter(this, void 0, void 0, function* () { let updatedSelectors = [...selectors]; for (let i = 0; i < selectors.length; i++) { const selector = selectors[i]; let retryCount = 0; let selectorSuccess = false; while (retryCount < MAX_RETRIES && !selectorSuccess) { try { const button = yield page.waitForSelector(selector, { state: 'attached', timeout: 10000 }); if (button) { debugLog('Found working selector:', selector); return { button, workingSelector: selector, updatedSelectors }; } } catch (error) { retryCount++; debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`); if (retryCount < MAX_RETRIES) { yield page.waitForTimeout(RETRY_DELAY); } else { debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`); updatedSelectors = updatedSelectors.filter(s => s !== selector); } } } } return { button: null, workingSelector: null, updatedSelectors }; }); const retryOperation = (operation, retryCount = 0) => __awaiter(this, void 0, void 0, function* () { try { return yield operation(); } catch (error) { if (retryCount < MAX_RETRIES) { debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`); yield page.waitForTimeout(RETRY_DELAY); return retryOperation(operation, retryCount + 1); } debugLog(`Operation failed after ${MAX_RETRIES} retries`); return false; } }); let availableSelectors = config.pagination.selector.split(','); let unchangedResultCounter = 0; try { while (true) { switch (config.pagination.type) { case 'scrollDown': { let previousResultCount = allResults.length; yield scrapeCurrentPage(); if (checkLimit()) { return allResults; } yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); yield page.waitForTimeout(2000); const currentHeight = yield page.evaluate(() => document.body.scrollHeight); const currentResultCount = allResults.length; if (currentResultCount === previousResultCount) { unchangedResultCounter++; if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) { return allResults; } } else { unchangedResultCounter = 0; } if (currentHeight === previousHeight) { return allResults; } previousHeight = currentHeight; break; } case 'scrollUp': { let previousResultCount = allResults.length; yield scrapeCurrentPage(); if (checkLimit()) { return allResults; } yield page.evaluate(() => window.scrollTo(0, 0)); yield page.waitForTimeout(2000); const currentTopHeight = yield page.evaluate(() => document.documentElement.scrollTop); const currentResultCount = allResults.length; if (currentResultCount === previousResultCount) { unchangedResultCounter++; if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) { return allResults; } } else { unchangedResultCounter = 0; } if (currentTopHeight === 0) { return allResults; } previousHeight = currentTopHeight; break; } case 'clickNext': { const currentUrl = page.url(); visitedUrls.add(currentUrl); yield scrapeCurrentPage(); if (checkLimit()) return allResults; const { button, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors); availableSelectors = updatedSelectors; if (!button || !workingSelector) { // Final retry for navigation when no selectors work const success = yield retryOperation(() => __awaiter(this, void 0, void 0, function* () { try { yield page.evaluate(() => window.history.forward()); const newUrl = page.url(); return !visitedUrls.has(newUrl); } catch (_a) { return false; } })); if (!success) return allResults; break; } let retryCount = 0; let paginationSuccess = false; // Capture basic content signature before click const captureContentSignature = () => __awaiter(this, void 0, void 0, function* () { return yield page.evaluate((selector) => { const items = document.querySelectorAll(selector); return { url: window.location.href, itemCount: items.length, firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|') }; }, config.listSelector); }); const beforeSignature = yield captureContentSignature(); debugLog(`Before click: ${beforeSignature.itemCount} items`); while (retryCount < MAX_RETRIES && !paginationSuccess) { try { try { yield Promise.all([ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }).catch(e => { throw e; }), button.click() ]); debugLog("Navigation successful after regular click"); yield page.waitForTimeout(2000); paginationSuccess = true; } catch (navError) { debugLog("Regular click with navigation failed, trying dispatch event with navigation"); try { yield Promise.all([ page.waitForNavigation({ waitUntil: 'networkidle', timeout: 15000 }).catch(e => { throw e; }), button.dispatchEvent('click') ]); debugLog("Navigation successful after dispatch event"); yield page.waitForTimeout(2000); paginationSuccess = true; } catch (dispatchNavError) { try { yield button.click(); yield page.waitForTimeout(2000); } catch (clickError) { yield button.dispatchEvent('click'); yield page.waitForTimeout(2000); } } } yield page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => { }); if (!paginationSuccess) { const newUrl = page.url(); const afterSignature = yield captureContentSignature(); if (newUrl !== currentUrl) { debugLog(`URL changed to ${newUrl}`); visitedUrls.add(newUrl); paginationSuccess = true; } else if (afterSignature.firstItems !== beforeSignature.firstItems) { debugLog("Content changed without URL change"); paginationSuccess = true; } else if (afterSignature.itemCount !== beforeSignature.itemCount) { debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`); paginationSuccess = true; } } } catch (error) { debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`); } if (!paginationSuccess) { retryCount++; if (retryCount < MAX_RETRIES) { debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`); yield page.waitForTimeout(RETRY_DELAY); } } } if (!paginationSuccess) { debugLog(`Pagination failed after ${MAX_RETRIES} attempts`); return allResults; } break; } case 'clickLoadMore': { yield scrapeCurrentPage(); if (checkLimit()) return allResults; let loadMoreCounter = 0; let previousResultCount = allResults.length; let noNewItemsCounter = 0; const MAX_NO_NEW_ITEMS = 2; while (true) { // Find working button with retry mechanism const { button: loadMoreButton, workingSelector, updatedSelectors } = yield findWorkingButton(availableSelectors); availableSelectors = updatedSelectors; if (!workingSelector || !loadMoreButton) { debugLog('No working Load More selector found after retries'); return allResults; } // Implement retry mechanism for clicking the button let retryCount = 0; let clickSuccess = false; while (retryCount < MAX_RETRIES && !clickSuccess) { try { try { yield loadMoreButton.click(); clickSuccess = true; } catch (error) { debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`); // If regular click fails, try dispatchEvent try { yield loadMoreButton.dispatchEvent('click'); clickSuccess = true; } catch (dispatchError) { debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`); throw dispatchError; // Propagate error to trigger retry } } if (clickSuccess) { yield page.waitForTimeout(1000); loadMoreCounter++; debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`); } } catch (error) { debugLog(`Click attempt ${retryCount + 1} failed completely.`); retryCount++; if (retryCount < MAX_RETRIES) { debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`); yield page.waitForTimeout(RETRY_DELAY); } } } if (!clickSuccess) { debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`); return allResults; } // Wait for content to load and check scroll height yield page.waitForTimeout(2000); yield page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); yield page.waitForTimeout(2000); const currentHeight = yield page.evaluate(() => document.body.scrollHeight); const heightChanged = currentHeight !== previousHeight; previousHeight = currentHeight; yield scrapeCurrentPage(); const currentResultCount = allResults.length; const newItemsAdded = currentResultCount > previousResultCount; if (!newItemsAdded) { noNewItemsCounter++; debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`); if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) { debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`); return allResults; } } else { noNewItemsCounter = 0; previousResultCount = currentResultCount; } if (checkLimit()) return allResults; if (!heightChanged) { debugLog('No more items loaded after Load More'); return allResults; } } } default: { yield scrapeCurrentPage(); return allResults; } } if (checkLimit()) break; } } catch (error) { debugLog(`Fatal error: ${error.message}`); return allResults; } return allResults; }); } getMatchingActionId(workflow, pageState, usedActions) { for (let actionId = workflow.length - 1; actionId >= 0; actionId--) { const step = workflow[actionId]; const isApplicable = this.applicable(step.where, pageState, usedActions); console.log("-------------------------------------------------------------"); console.log(`Where:`, step.where); console.log(`Page state:`, pageState); console.log(`Match result: ${isApplicable}`); console.log("-------------------------------------------------------------"); if (isApplicable) { return actionId; } } } removeShadowSelectors(workflow) { for (let actionId = workflow.length - 1; actionId >= 0; actionId--) { const step = workflow[actionId]; // Check if step has where and selectors if (step.where && Array.isArray(step.where.selectors)) { // Filter out selectors that contain ">>" step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>')); } } return workflow; } removeSpecialSelectors(workflow) { for (let actionId = workflow.length - 1; actionId >= 0; actionId--) { const step = workflow[actionId]; if (step.where && Array.isArray(step.where.selectors)) { // Filter out if selector has EITHER ":>>" OR ">>" step.where.selectors = step.where.selectors.filter(selector => !(selector.includes(':>>') || selector.includes('>>'))); } } return workflow; } runLoop(p, workflow) { var _a, _b; return __awaiter(this, void 0, void 0, function* () { let workflowCopy = JSON.parse(JSON.stringify(workflow)); workflowCopy = this.removeSpecialSelectors(workflowCopy); // apply ad-blocker to the curren