maxun-core
Version:
Core package for Maxun, responsible for data extraction
107 lines (106 loc) • 3.73 kB
TypeScript
/// <reference types="node" />
import { Page } from 'playwright';
import { EventEmitter } from 'events';
import { WorkflowFile, ParamType } from './types/workflow';
/**
* Extending the Window interface for custom scraping functions.
*/
declare global {
interface Window {
scrape: (selector: string | null) => Record<string, string>[];
scrapeSchema: (schema: Record<string, {
selector: string;
tag: string;
attribute: string;
}>) => Record<string, any>;
scrapeList: (config: {
listSelector: string;
fields: any;
limit?: number;
pagination: any;
}) => Record<string, any>[];
scrapeListAuto: (listSelector: string) => {
selector: string;
innerText: string;
}[];
scrollDown: (pages?: number) => void;
scrollUp: (pages?: number) => void;
}
}
/**
* Defines optional intepreter options (passed in constructor)
*/
interface InterpreterOptions {
mode?: string;
maxRepeats: number;
maxConcurrency: number;
serializableCallback: (output: any) => (void | Promise<void>);
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
debug: boolean;
debugChannel: Partial<{
activeId: (id: number) => void;
debugMessage: (msg: string) => void;
setActionType: (type: string) => void;
}>;
}
/**
* Class for running the Smart Workflows.
*/
export default class Interpreter extends EventEmitter {
private workflow;
private initializedWorkflow;
private options;
private concurrency;
private stopper;
private log;
private blocker;
private cumulativeResults;
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>);
private applyAdBlocker;
private disableAdBlocker;
private getSelectors;
/**
* Returns the context object from given Page and the current workflow.\
* \
* `workflow` is used for selector extraction - function searches for used selectors to
* look for later in the page's context.
* @param page Playwright Page object
* @param workflow Current **initialized** workflow (array of where-what pairs).
* @returns {PageState} State of the current page.
*/
private getState;
/**
* Tests if the given action is applicable with the given context.
* @param where Tested *where* condition
* @param context Current browser context.
* @returns True if `where` is applicable in the given context, false otherwise
*/
private applicable;
/**
* Given a Playwright's page object and a "declarative" list of actions, this function
* calls all mentioned functions on the Page object.\
* \
* Manipulates the iterator indexes (experimental feature, likely to be removed in
* the following versions of maxun-core)
* @param page Playwright Page object
* @param steps Array of actions.
*/
private carryOutSteps;
private handlePagination;
private getMatchingActionId;
private removeShadowSelectors;
private removeSpecialSelectors;
private runLoop;
private ensureScriptsLoaded;
/**
* Spawns a browser context and runs given workflow.
* \
* Resolves after the playback is finished.
* @param {Page} [page] Page to run the workflow on.
* @param {ParamType} params Workflow specific, set of parameters
* for the `{$param: nameofparam}` fields.
*/
run(page: Page, params?: ParamType): Promise<void>;
stop(): Promise<void>;
}
export {};