UNPKG

@letsscrapedata/scraper

Version:

Web scraper that scraping web pages by LetsScrapeData XML template

249 lines (243 loc) 7.5 kB
import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller'; import { Proxy } from '@letsscrapedata/proxy'; import { LogFunction } from '@letsscrapedata/utils'; type TemplateId = number; type HttpHeaders = Record<string, string>; interface ScraperStateData extends BrowserStateData { /** * @default {} */ headers: HttpHeaders; /** * @default {} */ userData: Record<string, string>; expireTime?: number; } /** * Network context used to execute the task */ interface TaskNetworkContext { /** * Proxy that is used to access the target website: * * null only when domainId is 0, which means no network resource are required to execute the task * @default null */ proxy: Proxy | null; /** * browser page that is used to open web pages when executing the task * * null when domainNum is less than 0, which means no browser page is required to execute the task * @default null */ page: LsdPage | null; /** * LsdApiContext that shares the state data within the same browser context: * * null only when domainId is 0, which means no network resource are required to execute the task * @default null */ browserApiContext: LsdApiContext | null; /** * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext: * * null only when domainId is 0, which means no network resource are required to execute the task * * it is not recommended to use this context unless you never use a browser to access web pages. * @default null */ standaloneApiContext: LsdApiContext | null; } type DataRecord = Record<string, string>; type ExecData = Record<string, DataRecord[]>; interface Subtask { tid: number; parasstr: string; idx?: number; sapFlag?: boolean; } interface TaskMisc { taskId: number; message: string; stack: string; variables: Record<string, string>; } interface TaskData { templateId: TemplateId; parasStr: string; credits: number; execData: ExecData; subtasks: Subtask[]; debugFiles?: Record<string, string>; } interface TaskResult { taskData: TaskData; subtaskDatas?: TaskData[]; /** ** newStateData(misc_getstatedata) in TaskContext, related stateData fo cap or dom will be replaced or updated in LSDAPP ** included if credits >= 0 */ newStateData?: ScraperStateData; /** * included if credits < 0 */ misc?: TaskMisc; } type TaskType = "indAsync" | "indSync" | "memSync"; interface TemplateTasks { tid: number; parasstrs: string[]; } /** * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority. */ interface BrowserConfig { browserControllerType?: BrowserControllerType; /** * url used to connected the current browser ** url starts with "http://", such as "http://localhost:9222/" ** browserUrl can be used when mannaul login in advance. */ browserUrl?: string; /** * proxy ** no proxy will be used if proxyUrl is "" ** valid only if !browserUrl ** @default "none" */ proxyUrl?: string; /** * type of browser to be launched * valid only if !browserUrl * @default "chromium" */ browserType?: LsdBrowserType; } interface TemplatePara { templateId: number; /** * code for reading or getting the template * @default "" - only public templates can be got if readCode is "" */ readCode?: string; /** * the maximum number of concurrent tasks that can execute the same template in a browserContext * @default 1 */ maxConncurrency?: number; } type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt"; interface ScraperConfig { /** * @default false */ exitWhenCompleted?: boolean; /** * whether to use the parasstr in XML if parasstr of a task is "" * @default false */ useParasstrInXmlIfNeeded?: boolean; /** * whether to load unfinished tasks * @default false */ loadUnfinishedTasks?: boolean; /** * unit: minutes * @default 0 */ loadFailedTasksInterval?: number; /** * @default "", which will use current directory of process + "/data/" * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions. */ baseDir?: string; /** * where are the templates saved * @default "", which means to get the templates from LSD server */ templateDir?: string; /** * filename in action_setvar_get/get_file must include inputFileDirePart for security. * @default "LetsScrapeData" */ inputFileDirPart?: string; /** * wether to use puppeteer-extra-plugin-stealth, use patchright instead * @default false */ useStealthPlugin?: boolean; /** * default browserControllerType of BrowserConfig * @default "patchright" */ browserControllerType?: BrowserControllerType; /** * default browserType of BrowserConfig * @default "chromium" */ browserType?: LsdBrowserType; /** * @default { headless: false } */ lsdLaunchOptions?: LsdLaunchOptions; /** * @default {browserUrl: ""} */ lsdConnectOptions?: LsdConnectOptions; /** * Important: browsers to be launched or connected using proxyUrl * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy */ browserConfigs?: BrowserConfig[]; captcha?: { /** * clientKey of 2captcha */ clientKey: string; /** * captchas to be sovled: valid only in camoufox * @default [] */ captchas?: string[]; }; urlPrefix?: string; /** * the default maximum number of concurrent tasks that can execute the same template in a browserContext * @default 1 */ maxConcurrency?: number; /** * @default "" */ readCode?: string; /** * @default [] */ templateParas?: TemplatePara[]; /** * @default 10 */ totalMaxConcurrency?: number; /** * min miliseconds between two tasks of the same template * @default 2000 */ minMiliseconds?: number; /** * whether to move all dat_* files into a new directory "yyyyMMddHHmmss" * @default false */ moveDataWhenStart?: boolean; /** ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt"; * @default "jsonl" */ dataFileFormat?: DataFileFormat; /** * valid only when dataFileFormat is "txt" * @default "::" */ columnSeperator?: string; } declare function setScraperLogFun(logFun: LogFunction): boolean; declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number): Promise<TaskResult>; declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>; declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>; export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };