UNPKG

@letsscrapedata/scraper

Version:

Web scraper that scraping web pages by LetsScrapeData XML template

742 lines (731 loc) 19.7 kB
import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller'; import { Proxy } from '@letsscrapedata/proxy'; import { LogFunction } from '@letsscrapedata/utils'; interface XmlParaCfg { paraname: string; name: string; desc: string; uitype: string; defval: string; listid: string; listparas: string; min: string; max: string; pattern: string; alert: string; } interface XmlActionBreakCfg { type: string; id: string; } interface XmlActionCaptchaCfg { type: string; try: string; errname: string; id: string; } interface XmlActionClickCfg { type: string; wait: string; gen: boolean; cap: boolean; clicktype: string; try: string; errname: string; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; navigate: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; downloadpath: string; filename: string; pathvarname: string; id: string; } interface XmlActionContinueCfg { type: string; id: string; } interface XmlActionMiscCfg { type: string; id: string; } interface XmlActionExitCfg { type: string; errname: string; id: string; } interface XmlActionExtractCfg { type: string; tabname: string; id: string; } interface XmlActionGotoCfg { type: string; url: string; reuse: boolean; wait: string; encodeuri: boolean; gen: boolean; cap: boolean; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; headers: string; referer: string; id: string; } interface XmlActionHoverCfg { type: string; try: string; gen: boolean; cap: boolean; wait: string; errname: string; id: string; } interface XmlActionIfelseCfg { type: string; id: string; } interface XmlActionInputCfg { type: string; content: string; enter: boolean; replace: boolean; gen: boolean; cap: boolean; try: string; wait: string; errname: string; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; id: string; } interface XmlActionInterceptClearCfg { type: string; subtype: string; } interface XmlActionInterceptSetCfg { type: string; subtype: string; } interface XmlActionLoopdowhileElementCfg { type: string; subtype: string; iswhile: boolean; varname: string; maxloops: string; click: boolean; navigate: boolean; gen: boolean; cap: boolean; errname: string; wait: string; id: string; } interface XmlActionLoopdowhileTemplstrCfg { type: string; subtype: string; iswhile: boolean; varname: string; maxloops: string; id: string; } interface XmlActionLoopforCfg { type: string; from: string; to: string; step: string; roundtype: string; varname: string; maxloops: string; errname: string; id: string; } interface XmlActionLoopinelesCfg { type: string; varname: string; maxloops: string; start: string; end: string; step: string; errname: string; id: string; } interface XmlActionLoopinstrCfg { type: string; list: string; split: string; varname: string; maxloops: string; trim: boolean; errname: string; id: string; } interface XmlActionScrollByCfg { type: string; subtype: string; height: string; unit: string; maxtimes: string; interval: string; gen: boolean; cap: boolean; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; id: string; } interface XmlActionScrollIntoviewCfg { type: string; subtype: string; gen: boolean; cap: boolean; errname: string; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; id: string; } interface XmlActionScrollToCfg { type: string; subtype: string; height: string; unit: string; gen: boolean; cap: boolean; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; id: string; } interface XmlActionSelectCfg { type: string; selecttype: string; selectval: string; gen: boolean; cap: boolean; try: string; wait: string; errname: string; datapage: string; popupsubtask: boolean; login: boolean; captcha: boolean; eurl: string; eloc: string; pn1: string; pv1: string; pn2: string; pv2: string; id: string; } interface XmlActionSetvarDbqueryCfg { type: string; subtype: string; varname: string; defaultval: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarElementCfg { type: string; subtype: string; varname: string; defaultval: string; try: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarFileCfg { type: string; subtype: string; varname: string; defaultval: string; proxy: boolean; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarGetCfg { type: string; subtype: string; varname: string; defaultval: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarOcrCfg { type: string; subtype: string; varname: string; defaultval: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarSubtaskCfg { type: string; subtype: string; varname: string; defaultval: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSetvarTemplstrCfg { type: string; subtype: string; varname: string; defaultval: string; valerrname: string; pattern: string; flags: string; path: string; id: string; } interface XmlActionSubtaskCfg { type: string; subtasks: string; sameasparent: boolean; id: string; } interface XmlActionWaitElementCfg { type: string; subtype: string; timeout: string; state: string; errname: string; wait: string; id: string; } interface XmlActionWaitNavigationCfg { type: string; subtype: string; timeout: string; waituntil: string; url: string; errname: string; wait: string; id: string; } interface XmlActionWaitSleepCfg { type: string; subtype: string; minms: string; maxms: string; errname: string; wait: string; id: string; } interface XmlFontsvgCfg { exloc: string; inloc: string; csmaptype: string; bsfilter: string; } interface XmlFontselectorCfg { name: string; fontfamily: string; } interface XmlFontfamilyCfg { name: string; fontcodes: string; fontchars: string; } interface XmlFontcodesCfg { name: string; codes: string; } interface XmlFontcharsCfg { name: string; chars: string; } type XmlActionConfig = XmlActionBreakCfg | XmlActionCaptchaCfg | XmlActionClickCfg | XmlActionContinueCfg | XmlActionMiscCfg | XmlActionExitCfg | XmlActionExtractCfg | XmlActionGotoCfg | XmlActionHoverCfg | XmlActionIfelseCfg | XmlActionInputCfg | XmlActionInterceptClearCfg | XmlActionInterceptSetCfg | XmlActionLoopdowhileElementCfg | XmlActionLoopdowhileTemplstrCfg | XmlActionLoopforCfg | XmlActionLoopinelesCfg | XmlActionLoopinstrCfg | XmlActionScrollByCfg | XmlActionScrollIntoviewCfg | XmlActionScrollToCfg | XmlActionSelectCfg | XmlActionSetvarDbqueryCfg | XmlActionSetvarElementCfg | XmlActionSetvarFileCfg | XmlActionSetvarGetCfg | XmlActionSetvarOcrCfg | XmlActionSetvarSubtaskCfg | XmlActionSetvarTemplstrCfg | XmlActionSubtaskCfg | XmlActionWaitElementCfg | XmlActionWaitNavigationCfg | XmlActionWaitSleepCfg; type TokenCaptchaType = "amazon" | "funcaptcha" | "geetest" | "keycaptcha" | "mtcaptcha" | "recaptcha" | "turnstile"; type RecognitionCaptchaType = "text" | "coordinate" | "grid" | "slider" | "rotation"; type CaptchaType = TokenCaptchaType | RecognitionCaptchaType; type TemplateId = number; type DomainId = number; type HttpHeaders = Record<string, string>; interface ScraperStateData extends BrowserStateData { /** * @default {} */ headers: HttpHeaders; /** * @default {} */ userData: Record<string, string>; } type InParas = Record<string, string>; interface FontttfConfig { exloc: string; inloc: string; minuc: number; maxuc: number; startidx: number; fsfilter: string; fufilter: string; parsetype: string; } /** * fonts config in xml * * length: 0 ~ 20, default [] */ interface FontsConfig { fontselectorConfig: Record<string, XmlFontselectorCfg>; fontfamilyConfig: Record<string, XmlFontfamilyCfg>; fontcodesConfig: Record<string, XmlFontcodesCfg>; fontcharsConfig: Record<string, XmlFontcharsCfg>; fontsvgCfg?: XmlFontsvgCfg; fontttfConfig?: FontttfConfig; } type ElementSource = "browser" | "cheerio"; interface TemplateInScraper { templateId: TemplateId; domainId: DomainId; /** * @default "browser" */ defaultElementSource: ElementSource; /** * @default 600 seconds */ maxExecutionDuration: number; configDetail: string; capName?: string; } type AttrsInXml = Record<string, string>; type DatatableColumnMap = Map<string, string>; interface ParsedTemplate { actionConfigs: XmlActionConfig[]; paraCfgs: XmlParaCfg[]; fontsConfig: FontsConfig | null; attrsInXml: AttrsInXml; captchaTypes: CaptchaType[]; lastUsedTime: number; lastCheckTime: number; datatableMap: Map<string, DatatableColumnMap> | null; template?: TemplateInScraper; } type ParsedTemplateExt = Required<ParsedTemplate>; /** * Network context used to execute the task */ interface TaskNetworkContext { /** * Proxy that is used to access the target website: * * null only when domainId is 0, which means no network resource are required to execute the task * @default null */ proxy: Proxy | null; /** * browser page that is used to open web pages when executing the task * * null when domainNum is less than 0, which means no browser page is required to execute the task * @default null */ page: LsdPage | null; /** * LsdApiContext that shares the state data within the same browser context: * * null only when domainId is 0, which means no network resource are required to execute the task * @default null */ browserApiContext: LsdApiContext | null; /** * Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext: * * null only when domainId is 0, which means no network resource are required to execute the task * * it is not recommended to use this context unless you never use a browser to access web pages. * @default null */ standaloneApiContext: LsdApiContext | null; } type DataRecord = Record<string, string>; type ExecData = Record<string, DataRecord[]>; interface Subtask { tid: number; parasstr: string; idx?: number; sapFlag?: boolean; } interface TaskMisc { taskId: number; message: string; stack: string; variables: Record<string, string>; } interface TaskData { templateId: TemplateId; parasStr: string; credits: number; execData: ExecData; subtasks: Subtask[]; } interface TaskResult { taskData: TaskData; subtaskDatas?: TaskData[]; /** * included if credits >= 0 */ newStateData?: ScraperStateData; /** * included if credits < 0 */ misc?: TaskMisc; } type TaskType = "indAsync" | "indSync" | "memSync"; interface TemplateTasks { tid: number; parasstrs: string[]; } /** * Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority. */ interface BrowserConfig { browserControllerType?: BrowserControllerType; /** * url used to connected the current browser ** url starts with "http://", such as "http://localhost:9222/" ** browserUrl can be used when mannaul login in advance. */ browserUrl?: string; /** * proxy ** no proxy will be used if proxyUrl is "" ** valid only if !browserUrl */ proxyUrl?: string; /** * type of browser to be launched * valid only if !browserUrl * @default "chromium" */ browserType?: LsdBrowserType; } interface TemplatePara { templateId: number; /** * code for reading or getting the template * @default "" - only public templates can be got if readCode is "" */ readCode?: string; /** * the maximum number of concurrent tasks that can execute the same template in a browserContext * @default 1 */ maxConncurrency?: number; } type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt"; interface ScraperConfig { /** * @default false */ exitWhenCompleted?: boolean; /** * whether to use the parasstr in XML if parasstr of a task is "" * @default false */ useParasstrInXmlIfNeeded?: boolean; /** * whether to load unfinished tasks * @default false */ loadUnfinishedTasks?: boolean; /** * @default "", which will use current directory of process + "/data/" * if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions. */ baseDir?: string; /** * where are the templates saved * @default "", which means to get the templates from LSD server */ templateDir?: string; /** * filename in action_setvar_get/get_file must include inputFileDirePart for security. * @default "LetsScrapeData" */ inputFileDirPart?: string; /** * wether to use puppeteer-extra-plugin-stealth, use patchright instead * @default false */ useStealthPlugin?: boolean; /** * default browserControllerType of BrowserConfig * @default "patchright" */ browserControllerType?: BrowserControllerType; /** * default browserType of BrowserConfig * @default "chromium" */ browserType?: LsdBrowserType; /** * @default { headless: false } */ lsdLaunchOptions?: LsdLaunchOptions; /** * @default {browserUrl: ""} */ lsdConnectOptions?: LsdConnectOptions; /** * Important: browsers to be launched or connected using proxyUrl * @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy */ browserConfigs?: BrowserConfig[]; captcha?: { /** * clientKey of 2captcha */ clientKey: string; }; urlPrefix?: string; /** * the default maximum number of concurrent tasks that can execute the same template in a browserContext * @default 1 */ maxConcurrency?: number; /** * @default "" */ readCode?: string; /** * @default [] */ templateParas?: TemplatePara[]; /** * @default 10 */ totalMaxConcurrency?: number; /** * min miliseconds between two tasks of the same template * @default 2000 */ minMiliseconds?: number; /** * whether to move all dat_* files into a new directory "yyyyMMddHHmmss" * @default false */ moveDataWhenStart?: boolean; /** ** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt"; * @default "jsonl" */ dataFileFormat?: DataFileFormat; /** * valid only when dataFileFormat is "jsonl" * @default true */ useNickName?: boolean; /** * valid only when dataFileFormat is "txt" * @default "::" */ columnSeperator?: string; } declare function setScraperLogFun(logFun: LogFunction): boolean; /** 修改node_modules/xml2js/lib/parser.js文件,添加如下内容(根据tagName自动添加type和subtype属性,如action_setvar_element添加 type="setvar" subtype="element"): //////// start of LSD added by Joe //////////////////////////////////////////////////////////////////////////////////////////// // to be able to add attributes here and later(defaultElementCfg.js), set obj[attrkey] if undefined if(!obj[attrkey]){ obj[attrkey] = {} } const subTags = node.name.split("_") if(subTags.length > 1 && typeof obj[attrkey]["type"] ==="undefined"){ obj[attrkey]["type"] = subTags[1] } if(subTags.length > 2 && typeof obj[attrkey]["subtype"] ==="undefined"){ obj[attrkey]["subtype"] = subTags[2] } //////// end of LSD added by Joe //////////////////////////////////////////////////////////////////////////////////////////// obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name; */ /** * TaskParser */ declare class TaskParser { #private; /** * * @param xmlStr * @param defaultCfgFlag * @returns {$$:{id, version}, children: {paras: [...], depends: [...], actions: [...]}}} */ static convertXmlToJson(xmlStr: string, defaultCfgFlag?: boolean): Promise<any>; static getPartOfJsonCfg(jsonCfg: any, partName: string, optional?: boolean): any; static getParaCfgsFromJsonCfg(jsonCfg: any): XmlParaCfg[]; static getAttrsInXml(jsonCfg: any): AttrsInXml; static getCaptchTypes(jsonCfg: any): CaptchaType[]; static getDatableMapFromJsonCfg(jsonCfg: any): Map<string, DatatableColumnMap> | null; static getInParas(parasStr: string, paraCfgs?: XmlParaCfg[], splitStr?: string): InParas; static convertExecData(origExecData: ExecData, datatableMap: Map<string, DatatableColumnMap> | null): ExecData; } declare class TemplateManagerInScraper { #private; static parseXmlTemplate(xmlStr: string, datatableFlag?: boolean): Promise<ParsedTemplate>; static getTemplateConfig(templateId: number, xmlStr?: string): Promise<ParsedTemplateExt>; static clearTemplateConfig(templateId?: number): boolean; } declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>; declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>; declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>; export { type AttrsInXml, type BrowserConfig, type ExecData, type ParsedTemplate, type ScraperConfig, TaskParser, TemplateManagerInScraper, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };