@letsscrapedata/scraper
Version:
Web scraper that scraping web pages by LetsScrapeData XML template
742 lines (731 loc) • 19.7 kB
TypeScript
import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
import { Proxy } from '@letsscrapedata/proxy';
import { LogFunction } from '@letsscrapedata/utils';
interface XmlParaCfg {
paraname: string;
name: string;
desc: string;
uitype: string;
defval: string;
listid: string;
listparas: string;
min: string;
max: string;
pattern: string;
alert: string;
}
interface XmlActionBreakCfg {
type: string;
id: string;
}
interface XmlActionCaptchaCfg {
type: string;
try: string;
errname: string;
id: string;
}
interface XmlActionClickCfg {
type: string;
wait: string;
gen: boolean;
cap: boolean;
clicktype: string;
try: string;
errname: string;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
navigate: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
downloadpath: string;
filename: string;
pathvarname: string;
id: string;
}
interface XmlActionContinueCfg {
type: string;
id: string;
}
interface XmlActionMiscCfg {
type: string;
id: string;
}
interface XmlActionExitCfg {
type: string;
errname: string;
id: string;
}
interface XmlActionExtractCfg {
type: string;
tabname: string;
id: string;
}
interface XmlActionGotoCfg {
type: string;
url: string;
reuse: boolean;
wait: string;
encodeuri: boolean;
gen: boolean;
cap: boolean;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
headers: string;
referer: string;
id: string;
}
interface XmlActionHoverCfg {
type: string;
try: string;
gen: boolean;
cap: boolean;
wait: string;
errname: string;
id: string;
}
interface XmlActionIfelseCfg {
type: string;
id: string;
}
interface XmlActionInputCfg {
type: string;
content: string;
enter: boolean;
replace: boolean;
gen: boolean;
cap: boolean;
try: string;
wait: string;
errname: string;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
id: string;
}
interface XmlActionInterceptClearCfg {
type: string;
subtype: string;
}
interface XmlActionInterceptSetCfg {
type: string;
subtype: string;
}
interface XmlActionLoopdowhileElementCfg {
type: string;
subtype: string;
iswhile: boolean;
varname: string;
maxloops: string;
click: boolean;
navigate: boolean;
gen: boolean;
cap: boolean;
errname: string;
wait: string;
id: string;
}
interface XmlActionLoopdowhileTemplstrCfg {
type: string;
subtype: string;
iswhile: boolean;
varname: string;
maxloops: string;
id: string;
}
interface XmlActionLoopforCfg {
type: string;
from: string;
to: string;
step: string;
roundtype: string;
varname: string;
maxloops: string;
errname: string;
id: string;
}
interface XmlActionLoopinelesCfg {
type: string;
varname: string;
maxloops: string;
start: string;
end: string;
step: string;
errname: string;
id: string;
}
interface XmlActionLoopinstrCfg {
type: string;
list: string;
split: string;
varname: string;
maxloops: string;
trim: boolean;
errname: string;
id: string;
}
interface XmlActionScrollByCfg {
type: string;
subtype: string;
height: string;
unit: string;
maxtimes: string;
interval: string;
gen: boolean;
cap: boolean;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
id: string;
}
interface XmlActionScrollIntoviewCfg {
type: string;
subtype: string;
gen: boolean;
cap: boolean;
errname: string;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
id: string;
}
interface XmlActionScrollToCfg {
type: string;
subtype: string;
height: string;
unit: string;
gen: boolean;
cap: boolean;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
id: string;
}
interface XmlActionSelectCfg {
type: string;
selecttype: string;
selectval: string;
gen: boolean;
cap: boolean;
try: string;
wait: string;
errname: string;
datapage: string;
popupsubtask: boolean;
login: boolean;
captcha: boolean;
eurl: string;
eloc: string;
pn1: string;
pv1: string;
pn2: string;
pv2: string;
id: string;
}
interface XmlActionSetvarDbqueryCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarElementCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
try: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarFileCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
proxy: boolean;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarGetCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarOcrCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarSubtaskCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSetvarTemplstrCfg {
type: string;
subtype: string;
varname: string;
defaultval: string;
valerrname: string;
pattern: string;
flags: string;
path: string;
id: string;
}
interface XmlActionSubtaskCfg {
type: string;
subtasks: string;
sameasparent: boolean;
id: string;
}
interface XmlActionWaitElementCfg {
type: string;
subtype: string;
timeout: string;
state: string;
errname: string;
wait: string;
id: string;
}
interface XmlActionWaitNavigationCfg {
type: string;
subtype: string;
timeout: string;
waituntil: string;
url: string;
errname: string;
wait: string;
id: string;
}
interface XmlActionWaitSleepCfg {
type: string;
subtype: string;
minms: string;
maxms: string;
errname: string;
wait: string;
id: string;
}
interface XmlFontsvgCfg {
exloc: string;
inloc: string;
csmaptype: string;
bsfilter: string;
}
interface XmlFontselectorCfg {
name: string;
fontfamily: string;
}
interface XmlFontfamilyCfg {
name: string;
fontcodes: string;
fontchars: string;
}
interface XmlFontcodesCfg {
name: string;
codes: string;
}
interface XmlFontcharsCfg {
name: string;
chars: string;
}
type XmlActionConfig = XmlActionBreakCfg | XmlActionCaptchaCfg | XmlActionClickCfg | XmlActionContinueCfg | XmlActionMiscCfg | XmlActionExitCfg | XmlActionExtractCfg | XmlActionGotoCfg | XmlActionHoverCfg | XmlActionIfelseCfg | XmlActionInputCfg | XmlActionInterceptClearCfg | XmlActionInterceptSetCfg | XmlActionLoopdowhileElementCfg | XmlActionLoopdowhileTemplstrCfg | XmlActionLoopforCfg | XmlActionLoopinelesCfg | XmlActionLoopinstrCfg | XmlActionScrollByCfg | XmlActionScrollIntoviewCfg | XmlActionScrollToCfg | XmlActionSelectCfg | XmlActionSetvarDbqueryCfg | XmlActionSetvarElementCfg | XmlActionSetvarFileCfg | XmlActionSetvarGetCfg | XmlActionSetvarOcrCfg | XmlActionSetvarSubtaskCfg | XmlActionSetvarTemplstrCfg | XmlActionSubtaskCfg | XmlActionWaitElementCfg | XmlActionWaitNavigationCfg | XmlActionWaitSleepCfg;
type TokenCaptchaType = "amazon" | "funcaptcha" | "geetest" | "keycaptcha" | "mtcaptcha" | "recaptcha" | "turnstile";
type RecognitionCaptchaType = "text" | "coordinate" | "grid" | "slider" | "rotation";
type CaptchaType = TokenCaptchaType | RecognitionCaptchaType;
type TemplateId = number;
type DomainId = number;
type HttpHeaders = Record<string, string>;
interface ScraperStateData extends BrowserStateData {
/**
* @default {}
*/
headers: HttpHeaders;
/**
* @default {}
*/
userData: Record<string, string>;
}
type InParas = Record<string, string>;
interface FontttfConfig {
exloc: string;
inloc: string;
minuc: number;
maxuc: number;
startidx: number;
fsfilter: string;
fufilter: string;
parsetype: string;
}
/**
* fonts config in xml
* * length: 0 ~ 20, default []
*/
interface FontsConfig {
fontselectorConfig: Record<string, XmlFontselectorCfg>;
fontfamilyConfig: Record<string, XmlFontfamilyCfg>;
fontcodesConfig: Record<string, XmlFontcodesCfg>;
fontcharsConfig: Record<string, XmlFontcharsCfg>;
fontsvgCfg?: XmlFontsvgCfg;
fontttfConfig?: FontttfConfig;
}
type ElementSource = "browser" | "cheerio";
interface TemplateInScraper {
templateId: TemplateId;
domainId: DomainId;
/**
* @default "browser"
*/
defaultElementSource: ElementSource;
/**
* @default 600 seconds
*/
maxExecutionDuration: number;
configDetail: string;
capName?: string;
}
type AttrsInXml = Record<string, string>;
type DatatableColumnMap = Map<string, string>;
interface ParsedTemplate {
actionConfigs: XmlActionConfig[];
paraCfgs: XmlParaCfg[];
fontsConfig: FontsConfig | null;
attrsInXml: AttrsInXml;
captchaTypes: CaptchaType[];
lastUsedTime: number;
lastCheckTime: number;
datatableMap: Map<string, DatatableColumnMap> | null;
template?: TemplateInScraper;
}
type ParsedTemplateExt = Required<ParsedTemplate>;
/**
* Network context used to execute the task
*/
interface TaskNetworkContext {
/**
* Proxy that is used to access the target website:
* * null only when domainId is 0, which means no network resource are required to execute the task
* @default null
*/
proxy: Proxy | null;
/**
* browser page that is used to open web pages when executing the task
* * null when domainNum is less than 0, which means no browser page is required to execute the task
* @default null
*/
page: LsdPage | null;
/**
* LsdApiContext that shares the state data within the same browser context:
* * null only when domainId is 0, which means no network resource are required to execute the task
* @default null
*/
browserApiContext: LsdApiContext | null;
/**
* Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
* * null only when domainId is 0, which means no network resource are required to execute the task
* * it is not recommended to use this context unless you never use a browser to access web pages.
* @default null
*/
standaloneApiContext: LsdApiContext | null;
}
type DataRecord = Record<string, string>;
type ExecData = Record<string, DataRecord[]>;
interface Subtask {
tid: number;
parasstr: string;
idx?: number;
sapFlag?: boolean;
}
interface TaskMisc {
taskId: number;
message: string;
stack: string;
variables: Record<string, string>;
}
interface TaskData {
templateId: TemplateId;
parasStr: string;
credits: number;
execData: ExecData;
subtasks: Subtask[];
}
interface TaskResult {
taskData: TaskData;
subtaskDatas?: TaskData[];
/**
* included if credits >= 0
*/
newStateData?: ScraperStateData;
/**
* included if credits < 0
*/
misc?: TaskMisc;
}
type TaskType = "indAsync" | "indSync" | "memSync";
interface TemplateTasks {
tid: number;
parasstrs: string[];
}
/**
* Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
*/
interface BrowserConfig {
browserControllerType?: BrowserControllerType;
/**
* url used to connected the current browser
** url starts with "http://", such as "http://localhost:9222/"
** browserUrl can be used when mannaul login in advance.
*/
browserUrl?: string;
/**
* proxy
** no proxy will be used if proxyUrl is ""
** valid only if !browserUrl
*/
proxyUrl?: string;
/**
* type of browser to be launched
* valid only if !browserUrl
* @default "chromium"
*/
browserType?: LsdBrowserType;
}
interface TemplatePara {
templateId: number;
/**
* code for reading or getting the template
* @default "" - only public templates can be got if readCode is ""
*/
readCode?: string;
/**
* the maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConncurrency?: number;
}
type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
interface ScraperConfig {
/**
* @default false
*/
exitWhenCompleted?: boolean;
/**
* whether to use the parasstr in XML if parasstr of a task is ""
* @default false
*/
useParasstrInXmlIfNeeded?: boolean;
/**
* whether to load unfinished tasks
* @default false
*/
loadUnfinishedTasks?: boolean;
/**
* @default "", which will use current directory of process + "/data/"
* if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
*/
baseDir?: string;
/**
* where are the templates saved
* @default "", which means to get the templates from LSD server
*/
templateDir?: string;
/**
* filename in action_setvar_get/get_file must include inputFileDirePart for security.
* @default "LetsScrapeData"
*/
inputFileDirPart?: string;
/**
* wether to use puppeteer-extra-plugin-stealth, use patchright instead
* @default false
*/
useStealthPlugin?: boolean;
/**
* default browserControllerType of BrowserConfig
* @default "patchright"
*/
browserControllerType?: BrowserControllerType;
/**
* default browserType of BrowserConfig
* @default "chromium"
*/
browserType?: LsdBrowserType;
/**
* @default { headless: false }
*/
lsdLaunchOptions?: LsdLaunchOptions;
/**
* @default {browserUrl: ""}
*/
lsdConnectOptions?: LsdConnectOptions;
/**
* Important: browsers to be launched or connected using proxyUrl
* @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
*/
browserConfigs?: BrowserConfig[];
captcha?: {
/**
* clientKey of 2captcha
*/
clientKey: string;
};
urlPrefix?: string;
/**
* the default maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConcurrency?: number;
/**
* @default ""
*/
readCode?: string;
/**
* @default []
*/
templateParas?: TemplatePara[];
/**
* @default 10
*/
totalMaxConcurrency?: number;
/**
* min miliseconds between two tasks of the same template
* @default 2000
*/
minMiliseconds?: number;
/**
* whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
* @default false
*/
moveDataWhenStart?: boolean;
/**
** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
* @default "jsonl"
*/
dataFileFormat?: DataFileFormat;
/**
* valid only when dataFileFormat is "jsonl"
* @default true
*/
useNickName?: boolean;
/**
* valid only when dataFileFormat is "txt"
* @default "::"
*/
columnSeperator?: string;
}
declare function setScraperLogFun(logFun: LogFunction): boolean;
/** 修改node_modules/xml2js/lib/parser.js文件,添加如下内容(根据tagName自动添加type和subtype属性,如action_setvar_element添加 type="setvar" subtype="element"):
//////// start of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
// to be able to add attributes here and later(defaultElementCfg.js), set obj[attrkey] if undefined
if(!obj[attrkey]){
obj[attrkey] = {}
}
const subTags = node.name.split("_")
if(subTags.length > 1 && typeof obj[attrkey]["type"] ==="undefined"){
obj[attrkey]["type"] = subTags[1]
}
if(subTags.length > 2 && typeof obj[attrkey]["subtype"] ==="undefined"){
obj[attrkey]["subtype"] = subTags[2]
}
//////// end of LSD added by Joe ////////////////////////////////////////////////////////////////////////////////////////////
obj["#name"] = _this.options.tagNameProcessors ? processItem(_this.options.tagNameProcessors, node.name) : node.name;
*/
/**
* TaskParser
*/
declare class TaskParser {
#private;
/**
*
* @param xmlStr
* @param defaultCfgFlag
* @returns {$$:{id, version}, children: {paras: [...], depends: [...], actions: [...]}}}
*/
static convertXmlToJson(xmlStr: string, defaultCfgFlag?: boolean): Promise<any>;
static getPartOfJsonCfg(jsonCfg: any, partName: string, optional?: boolean): any;
static getParaCfgsFromJsonCfg(jsonCfg: any): XmlParaCfg[];
static getAttrsInXml(jsonCfg: any): AttrsInXml;
static getCaptchTypes(jsonCfg: any): CaptchaType[];
static getDatableMapFromJsonCfg(jsonCfg: any): Map<string, DatatableColumnMap> | null;
static getInParas(parasStr: string, paraCfgs?: XmlParaCfg[], splitStr?: string): InParas;
static convertExecData(origExecData: ExecData, datatableMap: Map<string, DatatableColumnMap> | null): ExecData;
}
declare class TemplateManagerInScraper {
#private;
static parseXmlTemplate(xmlStr: string, datatableFlag?: boolean): Promise<ParsedTemplate>;
static getTemplateConfig(templateId: number, xmlStr?: string): Promise<ParsedTemplateExt>;
static clearTemplateConfig(templateId?: number): boolean;
}
declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number, useNickName?: boolean): Promise<TaskResult>;
declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
export { type AttrsInXml, type BrowserConfig, type ExecData, type ParsedTemplate, type ScraperConfig, TaskParser, TemplateManagerInScraper, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };