@letsscrapedata/scraper
Version:
Web scraper that scraping web pages by LetsScrapeData XML template
249 lines (243 loc) • 7.5 kB
text/typescript
import { BrowserControllerType, LsdBrowserType, LsdLaunchOptions, LsdConnectOptions, LsdPage, LsdApiContext, BrowserStateData } from '@letsscrapedata/controller';
import { Proxy } from '@letsscrapedata/proxy';
import { LogFunction } from '@letsscrapedata/utils';
type TemplateId = number;
type HttpHeaders = Record<string, string>;
interface ScraperStateData extends BrowserStateData {
/**
* @default {}
*/
headers: HttpHeaders;
/**
* @default {}
*/
userData: Record<string, string>;
expireTime?: number;
}
/**
* Network context used to execute the task
*/
interface TaskNetworkContext {
/**
* Proxy that is used to access the target website:
* * null only when domainId is 0, which means no network resource are required to execute the task
* @default null
*/
proxy: Proxy | null;
/**
* browser page that is used to open web pages when executing the task
* * null when domainNum is less than 0, which means no browser page is required to execute the task
* @default null
*/
page: LsdPage | null;
/**
* LsdApiContext that shares the state data within the same browser context:
* * null only when domainId is 0, which means no network resource are required to execute the task
* @default null
*/
browserApiContext: LsdApiContext | null;
/**
* Standalone LsdApiContext that shares the state data between the tasks that using the same standalone LsdApiContext:
* * null only when domainId is 0, which means no network resource are required to execute the task
* * it is not recommended to use this context unless you never use a browser to access web pages.
* @default null
*/
standaloneApiContext: LsdApiContext | null;
}
type DataRecord = Record<string, string>;
type ExecData = Record<string, DataRecord[]>;
interface Subtask {
tid: number;
parasstr: string;
idx?: number;
sapFlag?: boolean;
}
interface TaskMisc {
taskId: number;
message: string;
stack: string;
variables: Record<string, string>;
}
interface TaskData {
templateId: TemplateId;
parasStr: string;
credits: number;
execData: ExecData;
subtasks: Subtask[];
debugFiles?: Record<string, string>;
}
interface TaskResult {
taskData: TaskData;
subtaskDatas?: TaskData[];
/**
** newStateData(misc_getstatedata) in TaskContext, related stateData fo cap or dom will be replaced or updated in LSDAPP
** included if credits >= 0
*/
newStateData?: ScraperStateData;
/**
* included if credits < 0
*/
misc?: TaskMisc;
}
type TaskType = "indAsync" | "indSync" | "memSync";
interface TemplateTasks {
tid: number;
parasstrs: string[];
}
/**
* Only one of browserUrl and proxyUrl will take effect, and browserUrl has higher priority.
*/
interface BrowserConfig {
browserControllerType?: BrowserControllerType;
/**
* url used to connected the current browser
** url starts with "http://", such as "http://localhost:9222/"
** browserUrl can be used when mannaul login in advance.
*/
browserUrl?: string;
/**
* proxy
** no proxy will be used if proxyUrl is ""
** valid only if !browserUrl
** @default "none"
*/
proxyUrl?: string;
/**
* type of browser to be launched
* valid only if !browserUrl
* @default "chromium"
*/
browserType?: LsdBrowserType;
}
interface TemplatePara {
templateId: number;
/**
* code for reading or getting the template
* @default "" - only public templates can be got if readCode is ""
*/
readCode?: string;
/**
* the maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConncurrency?: number;
}
type DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
interface ScraperConfig {
/**
* @default false
*/
exitWhenCompleted?: boolean;
/**
* whether to use the parasstr in XML if parasstr of a task is ""
* @default false
*/
useParasstrInXmlIfNeeded?: boolean;
/**
* whether to load unfinished tasks
* @default false
*/
loadUnfinishedTasks?: boolean;
/**
* unit: minutes
* @default 0
*/
loadFailedTasksInterval?: number;
/**
* @default "", which will use current directory of process + "/data/"
* if not empty, baseDir must be an absolute path, and the directory must exist and have read and write permissions.
*/
baseDir?: string;
/**
* where are the templates saved
* @default "", which means to get the templates from LSD server
*/
templateDir?: string;
/**
* filename in action_setvar_get/get_file must include inputFileDirePart for security.
* @default "LetsScrapeData"
*/
inputFileDirPart?: string;
/**
* wether to use puppeteer-extra-plugin-stealth, use patchright instead
* @default false
*/
useStealthPlugin?: boolean;
/**
* default browserControllerType of BrowserConfig
* @default "patchright"
*/
browserControllerType?: BrowserControllerType;
/**
* default browserType of BrowserConfig
* @default "chromium"
*/
browserType?: LsdBrowserType;
/**
* @default { headless: false }
*/
lsdLaunchOptions?: LsdLaunchOptions;
/**
* @default {browserUrl: ""}
*/
lsdConnectOptions?: LsdConnectOptions;
/**
* Important: browsers to be launched or connected using proxyUrl
* @default [{proxyUrl: ""}], launch a default browser using default type of browser controller, no proxy
*/
browserConfigs?: BrowserConfig[];
captcha?: {
/**
* clientKey of 2captcha
*/
clientKey: string;
/**
* captchas to be sovled: valid only in camoufox
* @default []
*/
captchas?: string[];
};
urlPrefix?: string;
/**
* the default maximum number of concurrent tasks that can execute the same template in a browserContext
* @default 1
*/
maxConcurrency?: number;
/**
* @default ""
*/
readCode?: string;
/**
* @default []
*/
templateParas?: TemplatePara[];
/**
* @default 10
*/
totalMaxConcurrency?: number;
/**
* min miliseconds between two tasks of the same template
* @default 2000
*/
minMiliseconds?: number;
/**
* whether to move all dat_* files into a new directory "yyyyMMddHHmmss"
* @default false
*/
moveDataWhenStart?: boolean;
/**
** DataFileFormat = "csv" | "jsonl" | "tsv" | "txt";
* @default "jsonl"
*/
dataFileFormat?: DataFileFormat;
/**
* valid only when dataFileFormat is "txt"
* @default "::"
*/
columnSeperator?: string;
}
declare function setScraperLogFun(logFun: LogFunction): boolean;
declare function performOneTask(templateId: number, parasStr: string, taskNetworkContext: TaskNetworkContext, taskType?: TaskType, xmlStr?: string, taskId?: number): Promise<TaskResult>;
declare function updateScraperConfig(config: ScraperConfig): Promise<boolean>;
declare function scraper(newTasks?: TemplateTasks[], config?: ScraperConfig): Promise<boolean>;
export { type BrowserConfig, type ExecData, type ScraperConfig, type TemplatePara, type TemplateTasks, performOneTask, scraper, setScraperLogFun, updateScraperConfig };