x-crawl
Version:
x-crawl is a flexible Node.js AI-assisted crawler library.
328 lines (276 loc) • 7.32 kB
TypeScript
/// <reference types="node" />
import { IncomingHttpHeaders } from 'node:http'
import {
Viewport,
PuppeteerLaunchOptions,
HTTPResponse,
Page,
Browser,
Protocol
} from 'puppeteer'
export * from 'puppeteer'
/* Common */
export interface AnyObject {
[key: string | number | symbol]: any
}
/* API Config */
// API crawl config
// API crawl config other
export type IntervalTime = number | { max: number; min?: number }
export type Method =
| 'get'
| 'GET'
| 'delete'
| 'DELETE'
| 'head'
| 'HEAD'
| 'options'
| 'OPTIONS'
| 'post'
| 'POST'
| 'put'
| 'PUT'
| 'patch'
| 'PATCH'
| 'purge'
| 'PURGE'
| 'link'
| 'LINK'
| 'unlink'
| 'UNLINK'
export type PageCookies =
| string
| Protocol.Network.CookieParam
| Protocol.Network.CookieParam[]
export type Platform =
| 'Android'
| 'Chrome OS'
| 'Chromium OS'
| 'iOS'
| 'Linux'
| 'macOS'
| 'Windows'
| 'Unknown'
export interface DetailTargetFingerprintCommon {
ua?: string
mobile?: '?0' | '?1' | 'random'
platform?: Platform
platformVersion?: string
acceptLanguage?: string
userAgent?: {
value: string
versions?: {
name: string
maxMajorVersion?: number
minMajorVersion?: number
maxMinorVersion?: number
minMinorVersion?: number
maxPatchVersion?: number
minPatchVersion?: number
}[]
}
}
export interface CrawlCommonConfig {
timeout?: number | null
proxy?: {
urls: string[]
switchByHttpStatus?: number[]
switchByErrorCount?: number
} | null
maxRetry?: number | null
}
// CreateCrawlConfig
export interface CreateCrawlConfig extends CrawlCommonConfig {
mode?: 'async' | 'sync'
enableRandomFingerprint?: boolean
baseUrl?: string
intervalTime?: IntervalTime
log?:
| {
start?: boolean
process?: boolean
result?: boolean
}
| boolean
crawlPage?: {
puppeteerLaunchOptions?: PuppeteerLaunchOptions
}
}
// 1.Detail target
export interface CrawlPageDetailTargetConfig extends CrawlCommonConfig {
url: string
headers?: AnyObject | null
cookies?: PageCookies | null
priority?: number
viewport?: Viewport | null
fingerprint?:
| (DetailTargetFingerprintCommon & {
maxWidth?: number
minWidth?: number
maxHeight?: number
minHidth?: number
})
| null
}
export interface CrawlHTMLDetailTargetConfig extends CrawlCommonConfig {
url: string
headers?: AnyObject | null
priority?: number
fingerprint?: DetailTargetFingerprintCommon | null
}
export interface CrawlDataDetailTargetConfig extends CrawlCommonConfig {
url: string
method?: Method
headers?: AnyObject | null
params?: AnyObject
data?: any
priority?: number
fingerprint?: DetailTargetFingerprintCommon | null
}
export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
url: string
headers?: AnyObject | null
priority?: number
storeDir?: string | null
fileName?: string | null
extension?: string | null
fingerprint?: DetailTargetFingerprintCommon | null
}
// 2.Advanced
export interface CrawlPageAdvancedConfig extends CrawlCommonConfig {
targets: (string | CrawlPageDetailTargetConfig)[]
intervalTime?: IntervalTime
fingerprints?: (DetailTargetFingerprintCommon & {
maxWidth?: number
minWidth?: number
maxHeight?: number
minHidth?: number
})[]
headers?: AnyObject
cookies?: PageCookies
viewport?: Viewport
onCrawlItemComplete?: (crawlPageSingleResult: CrawlPageSingleResult) => void
}
export interface CrawlHTMLAdvancedConfig extends CrawlCommonConfig {
targets: (string | CrawlHTMLDetailTargetConfig)[]
intervalTime?: IntervalTime
fingerprints?: DetailTargetFingerprintCommon[]
headers?: AnyObject
onCrawlItemComplete?: (crawlDataSingleResult: CrawlHTMLSingleResult) => void
}
export interface CrawlDataAdvancedConfig<T> extends CrawlCommonConfig {
targets: (string | CrawlDataDetailTargetConfig)[]
intervalTime?: IntervalTime
fingerprints?: DetailTargetFingerprintCommon[]
headers?: AnyObject
onCrawlItemComplete?: (
crawlDataSingleResult: CrawlDataSingleResult<T>
) => void
}
export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
targets: (string | CrawlFileDetailTargetConfig)[]
intervalTime?: IntervalTime
fingerprints?: DetailTargetFingerprintCommon[]
storeDirs?: string | (string | null)[]
extensions?: string | (string | null)[]
fileNames?: (string | null)[]
headers?: AnyObject
onCrawlItemComplete?: (crawlFileSingleResult: CrawlFileSingleResult) => void
onBeforeSaveItemFile?: (info: {
id: number
fileName: string
filePath: string
data: Buffer
}) => Promise<Buffer | void> | Buffer | void
}
export interface StartPollingConfig {
d?: number
h?: number
m?: number
}
/* API Result */
export interface CrawlApp {
crawlPage: {
(config: string): Promise<CrawlPageSingleResult>
(config: CrawlPageDetailTargetConfig): Promise<CrawlPageSingleResult>
(
config: (string | CrawlPageDetailTargetConfig)[]
): Promise<CrawlPageSingleResult[]>
(config: CrawlPageAdvancedConfig): Promise<CrawlPageSingleResult[]>
}
crawlHTML: {
(config: string): Promise<CrawlHTMLSingleResult>
(config: CrawlHTMLDetailTargetConfig): Promise<CrawlHTMLSingleResult>
(
config: (string | CrawlHTMLDetailTargetConfig)[]
): Promise<CrawlHTMLSingleResult[]>
(config: CrawlHTMLAdvancedConfig): Promise<CrawlHTMLSingleResult[]>
}
crawlData: {
<T = any>(
config: CrawlDataDetailTargetConfig
): Promise<CrawlDataSingleResult<T>>
<T = any>(config: string): Promise<CrawlDataSingleResult<T>>
<T = any>(
config: (string | CrawlDataDetailTargetConfig)[]
): Promise<CrawlDataSingleResult<T>[]>
<T = any>(
config: CrawlDataAdvancedConfig<T>
): Promise<CrawlDataSingleResult<T>[]>
}
crawlFile: {
(config: string): Promise<CrawlFileSingleResult>
(config: CrawlFileDetailTargetConfig): Promise<CrawlFileSingleResult>
(
config: (string | CrawlFileDetailTargetConfig)[]
): Promise<CrawlFileSingleResult[]>
(config: CrawlFileAdvancedConfig): Promise<CrawlFileSingleResult[]>
}
}
export interface CrawlCommonResult {
id: number
isSuccess: boolean
maxRetry: number
retryCount: number
proxyDetails: {
url: string
state: boolean
}[]
crawlErrorQueue: Error[]
}
export interface CrawlPageSingleResult extends CrawlCommonResult {
data: {
browser: Browser
response: HTTPResponse | null
page: Page
}
}
export interface CrawlHTMLSingleResult extends CrawlCommonResult {
data: {
statusCode: number | undefined
headers: IncomingHttpHeaders
html: string
} | null
}
export interface CrawlDataSingleResult<D> extends CrawlCommonResult {
data: {
statusCode: number | undefined
headers: IncomingHttpHeaders
data: D
} | null
}
export interface CrawlFileSingleResult extends CrawlCommonResult {
data: {
statusCode: number | undefined
headers: IncomingHttpHeaders
data: {
isSuccess: boolean
fileName: string
fileExtension: string
mimeType: string
size: number
filePath: string
}
} | null
}
export function createCrawl(config?: CreateCrawlConfig): CrawlApp