web-structure
Version:
A powerful and flexible web scraping library with concurrent processing and DOM hierarchy awareness
59 lines (58 loc) • 1.59 kB
TypeScript
interface ScrapingResult {
url: string;
title: string;
data: any;
timestamp: string;
childPages?: ScrapingResult[];
}
export interface ScrapingOptions {
/**
* @description Maximum depth for recursive crawling. If a page contains child links, they will be crawled by default.
* The more child links, the slower the crawling process.
* @default 0
*/
maxDepth?: number;
excludeChildPage?: (url: string) => boolean;
/**
* @description Selectors to extract content from the page.
* @default {
headings: ['h1', 'h2', 'h3', 'h4', 'h5'],
paragraphs: 'p',
articles: 'article',
spans: 'span',
orderLists: 'ol',
lists: 'ul',
code: 'pre
}
*/
selectors?: {
[key: string]: string | string[];
};
/**
* @description Whether to show console information
* @default true
*/
withConsole?: boolean;
/**
* @description Whether to break when a page fails
* @default false
*/
breakWhenFailed?: boolean;
/**
* @description Number of retries when scraping a page fails
* @default 3
*/
retryCount?: number;
/**
* @description Timeout for waiting for a selector to be present
* @default 12000
* */
waitForSelectorTimeout?: number;
/**
* @description Timeout for waiting for a page to load
* @default 12000
* */
waitForPageLoadTimeout?: number;
}
export declare function scraping(url: string, _options?: ScrapingOptions): Promise<ScrapingResult>;
export {};