web-structure

Version:

A powerful and flexible web scraping library with concurrent processing and DOM hierarchy awareness

59 lines (58 loc) • 1.59 kB

TypeScript

interface ScrapingResult { url: string; title: string; data: any; timestamp: string; childPages?: ScrapingResult[]; } export interface ScrapingOptions { /** * @description Maximum depth for recursive crawling. If a page contains child links, they will be crawled by default. * The more child links, the slower the crawling process. * @default 0 */ maxDepth?: number; excludeChildPage?: (url: string) => boolean; /** * @description Selectors to extract content from the page. * @default { headings: ['h1', 'h2', 'h3', 'h4', 'h5'], paragraphs: 'p', articles: 'article', spans: 'span', orderLists: 'ol', lists: 'ul', code: 'pre } */ selectors?: { [key: string]: string | string[]; }; /** * @description Whether to show console information * @default true */ withConsole?: boolean; /** * @description Whether to break when a page fails * @default false */ breakWhenFailed?: boolean; /** * @description Number of retries when scraping a page fails * @default 3 */ retryCount?: number; /** * @description Timeout for waiting for a selector to be present * @default 12000 * */ waitForSelectorTimeout?: number; /** * @description Timeout for waiting for a page to load * @default 12000 * */ waitForPageLoadTimeout?: number; } export declare function scraping(url: string, _options?: ScrapingOptions): Promise<ScrapingResult>; export {};