UNPKG

ts-web-scraper

Version:

A powerful web scraper for both static and client-side rendered sites using only Bun native APIs

83 lines 2.82 kB
/** * Parse HTML string into a queryable Document object */ export declare function parseHTML(html: string): Document; /** * Fetch and parse HTML from a URL */ export declare function fetchHTML(url: string, options?: FetchHTMLOptions): Promise<Document>; /** * Extract text content from HTML, stripping all tags */ export declare function extractText(html: string): string; /** * Extract all links from HTML */ export declare function extractLinks(html: string, baseUrl?: string): string[]; /** * Extract meta tags from HTML */ export declare function extractMeta(html: string): Record<string, string>; /** * Helper to wait for a condition (useful for client-side rendered content) * Note: This is a simple polling implementation since we can't execute JavaScript */ export declare function waitFor(condition: () => boolean | Promise<boolean>, options?: { timeout?: number, interval?: number }): Promise<void>; /** * Batch fetch multiple URLs in parallel */ export declare function fetchMultiple(urls: string[], options?: FetchHTMLOptions): Promise<Map<string, Document>>; /** * Extract structured data from common formats */ export declare function extractStructuredData(html: string): { jsonLd: any[] openGraph: Record<string, string> twitter: Record<string, string> microdata: any[] }; /** * Lightweight web scraper using only Bun native APIs * No external dependencies required * * @example * ```ts * import { fetchHTML, parseHTML } from './web-scraper' * * // Simple usage * const doc = await fetchHTML('https://example.com') * const title = doc.querySelector('title')?.textContent * * // Advanced usage with custom options * const html = await fetch('https://example.com').then(r => r.text()) * const doc = parseHTML(html) * const links = doc.querySelectorAll('a[href]') * links.forEach(link => console.log(link.getAttribute('href'))) * ``` */ export declare interface HTMLElement { tagName: string attributes: Record<string, string> textContent: string innerHTML: string children: HTMLElement[] parent: HTMLElement | null querySelector: (selector: string) => HTMLElement | null querySelectorAll: (selector: string) => HTMLElement[] getAttribute: (name: string) => string | null hasAttribute: (name: string) => boolean getElementById: (id: string) => HTMLElement | null getElementsByClassName: (className: string) => HTMLElement[] getElementsByTagName: (tagName: string) => HTMLElement[] } export declare interface Document extends HTMLElement { querySelector: (selector: string) => HTMLElement | null querySelectorAll: (selector: string) => HTMLElement[] } export declare interface FetchHTMLOptions { timeout?: number headers?: Record<string, string> userAgent?: string redirect?: 'follow' | 'manual' | 'error' signal?: AbortSignal }