ts-web-scraper
Version:
A powerful web scraper for both static and client-side rendered sites using only Bun native APIs
83 lines • 2.82 kB
TypeScript
/**
* Parse HTML string into a queryable Document object
*/
export declare function parseHTML(html: string): Document;
/**
* Fetch and parse HTML from a URL
*/
export declare function fetchHTML(url: string, options?: FetchHTMLOptions): Promise<Document>;
/**
* Extract text content from HTML, stripping all tags
*/
export declare function extractText(html: string): string;
/**
* Extract all links from HTML
*/
export declare function extractLinks(html: string, baseUrl?: string): string[];
/**
* Extract meta tags from HTML
*/
export declare function extractMeta(html: string): Record<string, string>;
/**
* Helper to wait for a condition (useful for client-side rendered content)
* Note: This is a simple polling implementation since we can't execute JavaScript
*/
export declare function waitFor(condition: () => boolean | Promise<boolean>, options?: { timeout?: number, interval?: number }): Promise<void>;
/**
* Batch fetch multiple URLs in parallel
*/
export declare function fetchMultiple(urls: string[], options?: FetchHTMLOptions): Promise<Map<string, Document>>;
/**
* Extract structured data from common formats
*/
export declare function extractStructuredData(html: string): {
jsonLd: any[]
openGraph: Record<string, string>
twitter: Record<string, string>
microdata: any[]
};
/**
* Lightweight web scraper using only Bun native APIs
* No external dependencies required
*
* @example
* ```ts
* import { fetchHTML, parseHTML } from './web-scraper'
*
* // Simple usage
* const doc = await fetchHTML('https://example.com')
* const title = doc.querySelector('title')?.textContent
*
* // Advanced usage with custom options
* const html = await fetch('https://example.com').then(r => r.text())
* const doc = parseHTML(html)
* const links = doc.querySelectorAll('a[href]')
* links.forEach(link => console.log(link.getAttribute('href')))
* ```
*/
export declare interface HTMLElement {
tagName: string
attributes: Record<string, string>
textContent: string
innerHTML: string
children: HTMLElement[]
parent: HTMLElement | null
querySelector: (selector: string) => HTMLElement | null
querySelectorAll: (selector: string) => HTMLElement[]
getAttribute: (name: string) => string | null
hasAttribute: (name: string) => boolean
getElementById: (id: string) => HTMLElement | null
getElementsByClassName: (className: string) => HTMLElement[]
getElementsByTagName: (tagName: string) => HTMLElement[]
}
export declare interface Document extends HTMLElement {
querySelector: (selector: string) => HTMLElement | null
querySelectorAll: (selector: string) => HTMLElement[]
}
export declare interface FetchHTMLOptions {
timeout?: number
headers?: Record<string, string>
userAgent?: string
redirect?: 'follow' | 'manual' | 'error'
signal?: AbortSignal
}