UNPKG

@monostate/node-scraper

Version:

Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers

461 lines (420 loc) 13.3 kB
// Type definitions for @bnca/smart-scraper // Project: https://github.com/your-org/bnca-prototype // Definitions by: BNCA Team export interface ScrapingOptions { /** Scraping method to use: "auto" (default), "direct", "lightpanda", or "puppeteer" */ method?: 'auto' | 'direct' | 'lightpanda' | 'puppeteer'; /** Request timeout in milliseconds */ timeout?: number; /** Number of retries per method */ retries?: number; /** Enable detailed logging */ verbose?: boolean; /** Path to Lightpanda binary */ lightpandaPath?: string; /** Custom user agent string */ userAgent?: string; /** BNCA API key for backend services */ apiKey?: string; /** BNCA API URL (defaults to https://bnca-api.fly.dev) */ apiUrl?: string; /** OpenRouter API key for AI processing */ openRouterApiKey?: string; /** OpenAI API key for AI processing */ openAIApiKey?: string; /** OpenAI base URL (for compatible endpoints) */ openAIBaseUrl?: string; /** AI model to use */ model?: string; /** AI temperature setting */ temperature?: number; /** Maximum tokens for AI response */ maxTokens?: number; /** HTTP referer for OpenRouter */ referer?: string; } export interface ScrapingResult { /** Whether the scraping was successful */ success: boolean; /** The extracted content as JSON string */ content?: string; /** Raw HTML content (when available) */ html?: string; /** Size of the content in bytes */ size?: number; /** Method used for scraping */ method: 'direct-fetch' | 'lightpanda' | 'puppeteer' | 'chrome-screenshot' | 'quickshot' | 'failed' | 'error'; /** Whether browser rendering was needed */ needsBrowser?: boolean; /** Content type from response headers */ contentType?: string; /** Error message if scraping failed */ error?: string; /** Error type for categorization */ errorType?: 'network' | 'timeout' | 'parsing' | 'service_unavailable'; /** Additional error details */ details?: string; /** Base64 encoded screenshot (if captured) */ screenshot?: string; /** Performance metrics */ performance: { /** Total time taken in milliseconds */ totalTime: number; /** Method used for scraping */ method?: string; /** System metrics (if available) */ systemMetrics?: SystemMetrics; }; /** Browser requirement indicators */ browserIndicators?: string[]; /** Performance statistics */ stats?: ScrapingStats; /** Methods attempted during auto mode (only populated in auto mode) */ fallbackChain?: string[]; } export interface SystemMetrics { /** Duration of monitoring in milliseconds */ duration?: number; /** Number of samples collected */ samples?: number; /** Memory usage statistics */ memory?: { heapUsed: MetricStats; rss: MetricStats; }; /** CPU usage statistics */ cpu?: MetricStats; /** System memory usage */ systemMemory?: MetricStats; /** Error message if metrics collection failed */ error?: string; } export interface MetricStats { /** Minimum value */ min: number; /** Maximum value */ max: number; /** Average value */ avg: number; /** Peak value */ peak?: number; } export interface ScrapingStats { /** Direct fetch statistics */ directFetch: MethodStats; /** Lightpanda statistics */ lightpanda: MethodStats; /** Puppeteer statistics */ puppeteer: MethodStats; /** Success rates for each method */ successRates: { directFetch: string; lightpanda: string; puppeteer: string; }; } export interface MethodStats { /** Number of attempts */ attempts: number; /** Number of successes */ successes: number; } export interface HealthCheckResult { /** Overall health status */ status: 'healthy' | 'unhealthy'; /** Availability of each method */ methods: { directFetch: boolean; lightpanda: boolean; puppeteer: boolean; }; /** Timestamp of health check */ timestamp: string; } export interface BulkScrapeOptions extends ScrapingOptions { /** Number of concurrent requests (default: 5) */ concurrency?: number; /** Progress callback function */ progressCallback?: (progress: BulkProgress) => void; /** Continue processing on error (default: true) */ continueOnError?: boolean; } export interface BulkScrapeStreamOptions extends ScrapingOptions { /** Number of concurrent requests (default: 5) */ concurrency?: number; /** Callback for each successful result */ onResult: (result: BulkScrapeResultItem) => void | Promise<void>; /** Callback for errors */ onError?: (error: BulkScrapeErrorItem) => void | Promise<void>; /** Progress callback function */ progressCallback?: (progress: BulkProgress) => void; } export interface BulkProgress { /** Number of URLs processed */ processed: number; /** Total number of URLs */ total: number; /** Percentage complete */ percentage: number; /** Current URL being processed */ current: string; } export interface BulkScrapeResult { /** Successfully scraped results */ success: BulkScrapeResultItem[]; /** Failed scrapes */ failed: BulkScrapeErrorItem[]; /** Total number of URLs */ total: number; /** Start timestamp */ startTime: number; /** End timestamp */ endTime: number; /** Aggregate statistics */ stats: BulkScrapeStats; } export interface BulkScrapeResultItem extends ScrapingResult { /** The URL that was scraped */ url: string; /** Time taken in milliseconds */ duration: number; /** Timestamp of completion */ timestamp: string; } export interface BulkScrapeErrorItem { /** The URL that failed */ url: string; /** Success is always false for errors */ success: false; /** Error message */ error: string; /** Time taken in milliseconds */ duration: number; /** Timestamp of failure */ timestamp: string; } export interface BulkScrapeStats { /** Number of successful scrapes */ successful: number; /** Number of failed scrapes */ failed: number; /** Total time taken in milliseconds */ totalTime: number; /** Average time per URL in milliseconds */ averageTime: number; /** Count of methods used */ methods: { direct: number; lightpanda: number; puppeteer: number; pdf: number; }; } export interface BulkScrapeStreamStats { /** Total number of URLs */ total: number; /** Number of URLs processed */ processed: number; /** Number of successful scrapes */ successful: number; /** Number of failed scrapes */ failed: number; /** Start timestamp */ startTime: number; /** End timestamp */ endTime: number; /** Total time in milliseconds */ totalTime: number; /** Average time per URL in milliseconds */ averageTime: number; /** Count of methods used */ methods: { direct: number; lightpanda: number; puppeteer: number; pdf: number; }; } /** * BNCA Smart Scraper - Intelligent web scraping with multi-level fallback */ export class BNCASmartScraper { /** * Create a new BNCA Smart Scraper instance * @param options Configuration options */ constructor(options?: ScrapingOptions); /** * Scrape a URL with intelligent fallback system * @param url The URL to scrape * @param options Optional configuration overrides * @returns Promise resolving to scraping result */ scrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Take a screenshot of a webpage * @param url The URL to capture * @param options Optional configuration overrides * @returns Promise resolving to screenshot result */ screenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Quick screenshot capture - optimized for speed * @param url The URL to capture * @param options Optional configuration overrides * @returns Promise resolving to screenshot result */ quickshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Ask AI a question about a URL * @param url The URL to analyze * @param question The question to answer about the page * @param options Optional configuration overrides * @returns Promise resolving to AI answer */ askAI(url: string, question: string, options?: ScrapingOptions): Promise<{ success: boolean; answer?: string; error?: string; method?: string; scrapeTime?: number; processing?: 'openrouter' | 'openai' | 'backend' | 'local'; }>; /** * Get performance statistics for all methods * @returns Current statistics */ getStats(): ScrapingStats; /** * Perform health check on all scraping methods * @returns Promise resolving to health status */ healthCheck(): Promise<HealthCheckResult>; /** * Clean up resources (browser instances, etc.) * @returns Promise that resolves when cleanup is complete */ cleanup(): Promise<void>; /** * Try direct HTTP fetch method * @param url URL to fetch * @param config Configuration options * @returns Promise resolving to scraping result */ private tryDirectFetch(url: string, config: ScrapingOptions): Promise<ScrapingResult>; /** * Try Lightpanda browser method * @param url URL to scrape * @param config Configuration options * @returns Promise resolving to scraping result */ private tryLightpanda(url: string, config: ScrapingOptions): Promise<ScrapingResult>; /** * Try Puppeteer browser method * @param url URL to scrape * @param config Configuration options * @returns Promise resolving to scraping result */ private tryPuppeteer(url: string, config: ScrapingOptions): Promise<ScrapingResult>; /** * Detect if a site requires browser rendering * @param html HTML content to analyze * @param url Original URL for context * @returns Whether browser rendering is needed */ private detectBrowserRequirement(html: string, url: string): boolean; /** * Extract structured content from HTML * @param html Raw HTML content * @returns Extracted content as JSON string */ private extractContentFromHTML(html: string): string; /** * Find Lightpanda binary on the system * @returns Path to binary or null if not found */ private findLightpandaBinary(): string | null; /** * Get browser requirement indicators for debugging * @param html HTML content to analyze * @returns Array of detected indicators */ private getBrowserIndicators(html: string): string[]; /** * Log a message if verbose mode is enabled * @param message Message to log */ private log(message: string): void; /** * Clean up resources - closes all browser instances */ cleanup(): Promise<void>; /** * Bulk scrape multiple URLs with optimized concurrency * @param urls Array of URLs to scrape * @param options Bulk scraping options * @returns Promise resolving to bulk scraping results */ bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>; /** * Bulk scrape with streaming results * @param urls Array of URLs to scrape * @param options Bulk scraping options with callbacks * @returns Promise resolving to summary statistics */ bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>; } /** * Convenience function for quick web scraping * @param url The URL to scrape * @param options Optional configuration * @returns Promise resolving to scraping result */ export function smartScrape(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Convenience function for taking screenshots * @param url The URL to capture * @param options Optional configuration * @returns Promise resolving to screenshot result */ export function smartScreenshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Convenience function for quick screenshot capture * @param url The URL to capture * @param options Optional configuration * @returns Promise resolving to screenshot result */ export function quickShot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>; /** * Convenience function for asking AI questions about a webpage * @param url The URL to analyze * @param question The question to answer * @param options Optional configuration * @returns Promise resolving to AI answer */ export function askWebsiteAI(url: string, question: string, options?: ScrapingOptions): Promise<{ success: boolean; answer?: string; error?: string; method?: string; scrapeTime?: number; processing?: 'openrouter' | 'openai' | 'backend' | 'local'; }>; /** * Convenience function for bulk scraping multiple URLs * @param urls Array of URLs to scrape * @param options Bulk scraping options * @returns Promise resolving to bulk scraping results */ export function bulkScrape(urls: string[], options?: BulkScrapeOptions): Promise<BulkScrapeResult>; /** * Convenience function for bulk scraping with streaming results * @param urls Array of URLs to scrape * @param options Bulk scraping options with callbacks * @returns Promise resolving to summary statistics */ export function bulkScrapeStream(urls: string[], options: BulkScrapeStreamOptions): Promise<BulkScrapeStreamStats>; /** * Default export - same as BNCASmartScraper class */ export default BNCASmartScraper;