UNPKG

@monostate/node-scraper

Version:

Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers

1,656 lines (1,449 loc) 53.1 kB
import fetch from 'node-fetch'; import { spawn, execSync } from 'child_process'; import fs from 'fs/promises'; import { existsSync, statSync } from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import { promises as fsPromises } from 'fs'; import pdfParse from 'pdf-parse/lib/pdf-parse.js'; import browserPool from './browser-pool.js'; let puppeteer = null; try { puppeteer = await import('puppeteer'); puppeteer = puppeteer.default || puppeteer; } catch (e) { // Puppeteer is optional } const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); /** * BNCA Smart Scraper - Intelligent Web Scraping with Multi-level Fallback * * This class implements a sophisticated fallback system: * 1. Direct Fetch - Fast HTML retrieval for simple sites * 2. Lightpanda - Lightning-fast browser for static/SSR sites * 3. Puppeteer - Full Chromium browser for complex JavaScript sites * * Performance: 10x+ faster than Firecrawl on average */ export class BNCASmartScraper { constructor(options = {}) { this.options = { timeout: options.timeout || 10000, userAgent: options.userAgent || 'Mozilla/5.0 (compatible; BNCA/1.0; +https://github.com/your-org/bnca)', lightpandaPath: options.lightpandaPath || this.findLightpandaBinary(), retries: options.retries || 2, verbose: options.verbose || false, ...options }; this.browser = null; this.stats = { directFetch: { attempts: 0, successes: 0 }, lightpanda: { attempts: 0, successes: 0 }, puppeteer: { attempts: 0, successes: 0 }, pdf: { attempts: 0, successes: 0 } }; } /** * Ask AI a question about a URL * Scrapes the URL and uses AI to answer the question * * @param {string} url - URL to analyze * @param {string} question - Question to answer * @param {object} options - Additional options * @returns {Promise<object>} AI response with answer */ async askAI(url, question, options = {}) { try { // First scrape the content const scrapeResult = await this.scrape(url, options); if (!scrapeResult.success) { return { success: false, error: `Failed to scrape URL: ${scrapeResult.error}`, method: scrapeResult.method }; } // Check for OpenRouter/OpenAI API key const openRouterKey = options.openRouterApiKey || this.options.openRouterApiKey || process.env.OPENROUTER_API_KEY; const openAIKey = options.openAIApiKey || this.options.openAIApiKey || process.env.OPENAI_API_KEY; // Priority: OpenRouter > OpenAI > Backend API > Local if (openRouterKey) { try { const answer = await this.processWithOpenRouter(question, scrapeResult.content, openRouterKey, options); return { success: true, answer, method: scrapeResult.method, scrapeTime: scrapeResult.stats.totalTime, processing: 'openrouter' }; } catch (error) { this.log(' ⚠️ OpenRouter API call failed, falling back...'); } } if (openAIKey) { try { const answer = await this.processWithOpenAI(question, scrapeResult.content, openAIKey, options); return { success: true, answer, method: scrapeResult.method, scrapeTime: scrapeResult.stats.totalTime, processing: 'openai' }; } catch (error) { this.log(' ⚠️ OpenAI API call failed, falling back...'); } } // If BNCA API key is provided, use the backend API if (this.options.apiKey) { try { const response = await fetch(`${this.options.apiUrl || 'https://bnca-api.fly.dev'}/aireply`, { method: 'POST', headers: { 'x-api-key': this.options.apiKey, 'Content-Type': 'application/json' }, body: JSON.stringify({ url, question }) }); if (response.ok) { const data = await response.json(); return { success: true, answer: data.answer, method: scrapeResult.method, scrapeTime: scrapeResult.stats.totalTime, processing: 'backend' }; } } catch (error) { this.log(' ⚠️ Backend API call failed, using local AI processing'); } } // Local AI processing fallback const answer = this.processLocally(question, scrapeResult.content); return { success: true, answer, method: scrapeResult.method, scrapeTime: scrapeResult.stats.totalTime, processing: 'local' }; } catch (error) { return { success: false, error: error.message || 'AI processing failed' }; } } /** * Process with OpenRouter API * @private */ async processWithOpenRouter(question, content, apiKey, options = {}) { const parsedContent = typeof content === 'string' ? JSON.parse(content) : content; const contentText = ` Title: ${parsedContent.title || 'Unknown'} Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'} Meta Description: ${parsedContent.metaDescription || 'None'} ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''} `.trim(); const response = await fetch('https://openrouter.ai/api/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}`, 'HTTP-Referer': options.referer || 'https://github.com/monostate/node-scraper', 'X-Title': 'BNCA Node Scraper', }, body: JSON.stringify({ model: options.model || 'meta-llama/llama-4-scout:free', messages: [ { role: 'system', content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.' }, { role: 'user', content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}` } ], temperature: options.temperature || 0.3, max_tokens: options.maxTokens || 500, }), }); if (!response.ok) { throw new Error(`OpenRouter API error: ${response.status}`); } const data = await response.json(); return data.choices[0]?.message?.content || 'No response from AI'; } /** * Process with OpenAI API * @private */ async processWithOpenAI(question, content, apiKey, options = {}) { const parsedContent = typeof content === 'string' ? JSON.parse(content) : content; const contentText = ` Title: ${parsedContent.title || 'Unknown'} Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'} Meta Description: ${parsedContent.metaDescription || 'None'} ${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''} `.trim(); const baseUrl = options.openAIBaseUrl || 'https://api.openai.com'; const response = await fetch(`${baseUrl}/v1/chat/completions`, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}`, }, body: JSON.stringify({ model: options.model || 'gpt-3.5-turbo', messages: [ { role: 'system', content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.' }, { role: 'user', content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}` } ], temperature: options.temperature || 0.3, max_tokens: options.maxTokens || 500, }), }); if (!response.ok) { throw new Error(`OpenAI API error: ${response.status}`); } const data = await response.json(); return data.choices[0]?.message?.content || 'No response from AI'; } /** * Local AI processing (simple pattern matching) * @private */ processLocally(question, content) { const parsedContent = typeof content === 'string' ? JSON.parse(content) : content; const title = parsedContent.title || 'Unknown'; const text = parsedContent.content || parsedContent.bodyText || ''; const lowerQuestion = question.toLowerCase(); if (lowerQuestion.includes('title')) { return `The page title is "${title}".`; } if (lowerQuestion.includes('about') || lowerQuestion.includes('what')) { return `This page titled "${title}" contains: ${text.substring(0, 200)}...`; } if (lowerQuestion.includes('contact') || lowerQuestion.includes('email')) { const emailMatch = text.match(/[\w.-]+@[\w.-]+\.\w+/); return emailMatch ? `Found contact: ${emailMatch[0]}` : 'No contact information found.'; } return `Based on "${title}": ${text.substring(0, 150)}...`; } /** * Main scraping method with intelligent fallback */ async scrape(url, options = {}) { const startTime = Date.now(); const config = { ...this.options, ...options }; this.log(`🚀 Starting smart scrape for: ${url}`); let result = null; let method = 'unknown'; let lastError = null; const fallbackChain = []; // Check if a specific method is requested const requestedMethod = config.method; const isForced = requestedMethod && requestedMethod !== 'auto'; try { // Check if URL is a PDF (by extension or content-type check) const isPdfUrl = url.toLowerCase().endsWith('.pdf') || url.toLowerCase().includes('.pdf?') || url.toLowerCase().includes('/pdf/'); if (isPdfUrl) { this.log(' 📄 PDF detected, using PDF parser...'); result = await this.tryPDFParse(url, config); if (result.success) { method = 'pdf'; this.log(' ✅ PDF parsing successful'); const totalTime = Date.now() - startTime; return { ...result, method, performance: { totalTime, method }, stats: this.getStats() }; } else { this.log(' ❌ PDF parsing failed'); lastError = result.error; } } // Handle forced method requests if (isForced) { this.log(` 🎯 Method forced to: ${requestedMethod}`); switch (requestedMethod) { case 'direct': this.log(' 🔄 Attempting direct fetch...'); result = await this.tryDirectFetch(url, config); method = 'direct-fetch'; break; case 'lightpanda': this.log(' 🐼 Attempting Lightpanda...'); result = await this.tryLightpanda(url, config); method = 'lightpanda'; break; case 'puppeteer': this.log(' 🔵 Attempting Puppeteer...'); result = await this.tryPuppeteer(url, config); method = 'puppeteer'; break; default: return { success: false, error: `Invalid method: ${requestedMethod}. Valid methods are: auto, direct, lightpanda, puppeteer`, method: 'error', errorType: 'service_unavailable', performance: { totalTime: Date.now() - startTime } }; } // For forced methods, return immediately with no fallback if (!result.success) { this.log(` ❌ ${requestedMethod} failed`); return { success: false, error: result.error || `${requestedMethod} scraping failed`, method, errorType: this.categorizeError(result.error), details: result.error, performance: { totalTime: Date.now() - startTime, method }, stats: this.getStats() }; } this.log(` ✅ ${requestedMethod} successful`); const totalTime = Date.now() - startTime; return { ...result, method, performance: { totalTime, method }, stats: this.getStats() }; } // Step 1: Try direct fetch first (fastest) this.log(' 🔄 Attempting direct fetch...'); fallbackChain.push('direct-fetch'); result = await this.tryDirectFetch(url, config); if (result.success && !result.needsBrowser) { method = 'direct-fetch'; this.log(' ✅ Direct fetch successful'); } else if (result.isPdf) { // Direct fetch detected a PDF, try PDF parser this.log(' 📄 Direct fetch detected PDF content, using PDF parser...'); result = await this.tryPDFParse(url, config); if (result.success) { method = 'pdf'; this.log(' ✅ PDF parsing successful'); const totalTime = Date.now() - startTime; return { ...result, method, performance: { totalTime, method }, stats: this.getStats() }; } else { this.log(' ❌ PDF parsing failed'); lastError = result.error; } } else { this.log(result.needsBrowser ? ' ⚠️ Browser rendering required' : ' ❌ Direct fetch failed'); lastError = result.error; // Step 2: Try Lightpanda (fast browser) this.log(' 🐼 Attempting Lightpanda...'); fallbackChain.push('lightpanda'); result = await this.tryLightpanda(url, config); if (result.success) { method = 'lightpanda'; this.log(' ✅ Lightpanda successful'); } else { this.log(' ❌ Lightpanda failed, falling back to Puppeteer'); lastError = result.error; // Step 3: Fallback to Puppeteer (full browser) this.log(' 🔵 Attempting Puppeteer...'); fallbackChain.push('puppeteer'); result = await this.tryPuppeteer(url, config); if (result.success) { method = 'puppeteer'; this.log(' ✅ Puppeteer successful'); } else { method = 'failed'; this.log(' ❌ All methods failed'); lastError = result.error; } } } const totalTime = Date.now() - startTime; return { ...result, method, performance: { totalTime, method }, stats: this.getStats(), // Only include fallbackChain in auto mode ...((!requestedMethod || requestedMethod === 'auto') && { fallbackChain }) }; } catch (error) { return { success: false, method: 'error', error: error.message, performance: { totalTime: Date.now() - startTime } }; } } /** * Direct HTTP fetch - fastest method for simple sites */ async tryDirectFetch(url, config) { this.stats.directFetch.attempts++; try { const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), config.timeout); const response = await fetch(url, { headers: { 'User-Agent': config.userAgent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }, signal: controller.signal }); clearTimeout(timeoutId); if (!response.ok) { return { success: false, error: `Direct fetch failed: HTTP ${response.status}: ${response.statusText}`, errorType: response.status === 404 ? 'service_unavailable' : 'network' }; } // Check if the response is actually a PDF const contentType = response.headers.get('content-type') || ''; if (contentType.includes('application/pdf')) { return { success: false, error: 'Content is PDF, should use PDF parser', isPdf: true }; } // Get response as array buffer to check magic bytes const buffer = await response.arrayBuffer(); const firstBytes = new Uint8Array(buffer.slice(0, 5)); const signature = Array.from(firstBytes).map(b => String.fromCharCode(b)).join(''); // Check for PDF magic bytes if (signature.startsWith('%PDF')) { return { success: false, error: 'Content is PDF (detected by magic bytes), should use PDF parser', isPdf: true }; } // Convert buffer back to text for HTML processing const html = new TextDecoder().decode(buffer); // Intelligent browser detection const needsBrowser = this.detectBrowserRequirement(html, url); if (!needsBrowser) { const content = this.extractContentFromHTML(html); this.stats.directFetch.successes++; return { success: true, needsBrowser: false, content, html, size: html.length, contentType: response.headers.get('content-type') || 'text/html' }; } else { return { success: true, needsBrowser: true, html, size: html.length, browserIndicators: this.getBrowserIndicators(html) }; } } catch (error) { const errorMsg = error.message || 'Unknown error'; return { success: false, error: `Direct fetch failed: ${errorMsg}`, errorType: this.categorizeError(errorMsg) }; } } /** * Lightpanda browser - fast browser engine for static/SSR sites */ async tryLightpanda(url, config) { this.stats.lightpanda.attempts++; if (!this.options.lightpandaPath) { return { success: false, error: 'Lightpanda scraping failed: Lightpanda binary not found. Please install Lightpanda or provide path.', errorType: 'service_unavailable' }; } try { // Check if binary exists const stats = statSync(this.options.lightpandaPath); if (!stats.isFile()) { return { success: false, error: 'Lightpanda scraping failed: Lightpanda binary is not a file', errorType: 'service_unavailable' }; } } catch { return { success: false, error: 'Lightpanda scraping failed: Lightpanda binary not accessible', errorType: 'service_unavailable' }; } return new Promise((resolve) => { const args = ['fetch', '--dump', url]; const process = spawn(this.options.lightpandaPath, args, { timeout: config.timeout + 1000 // Add buffer for process timeout only }); let output = ''; let errorOutput = ''; process.stdout.on('data', (data) => { output += data.toString(); }); process.stderr.on('data', (data) => { errorOutput += data.toString(); }); process.on('close', (code) => { if (code === 0 && output.length > 0) { const content = this.extractContentFromHTML(output); this.stats.lightpanda.successes++; resolve({ success: true, content, html: output, size: output.length, exitCode: code }); } else { const errorMsg = errorOutput || `Lightpanda exited with code ${code}`; resolve({ success: false, error: `Lightpanda scraping failed: ${errorMsg}`, errorType: this.categorizeError(errorMsg), exitCode: code }); } }); process.on('error', (error) => { resolve({ success: false, error: `Lightpanda scraping failed: ${error.message}`, errorType: this.categorizeError(error.message) }); }); }); } /** * Puppeteer browser - full Chromium for complex JavaScript sites */ async tryPuppeteer(url, config) { this.stats.puppeteer.attempts++; if (!puppeteer) { return { success: false, error: 'Puppeteer scraping failed: Puppeteer is not installed. Please install puppeteer package.', errorType: 'service_unavailable' }; } let browser = null; let page = null; try { // Get browser from pool browser = await browserPool.getBrowser(); page = await browser.newPage(); // Set user agent and viewport await page.setUserAgent(config.userAgent); await page.setViewport({ width: 1280, height: 720 }); // Block unnecessary resources for faster loading await page.setRequestInterception(true); page.on('request', (req) => { const resourceType = req.resourceType(); if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) { req.abort(); } else { req.continue(); } }); // Navigate with timeout await page.goto(url, { waitUntil: 'networkidle0', timeout: config.timeout }); // Extract content using browser APIs const content = await page.evaluate(() => { // Get basic page info const title = document.title; const metaDescription = document.querySelector('meta[name="description"]')?.content || ''; const canonical = document.querySelector('link[rel="canonical"]')?.href || ''; // Extract headings const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')) .map(h => ({ level: h.tagName.toLowerCase(), text: h.textContent.trim() })) .filter(h => h.text.length > 0) .slice(0, 20); // Extract paragraphs const paragraphs = Array.from(document.querySelectorAll('p')) .map(p => p.textContent.trim()) .filter(text => text.length > 20) .slice(0, 10); // Extract links const links = Array.from(document.querySelectorAll('a[href]')) .map(a => ({ text: a.textContent.trim(), href: a.href })) .filter(link => link.text.length > 0) .slice(0, 15); // Extract JSON-LD structured data const structuredData = Array.from(document.querySelectorAll('script[type=\"application/ld+json\"]')) .map(script => { try { return JSON.parse(script.textContent); } catch { return null; } }) .filter(data => data !== null); // Get page text content (truncated) const bodyText = document.body.textContent .replace(/\\s+/g, ' ') .trim() .substring(0, 3000); return { title, metaDescription, canonical, headings, paragraphs, links, structuredData, bodyText, url: window.location.href }; }); this.stats.puppeteer.successes++; return { success: true, content: JSON.stringify(content, null, 2), size: JSON.stringify(content).length }; } catch (error) { const errorMsg = error.message || 'Unknown error'; return { success: false, error: `Puppeteer scraping failed: ${errorMsg}`, errorType: this.categorizeError(errorMsg) }; } finally { // Always clean up page if (page) { try { // Check if page is still connected before closing if (!page.isClosed()) { await page.close(); } } catch (e) { // Silently ignore protocol errors when page is already closed if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) { console.warn('Error closing page:', e.message); } } } // Release browser back to pool if (browser) { browserPool.releaseBrowser(browser); } } } /** * PDF parsing method - handles PDF documents */ async tryPDFParse(url, config) { this.stats.pdf.attempts++; try { // Download PDF with timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), config.timeout); const response = await fetch(url, { headers: { 'User-Agent': config.userAgent, 'Accept': 'application/pdf,*/*' }, signal: controller.signal }); clearTimeout(timeoutId); if (!response.ok) { return { success: false, error: `HTTP ${response.status}: ${response.statusText}` }; } // Check content type (be lenient - accept various content types) const contentType = response.headers.get('content-type') || ''; const acceptableTypes = ['pdf', 'octet-stream', 'binary', 'download']; const isAcceptableType = acceptableTypes.some(type => contentType.includes(type)); if (!isAcceptableType && !url.toLowerCase().includes('.pdf')) { return { success: false, error: `Not a PDF document: ${contentType}` }; } // Get PDF buffer const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); // Check size limit (20MB) if (buffer.length > 20 * 1024 * 1024) { return { success: false, error: 'PDF too large (max 20MB)' }; } // Parse PDF const pdfData = await pdfParse(buffer); // Extract structured content const content = { title: pdfData.info?.Title || 'Untitled PDF', author: pdfData.info?.Author || '', subject: pdfData.info?.Subject || '', keywords: pdfData.info?.Keywords || '', creator: pdfData.info?.Creator || '', producer: pdfData.info?.Producer || '', creationDate: pdfData.info?.CreationDate || '', modificationDate: pdfData.info?.ModificationDate || '', pages: pdfData.numpages || 0, text: pdfData.text || '', metadata: pdfData.metadata || null, url: url }; this.stats.pdf.successes++; return { success: true, content: JSON.stringify(content, null, 2), size: buffer.length, contentType: 'application/pdf', pages: content.pages }; } catch (error) { return { success: false, error: `PDF parsing error: ${error.message}` }; } } /** * Intelligent detection of browser requirement */ detectBrowserRequirement(html, url) { // Whitelist simple sites that should always use direct fetch const simpleSites = [ 'example.com', 'httpbin.org', 'wikipedia.org', 'github.io', 'netlify.app', 'vercel.app' ]; if (simpleSites.some(site => url.includes(site))) { return false; // Always use direct fetch for these } // Check for common SPA patterns (be more specific) const spaIndicators = [ /<div[^>]*id=['"]?root['"]?[^>]*>\s*<\/div>/i, /<div[^>]*id=['"]?app['"]?[^>]*>\s*<\/div>/i, /<div[^>]*data-reactroot/i, /window\.__NEXT_DATA__/i, /window\.__NUXT__/i, /_next\/static/i, /__webpack_require__/i ]; // Check for protection systems (more specific patterns) const protectionIndicators = [ /cloudflare.*challenge/i, /cloudflare.*protection/i, /ray id.*cloudflare/i, /please enable javascript/i, /you need to enable javascript/i, /this site requires javascript/i, /jscript.*required/i, /security check.*cloudflare/i, /attention required.*cloudflare/i ]; // Domain-based checks for known SPA sites const domainIndicators = [ /instagram\.com/i, /twitter\.com/i, /facebook\.com/i, /linkedin\.com/i, /maps\.google/i, /gmail\.com/i, /youtube\.com/i ]; // Check if it's clearly a SPA or protected site const hasSpaIndicators = spaIndicators.some(pattern => pattern.test(html)); const hasProtection = protectionIndicators.some(pattern => pattern.test(html)); const isKnownSpa = domainIndicators.some(pattern => pattern.test(url)); // Check for minimal content BUT only if we also have SPA indicators const bodyContent = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)?.[1] || ''; const textContent = bodyContent .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim(); const hasMinimalContent = textContent.length < 200; // More conservative threshold const isLikelySpa = hasMinimalContent && hasSpaIndicators; // Only require browser if we have strong indicators const needsBrowser = hasProtection || isKnownSpa || isLikelySpa; return needsBrowser; } /** * Get browser requirement indicators for debugging */ getBrowserIndicators(html) { const indicators = []; if (/<div[^>]*id=['"]?root['"]?[^>]*>\s*<\/div>/i.test(html)) { indicators.push('React root div detected'); } if (/window\.__NEXT_DATA__/i.test(html)) { indicators.push('Next.js data detected'); } if (/cloudflare.*challenge/i.test(html)) { indicators.push('Cloudflare challenge detected'); } if (/cloudflare.*protection/i.test(html)) { indicators.push('Cloudflare protection detected'); } if (/please enable javascript/i.test(html)) { indicators.push('JavaScript required message detected'); } return indicators; } /** * Extract structured content from HTML */ extractContentFromHTML(html) { try { // Basic content extraction const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1] || ''; const metaDescription = html.match(/<meta[^>]*name=['"]description['"][^>]*content=['"]([^'"]*)['"]/i)?.[1] || ''; // Extract JSON-LD structured data const jsonLdMatches = [...html.matchAll(/<script[^>]*type=['"]application\/ld\+json['"][^>]*>([\s\S]*?)<\/script>/gi)]; const structuredData = []; jsonLdMatches.forEach(match => { try { const data = JSON.parse(match[1]); structuredData.push(data); } catch { // Ignore malformed JSON } }); // Extract window state data const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/); let windowData = null; if (windowDataMatch) { try { windowData = JSON.parse(windowDataMatch[1]); } catch { windowData = 'Found but unparseable'; } } // Extract main content const bodyMatch = html.match(/<body[^>]*>([\s\S]*)<\/body>/i); let textContent = ''; if (bodyMatch) { textContent = bodyMatch[1] .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<[^>]+>/g, ' ') .replace(/\s+/g, ' ') .trim() .substring(0, 2000); } // Extract meta tags const metaTags = {}; const metaMatches = [...html.matchAll(/<meta[^>]*(?:property|name)=['"]([^'"]+)['"][^>]*content=['"]([^'"]*)['"]/gi)]; metaMatches.slice(0, 15).forEach(match => { metaTags[match[1]] = match[2]; }); return JSON.stringify({ title, metaDescription, structuredData: structuredData.length > 0 ? structuredData : null, windowData, metaTags: Object.keys(metaTags).length > 0 ? metaTags : null, content: textContent, extractedAt: new Date().toISOString() }, null, 2); } catch (error) { return JSON.stringify({ error: 'Content extraction failed', message: error.message, rawLength: html.length }, null, 2); } } /** * Find Lightpanda binary */ findLightpandaBinary() { // First check the package's bin directory (installed by postinstall script) const packageDir = path.dirname(new URL(import.meta.url).pathname); const packageBinPath = path.join(packageDir, 'bin', 'lightpanda'); const possiblePaths = [ packageBinPath, // Package's bin directory (highest priority) './lightpanda', '../lightpanda', './lightpanda/lightpanda', '/usr/local/bin/lightpanda', path.join(process.cwd(), 'lightpanda'), path.join(process.cwd(), 'bin', 'lightpanda') ]; for (const binaryPath of possiblePaths) { try { // Synchronous check for binary existence and executability const fullPath = path.resolve(binaryPath); if (existsSync(fullPath)) { const stats = statSync(fullPath); if (stats.isFile()) { // Check if it's executable (on Unix-like systems including WSL) if (process.platform !== 'win32' || this.isWSL()) { const mode = stats.mode; const isExecutable = Boolean(mode & parseInt('111', 8)); if (isExecutable) { return fullPath; } } else { // On native Windows (not WSL), Lightpanda is not supported continue; } } } } catch { continue; } } return null; } /** * Check if running in WSL environment */ isWSL() { try { const uname = execSync('uname -r', { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] }); return uname.toLowerCase().includes('microsoft') || uname.toLowerCase().includes('wsl'); } catch { return false; } } /** * Get performance statistics */ getStats() { return { ...this.stats, successRates: { directFetch: this.stats.directFetch.attempts > 0 ? (this.stats.directFetch.successes / this.stats.directFetch.attempts * 100).toFixed(1) + '%' : '0%', lightpanda: this.stats.lightpanda.attempts > 0 ? (this.stats.lightpanda.successes / this.stats.lightpanda.attempts * 100).toFixed(1) + '%' : '0%', puppeteer: this.stats.puppeteer.attempts > 0 ? (this.stats.puppeteer.successes / this.stats.puppeteer.attempts * 100).toFixed(1) + '%' : '0%', pdf: this.stats.pdf.attempts > 0 ? (this.stats.pdf.successes / this.stats.pdf.attempts * 100).toFixed(1) + '%' : '0%' } }; } /** * Categorize error types for better error handling */ categorizeError(errorMessage) { if (!errorMessage) return 'unknown'; const error = errorMessage.toLowerCase(); if (error.includes('timeout') || error.includes('timed out') || error.includes('abort')) { return 'timeout'; } else if (error.includes('network') || error.includes('enotfound') || error.includes('econnrefused') || error.includes('econnreset')) { return 'network'; } else if (error.includes('parse') || error.includes('parsing') || error.includes('invalid')) { return 'parsing'; } else if (error.includes('not found') || error.includes('404') || error.includes('unavailable')) { return 'service_unavailable'; } return 'unknown'; } /** * Logging helper */ log(message) { if (this.options.verbose) { console.log(message); } } /** * Cleanup resources */ async cleanup() { if (this.browser) { await this.browser.close(); this.browser = null; } } /** * Take a screenshot of a webpage */ async screenshot(url, options = {}) { const startTime = Date.now(); const config = { ...this.options, ...options }; this.log(`📸 Taking screenshot for: ${url}`); try { const screenshot = await this.takeScreenshotWithChrome(url, config); return { success: !!screenshot, screenshot, method: 'chrome-screenshot', performance: { totalTime: Date.now() - startTime } }; } catch (error) { return { success: false, error: error.message, method: 'chrome-screenshot', performance: { totalTime: Date.now() - startTime } }; } } /** * Quick screenshot capture - optimized for speed */ async quickshot(url, options = {}) { const startTime = Date.now(); const config = { ...this.options, ...options, timeout: options.timeout || 15000 // Longer timeout for screenshots }; this.log(`⚡ Taking quick screenshot for: ${url}`); try { const screenshot = await this.takeScreenshotOptimized(url, config); return { success: !!screenshot, screenshot, method: 'quickshot', performance: { totalTime: Date.now() - startTime } }; } catch (error) { return { success: false, error: error.message, method: 'quickshot', performance: { totalTime: Date.now() - startTime } }; } } /** * Take screenshot using Chrome CLI */ async takeScreenshotWithChrome(url, config) { const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`); try { const args = [ '--headless=new', '--disable-gpu', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--user-agent=' + config.userAgent, '--screenshot=' + tempFile, '--window-size=1280,800', '--hide-scrollbars', '--virtual-time-budget=10000', url ]; const chromePath = await this.findChromePath(); if (!chromePath) { throw new Error('Chrome/Chromium not found'); } return new Promise((resolve) => { const chrome = spawn(chromePath, args, { stdio: ['ignore', 'pipe', 'pipe'], detached: false }); let processExited = false; let stderr = ''; chrome.stderr.on('data', (data) => { stderr += data.toString(); }); const killTimeout = setTimeout(() => { if (!processExited) { this.log('Chrome timeout, sending SIGTERM...'); chrome.kill('SIGTERM'); setTimeout(() => { if (!processExited) { chrome.kill('SIGKILL'); } }, 1000); } }, config.timeout || 15000); chrome.on('exit', async (code, signal) => { processExited = true; clearTimeout(killTimeout); try { await new Promise(r => setTimeout(r, 500)); const screenshotBuffer = await fsPromises.readFile(tempFile); const base64 = screenshotBuffer.toString('base64'); await fsPromises.unlink(tempFile).catch(() => {}); resolve(`data:image/png;base64,${base64}`); } catch (error) { resolve(null); } }); chrome.on('error', (error) => { clearTimeout(killTimeout); resolve(null); }); }); } catch (error) { return null; } } /** * Optimized screenshot for speed */ async takeScreenshotOptimized(url, config, retryCount = 0) { const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`); try { const virtualTimeBudget = retryCount === 0 ? 5000 : 8000; const processTimeout = retryCount === 0 ? 8000 : 12000; const args = [ '--headless=new', '--disable-gpu', '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', '--disable-features=TranslateUI', '--disable-extensions', '--disable-default-apps', '--disable-sync', '--metrics-recording-only', '--mute-audio', '--no-first-run', '--disable-background-timer-throttling', '--disable-backgrounding-occluded-windows', '--disable-renderer-backgrounding', '--user-agent=' + config.userAgent, '--screenshot=' + tempFile, '--window-size=1280,800', '--hide-scrollbars', '--run-all-compositor-stages-before-draw', `--virtual-time-budget=${virtualTimeBudget}`, url ]; const chromePath = await this.findChromePath(); if (!chromePath) { throw new Error('Chrome/Chromium not found'); } return new Promise((resolve) => { const chrome = spawn(chromePath, args, { stdio: ['ignore', 'pipe', 'pipe'], detached: false }); let processExited = false; const killTimeout = setTimeout(() => { if (!processExited) { chrome.kill('SIGTERM'); setTimeout(() => { if (!processExited) { chrome.kill('SIGKILL'); } }, 1000); } }, processTimeout); chrome.on('exit', async (code, signal) => { processExited = true; clearTimeout(killTimeout); try { await new Promise(r => setTimeout(r, 500)); const screenshotBuffer = await fsPromises.readFile(tempFile); const base64 = screenshotBuffer.toString('base64'); await fsPromises.unlink(tempFile).catch(() => {}); resolve(`data:image/png;base64,${base64}`); } catch (error) { if (retryCount === 0) { const retryResult = await this.takeScreenshotOptimized(url, config, 1); resolve(retryResult); } else { resolve(null); } } }); chrome.on('error', (error) => { clearTimeout(killTimeout); resolve(null); }); }); } catch (error) { if (retryCount === 0) { return this.takeScreenshotOptimized(url, config, 1); } return null; } } /** * Find Chrome/Chromium binary path */ async findChromePath() { const chromePaths = process.platform === 'darwin' ? [ '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', '/Applications/Chromium.app/Contents/MacOS/Chromium', ] : [ '/usr/bin/chromium-browser', '/usr/bin/chromium', '/usr/bin/google-chrome-stable', '/usr/bin/google-chrome', ]; for (const path of chromePaths) { try { await fsPromises.access(path); return path; } catch (e) { continue; } } return null; } /** * Health check for all scraping methods */ async healthCheck() { const testUrl = 'https://example.com'; const results = {}; // Test direct fetch try { const directResult = await this.tryDirectFetch(testUrl, this.options); results.directFetch = directResult.success; } catch { results.directFetch = false; } // Test Lightpanda try { const lightpandaResult = await this.tryLightpanda(testUrl, this.options); results.lightpanda = lightpandaResult.success; } catch { results.lightpanda = false; } // Test Puppeteer try { const puppeteerResult = await this.tryPuppeteer(testUrl, this.options); results.puppeteer = puppeteerResult.success; await this.cleanup(); // Clean up after test } catch { results.puppeteer = false; } return { status: Object.values(results).some(r => r) ? 'healthy' : 'unhealthy', methods: results, timestamp: new Date().toISOString() }; } /** * Clean up resources - closes all browser instances */ async cleanup() { await browserPool.closeAll(); } /** * Bulk scrape multiple URLs with optimized concurrency * @param {string[]} urls - Array of URLs to scrape * @param {Object} options - Scraping options * @returns {Promise<Object>} Bulk scraping results */ async bulkScrape(urls, options = {}) { const { concurrency = 5, progressCallback = null, continueOnError = true, ...scrapeOptions } = options; const results = { success: [], failed: [], total: urls.length, startTime: Date.now(), endTime: null, stats: { successful: 0, failed: 0, totalTime: 0, averageTime: 0, methods: { direct: 0, lightpanda: 0, puppeteer: 0, pdf: 0 } } }; // Process URLs in batches const batches = []; for (let i = 0; i < urls.length; i += concurrency) { batches.push(urls.slice(i, i + concurrency)); } let processedCount = 0; for (const batch of batches) { const batchPromises = batch.map(async (url) => { const startTime = Date.now(); try { const result = await this.scrape(url, scrapeOptions); const endTime = Date.now(); const duration = endTime - startTime; const successResult = { url, ...result, duration, timestamp: new Date(endTime).toISOString() }; results.success.push(successResult); results.stats.successful++; // Track method usage if (result.method) { results.stats.methods[result.method]++; } return successResult; } catch (error) { const endTime = Date.now(); const duration = endTime - startTime; const failedResult = { url, success: false, error: error.message, duration, timestamp: new Date(endTime).toISOString() }; results.failed.push(failedResult); results.stats.failed++; if (!continueOnError) { throw error; } return failedResult; } finally { processedCount++; if (progressCallback) { progressCallback({ processed: processedCount, total: urls.length, percentage: (processedCount / urls.length) * 100, current: url }); } } }); await Promise.all(batchPromises); } results.endTime = Date.now(); results.stats.totalTime = results.endTime - results.startTime; results.stats.averageTime = results.stats.totalTime / urls.length; return results; } /** * Bulk scrape with streaming results * @param {string[]} urls - Array of URLs to scrape * @param {Object} options - Scraping options with onResult callback * @returns {Promise<Object>} Summary statistics */ async bulkScrapeStream(urls, options = {}) { const { concurrency = 5, onResult = null, onError = null, progressCallback = null, ...scrapeOptions } = options; if (!onResult) { throw new Error('onResult callback is required for streaming bulk scrape'); } const stats = { total: urls.length, processed: 0, successful: 0, failed: 0, startTime: Date.now(), endTime: null, methods: { direct: 0, lightpanda: 0, puppeteer: 0, pdf: 0 } }; const queue = [...urls]; const inProgress = new Set(); const processNext = async () => { if (queue.length === 0 || inProgress.size >= concurrency) { return; } const url = queue.shift(); inProgress.add(url); const startTime = Date.now(); try { const result = await this.scrape(url, scrapeOptions); const duration = Date.now() - startTime; stats.successful++; if (result.method) { stats.methods[result.method]++; } await onResult({ url, ...result, duration, timestamp: new Date().to