@monostate/node-scraper
Version:
Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers
1,656 lines (1,449 loc) • 53.1 kB
JavaScript
import fetch from 'node-fetch';
import { spawn, execSync } from 'child_process';
import fs from 'fs/promises';
import { existsSync, statSync } from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { promises as fsPromises } from 'fs';
import pdfParse from 'pdf-parse/lib/pdf-parse.js';
import browserPool from './browser-pool.js';
let puppeteer = null;
try {
puppeteer = await import('puppeteer');
puppeteer = puppeteer.default || puppeteer;
} catch (e) {
// Puppeteer is optional
}
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
/**
* BNCA Smart Scraper - Intelligent Web Scraping with Multi-level Fallback
*
* This class implements a sophisticated fallback system:
* 1. Direct Fetch - Fast HTML retrieval for simple sites
* 2. Lightpanda - Lightning-fast browser for static/SSR sites
* 3. Puppeteer - Full Chromium browser for complex JavaScript sites
*
* Performance: 10x+ faster than Firecrawl on average
*/
export class BNCASmartScraper {
constructor(options = {}) {
this.options = {
timeout: options.timeout || 10000,
userAgent: options.userAgent || 'Mozilla/5.0 (compatible; BNCA/1.0; +https://github.com/your-org/bnca)',
lightpandaPath: options.lightpandaPath || this.findLightpandaBinary(),
retries: options.retries || 2,
verbose: options.verbose || false,
...options
};
this.browser = null;
this.stats = {
directFetch: { attempts: 0, successes: 0 },
lightpanda: { attempts: 0, successes: 0 },
puppeteer: { attempts: 0, successes: 0 },
pdf: { attempts: 0, successes: 0 }
};
}
/**
* Ask AI a question about a URL
* Scrapes the URL and uses AI to answer the question
*
* @param {string} url - URL to analyze
* @param {string} question - Question to answer
* @param {object} options - Additional options
* @returns {Promise<object>} AI response with answer
*/
async askAI(url, question, options = {}) {
try {
// First scrape the content
const scrapeResult = await this.scrape(url, options);
if (!scrapeResult.success) {
return {
success: false,
error: `Failed to scrape URL: ${scrapeResult.error}`,
method: scrapeResult.method
};
}
// Check for OpenRouter/OpenAI API key
const openRouterKey = options.openRouterApiKey || this.options.openRouterApiKey || process.env.OPENROUTER_API_KEY;
const openAIKey = options.openAIApiKey || this.options.openAIApiKey || process.env.OPENAI_API_KEY;
// Priority: OpenRouter > OpenAI > Backend API > Local
if (openRouterKey) {
try {
const answer = await this.processWithOpenRouter(question, scrapeResult.content, openRouterKey, options);
return {
success: true,
answer,
method: scrapeResult.method,
scrapeTime: scrapeResult.stats.totalTime,
processing: 'openrouter'
};
} catch (error) {
this.log(' ⚠️ OpenRouter API call failed, falling back...');
}
}
if (openAIKey) {
try {
const answer = await this.processWithOpenAI(question, scrapeResult.content, openAIKey, options);
return {
success: true,
answer,
method: scrapeResult.method,
scrapeTime: scrapeResult.stats.totalTime,
processing: 'openai'
};
} catch (error) {
this.log(' ⚠️ OpenAI API call failed, falling back...');
}
}
// If BNCA API key is provided, use the backend API
if (this.options.apiKey) {
try {
const response = await fetch(`${this.options.apiUrl || 'https://bnca-api.fly.dev'}/aireply`, {
method: 'POST',
headers: {
'x-api-key': this.options.apiKey,
'Content-Type': 'application/json'
},
body: JSON.stringify({ url, question })
});
if (response.ok) {
const data = await response.json();
return {
success: true,
answer: data.answer,
method: scrapeResult.method,
scrapeTime: scrapeResult.stats.totalTime,
processing: 'backend'
};
}
} catch (error) {
this.log(' ⚠️ Backend API call failed, using local AI processing');
}
}
// Local AI processing fallback
const answer = this.processLocally(question, scrapeResult.content);
return {
success: true,
answer,
method: scrapeResult.method,
scrapeTime: scrapeResult.stats.totalTime,
processing: 'local'
};
} catch (error) {
return {
success: false,
error: error.message || 'AI processing failed'
};
}
}
/**
* Process with OpenRouter API
* @private
*/
async processWithOpenRouter(question, content, apiKey, options = {}) {
const parsedContent = typeof content === 'string' ? JSON.parse(content) : content;
const contentText = `
Title: ${parsedContent.title || 'Unknown'}
Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'}
Meta Description: ${parsedContent.metaDescription || 'None'}
${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''}
`.trim();
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
'HTTP-Referer': options.referer || 'https://github.com/monostate/node-scraper',
'X-Title': 'BNCA Node Scraper',
},
body: JSON.stringify({
model: options.model || 'meta-llama/llama-4-scout:free',
messages: [
{
role: 'system',
content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.'
},
{
role: 'user',
content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}`
}
],
temperature: options.temperature || 0.3,
max_tokens: options.maxTokens || 500,
}),
});
if (!response.ok) {
throw new Error(`OpenRouter API error: ${response.status}`);
}
const data = await response.json();
return data.choices[0]?.message?.content || 'No response from AI';
}
/**
* Process with OpenAI API
* @private
*/
async processWithOpenAI(question, content, apiKey, options = {}) {
const parsedContent = typeof content === 'string' ? JSON.parse(content) : content;
const contentText = `
Title: ${parsedContent.title || 'Unknown'}
Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'}
Meta Description: ${parsedContent.metaDescription || 'None'}
${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''}
`.trim();
const baseUrl = options.openAIBaseUrl || 'https://api.openai.com';
const response = await fetch(`${baseUrl}/v1/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
},
body: JSON.stringify({
model: options.model || 'gpt-3.5-turbo',
messages: [
{
role: 'system',
content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.'
},
{
role: 'user',
content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}`
}
],
temperature: options.temperature || 0.3,
max_tokens: options.maxTokens || 500,
}),
});
if (!response.ok) {
throw new Error(`OpenAI API error: ${response.status}`);
}
const data = await response.json();
return data.choices[0]?.message?.content || 'No response from AI';
}
/**
* Local AI processing (simple pattern matching)
* @private
*/
processLocally(question, content) {
const parsedContent = typeof content === 'string' ?
JSON.parse(content) : content;
const title = parsedContent.title || 'Unknown';
const text = parsedContent.content || parsedContent.bodyText || '';
const lowerQuestion = question.toLowerCase();
if (lowerQuestion.includes('title')) {
return `The page title is "${title}".`;
}
if (lowerQuestion.includes('about') || lowerQuestion.includes('what')) {
return `This page titled "${title}" contains: ${text.substring(0, 200)}...`;
}
if (lowerQuestion.includes('contact') || lowerQuestion.includes('email')) {
const emailMatch = text.match(/[\w.-]+@[\w.-]+\.\w+/);
return emailMatch ?
`Found contact: ${emailMatch[0]}` :
'No contact information found.';
}
return `Based on "${title}": ${text.substring(0, 150)}...`;
}
/**
* Main scraping method with intelligent fallback
*/
async scrape(url, options = {}) {
const startTime = Date.now();
const config = { ...this.options, ...options };
this.log(`🚀 Starting smart scrape for: ${url}`);
let result = null;
let method = 'unknown';
let lastError = null;
const fallbackChain = [];
// Check if a specific method is requested
const requestedMethod = config.method;
const isForced = requestedMethod && requestedMethod !== 'auto';
try {
// Check if URL is a PDF (by extension or content-type check)
const isPdfUrl = url.toLowerCase().endsWith('.pdf') ||
url.toLowerCase().includes('.pdf?') ||
url.toLowerCase().includes('/pdf/');
if (isPdfUrl) {
this.log(' 📄 PDF detected, using PDF parser...');
result = await this.tryPDFParse(url, config);
if (result.success) {
method = 'pdf';
this.log(' ✅ PDF parsing successful');
const totalTime = Date.now() - startTime;
return {
...result,
method,
performance: {
totalTime,
method
},
stats: this.getStats()
};
} else {
this.log(' ❌ PDF parsing failed');
lastError = result.error;
}
}
// Handle forced method requests
if (isForced) {
this.log(` 🎯 Method forced to: ${requestedMethod}`);
switch (requestedMethod) {
case 'direct':
this.log(' 🔄 Attempting direct fetch...');
result = await this.tryDirectFetch(url, config);
method = 'direct-fetch';
break;
case 'lightpanda':
this.log(' 🐼 Attempting Lightpanda...');
result = await this.tryLightpanda(url, config);
method = 'lightpanda';
break;
case 'puppeteer':
this.log(' 🔵 Attempting Puppeteer...');
result = await this.tryPuppeteer(url, config);
method = 'puppeteer';
break;
default:
return {
success: false,
error: `Invalid method: ${requestedMethod}. Valid methods are: auto, direct, lightpanda, puppeteer`,
method: 'error',
errorType: 'service_unavailable',
performance: {
totalTime: Date.now() - startTime
}
};
}
// For forced methods, return immediately with no fallback
if (!result.success) {
this.log(` ❌ ${requestedMethod} failed`);
return {
success: false,
error: result.error || `${requestedMethod} scraping failed`,
method,
errorType: this.categorizeError(result.error),
details: result.error,
performance: {
totalTime: Date.now() - startTime,
method
},
stats: this.getStats()
};
}
this.log(` ✅ ${requestedMethod} successful`);
const totalTime = Date.now() - startTime;
return {
...result,
method,
performance: {
totalTime,
method
},
stats: this.getStats()
};
}
// Step 1: Try direct fetch first (fastest)
this.log(' 🔄 Attempting direct fetch...');
fallbackChain.push('direct-fetch');
result = await this.tryDirectFetch(url, config);
if (result.success && !result.needsBrowser) {
method = 'direct-fetch';
this.log(' ✅ Direct fetch successful');
} else if (result.isPdf) {
// Direct fetch detected a PDF, try PDF parser
this.log(' 📄 Direct fetch detected PDF content, using PDF parser...');
result = await this.tryPDFParse(url, config);
if (result.success) {
method = 'pdf';
this.log(' ✅ PDF parsing successful');
const totalTime = Date.now() - startTime;
return {
...result,
method,
performance: {
totalTime,
method
},
stats: this.getStats()
};
} else {
this.log(' ❌ PDF parsing failed');
lastError = result.error;
}
} else {
this.log(result.needsBrowser ? ' ⚠️ Browser rendering required' : ' ❌ Direct fetch failed');
lastError = result.error;
// Step 2: Try Lightpanda (fast browser)
this.log(' 🐼 Attempting Lightpanda...');
fallbackChain.push('lightpanda');
result = await this.tryLightpanda(url, config);
if (result.success) {
method = 'lightpanda';
this.log(' ✅ Lightpanda successful');
} else {
this.log(' ❌ Lightpanda failed, falling back to Puppeteer');
lastError = result.error;
// Step 3: Fallback to Puppeteer (full browser)
this.log(' 🔵 Attempting Puppeteer...');
fallbackChain.push('puppeteer');
result = await this.tryPuppeteer(url, config);
if (result.success) {
method = 'puppeteer';
this.log(' ✅ Puppeteer successful');
} else {
method = 'failed';
this.log(' ❌ All methods failed');
lastError = result.error;
}
}
}
const totalTime = Date.now() - startTime;
return {
...result,
method,
performance: {
totalTime,
method
},
stats: this.getStats(),
// Only include fallbackChain in auto mode
...((!requestedMethod || requestedMethod === 'auto') && { fallbackChain })
};
} catch (error) {
return {
success: false,
method: 'error',
error: error.message,
performance: {
totalTime: Date.now() - startTime
}
};
}
}
/**
* Direct HTTP fetch - fastest method for simple sites
*/
async tryDirectFetch(url, config) {
this.stats.directFetch.attempts++;
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), config.timeout);
const response = await fetch(url, {
headers: {
'User-Agent': config.userAgent,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
},
signal: controller.signal
});
clearTimeout(timeoutId);
if (!response.ok) {
return {
success: false,
error: `Direct fetch failed: HTTP ${response.status}: ${response.statusText}`,
errorType: response.status === 404 ? 'service_unavailable' : 'network'
};
}
// Check if the response is actually a PDF
const contentType = response.headers.get('content-type') || '';
if (contentType.includes('application/pdf')) {
return {
success: false,
error: 'Content is PDF, should use PDF parser',
isPdf: true
};
}
// Get response as array buffer to check magic bytes
const buffer = await response.arrayBuffer();
const firstBytes = new Uint8Array(buffer.slice(0, 5));
const signature = Array.from(firstBytes).map(b => String.fromCharCode(b)).join('');
// Check for PDF magic bytes
if (signature.startsWith('%PDF')) {
return {
success: false,
error: 'Content is PDF (detected by magic bytes), should use PDF parser',
isPdf: true
};
}
// Convert buffer back to text for HTML processing
const html = new TextDecoder().decode(buffer);
// Intelligent browser detection
const needsBrowser = this.detectBrowserRequirement(html, url);
if (!needsBrowser) {
const content = this.extractContentFromHTML(html);
this.stats.directFetch.successes++;
return {
success: true,
needsBrowser: false,
content,
html,
size: html.length,
contentType: response.headers.get('content-type') || 'text/html'
};
} else {
return {
success: true,
needsBrowser: true,
html,
size: html.length,
browserIndicators: this.getBrowserIndicators(html)
};
}
} catch (error) {
const errorMsg = error.message || 'Unknown error';
return {
success: false,
error: `Direct fetch failed: ${errorMsg}`,
errorType: this.categorizeError(errorMsg)
};
}
}
/**
* Lightpanda browser - fast browser engine for static/SSR sites
*/
async tryLightpanda(url, config) {
this.stats.lightpanda.attempts++;
if (!this.options.lightpandaPath) {
return {
success: false,
error: 'Lightpanda scraping failed: Lightpanda binary not found. Please install Lightpanda or provide path.',
errorType: 'service_unavailable'
};
}
try {
// Check if binary exists
const stats = statSync(this.options.lightpandaPath);
if (!stats.isFile()) {
return {
success: false,
error: 'Lightpanda scraping failed: Lightpanda binary is not a file',
errorType: 'service_unavailable'
};
}
} catch {
return {
success: false,
error: 'Lightpanda scraping failed: Lightpanda binary not accessible',
errorType: 'service_unavailable'
};
}
return new Promise((resolve) => {
const args = ['fetch', '--dump', url];
const process = spawn(this.options.lightpandaPath, args, {
timeout: config.timeout + 1000 // Add buffer for process timeout only
});
let output = '';
let errorOutput = '';
process.stdout.on('data', (data) => {
output += data.toString();
});
process.stderr.on('data', (data) => {
errorOutput += data.toString();
});
process.on('close', (code) => {
if (code === 0 && output.length > 0) {
const content = this.extractContentFromHTML(output);
this.stats.lightpanda.successes++;
resolve({
success: true,
content,
html: output,
size: output.length,
exitCode: code
});
} else {
const errorMsg = errorOutput || `Lightpanda exited with code ${code}`;
resolve({
success: false,
error: `Lightpanda scraping failed: ${errorMsg}`,
errorType: this.categorizeError(errorMsg),
exitCode: code
});
}
});
process.on('error', (error) => {
resolve({
success: false,
error: `Lightpanda scraping failed: ${error.message}`,
errorType: this.categorizeError(error.message)
});
});
});
}
/**
* Puppeteer browser - full Chromium for complex JavaScript sites
*/
async tryPuppeteer(url, config) {
this.stats.puppeteer.attempts++;
if (!puppeteer) {
return {
success: false,
error: 'Puppeteer scraping failed: Puppeteer is not installed. Please install puppeteer package.',
errorType: 'service_unavailable'
};
}
let browser = null;
let page = null;
try {
// Get browser from pool
browser = await browserPool.getBrowser();
page = await browser.newPage();
// Set user agent and viewport
await page.setUserAgent(config.userAgent);
await page.setViewport({ width: 1280, height: 720 });
// Block unnecessary resources for faster loading
await page.setRequestInterception(true);
page.on('request', (req) => {
const resourceType = req.resourceType();
if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
req.abort();
} else {
req.continue();
}
});
// Navigate with timeout
await page.goto(url, {
waitUntil: 'networkidle0',
timeout: config.timeout
});
// Extract content using browser APIs
const content = await page.evaluate(() => {
// Get basic page info
const title = document.title;
const metaDescription = document.querySelector('meta[name="description"]')?.content || '';
const canonical = document.querySelector('link[rel="canonical"]')?.href || '';
// Extract headings
const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'))
.map(h => ({
level: h.tagName.toLowerCase(),
text: h.textContent.trim()
}))
.filter(h => h.text.length > 0)
.slice(0, 20);
// Extract paragraphs
const paragraphs = Array.from(document.querySelectorAll('p'))
.map(p => p.textContent.trim())
.filter(text => text.length > 20)
.slice(0, 10);
// Extract links
const links = Array.from(document.querySelectorAll('a[href]'))
.map(a => ({
text: a.textContent.trim(),
href: a.href
}))
.filter(link => link.text.length > 0)
.slice(0, 15);
// Extract JSON-LD structured data
const structuredData = Array.from(document.querySelectorAll('script[type=\"application/ld+json\"]'))
.map(script => {
try {
return JSON.parse(script.textContent);
} catch {
return null;
}
})
.filter(data => data !== null);
// Get page text content (truncated)
const bodyText = document.body.textContent
.replace(/\\s+/g, ' ')
.trim()
.substring(0, 3000);
return {
title,
metaDescription,
canonical,
headings,
paragraphs,
links,
structuredData,
bodyText,
url: window.location.href
};
});
this.stats.puppeteer.successes++;
return {
success: true,
content: JSON.stringify(content, null, 2),
size: JSON.stringify(content).length
};
} catch (error) {
const errorMsg = error.message || 'Unknown error';
return {
success: false,
error: `Puppeteer scraping failed: ${errorMsg}`,
errorType: this.categorizeError(errorMsg)
};
} finally {
// Always clean up page
if (page) {
try {
// Check if page is still connected before closing
if (!page.isClosed()) {
await page.close();
}
} catch (e) {
// Silently ignore protocol errors when page is already closed
if (!e.message.includes('Protocol error') && !e.message.includes('Target closed')) {
console.warn('Error closing page:', e.message);
}
}
}
// Release browser back to pool
if (browser) {
browserPool.releaseBrowser(browser);
}
}
}
/**
* PDF parsing method - handles PDF documents
*/
async tryPDFParse(url, config) {
this.stats.pdf.attempts++;
try {
// Download PDF with timeout
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), config.timeout);
const response = await fetch(url, {
headers: {
'User-Agent': config.userAgent,
'Accept': 'application/pdf,*/*'
},
signal: controller.signal
});
clearTimeout(timeoutId);
if (!response.ok) {
return {
success: false,
error: `HTTP ${response.status}: ${response.statusText}`
};
}
// Check content type (be lenient - accept various content types)
const contentType = response.headers.get('content-type') || '';
const acceptableTypes = ['pdf', 'octet-stream', 'binary', 'download'];
const isAcceptableType = acceptableTypes.some(type => contentType.includes(type));
if (!isAcceptableType && !url.toLowerCase().includes('.pdf')) {
return {
success: false,
error: `Not a PDF document: ${contentType}`
};
}
// Get PDF buffer
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
// Check size limit (20MB)
if (buffer.length > 20 * 1024 * 1024) {
return {
success: false,
error: 'PDF too large (max 20MB)'
};
}
// Parse PDF
const pdfData = await pdfParse(buffer);
// Extract structured content
const content = {
title: pdfData.info?.Title || 'Untitled PDF',
author: pdfData.info?.Author || '',
subject: pdfData.info?.Subject || '',
keywords: pdfData.info?.Keywords || '',
creator: pdfData.info?.Creator || '',
producer: pdfData.info?.Producer || '',
creationDate: pdfData.info?.CreationDate || '',
modificationDate: pdfData.info?.ModificationDate || '',
pages: pdfData.numpages || 0,
text: pdfData.text || '',
metadata: pdfData.metadata || null,
url: url
};
this.stats.pdf.successes++;
return {
success: true,
content: JSON.stringify(content, null, 2),
size: buffer.length,
contentType: 'application/pdf',
pages: content.pages
};
} catch (error) {
return {
success: false,
error: `PDF parsing error: ${error.message}`
};
}
}
/**
* Intelligent detection of browser requirement
*/
detectBrowserRequirement(html, url) {
// Whitelist simple sites that should always use direct fetch
const simpleSites = [
'example.com',
'httpbin.org',
'wikipedia.org',
'github.io',
'netlify.app',
'vercel.app'
];
if (simpleSites.some(site => url.includes(site))) {
return false; // Always use direct fetch for these
}
// Check for common SPA patterns (be more specific)
const spaIndicators = [
/<div[^>]*id=['"]?root['"]?[^>]*>\s*<\/div>/i,
/<div[^>]*id=['"]?app['"]?[^>]*>\s*<\/div>/i,
/<div[^>]*data-reactroot/i,
/window\.__NEXT_DATA__/i,
/window\.__NUXT__/i,
/_next\/static/i,
/__webpack_require__/i
];
// Check for protection systems (more specific patterns)
const protectionIndicators = [
/cloudflare.*challenge/i,
/cloudflare.*protection/i,
/ray id.*cloudflare/i,
/please enable javascript/i,
/you need to enable javascript/i,
/this site requires javascript/i,
/jscript.*required/i,
/security check.*cloudflare/i,
/attention required.*cloudflare/i
];
// Domain-based checks for known SPA sites
const domainIndicators = [
/instagram\.com/i,
/twitter\.com/i,
/facebook\.com/i,
/linkedin\.com/i,
/maps\.google/i,
/gmail\.com/i,
/youtube\.com/i
];
// Check if it's clearly a SPA or protected site
const hasSpaIndicators = spaIndicators.some(pattern => pattern.test(html));
const hasProtection = protectionIndicators.some(pattern => pattern.test(html));
const isKnownSpa = domainIndicators.some(pattern => pattern.test(url));
// Check for minimal content BUT only if we also have SPA indicators
const bodyContent = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)?.[1] || '';
const textContent = bodyContent
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim();
const hasMinimalContent = textContent.length < 200; // More conservative threshold
const isLikelySpa = hasMinimalContent && hasSpaIndicators;
// Only require browser if we have strong indicators
const needsBrowser = hasProtection || isKnownSpa || isLikelySpa;
return needsBrowser;
}
/**
* Get browser requirement indicators for debugging
*/
getBrowserIndicators(html) {
const indicators = [];
if (/<div[^>]*id=['"]?root['"]?[^>]*>\s*<\/div>/i.test(html)) {
indicators.push('React root div detected');
}
if (/window\.__NEXT_DATA__/i.test(html)) {
indicators.push('Next.js data detected');
}
if (/cloudflare.*challenge/i.test(html)) {
indicators.push('Cloudflare challenge detected');
}
if (/cloudflare.*protection/i.test(html)) {
indicators.push('Cloudflare protection detected');
}
if (/please enable javascript/i.test(html)) {
indicators.push('JavaScript required message detected');
}
return indicators;
}
/**
* Extract structured content from HTML
*/
extractContentFromHTML(html) {
try {
// Basic content extraction
const title = html.match(/<title[^>]*>([^<]+)<\/title>/i)?.[1] || '';
const metaDescription = html.match(/<meta[^>]*name=['"]description['"][^>]*content=['"]([^'"]*)['"]/i)?.[1] || '';
// Extract JSON-LD structured data
const jsonLdMatches = [...html.matchAll(/<script[^>]*type=['"]application\/ld\+json['"][^>]*>([\s\S]*?)<\/script>/gi)];
const structuredData = [];
jsonLdMatches.forEach(match => {
try {
const data = JSON.parse(match[1]);
structuredData.push(data);
} catch {
// Ignore malformed JSON
}
});
// Extract window state data
const windowDataMatch = html.match(/window\.__(?:INITIAL_STATE__|INITIAL_DATA__|NEXT_DATA__)__\s*=\s*({[\s\S]*?});/);
let windowData = null;
if (windowDataMatch) {
try {
windowData = JSON.parse(windowDataMatch[1]);
} catch {
windowData = 'Found but unparseable';
}
}
// Extract main content
const bodyMatch = html.match(/<body[^>]*>([\s\S]*)<\/body>/i);
let textContent = '';
if (bodyMatch) {
textContent = bodyMatch[1]
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<[^>]+>/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.substring(0, 2000);
}
// Extract meta tags
const metaTags = {};
const metaMatches = [...html.matchAll(/<meta[^>]*(?:property|name)=['"]([^'"]+)['"][^>]*content=['"]([^'"]*)['"]/gi)];
metaMatches.slice(0, 15).forEach(match => {
metaTags[match[1]] = match[2];
});
return JSON.stringify({
title,
metaDescription,
structuredData: structuredData.length > 0 ? structuredData : null,
windowData,
metaTags: Object.keys(metaTags).length > 0 ? metaTags : null,
content: textContent,
extractedAt: new Date().toISOString()
}, null, 2);
} catch (error) {
return JSON.stringify({
error: 'Content extraction failed',
message: error.message,
rawLength: html.length
}, null, 2);
}
}
/**
* Find Lightpanda binary
*/
findLightpandaBinary() {
// First check the package's bin directory (installed by postinstall script)
const packageDir = path.dirname(new URL(import.meta.url).pathname);
const packageBinPath = path.join(packageDir, 'bin', 'lightpanda');
const possiblePaths = [
packageBinPath, // Package's bin directory (highest priority)
'./lightpanda',
'../lightpanda',
'./lightpanda/lightpanda',
'/usr/local/bin/lightpanda',
path.join(process.cwd(), 'lightpanda'),
path.join(process.cwd(), 'bin', 'lightpanda')
];
for (const binaryPath of possiblePaths) {
try {
// Synchronous check for binary existence and executability
const fullPath = path.resolve(binaryPath);
if (existsSync(fullPath)) {
const stats = statSync(fullPath);
if (stats.isFile()) {
// Check if it's executable (on Unix-like systems including WSL)
if (process.platform !== 'win32' || this.isWSL()) {
const mode = stats.mode;
const isExecutable = Boolean(mode & parseInt('111', 8));
if (isExecutable) {
return fullPath;
}
} else {
// On native Windows (not WSL), Lightpanda is not supported
continue;
}
}
}
} catch {
continue;
}
}
return null;
}
/**
* Check if running in WSL environment
*/
isWSL() {
try {
const uname = execSync('uname -r', { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'] });
return uname.toLowerCase().includes('microsoft') || uname.toLowerCase().includes('wsl');
} catch {
return false;
}
}
/**
* Get performance statistics
*/
getStats() {
return {
...this.stats,
successRates: {
directFetch: this.stats.directFetch.attempts > 0 ?
(this.stats.directFetch.successes / this.stats.directFetch.attempts * 100).toFixed(1) + '%' : '0%',
lightpanda: this.stats.lightpanda.attempts > 0 ?
(this.stats.lightpanda.successes / this.stats.lightpanda.attempts * 100).toFixed(1) + '%' : '0%',
puppeteer: this.stats.puppeteer.attempts > 0 ?
(this.stats.puppeteer.successes / this.stats.puppeteer.attempts * 100).toFixed(1) + '%' : '0%',
pdf: this.stats.pdf.attempts > 0 ?
(this.stats.pdf.successes / this.stats.pdf.attempts * 100).toFixed(1) + '%' : '0%'
}
};
}
/**
* Categorize error types for better error handling
*/
categorizeError(errorMessage) {
if (!errorMessage) return 'unknown';
const error = errorMessage.toLowerCase();
if (error.includes('timeout') || error.includes('timed out') || error.includes('abort')) {
return 'timeout';
} else if (error.includes('network') || error.includes('enotfound') || error.includes('econnrefused') || error.includes('econnreset')) {
return 'network';
} else if (error.includes('parse') || error.includes('parsing') || error.includes('invalid')) {
return 'parsing';
} else if (error.includes('not found') || error.includes('404') || error.includes('unavailable')) {
return 'service_unavailable';
}
return 'unknown';
}
/**
* Logging helper
*/
log(message) {
if (this.options.verbose) {
console.log(message);
}
}
/**
* Cleanup resources
*/
async cleanup() {
if (this.browser) {
await this.browser.close();
this.browser = null;
}
}
/**
* Take a screenshot of a webpage
*/
async screenshot(url, options = {}) {
const startTime = Date.now();
const config = { ...this.options, ...options };
this.log(`📸 Taking screenshot for: ${url}`);
try {
const screenshot = await this.takeScreenshotWithChrome(url, config);
return {
success: !!screenshot,
screenshot,
method: 'chrome-screenshot',
performance: {
totalTime: Date.now() - startTime
}
};
} catch (error) {
return {
success: false,
error: error.message,
method: 'chrome-screenshot',
performance: {
totalTime: Date.now() - startTime
}
};
}
}
/**
* Quick screenshot capture - optimized for speed
*/
async quickshot(url, options = {}) {
const startTime = Date.now();
const config = {
...this.options,
...options,
timeout: options.timeout || 15000 // Longer timeout for screenshots
};
this.log(`⚡ Taking quick screenshot for: ${url}`);
try {
const screenshot = await this.takeScreenshotOptimized(url, config);
return {
success: !!screenshot,
screenshot,
method: 'quickshot',
performance: {
totalTime: Date.now() - startTime
}
};
} catch (error) {
return {
success: false,
error: error.message,
method: 'quickshot',
performance: {
totalTime: Date.now() - startTime
}
};
}
}
/**
* Take screenshot using Chrome CLI
*/
async takeScreenshotWithChrome(url, config) {
const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
try {
const args = [
'--headless=new',
'--disable-gpu',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--user-agent=' + config.userAgent,
'--screenshot=' + tempFile,
'--window-size=1280,800',
'--hide-scrollbars',
'--virtual-time-budget=10000',
url
];
const chromePath = await this.findChromePath();
if (!chromePath) {
throw new Error('Chrome/Chromium not found');
}
return new Promise((resolve) => {
const chrome = spawn(chromePath, args, {
stdio: ['ignore', 'pipe', 'pipe'],
detached: false
});
let processExited = false;
let stderr = '';
chrome.stderr.on('data', (data) => {
stderr += data.toString();
});
const killTimeout = setTimeout(() => {
if (!processExited) {
this.log('Chrome timeout, sending SIGTERM...');
chrome.kill('SIGTERM');
setTimeout(() => {
if (!processExited) {
chrome.kill('SIGKILL');
}
}, 1000);
}
}, config.timeout || 15000);
chrome.on('exit', async (code, signal) => {
processExited = true;
clearTimeout(killTimeout);
try {
await new Promise(r => setTimeout(r, 500));
const screenshotBuffer = await fsPromises.readFile(tempFile);
const base64 = screenshotBuffer.toString('base64');
await fsPromises.unlink(tempFile).catch(() => {});
resolve(`data:image/png;base64,${base64}`);
} catch (error) {
resolve(null);
}
});
chrome.on('error', (error) => {
clearTimeout(killTimeout);
resolve(null);
});
});
} catch (error) {
return null;
}
}
/**
* Optimized screenshot for speed
*/
async takeScreenshotOptimized(url, config, retryCount = 0) {
const tempFile = path.join('/tmp', `screenshot_${Date.now()}_${Math.random().toString(36).substring(7)}.png`);
try {
const virtualTimeBudget = retryCount === 0 ? 5000 : 8000;
const processTimeout = retryCount === 0 ? 8000 : 12000;
const args = [
'--headless=new',
'--disable-gpu',
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
'--disable-features=TranslateUI',
'--disable-extensions',
'--disable-default-apps',
'--disable-sync',
'--metrics-recording-only',
'--mute-audio',
'--no-first-run',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--user-agent=' + config.userAgent,
'--screenshot=' + tempFile,
'--window-size=1280,800',
'--hide-scrollbars',
'--run-all-compositor-stages-before-draw',
`--virtual-time-budget=${virtualTimeBudget}`,
url
];
const chromePath = await this.findChromePath();
if (!chromePath) {
throw new Error('Chrome/Chromium not found');
}
return new Promise((resolve) => {
const chrome = spawn(chromePath, args, {
stdio: ['ignore', 'pipe', 'pipe'],
detached: false
});
let processExited = false;
const killTimeout = setTimeout(() => {
if (!processExited) {
chrome.kill('SIGTERM');
setTimeout(() => {
if (!processExited) {
chrome.kill('SIGKILL');
}
}, 1000);
}
}, processTimeout);
chrome.on('exit', async (code, signal) => {
processExited = true;
clearTimeout(killTimeout);
try {
await new Promise(r => setTimeout(r, 500));
const screenshotBuffer = await fsPromises.readFile(tempFile);
const base64 = screenshotBuffer.toString('base64');
await fsPromises.unlink(tempFile).catch(() => {});
resolve(`data:image/png;base64,${base64}`);
} catch (error) {
if (retryCount === 0) {
const retryResult = await this.takeScreenshotOptimized(url, config, 1);
resolve(retryResult);
} else {
resolve(null);
}
}
});
chrome.on('error', (error) => {
clearTimeout(killTimeout);
resolve(null);
});
});
} catch (error) {
if (retryCount === 0) {
return this.takeScreenshotOptimized(url, config, 1);
}
return null;
}
}
/**
* Find Chrome/Chromium binary path
*/
async findChromePath() {
const chromePaths = process.platform === 'darwin' ? [
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
] : [
'/usr/bin/chromium-browser',
'/usr/bin/chromium',
'/usr/bin/google-chrome-stable',
'/usr/bin/google-chrome',
];
for (const path of chromePaths) {
try {
await fsPromises.access(path);
return path;
} catch (e) {
continue;
}
}
return null;
}
/**
* Health check for all scraping methods
*/
async healthCheck() {
const testUrl = 'https://example.com';
const results = {};
// Test direct fetch
try {
const directResult = await this.tryDirectFetch(testUrl, this.options);
results.directFetch = directResult.success;
} catch {
results.directFetch = false;
}
// Test Lightpanda
try {
const lightpandaResult = await this.tryLightpanda(testUrl, this.options);
results.lightpanda = lightpandaResult.success;
} catch {
results.lightpanda = false;
}
// Test Puppeteer
try {
const puppeteerResult = await this.tryPuppeteer(testUrl, this.options);
results.puppeteer = puppeteerResult.success;
await this.cleanup(); // Clean up after test
} catch {
results.puppeteer = false;
}
return {
status: Object.values(results).some(r => r) ? 'healthy' : 'unhealthy',
methods: results,
timestamp: new Date().toISOString()
};
}
/**
* Clean up resources - closes all browser instances
*/
async cleanup() {
await browserPool.closeAll();
}
/**
* Bulk scrape multiple URLs with optimized concurrency
* @param {string[]} urls - Array of URLs to scrape
* @param {Object} options - Scraping options
* @returns {Promise<Object>} Bulk scraping results
*/
async bulkScrape(urls, options = {}) {
const {
concurrency = 5,
progressCallback = null,
continueOnError = true,
...scrapeOptions
} = options;
const results = {
success: [],
failed: [],
total: urls.length,
startTime: Date.now(),
endTime: null,
stats: {
successful: 0,
failed: 0,
totalTime: 0,
averageTime: 0,
methods: {
direct: 0,
lightpanda: 0,
puppeteer: 0,
pdf: 0
}
}
};
// Process URLs in batches
const batches = [];
for (let i = 0; i < urls.length; i += concurrency) {
batches.push(urls.slice(i, i + concurrency));
}
let processedCount = 0;
for (const batch of batches) {
const batchPromises = batch.map(async (url) => {
const startTime = Date.now();
try {
const result = await this.scrape(url, scrapeOptions);
const endTime = Date.now();
const duration = endTime - startTime;
const successResult = {
url,
...result,
duration,
timestamp: new Date(endTime).toISOString()
};
results.success.push(successResult);
results.stats.successful++;
// Track method usage
if (result.method) {
results.stats.methods[result.method]++;
}
return successResult;
} catch (error) {
const endTime = Date.now();
const duration = endTime - startTime;
const failedResult = {
url,
success: false,
error: error.message,
duration,
timestamp: new Date(endTime).toISOString()
};
results.failed.push(failedResult);
results.stats.failed++;
if (!continueOnError) {
throw error;
}
return failedResult;
} finally {
processedCount++;
if (progressCallback) {
progressCallback({
processed: processedCount,
total: urls.length,
percentage: (processedCount / urls.length) * 100,
current: url
});
}
}
});
await Promise.all(batchPromises);
}
results.endTime = Date.now();
results.stats.totalTime = results.endTime - results.startTime;
results.stats.averageTime = results.stats.totalTime / urls.length;
return results;
}
/**
* Bulk scrape with streaming results
* @param {string[]} urls - Array of URLs to scrape
* @param {Object} options - Scraping options with onResult callback
* @returns {Promise<Object>} Summary statistics
*/
async bulkScrapeStream(urls, options = {}) {
const {
concurrency = 5,
onResult = null,
onError = null,
progressCallback = null,
...scrapeOptions
} = options;
if (!onResult) {
throw new Error('onResult callback is required for streaming bulk scrape');
}
const stats = {
total: urls.length,
processed: 0,
successful: 0,
failed: 0,
startTime: Date.now(),
endTime: null,
methods: {
direct: 0,
lightpanda: 0,
puppeteer: 0,
pdf: 0
}
};
const queue = [...urls];
const inProgress = new Set();
const processNext = async () => {
if (queue.length === 0 || inProgress.size >= concurrency) {
return;
}
const url = queue.shift();
inProgress.add(url);
const startTime = Date.now();
try {
const result = await this.scrape(url, scrapeOptions);
const duration = Date.now() - startTime;
stats.successful++;
if (result.method) {
stats.methods[result.method]++;
}
await onResult({
url,
...result,
duration,
timestamp: new Date().to