web-vuln-scanner
Version:
Advanced, lightweight web vulnerability scanner with smart detection and easy-to-use interface
982 lines (839 loc) • 31.2 kB
JavaScript
const puppeteer = require('puppeteer');
const { URL } = require('url');
const debug = require('debug')('web-vuln-scanner:puppeteer');
class PuppeteerCrawler {
constructor(options = {}) {
this.baseUrl = options.baseUrl;
this.baseUrlObj = new URL(this.baseUrl);
this.depth = options.depth || 3;
this.maxPages = options.maxPages || 500;
this.timeout = options.timeout || 30000;
this.userAgent = options.userAgent || 'WebVulnScanner/2.0 (Security Testing)';
this.headers = options.headers || {};
this.cookies = options.cookies || [];
this.includeSubdomains = options.includeSubdomains || false;
this.screenshot = options.screenshot || false;
this.interceptRequests = options.interceptRequests !== false;
this.waitForJs = options.waitForJs || 3000;
this.maxRetries = options.maxRetries || 2;
// State management
this.visited = new Set();
this.queue = [this.baseUrl];
this.foundUrls = new Set([this.baseUrl]);
this.failedUrls = new Set();
this.pageData = new Map();
this.formData = new Map();
this.ajaxEndpoints = new Set();
this.websocketEndpoints = new Set();
this.apiEndpoints = new Set();
this.jsErrors = new Map();
this.networkRequests = new Map();
// Browser management
this.browser = null;
this.activeTabs = new Set();
}
async initBrowser() {
if (!this.browser) {
debug('Launching browser with enhanced configuration');
this.browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu',
'--disable-web-security',
'--disable-features=VizDisplayCompositor',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
'--disable-field-trial-config',
'--disable-ipc-flooding-protection'
],
timeout: 60000,
ignoreDefaultArgs: ['--disable-extensions']
});
// Handle browser disconnect
this.browser.on('disconnected', () => {
debug('Browser disconnected');
this.browser = null;
});
}
return this.browser;
}
async createPage() {
const browser = await this.initBrowser();
const page = await browser.newPage();
this.activeTabs.add(page);
// Enhanced page configuration
await page.setViewport({ width: 1366, height: 768 });
await page.setUserAgent(this.userAgent);
// Set extra headers
if (Object.keys(this.headers).length > 0) {
await page.setExtraHTTPHeaders(this.headers);
}
// Set cookies
if (this.cookies.length > 0) {
await page.setCookie(...this.cookies);
}
// Request/response interception for comprehensive monitoring
if (this.interceptRequests) {
await page.setRequestInterception(true);
page.on('request', (request) => {
// Log all requests for endpoint discovery
const url = request.url();
const method = request.method();
if (this.isRelevantEndpoint(url)) {
this.networkRequests.set(url, {
method,
headers: request.headers(),
postData: request.postData(),
timestamp: Date.now()
});
// Detect API endpoints
if (this.isApiEndpoint(url)) {
this.apiEndpoints.add(url);
}
}
request.continue();
});
page.on('response', (response) => {
const url = response.url();
const status = response.status();
if (this.networkRequests.has(url)) {
const requestData = this.networkRequests.get(url);
requestData.status = status;
requestData.responseHeaders = response.headers();
this.networkRequests.set(url, requestData);
}
});
}
// Console message monitoring for errors and endpoints
page.on('console', (msg) => {
if (msg.type() === 'error') {
const url = page.url();
if (!this.jsErrors.has(url)) {
this.jsErrors.set(url, []);
}
this.jsErrors.get(url).push(msg.text());
}
});
// Page error monitoring
page.on('pageerror', (error) => {
const url = page.url();
if (!this.jsErrors.has(url)) {
this.jsErrors.set(url, []);
}
this.jsErrors.get(url).push(error.message);
});
return page;
}
async closePage(page) {
try {
this.activeTabs.delete(page);
await page.close();
} catch (error) {
debug(`Error closing page: ${error.message}`);
}
}
async closeBrowser() {
if (this.browser) {
try {
// Close all active tabs
for (const page of this.activeTabs) {
await this.closePage(page);
}
await this.browser.close();
this.browser = null;
} catch (error) {
debug(`Error closing browser: ${error.message}`);
}
}
}
async crawl() {
debug(`Starting comprehensive Puppeteer crawl: ${this.baseUrl}`);
debug(`Configuration: depth=${this.depth}, maxPages=${this.maxPages}, timeout=${this.timeout}`);
try {
let currentDepth = 0;
while (this.queue.length && this.visited.size < this.maxPages && currentDepth < this.depth) {
const currentLevelUrls = [...this.queue];
this.queue = [];
debug(`Processing depth ${currentDepth + 1}, ${currentLevelUrls.length} URLs`);
// Process URLs in batches to manage resources
const batchSize = 3;
for (let i = 0; i < currentLevelUrls.length; i += batchSize) {
const batch = currentLevelUrls.slice(i, i + batchSize);
const promises = batch.map(url => this.crawlPage(url));
try {
await Promise.allSettled(promises);
} catch (error) {
debug(`Batch processing error: ${error.message}`);
}
// Small delay between batches to avoid overwhelming the target
await this.sleep(500);
}
currentDepth++;
}
// Additional discovery phases
await this.discoverHiddenEndpoints();
await this.performInteractiveDiscovery();
debug(`Puppeteer crawl complete. Found ${this.foundUrls.size} URLs, ${this.failedUrls.size} failed`);
return {
urls: [...this.foundUrls],
failed: [...this.failedUrls],
pageData: Object.fromEntries(this.pageData),
forms: Object.fromEntries(this.formData),
ajaxEndpoints: [...this.ajaxEndpoints],
apiEndpoints: [...this.apiEndpoints],
websocketEndpoints: [...this.websocketEndpoints],
networkRequests: Object.fromEntries(this.networkRequests),
jsErrors: Object.fromEntries(this.jsErrors)
};
} finally {
await this.closeBrowser();
}
}
async crawlPage(url, retryCount = 0) {
if (this.visited.has(url) || this.foundUrls.size >= this.maxPages) {
return;
}
this.visited.add(url);
debug(`Crawling: ${url} (attempt ${retryCount + 1})`);
let page;
try {
page = await this.createPage();
// Navigate to the page with comprehensive waiting
const response = await page.goto(url, {
waitUntil: ['networkidle0', 'domcontentloaded'],
timeout: this.timeout
});
if (!response) {
throw new Error('No response received');
}
const status = response.status();
if (status >= 400) {
debug(`HTTP ${status} at ${url}`);
this.failedUrls.add(url);
return;
}
// Wait for JavaScript execution and dynamic content
await page.waitForTimeout(this.waitForJs);
// Extract comprehensive page data
await this.extractPageData(page, url);
// Take screenshot if enabled
if (this.screenshot && this.foundUrls.size < 20) {
try {
await page.screenshot({
path: `screenshots/${this.sanitizeFilename(url)}.png`,
fullPage: true
});
} catch (screenshotError) {
debug(`Screenshot failed for ${url}: ${screenshotError.message}`);
}
}
} catch (error) {
if (retryCount < this.maxRetries) {
debug(`Retrying ${url} (attempt ${retryCount + 2})`);
await this.sleep(2000);
return this.crawlPage(url, retryCount + 1);
}
debug(`Failed to crawl ${url}: ${error.message}`);
this.failedUrls.add(url);
} finally {
if (page) {
await this.closePage(page);
}
}
}
async extractPageData(page, url) {
try {
// Execute comprehensive data extraction in browser context
const pageInfo = await page.evaluate(() => {
const data = {
title: document.title,
url: window.location.href,
links: [],
forms: [],
inputs: [],
buttons: [],
iframes: [],
scripts: [],
websockets: [],
eventListeners: [],
localStorage: {},
sessionStorage: {},
cookies: document.cookie,
meta: []
};
// Extract all links
document.querySelectorAll('a[href], area[href]').forEach((link, index) => {
data.links.push({
href: link.href,
text: link.textContent?.trim() || '',
title: link.title || '',
target: link.target || '',
rel: link.rel || ''
});
});
// Extract forms with detailed information
document.querySelectorAll('form').forEach((form, index) => {
const formData = {
id: form.id || `form_${index}`,
action: form.action || window.location.href,
method: form.method?.toUpperCase() || 'GET',
enctype: form.enctype || 'application/x-www-form-urlencoded',
target: form.target || '',
inputs: []
};
// Extract all form controls
form.querySelectorAll('input, textarea, select, button').forEach(input => {
const inputData = {
name: input.name || '',
type: input.type || 'text',
value: input.value || '',
placeholder: input.placeholder || '',
required: input.required || false,
disabled: input.disabled || false,
readonly: input.readOnly || false,
id: input.id || '',
className: input.className || ''
};
if (input.tagName === 'SELECT') {
inputData.options = [];
input.querySelectorAll('option').forEach(option => {
inputData.options.push({
value: option.value,
text: option.textContent,
selected: option.selected
});
});
}
formData.inputs.push(inputData);
});
data.forms.push(formData);
});
// Extract all clickable elements
document.querySelectorAll('button, input[type="button"], input[type="submit"], [onclick]').forEach(button => {
data.buttons.push({
text: button.textContent?.trim() || button.value || '',
type: button.type || '',
onclick: button.getAttribute('onclick') || '',
id: button.id || '',
className: button.className || ''
});
});
// Extract iframes
document.querySelectorAll('iframe, frame').forEach(iframe => {
data.iframes.push({
src: iframe.src || '',
name: iframe.name || '',
id: iframe.id || ''
});
});
// Extract script sources
document.querySelectorAll('script[src]').forEach(script => {
data.scripts.push(script.src);
});
// Extract meta information
document.querySelectorAll('meta').forEach(meta => {
data.meta.push({
name: meta.name || meta.getAttribute('property') || '',
content: meta.content || '',
httpEquiv: meta.httpEquiv || ''
});
});
// Extract storage data (if accessible)
try {
for (let i = 0; i < localStorage.length; i++) {
const key = localStorage.key(i);
data.localStorage[key] = localStorage.getItem(key);
}
} catch (e) {}
try {
for (let i = 0; i < sessionStorage.length; i++) {
const key = sessionStorage.key(i);
data.sessionStorage[key] = sessionStorage.getItem(key);
}
} catch (e) {}
return data;
});
// Store page data
this.pageData.set(url, pageInfo);
this.formData.set(url, pageInfo.forms);
// Process discovered links
pageInfo.links.forEach(link => {
const processedUrl = this.processFoundUrl(link.href, url);
if (processedUrl && !this.visited.has(processedUrl)) {
this.foundUrls.add(processedUrl);
this.queue.push(processedUrl);
}
});
// Process form actions
pageInfo.forms.forEach(form => {
const processedUrl = this.processFoundUrl(form.action, url);
if (processedUrl && !this.visited.has(processedUrl)) {
this.foundUrls.add(processedUrl);
this.queue.push(processedUrl);
}
});
// Extract AJAX endpoints from page
await this.extractAjaxEndpoints(page, url);
// Look for WebSocket connections
await this.extractWebSocketEndpoints(page, url);
} catch (error) {
debug(`Error extracting page data from ${url}: ${error.message}`);
}
}
async extractAjaxEndpoints(page, url) {
try {
// Override XMLHttpRequest and fetch to capture AJAX calls
await page.evaluateOnNewDocument(() => {
window.capturedRequests = [];
// Intercept XMLHttpRequest
const originalXHROpen = XMLHttpRequest.prototype.open;
XMLHttpRequest.prototype.open = function(method, url, async, user, password) {
window.capturedRequests.push({ type: 'xhr', method, url });
return originalXHROpen.apply(this, arguments);
};
// Intercept fetch
const originalFetch = window.fetch;
window.fetch = function(input, init) {
const url = typeof input === 'string' ? input : input.url;
const method = init?.method || 'GET';
window.capturedRequests.push({ type: 'fetch', method, url });
return originalFetch.apply(this, arguments);
};
});
// Trigger common AJAX patterns by interacting with the page
await this.triggerAjaxCalls(page);
// Extract captured requests
const capturedRequests = await page.evaluate(() => window.capturedRequests || []);
capturedRequests.forEach(request => {
const processedUrl = this.processFoundUrl(request.url, url);
if (processedUrl) {
this.ajaxEndpoints.add(processedUrl);
if (!this.visited.has(processedUrl)) {
this.foundUrls.add(processedUrl);
this.queue.push(processedUrl);
}
}
});
} catch (error) {
debug(`Error extracting AJAX endpoints from ${url}: ${error.message}`);
}
}
async triggerAjaxCalls(page) {
try {
// Click on buttons and links that might trigger AJAX
const clickableElements = await page.$('button, [onclick], .ajax, [data-ajax], [data-url]');
for (let element of clickableElements.slice(0, 5)) {
try {
await Promise.race([
element.click(),
page.waitForTimeout(1000)
]);
await page.waitForTimeout(500);
} catch (e) {}
}
// Trigger form submissions
const forms = await page.$('form');
for (let form of forms.slice(0, 3)) {
try {
await page.evaluate(form => {
// Fill form with test data
const inputs = form.querySelectorAll('input[type="text"], input[type="email"], textarea');
inputs.forEach(input => {
if (input.type === 'email') {
input.value = 'test@example.com';
} else {
input.value = 'test';
}
});
}, form);
const submitButton = await form.$('input[type="submit"], button[type="submit"], button');
if (submitButton) {
await Promise.race([
submitButton.click(),
page.waitForTimeout(1000)
]);
await page.waitForTimeout(1000);
}
} catch (e) {}
}
} catch (error) {
debug(`Error triggering AJAX calls: ${error.message}`);
}
}
async extractWebSocketEndpoints(page, url) {
try {
// Override WebSocket constructor to capture connections
await page.evaluateOnNewDocument(() => {
window.capturedWebSockets = [];
const originalWebSocket = window.WebSocket;
window.WebSocket = function(url, protocols) {
window.capturedWebSockets.push({ url, protocols });
return new originalWebSocket(url, protocols);
};
});
// Wait for any WebSocket connections to be established
await page.waitForTimeout(2000);
const webSockets = await page.evaluate(() => window.capturedWebSockets || []);
webSockets.forEach(ws => {
const processedUrl = this.processFoundUrl(ws.url, url);
if (processedUrl) {
this.websocketEndpoints.add(processedUrl);
}
});
} catch (error) {
debug(`Error extracting WebSocket endpoints from ${url}: ${error.message}`);
}
}
async discoverHiddenEndpoints(page) {
debug('Discovering hidden endpoints through JavaScript analysis');
const commonEndpoints = [
'/api/v1/', '/api/v2/', '/rest/', '/graphql', '/swagger',
'/admin/api/', '/backend/', '/internal/', '/private/',
'/debug/', '/test/', '/dev/', '/.well-known/'
];
for (const endpoint of commonEndpoints) {
if (this.foundUrls.size >= this.maxPages) break;
const testUrl = new URL(endpoint, this.baseUrl).toString();
if (!this.visited.has(testUrl)) {
this.foundUrls.add(testUrl);
this.queue.push(testUrl);
}
}
}
async performInteractiveDiscovery() {
debug('Performing interactive discovery on key pages');
// Get pages that are likely to have dynamic content
const interactivePages = [...this.foundUrls]
.filter(url => {
const path = new URL(url).pathname.toLowerCase();
return path.includes('admin') ||
path.includes('dashboard') ||
path.includes('panel') ||
path.includes('manage') ||
path === '/' ||
path.includes('login');
})
.slice(0, 5);
for (const url of interactivePages) {
if (this.foundUrls.size >= this.maxPages) break;
let page;
try {
page = await this.createPage();
await page.goto(url, { waitUntil: 'networkidle0' });
// Perform comprehensive interaction
await this.performPageInteraction(page);
// Enhanced discovery: Look for SPAs and dynamic routes
await this.discoverSpaRoutes(page);
// Extract data from JavaScript variables
await this.extractJsVariables(page);
} catch (error) {
debug(`Error in interactive discovery for ${url}: ${error.message}`);
} finally {
if (page) {
await this.closePage(page);
}
}
}
}
async discoverSpaRoutes(page) {
try {
// Look for common SPA routing patterns
const routes = await page.evaluate(() => {
const discoveredRoutes = new Set();
// Angular routes
if (window.ng && window.ng.getComponent) {
try {
const router = window.ng.getComponent(document.body)?.router;
if (router && router.config) {
router.config.forEach(route => {
if (route.path) discoveredRoutes.add(route.path);
});
}
} catch (e) {}
}
// React Router routes
if (window.React && window.ReactRouter) {
try {
// Look for route definitions in script tags
document.querySelectorAll('script').forEach(script => {
const content = script.textContent || script.innerHTML;
const routeMatches = content.match(/path\s*:\s*["']([^"']+)["']/g);
if (routeMatches) {
routeMatches.forEach(match => {
const route = match.match(/["']([^"']+)["']/)?.[1];
if (route) discoveredRoutes.add(route);
});
}
});
} catch (e) {}
}
// Vue Router routes
if (window.Vue && window.VueRouter) {
try {
if (window.$router && window.$router.options.routes) {
window.$router.options.routes.forEach(route => {
if (route.path) discoveredRoutes.add(route.path);
});
}
} catch (e) {}
}
// Look for href patterns in JavaScript
document.querySelectorAll('script').forEach(script => {
const content = script.textContent || script.innerHTML;
// Match common routing patterns
const patterns = [
/["']\/[a-zA-Z0-9\-_\/]+["']/g,
/route\s*:\s*["']([^"']+)["']/g,
/path\s*:\s*["']([^"']+)["']/g
];
patterns.forEach(pattern => {
const matches = content.match(pattern);
if (matches) {
matches.forEach(match => {
const route = match.replace(/["']/g, '');
if (route.startsWith('/') && route.length > 1 && route.length < 100) {
discoveredRoutes.add(route);
}
});
}
});
});
return Array.from(discoveredRoutes);
});
// Add discovered routes to crawl queue
routes.forEach(route => {
try {
const fullUrl = new URL(route, page.url()).toString();
if (!this.visited.has(fullUrl) && this.foundUrls.size < this.maxPages) {
this.foundUrls.add(fullUrl);
this.queue.push(fullUrl);
}
} catch (e) {}
});
debug(`Discovered ${routes.length} SPA routes`);
} catch (error) {
debug(`Error discovering SPA routes: ${error.message}`);
}
}
async extractJsVariables(page) {
try {
const jsData = await page.evaluate(() => {
const data = {
globalVars: {},
endpoints: [],
apiKeys: [],
secrets: []
};
// Extract global variables that might contain endpoints
Object.keys(window).forEach(key => {
try {
const value = window[key];
if (typeof value === 'string' && (
value.includes('/api/') ||
value.includes('/rest/') ||
value.includes('http://') ||
value.includes('https://')
)) {
data.globalVars[key] = value;
}
} catch (e) {}
});
// Look for endpoints in script content
document.querySelectorAll('script').forEach(script => {
const content = script.textContent || script.innerHTML;
// Find API endpoints
const endpointPatterns = [
/["']https?:\/\/[^"']+\/api\/[^"']+["']/g,
/["']\/api\/[^"']+["']/g,
/["']\/rest\/[^"']+["']/g,
/baseURL\s*:\s*["']([^"']+)["']/g,
/apiUrl\s*:\s*["']([^"']+)["']/g
];
endpointPatterns.forEach(pattern => {
const matches = content.match(pattern);
if (matches) {
matches.forEach(match => {
const endpoint = match.replace(/["']/g, '').replace(/.*:\s*/, '');
if (endpoint.includes('/') && endpoint.length > 1) {
data.endpoints.push(endpoint);
}
});
}
});
// Look for potential API keys or secrets
const secretPatterns = [
/api[_-]?key\s*[:=]\s*["']([^"']{10,})["']/gi,
/secret\s*[:=]\s*["']([^"']{10,})["']/gi,
/token\s*[:=]\s*["']([^"']{10,})["']/gi,
/password\s*[:=]\s*["']([^"']{5,})["']/gi
];
secretPatterns.forEach(pattern => {
const matches = content.match(pattern);
if (matches) {
matches.forEach(match => {
data.secrets.push(match);
});
}
});
});
return data;
});
// Process discovered endpoints
jsData.endpoints.forEach(endpoint => {
try {
const fullUrl = new URL(endpoint, page.url()).toString();
if (!this.visited.has(fullUrl) && this.foundUrls.size < this.maxPages) {
this.foundUrls.add(fullUrl);
this.apiEndpoints.add(fullUrl);
}
} catch (e) {}
});
// Store extracted secrets for security analysis
if (jsData.secrets.length > 0) {
const url = page.url();
if (!this.pageData.has(url)) {
this.pageData.set(url, {});
}
this.pageData.get(url).extractedSecrets = jsData.secrets;
}
} catch (error) {
debug(`Error extracting JS variables: ${error.message}`);
}
}
// Enhanced technology detection
async detectTechnologies(page) {
try {
const technologies = await page.evaluate(() => {
const detected = {
frameworks: [],
libraries: [],
cms: [],
analytics: [],
security: []
};
// Frontend frameworks
if (window.React) detected.frameworks.push('React');
if (window.Vue) detected.frameworks.push('Vue.js');
if (window.angular) detected.frameworks.push('AngularJS');
if (window.ng) detected.frameworks.push('Angular');
if (window.Backbone) detected.frameworks.push('Backbone.js');
if (window.Ember) detected.frameworks.push('Ember.js');
if (window.Svelte) detected.frameworks.push('Svelte');
// Libraries
if (window.jQuery || window.$) detected.libraries.push('jQuery');
if (window.Lodash || window._) detected.libraries.push('Lodash');
if (window.moment) detected.libraries.push('Moment.js');
if (window.axios) detected.libraries.push('Axios');
if (window.bootstrap) detected.libraries.push('Bootstrap');
// CMS detection
const metaGenerator = document.querySelector('meta[name="generator"]');
if (metaGenerator) {
const content = metaGenerator.content.toLowerCase();
if (content.includes('wordpress')) detected.cms.push('WordPress');
if (content.includes('drupal')) detected.cms.push('Drupal');
if (content.includes('joomla')) detected.cms.push('Joomla');
if (content.includes('magento')) detected.cms.push('Magento');
if (content.includes('shopify')) detected.cms.push('Shopify');
}
// Analytics
if (window.ga || window.gtag) detected.analytics.push('Google Analytics');
if (window.fbq) detected.analytics.push('Facebook Pixel');
if (window.mixpanel) detected.analytics.push('Mixpanel');
if (window.amplitude) detected.analytics.push('Amplitude');
// Security
if (document.querySelector('meta[http-equiv="Content-Security-Policy"]')) {
detected.security.push('Content Security Policy');
}
if (document.querySelector('meta[name="csrf-token"]')) {
detected.security.push('CSRF Token');
}
return detected;
});
// Store technology information
const url = page.url();
if (!this.pageData.has(url)) {
this.pageData.set(url, {});
}
this.pageData.get(url).technologies = technologies;
return technologies;
} catch (error) {
debug(`Error detecting technologies: ${error.message}`);
return {};
}
}
processFoundUrl(url, baseUrl) {
try {
// Normalize: trim and to lower case for scheme checks
let normalized = typeof url === 'string' ? url.trim().toLowerCase() : '';
if (!url ||
normalized.startsWith('#') ||
normalized.startsWith('mailto:') ||
normalized.startsWith('tel:') ||
normalized.startsWith('javascript:') ||
normalized.startsWith('data:') ||
normalized.startsWith('vbscript:')) {
return null;
}
const fullUrl = new URL(url, baseUrl);
// Domain filtering
if (!this.includeSubdomains) {
if (fullUrl.hostname !== this.baseUrlObj.hostname) {
return null;
}
} else {
if (!fullUrl.hostname.endsWith(this.baseUrlObj.hostname)) {
return null;
}
}
fullUrl.hash = '';
const normalizedUrl = fullUrl.toString();
if (normalizedUrl.length > 2000) return null;
return normalizedUrl;
} catch (error) {
return null;
}
}
isRelevantEndpoint(url) {
try {
const urlObj = new URL(url);
return urlObj.hostname === this.baseUrlObj.hostname ||
(this.includeSubdomains && urlObj.hostname.endsWith(this.baseUrlObj.hostname));
} catch (e) {
return false;
}
}
isApiEndpoint(url) {
const apiPatterns = [
'/api/', '/rest/', '/graphql', '/v1/', '/v2/', '/v3/',
'.json', '.xml', '/ajax/', '/rpc/', '/soap/'
];
return apiPatterns.some(pattern => url.includes(pattern));
}
sanitizeFilename(url) {
return url.replace(/[^a-z0-9]/gi, '_').toLowerCase().substring(0, 50);
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Standalone crawl function for backward compatibility
async function crawlPage(url, options = {}) {
const crawler = new PuppeteerCrawler({
baseUrl: url,
maxPages: options.maxPages || 20,
timeout: options.timeout || 15000,
userAgent: options.userAgent,
headers: options.headers,
cookies: options.cookies
});
const results = await crawler.crawl();
return results.urls;
}
module.exports = { PuppeteerCrawler, crawlPage };