UNPKG

vexify

Version:

Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.

346 lines (287 loc) 11.3 kB
'use strict'; const fs = require('fs'); const path = require('path'); const { URL } = require('url'); // Lazy-load playwright-extra to avoid blocking on module import let chromium = null; function getChromium() { if (!chromium) { const { chromium: pw } = require('playwright-extra'); const stealth = require('puppeteer-extra-plugin-stealth')(); pw.use(stealth); chromium = pw; } return chromium; } class WebCrawler { constructor(options = {}) { this.headless = options.headless !== false; this.userAgent = options.userAgent; this.timeout = options.timeout || 30000; this.maxPages = options.maxPages || 100; this.maxDepth = options.maxDepth || 3; this.concurrency = options.concurrency || 5; this.visited = new Set(); this.queue = []; this.stateFile = options.stateFile || null; this.supportedExtensions = options.supportedExtensions || [ '.html', '.htm', '.pdf', '.docx', '.doc', '.txt', '.text', '.csv', '.json', '.jsonl', '.xlsx', '.xls' ]; this.maxRetries = options.maxRetries || 2; this.retryDelay = options.retryDelay || 1000; } normalizeHostname(hostname) { return hostname.replace(/^www\./, ''); } loadState() { if (!this.stateFile || !fs.existsSync(this.stateFile)) { return null; } try { const state = JSON.parse(fs.readFileSync(this.stateFile, 'utf-8')); this.visited = new Set(state.visited); this.queue = state.queue; return state; } catch (e) { console.error(` ⚠ Could not load state from ${this.stateFile}: ${e.message}`); return null; } } saveState() { if (!this.stateFile) return; const state = { visited: Array.from(this.visited), queue: this.queue, timestamp: new Date().toISOString(), progress: { visited: this.visited.size, queued: this.queue.length } }; fs.writeFileSync(this.stateFile, JSON.stringify(state, null, 2), 'utf-8'); } async crawlSite(startUrl, outputDir, vecStore = null, onPageCallback = null) { const maxCrashRetries = 100; let crashCount = 0; let backoffDelay = 5000; const maxBackoff = 60000; while (crashCount < maxCrashRetries) { try { return await this._crawlSiteInternal(startUrl, outputDir, vecStore, onPageCallback); } catch (error) { crashCount++; const timestamp = new Date().toISOString(); console.error(`\n ⚠ Crawler crashed at ${timestamp}`); console.error(` Error: ${error.message}`); console.error(` Crash count: ${crashCount}/${maxCrashRetries}`); if (this.visited.size >= this.maxPages || this.queue.length === 0) { console.error(` Crawl complete (visited: ${this.visited.size}, queued: ${this.queue.length})`); break; } console.error(` Automatically restarting in ${backoffDelay / 1000}s...\n`); await new Promise(resolve => setTimeout(resolve, backoffDelay)); backoffDelay = Math.min(backoffDelay * 2, maxBackoff); } } if (crashCount >= maxCrashRetries) { console.error(` ⚠ Maximum crash retries (${maxCrashRetries}) reached. Stopping.`); } return { pages: [], files: [], errors: [] }; } async _crawlSiteInternal(startUrl, outputDir, vecStore = null, onPageCallback = null) { if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } const baseUrl = new URL(startUrl); const baseHostname = this.normalizeHostname(baseUrl.hostname); const resumeState = this.loadState(); if (resumeState) { console.error(` Resuming from previous crawl state...`); console.error(` Progress: ${resumeState.progress.visited} visited, ${resumeState.progress.queued} queued\n`); } else { this.queue.push({ url: startUrl, depth: 0 }); } const launchBrowser = async () => { const chromium = getChromium(); const browser = await chromium.launch({ headless: this.headless, args: [ '--disable-blink-features=AutomationControlled', '--disable-dev-shm-usage', '--no-sandbox' ] }); const context = await browser.newContext({ userAgent: this.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', viewport: { width: 1920, height: 1080 }, locale: 'en-US', timezoneId: 'America/New_York' }); return { browser, context }; }; let { browser, context } = await launchBrowser(); const results = { pages: [], files: [], errors: [] }; while (this.queue.length > 0 && this.visited.size < this.maxPages) { const batch = []; while (batch.length < this.concurrency && this.queue.length > 0 && this.visited.size + batch.length < this.maxPages) { const { url, depth } = this.queue.shift(); if (this.visited.has(url) || depth > this.maxDepth) { continue; } this.visited.add(url); batch.push({ url, depth }); } if (batch.length === 0) { break; } const batchResults = await Promise.all(batch.map(async ({ url, depth }) => { try { const urlObj = new URL(url); if (this.normalizeHostname(urlObj.hostname) !== baseHostname) { return null; } const ext = path.extname(urlObj.pathname).toLowerCase(); if (this.supportedExtensions.includes(ext) && ext !== '.html' && ext !== '.htm') { const fileName = this.getFileName(url, outputDir); await this.downloadFile(url, fileName); console.error(` Downloaded: ${url}`); return { type: 'file', url, path: fileName }; } for (let retry = 0; retry <= this.maxRetries; retry++) { let page = null; try { page = await context.newPage(); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.timeout }); try { await page.waitForFunction(() => { const bodyText = document.body.innerText; return !bodyText.includes('Just a moment') && !bodyText.includes('Verifying you are human') && document.body.innerHTML.length > 10000; }, { timeout: 60000 }); } catch (e) { console.error(` ⚠ Cloudflare challenge timeout for ${url}`); } const html = await page.content(); if (html.includes('Just a moment') || html.includes('Verifying you are human')) { console.error(` ✗ Skipping ${url} - still showing Cloudflare challenge`); await page.close(); return null; } const fileName = this.getFileName(url, outputDir, '.html'); fs.writeFileSync(fileName, html, 'utf-8'); console.error(` Crawled: ${url}`); let links = []; if (depth < this.maxDepth) { links = await page.$$eval('a[href]', anchors => anchors.map(a => a.href)); } await page.close(); return { type: 'page', url, path: fileName, links, depth }; } catch (error) { if (page) { try { await page.close(); } catch (e) {} } if (error.message.includes('Target page, context or browser has been closed')) { if (retry < this.maxRetries) { console.error(` ⚠ Browser closed, restarting... (retry ${retry + 1}/${this.maxRetries})`); try { await browser.close(); } catch (e) {} const result = await launchBrowser(); browser = result.browser; context = result.context; await new Promise(resolve => setTimeout(resolve, this.retryDelay)); continue; } } throw error; } } return null; } catch (error) { results.errors.push({ url, error: error.message }); return null; } })); if (onPageCallback) { const validItems = batchResults.filter(r => r && (r.type === 'page' || r.type === 'file')); if (validItems.length > 0) { // Fire and forget - don't block crawler waiting for embeddings validItems.forEach(item => { onPageCallback(item).catch(err => { console.error(` ⚠ Callback error for ${item.url}: ${err.message}`); }); }); } } for (const result of batchResults) { if (!result) continue; if (result.type === 'page') { results.pages.push({ url: result.url, path: result.path }); for (const link of result.links) { try { const linkUrl = new URL(link, result.url).href; if (!this.visited.has(linkUrl)) { this.queue.push({ url: linkUrl, depth: result.depth + 1 }); } } catch (e) { } } } else if (result.type === 'file') { results.files.push({ url: result.url, path: result.path }); } else if (result.type === 'error') { results.errors.push({ url: result.url, error: result.error }); } } this.saveState(); } try { await browser.close(); } catch (e) { if (this.visited.size < this.maxPages && this.queue.length > 0) { throw new Error(`Browser closed unexpectedly with ${this.queue.length} pages remaining: ${e.message}`); } } if (this.stateFile && fs.existsSync(this.stateFile)) { fs.unlinkSync(this.stateFile); } return results; } async downloadFile(url, outputPath) { const response = await fetch(url); const buffer = await response.arrayBuffer(); const bytes = Buffer.from(buffer); // Check for error pages (HTML content in PDF/HTML files) if (outputPath.endsWith('.pdf') || outputPath.endsWith('.html') || outputPath.endsWith('.htm')) { const preview = bytes.toString('utf8', 0, Math.min(200, bytes.length)); if (preview.includes('<title>404') || preview.includes('Not Found') || preview.includes('Error 404') || preview.includes('Page not found') || preview.includes('File not found') || preview.includes('Access denied')) { throw new Error(`Error page detected: 404/Not Found`); } } fs.writeFileSync(outputPath, bytes); } getFileName(url, outputDir, forceExt = null) { const urlObj = new URL(url); let pathname = urlObj.pathname; if (pathname === '/' || pathname === '') { pathname = '/index'; } if (forceExt) { pathname = pathname.replace(/\.[^.]*$/, '') + forceExt; } const sanitized = pathname.replace(/^\//, '').replace(/[^a-zA-Z0-9._-]/g, '_'); return path.join(outputDir, sanitized || 'index.html'); } } module.exports = { WebCrawler };