vexify
Version:
Portable vector database with in-process ONNX embeddings. Zero-config semantic search via SQLite. No external servers required.
346 lines (287 loc) • 11.3 kB
JavaScript
'use strict';
const fs = require('fs');
const path = require('path');
const { URL } = require('url');
// Lazy-load playwright-extra to avoid blocking on module import
let chromium = null;
function getChromium() {
if (!chromium) {
const { chromium: pw } = require('playwright-extra');
const stealth = require('puppeteer-extra-plugin-stealth')();
pw.use(stealth);
chromium = pw;
}
return chromium;
}
class WebCrawler {
constructor(options = {}) {
this.headless = options.headless !== false;
this.userAgent = options.userAgent;
this.timeout = options.timeout || 30000;
this.maxPages = options.maxPages || 100;
this.maxDepth = options.maxDepth || 3;
this.concurrency = options.concurrency || 5;
this.visited = new Set();
this.queue = [];
this.stateFile = options.stateFile || null;
this.supportedExtensions = options.supportedExtensions || [
'.html', '.htm', '.pdf', '.docx', '.doc', '.txt', '.text',
'.csv', '.json', '.jsonl', '.xlsx', '.xls'
];
this.maxRetries = options.maxRetries || 2;
this.retryDelay = options.retryDelay || 1000;
}
normalizeHostname(hostname) {
return hostname.replace(/^www\./, '');
}
loadState() {
if (!this.stateFile || !fs.existsSync(this.stateFile)) {
return null;
}
try {
const state = JSON.parse(fs.readFileSync(this.stateFile, 'utf-8'));
this.visited = new Set(state.visited);
this.queue = state.queue;
return state;
} catch (e) {
console.error(` ⚠ Could not load state from ${this.stateFile}: ${e.message}`);
return null;
}
}
saveState() {
if (!this.stateFile) return;
const state = {
visited: Array.from(this.visited),
queue: this.queue,
timestamp: new Date().toISOString(),
progress: {
visited: this.visited.size,
queued: this.queue.length
}
};
fs.writeFileSync(this.stateFile, JSON.stringify(state, null, 2), 'utf-8');
}
async crawlSite(startUrl, outputDir, vecStore = null, onPageCallback = null) {
const maxCrashRetries = 100;
let crashCount = 0;
let backoffDelay = 5000;
const maxBackoff = 60000;
while (crashCount < maxCrashRetries) {
try {
return await this._crawlSiteInternal(startUrl, outputDir, vecStore, onPageCallback);
} catch (error) {
crashCount++;
const timestamp = new Date().toISOString();
console.error(`\n ⚠ Crawler crashed at ${timestamp}`);
console.error(` Error: ${error.message}`);
console.error(` Crash count: ${crashCount}/${maxCrashRetries}`);
if (this.visited.size >= this.maxPages || this.queue.length === 0) {
console.error(` Crawl complete (visited: ${this.visited.size}, queued: ${this.queue.length})`);
break;
}
console.error(` Automatically restarting in ${backoffDelay / 1000}s...\n`);
await new Promise(resolve => setTimeout(resolve, backoffDelay));
backoffDelay = Math.min(backoffDelay * 2, maxBackoff);
}
}
if (crashCount >= maxCrashRetries) {
console.error(` ⚠ Maximum crash retries (${maxCrashRetries}) reached. Stopping.`);
}
return {
pages: [],
files: [],
errors: []
};
}
async _crawlSiteInternal(startUrl, outputDir, vecStore = null, onPageCallback = null) {
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
const baseUrl = new URL(startUrl);
const baseHostname = this.normalizeHostname(baseUrl.hostname);
const resumeState = this.loadState();
if (resumeState) {
console.error(` Resuming from previous crawl state...`);
console.error(` Progress: ${resumeState.progress.visited} visited, ${resumeState.progress.queued} queued\n`);
} else {
this.queue.push({ url: startUrl, depth: 0 });
}
const launchBrowser = async () => {
const chromium = getChromium();
const browser = await chromium.launch({
headless: this.headless,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage',
'--no-sandbox'
]
});
const context = await browser.newContext({
userAgent: this.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York'
});
return { browser, context };
};
let { browser, context } = await launchBrowser();
const results = {
pages: [],
files: [],
errors: []
};
while (this.queue.length > 0 && this.visited.size < this.maxPages) {
const batch = [];
while (batch.length < this.concurrency && this.queue.length > 0 && this.visited.size + batch.length < this.maxPages) {
const { url, depth } = this.queue.shift();
if (this.visited.has(url) || depth > this.maxDepth) {
continue;
}
this.visited.add(url);
batch.push({ url, depth });
}
if (batch.length === 0) {
break;
}
const batchResults = await Promise.all(batch.map(async ({ url, depth }) => {
try {
const urlObj = new URL(url);
if (this.normalizeHostname(urlObj.hostname) !== baseHostname) {
return null;
}
const ext = path.extname(urlObj.pathname).toLowerCase();
if (this.supportedExtensions.includes(ext) && ext !== '.html' && ext !== '.htm') {
const fileName = this.getFileName(url, outputDir);
await this.downloadFile(url, fileName);
console.error(` Downloaded: ${url}`);
return { type: 'file', url, path: fileName };
}
for (let retry = 0; retry <= this.maxRetries; retry++) {
let page = null;
try {
page = await context.newPage();
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: this.timeout });
try {
await page.waitForFunction(() => {
const bodyText = document.body.innerText;
return !bodyText.includes('Just a moment') &&
!bodyText.includes('Verifying you are human') &&
document.body.innerHTML.length > 10000;
}, { timeout: 60000 });
} catch (e) {
console.error(` ⚠ Cloudflare challenge timeout for ${url}`);
}
const html = await page.content();
if (html.includes('Just a moment') || html.includes('Verifying you are human')) {
console.error(` ✗ Skipping ${url} - still showing Cloudflare challenge`);
await page.close();
return null;
}
const fileName = this.getFileName(url, outputDir, '.html');
fs.writeFileSync(fileName, html, 'utf-8');
console.error(` Crawled: ${url}`);
let links = [];
if (depth < this.maxDepth) {
links = await page.$$eval('a[href]', anchors => anchors.map(a => a.href));
}
await page.close();
return { type: 'page', url, path: fileName, links, depth };
} catch (error) {
if (page) {
try { await page.close(); } catch (e) {}
}
if (error.message.includes('Target page, context or browser has been closed')) {
if (retry < this.maxRetries) {
console.error(` ⚠ Browser closed, restarting... (retry ${retry + 1}/${this.maxRetries})`);
try {
await browser.close();
} catch (e) {}
const result = await launchBrowser();
browser = result.browser;
context = result.context;
await new Promise(resolve => setTimeout(resolve, this.retryDelay));
continue;
}
}
throw error;
}
}
return null;
} catch (error) {
results.errors.push({ url, error: error.message });
return null;
}
}));
if (onPageCallback) {
const validItems = batchResults.filter(r => r && (r.type === 'page' || r.type === 'file'));
if (validItems.length > 0) {
// Fire and forget - don't block crawler waiting for embeddings
validItems.forEach(item => {
onPageCallback(item).catch(err => {
console.error(` ⚠ Callback error for ${item.url}: ${err.message}`);
});
});
}
}
for (const result of batchResults) {
if (!result) continue;
if (result.type === 'page') {
results.pages.push({ url: result.url, path: result.path });
for (const link of result.links) {
try {
const linkUrl = new URL(link, result.url).href;
if (!this.visited.has(linkUrl)) {
this.queue.push({ url: linkUrl, depth: result.depth + 1 });
}
} catch (e) {
}
}
} else if (result.type === 'file') {
results.files.push({ url: result.url, path: result.path });
} else if (result.type === 'error') {
results.errors.push({ url: result.url, error: result.error });
}
}
this.saveState();
}
try {
await browser.close();
} catch (e) {
if (this.visited.size < this.maxPages && this.queue.length > 0) {
throw new Error(`Browser closed unexpectedly with ${this.queue.length} pages remaining: ${e.message}`);
}
}
if (this.stateFile && fs.existsSync(this.stateFile)) {
fs.unlinkSync(this.stateFile);
}
return results;
}
async downloadFile(url, outputPath) {
const response = await fetch(url);
const buffer = await response.arrayBuffer();
const bytes = Buffer.from(buffer);
// Check for error pages (HTML content in PDF/HTML files)
if (outputPath.endsWith('.pdf') || outputPath.endsWith('.html') || outputPath.endsWith('.htm')) {
const preview = bytes.toString('utf8', 0, Math.min(200, bytes.length));
if (preview.includes('<title>404') || preview.includes('Not Found') ||
preview.includes('Error 404') || preview.includes('Page not found') ||
preview.includes('File not found') || preview.includes('Access denied')) {
throw new Error(`Error page detected: 404/Not Found`);
}
}
fs.writeFileSync(outputPath, bytes);
}
getFileName(url, outputDir, forceExt = null) {
const urlObj = new URL(url);
let pathname = urlObj.pathname;
if (pathname === '/' || pathname === '') {
pathname = '/index';
}
if (forceExt) {
pathname = pathname.replace(/\.[^.]*$/, '') + forceExt;
}
const sanitized = pathname.replace(/^\//, '').replace(/[^a-zA-Z0-9._-]/g, '_');
return path.join(outputDir, sanitized || 'index.html');
}
}
module.exports = { WebCrawler };