UNPKG

node-ai-ragbot

Version:

Node.js backend package for building AI chatbots and voicebots with Retrieval-Augmented Generation (RAG). It ingests website pages or local files (PDF, DOCX, TXT, MD), creates embeddings with LangChain + OpenAI, stores them in a fast in-memory vector data

56 lines (49 loc) 1.55 kB
async function crawlSiteWithPuppeteer(startUrl, browser, maxPages = 50) { const visited = new Set(); const toVisit = [startUrl]; const base = new URL(startUrl).origin; const found = []; while (toVisit.length && visited.size < maxPages) { const url = toVisit.shift(); if (visited.has(url)) continue; try { const page = await browser.newPage(); await page.goto(url, { waitUntil: "domcontentloaded", timeout: 20000 }); const links = await page.evaluate(() => { const anchors = Array.from(document.querySelectorAll("a[href]")); const urls = anchors .map((a) => a.href.trim()) .filter(Boolean) .filter( (h) => h.startsWith("http") && !h.startsWith("mailto:") && !h.startsWith("tel:") && !h.toLowerCase().includes("javascript:") && !h.includes("#") && !h.match(/\.(pdf|jpg|jpeg|png|gif|svg|zip|docx?|xls|pptx?)$/i) ); return [...new Set(urls)]; }); await page.close(); visited.add(url); found.push(url); for (const link of links) { try { const abs = new URL(link, base).href; if ( abs.startsWith(base) && !visited.has(abs) && !toVisit.includes(abs) ) { toVisit.push(abs); } } catch {} } } catch { // ignore per-page failures } } return found; } module.exports = { crawlSiteWithPuppeteer };