UNPKG

@lenne.tech/cli

Version:

lenne.Tech CLI: lt

662 lines (661 loc) • 30.3 kB

JavaScript

"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.crawlSite = crawlSite; /** * Website crawler utilities. * * Fetches web pages (optionally guided by sitemap.xml), extracts the * main content using the same defuddle + Turndown pipeline as the * chrome-md browser extension (see ../../../chrome-md/content/content.js), * converts it to Markdown, and writes one .md file per page plus an * overview README when multiple pages are discovered. Designed for * building Claude Code knowledge bases. */ const axios_1 = __importDefault(require("axios")); const crypto_1 = require("crypto"); const defuddle_1 = __importDefault(require("defuddle")); const fs_1 = require("fs"); const jsdom_1 = require("jsdom"); const path_1 = require("path"); const turndown_1 = __importDefault(require("turndown")); const turndown_plugin_gfm_1 = require("turndown-plugin-gfm"); const browser_fetcher_1 = require("./browser-fetcher"); const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)'; /** * Crawl a website starting at `options.url` and write the collected * pages as Markdown files beneath `options.outDir`. */ function crawlSite(options) { return __awaiter(this, void 0, void 0, function* () { const { autoInstallBrowser = false, concurrency = 4, depth: rawDepth, includeImages = true, includeSitemap = true, maxPages = 200, onLog = () => undefined, outDir, prune = false, renderJs = false, selector, timeout = 20000, url: seedUrl, userAgent = DEFAULT_USER_AGENT, } = options; // Normalize depth. `'all'` and negative numbers mean "follow every // same-origin link we find" — bounded by `maxPages`. const depth = rawDepth === 'all' || (typeof rawDepth === 'number' && rawDepth < 0) ? Number.POSITIVE_INFINITY : Number(rawDepth); const http = axios_1.default.create({ headers: { 'User-Agent': userAgent }, maxRedirects: 5, responseType: 'text', timeout, validateStatus: (status) => status >= 200 && status < 400, }); // Headless browser only spun up when needed (SPA-mode). let browserFetcher = null; if (renderJs) { browserFetcher = yield (0, browser_fetcher_1.createBrowserFetcher)({ autoInstall: autoInstallBrowser, extraWaitMs: 500, maxWaitMs: timeout, onLog, userAgent, }); } try { const seed = new URL(seedUrl); const origin = seed.origin; if (!(0, fs_1.existsSync)(outDir)) { (0, fs_1.mkdirSync)(outDir, { recursive: true }); } // Queue preserves the depth at which a URL was discovered so children // are only followed when `discovered.depth < options.depth`. const queue = [{ depth: 0, url: normalizeUrl(seedUrl) }]; const seen = new Set([normalizeUrl(seedUrl)]); if (includeSitemap) { onLog(`Checking sitemap at ${origin}/sitemap.xml`); const sitemapUrls = yield fetchSitemapUrls(http, origin, onLog); for (const sitemapUrl of sitemapUrls) { const normalized = normalizeUrl(sitemapUrl); if (!seen.has(normalized) && sameOrigin(normalized, origin)) { seen.add(normalized); queue.push({ depth: 0, url: normalized }); } } if (sitemapUrls.length > 0) { onLog(`Sitemap discovered ${sitemapUrls.length} URLs`); } } const pages = []; const errors = []; const skipped = []; // Shared deduplicated image map (content hash -> relative path under outDir). const imageHashToPath = new Map(); // We can't know upfront whether the crawl is single- or multi-page, // so we render pages into a buffer first and only materialize files // once the queue drains. const rendered = []; const processPage = (item) => __awaiter(this, void 0, void 0, function* () { var _a, _b; if (pages.length + errors.length >= maxPages) { skipped.push(item.url); return; } onLog(`Fetching (depth ${item.depth}): ${item.url}`); try { let html; let finalUrl = normalizeUrl(item.url); if (browserFetcher) { // In render mode we trust the URL we navigated to. We can't // cheaply detect redirects here, so assume same origin (the // crawler already filtered non-HTML URLs out of the queue). html = yield browserFetcher.fetch(item.url); } else { const response = yield http.get(item.url); finalUrl = normalizeUrl(((_b = (_a = response.request) === null || _a === void 0 ? void 0 : _a.res) === null || _b === void 0 ? void 0 : _b.responseUrl) || item.url); if (!sameOrigin(finalUrl, origin)) { skipped.push(item.url); return; } const contentType = String(response.headers['content-type'] || ''); if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) { skipped.push(item.url); return; } html = String(response.data || ''); } const extracted = yield extractContent(html, finalUrl, { selector }); // Follow links when depth budget is left. if (item.depth < depth) { for (const link of extracted.links) { if (!sameOrigin(link, origin)) continue; const normalized = normalizeUrl(link); if (seen.has(normalized)) continue; seen.add(normalized); queue.push({ depth: item.depth + 1, url: normalized }); } } // Download images and build a URL -> local path map for Turndown. const imageEntries = []; if (includeImages && extracted.images.length > 0) { for (const imgUrl of extracted.images) { try { const absolute = new URL(imgUrl, finalUrl).href; const result = yield fetchImage(http, absolute); if (!result) continue; const hash = (0, crypto_1.createHash)('sha1').update(result.buffer).digest('hex'); let relativeImagePath = imageHashToPath.get(hash); if (!relativeImagePath) { // Filename uses a content-hash suffix so re-runs with // identical bytes overwrite the same file instead of // leaving orphans with rotating counter suffixes. const filename = buildImageFilename(absolute, hash, result.contentType); relativeImagePath = `images/${filename}`; imageHashToPath.set(hash, relativeImagePath); imageEntries.push({ data: result.buffer, filename }); } extracted.imageMap.set(imgUrl, relativeImagePath); extracted.imageMap.set(absolute, relativeImagePath); } catch (_c) { // Skip image on error; continue with others. } } } const markdown = convertToMarkdown(extracted.contentHtml, finalUrl, extracted.imageMap); const filename = buildPageFilename(finalUrl, rendered.length === 0); rendered.push({ filename, images: imageEntries, info: { author: extracted.meta.author, depth: item.depth, description: extracted.meta.description, downloadDate: new Date().toISOString(), firstDownloaded: new Date().toISOString(), imageCount: imageEntries.length, language: extracted.meta.language, ogImage: extracted.meta.ogImage, title: extracted.meta.title, url: finalUrl, wordCount: extracted.meta.wordCount || countWords(extracted.contentText), }, markdown, }); } catch (error) { errors.push({ reason: error instanceof Error ? error.message : String(error), url: item.url, }); } }); // Simple parallel worker pool. `queue` grows as pages are discovered, // so workers pick new items until nothing is left. let cursor = 0; const worker = () => __awaiter(this, void 0, void 0, function* () { while (cursor < queue.length && pages.length + errors.length < maxPages) { const item = queue[cursor++]; yield processPage(item); } }); const workers = Array.from({ length: Math.max(1, concurrency) }, () => worker()); yield Promise.all(workers); // Drain any late discoveries added after all initial workers exited. while (cursor < queue.length) { yield Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker())); } const multiPage = rendered.length > 1; const pagesDir = multiPage ? (0, path_1.join)(outDir, 'pages') : outDir; const imagesDir = (0, path_1.join)(outDir, 'images'); if (rendered.length === 0) { onLog('No pages rendered'); return { errors, outDir, pages, pruned: [], skipped }; } (0, fs_1.mkdirSync)(pagesDir, { recursive: true }); if (includeImages && imageHashToPath.size > 0) { (0, fs_1.mkdirSync)(imagesDir, { recursive: true }); } // Write deduplicated images. const writtenImageFilenames = new Set(); for (const entry of rendered.flatMap((r) => r.images)) { if (writtenImageFilenames.has(entry.filename)) continue; writtenImageFilenames.add(entry.filename); (0, fs_1.writeFileSync)((0, path_1.join)(imagesDir, entry.filename), entry.data); } // Persist pages. When updating, preserve the original // `first_downloaded` timestamp so history stays intact. for (const entry of rendered) { const outputPath = (0, path_1.join)(pagesDir, entry.filename); const relativePath = (0, path_1.relative)(outDir, outputPath); // Images live under `<outDir>/images/`. Each page rewrites the // Turndown-emitted `images/<file>` placeholder to the correct // relative path so nested URL slugs (`pages/ueber-uns/…`, or a // single-page crawl that lands in `<outDir>/ueber-uns/…`) still // render in Markdown previews. const imagePrefix = `${(0, path_1.relative)((0, path_1.dirname)(outputPath), imagesDir).split(/[\\/]/).join('/')}/`; const fixedMarkdown = entry.markdown.replace(/\]\(images\//g, `](${imagePrefix}`); if ((0, fs_1.existsSync)(outputPath)) { const existing = (0, fs_1.readFileSync)(outputPath, 'utf8'); const existingMeta = parseFrontmatter(existing); if (existingMeta === null || existingMeta === void 0 ? void 0 : existingMeta.first_downloaded) { entry.info.firstDownloaded = String(existingMeta.first_downloaded); } } const frontmatter = renderFrontmatter(entry.info); (0, fs_1.mkdirSync)((0, path_1.dirname)(outputPath), { recursive: true }); (0, fs_1.writeFileSync)(outputPath, `${frontmatter}\n${fixedMarkdown.trim()}\n`); pages.push(Object.assign(Object.assign({}, entry.info), { outputPath, relativePath })); } let indexFile; if (multiPage) { indexFile = (0, path_1.join)(outDir, 'README.md'); (0, fs_1.writeFileSync)(indexFile, renderOverview(seed.href, pages)); } // Prune orphans (files left over from previous crawls). Scoped to // `pages/` and `images/` so stray user files in outDir root never // get touched. Only active in multi-page mode — a single-page // crawl writes into `outDir` itself and has no page subfolder to // sweep. const pruned = []; if (prune && multiPage) { const keep = new Set(pages.map((p) => p.outputPath)); for (const entry of rendered.flatMap((r) => r.images)) { keep.add((0, path_1.join)(imagesDir, entry.filename)); } pruned.push(...pruneOrphans(pagesDir, keep)); if ((0, fs_1.existsSync)(imagesDir)) { pruned.push(...pruneOrphans(imagesDir, keep)); } if (pruned.length > 0) { onLog(`Pruned ${pruned.length} orphaned file(s)`); } } return { errors, indexFile, outDir, pages, pruned, skipped }; } finally { // Guarantee the headless browser is shut down on every exit path, // including thrown errors, so no orphan chromium processes linger. if (browserFetcher) { yield browserFetcher.close().catch(() => undefined); } } }); } function buildImageFilename(url, contentHash, contentType) { let basename = 'image'; let extension = ''; try { const u = new URL(url); const last = u.pathname.split('/').filter(Boolean).pop() || ''; const parsedExt = (0, path_1.extname)(last).replace('.', '').toLowerCase(); if (parsedExt && /^(jpg|jpeg|png|gif|webp|svg|avif)$/.test(parsedExt)) { extension = parsedExt; } basename = last .replace((0, path_1.extname)(last), '') .replace(/[^a-zA-Z0-9-_]/g, '_') .substring(0, 40) || 'image'; } catch (_a) { // fall through } if (!extension) { const fromType = contentType.split(';')[0].split('/')[1]; if (fromType && /^(jpeg|jpg|png|gif|webp|svg\+xml|avif)$/.test(fromType)) { extension = fromType === 'svg+xml' ? 'svg' : fromType; } else { extension = 'png'; } } return `${basename}-${contentHash.slice(0, 8)}.${extension}`; } function buildPageFilename(url, isFirst) { const u = new URL(url); const segments = u.pathname.split('/').filter(Boolean); if (segments.length === 0) { return isFirst ? 'index.md' : 'home.md'; } const slugged = segments .map((s) => s .toLowerCase() .replace(/\.(html?|php|aspx?)$/, '') .replace(/[^a-z0-9-_]/g, '-') .replace(/-+/g, '-') .replace(/^-|-$/g, '') || 'page') .join('/'); return `${slugged}.md`; } function convertToMarkdown(html, baseUrl, imageMap) { const turndown = new turndown_1.default({ bulletListMarker: '-', codeBlockStyle: 'fenced', emDelimiter: '*', headingStyle: 'atx', linkStyle: 'inlined', strongDelimiter: '**', }); // Enable GFM so tables, strikethrough and task lists convert cleanly. if (turndown_plugin_gfm_1.gfm) { turndown.use(turndown_plugin_gfm_1.gfm); } turndown.addRule('absoluteLinks', { filter: 'a', replacement: (content, node) => { var _a, _b, _c, _d; const href = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'href')) || ''; if (!href || href === '#' || href.startsWith('javascript:')) { return content; } let absolute = href; try { absolute = new URL(href, baseUrl).href; } catch (_e) { // keep original } const title = (_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'title'); return title ? `[${content}](${absolute} "${title}")` : `[${content}](${absolute})`; }, }); turndown.addRule('localImages', { filter: 'img', replacement: (_content, node) => { var _a, _b, _c, _d; const src = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'src')) || ''; if (!src) return ''; let absolute = src; try { absolute = new URL(src, baseUrl).href; } catch (_e) { // keep original } const local = imageMap.get(src) || imageMap.get(absolute); const alt = ((_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'alt')) || ''; const target = local || absolute; return `![${alt}](${target})`; }, }); turndown.remove(['script', 'style', 'noscript', 'iframe']); const markdown = turndown.turndown(html); return markdown.replace(/\n{3,}/g, '\n\n').trim(); } function countWords(text) { return text.replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length; } function escapeYaml(value) { return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, ' '); } /** * Extract main content + metadata using defuddle (the same engine as * chrome-md). Falls back to a raw body dump if defuddle fails. */ function extractContent(html, pageUrl, options) { return __awaiter(this, void 0, void 0, function* () { var _a, _b, _c, _d, _e, _f, _g; const dom = new jsdom_1.JSDOM(html, { url: pageUrl }); const doc = dom.window.document; const defuddleOptions = { markdown: false, removeHiddenElements: true, removeLowScoring: true, removeSmallImages: false, }; if (options.selector) { defuddleOptions.contentSelector = options.selector; } let parsed; try { // Same class-based API as chrome-md's content script. const instance = new defuddle_1.default(doc, defuddleOptions); parsed = instance.parse(); } catch (_h) { parsed = { content: ((_a = doc.body) === null || _a === void 0 ? void 0 : _a.innerHTML) || html, title: doc.title, }; } const contentHtml = parsed.content || ((_b = doc.body) === null || _b === void 0 ? void 0 : _b.innerHTML) || ''; // Collect images and links from the cleaned content. const helperDom = new jsdom_1.JSDOM(`<!DOCTYPE html><html><body>${contentHtml}</body></html>`, { url: pageUrl, }); const contentDoc = helperDom.window.document; const links = new Set(); contentDoc.querySelectorAll('a[href]').forEach((el) => { const href = (el.getAttribute('href') || '').trim(); if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) { return; } try { links.add(new URL(href, pageUrl).href); } catch (_a) { // ignore malformed URLs } }); const images = new Set(); contentDoc.querySelectorAll('img').forEach((el) => { const src = (el.getAttribute('src') || el.getAttribute('data-src') || '').trim(); if (!src || src.startsWith('data:')) return; try { images.add(new URL(src, pageUrl).href); } catch (_a) { // ignore malformed URLs } }); // Some lazy-loading frameworks keep the real URL only in the source // document (stripped out by defuddle), so also consult the original DOM. doc.querySelectorAll('img[data-src], img[data-lazy-src]').forEach((el) => { const src = (el.getAttribute('data-src') || el.getAttribute('data-lazy-src') || '').trim(); if (!src || src.startsWith('data:')) return; try { images.add(new URL(src, pageUrl).href); } catch (_a) { // ignore } }); const meta = { author: parsed.author || ((_c = doc.querySelector('meta[name="author"]')) === null || _c === void 0 ? void 0 : _c.getAttribute('content')) || undefined, description: parsed.description || ((_d = doc.querySelector('meta[name="description"]')) === null || _d === void 0 ? void 0 : _d.getAttribute('content')) || ((_e = doc.querySelector('meta[property="og:description"]')) === null || _e === void 0 ? void 0 : _e.getAttribute('content')) || '', language: parsed.language || doc.documentElement.getAttribute('lang') || undefined, ogImage: parsed.image || ((_f = doc.querySelector('meta[property="og:image"]')) === null || _f === void 0 ? void 0 : _f.getAttribute('content')) || undefined, title: parsed.title || doc.title || pageUrl, wordCount: parsed.wordCount, }; return { contentHtml, contentText: ((_g = contentDoc.body) === null || _g === void 0 ? void 0 : _g.textContent) || '', imageMap: new Map(), images: [...images], links: [...links], meta, }; }); } function fetchImage(http, url) { return __awaiter(this, void 0, void 0, function* () { try { const response = yield http.get(url, { responseType: 'arraybuffer' }); const buffer = Buffer.from(response.data); if (buffer.byteLength === 0) return null; return { buffer, contentType: String(response.headers['content-type'] || '') }; } catch (_a) { return null; } }); } function fetchSitemapUrls(http, origin, onLog) { return __awaiter(this, void 0, void 0, function* () { const urls = []; const visited = new Set(); function walk(sitemapUrl) { return __awaiter(this, void 0, void 0, function* () { if (visited.has(sitemapUrl)) return; visited.add(sitemapUrl); try { const response = yield http.get(sitemapUrl); const xml = String(response.data || ''); // Nested sitemap index: follow each <sitemap><loc>...</loc></sitemap>. const nested = [...xml.matchAll(/<sitemap>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/sitemap>/gi)].map((m) => m[1]); for (const child of nested) { yield walk(child); } const pageMatches = [...xml.matchAll(/<url>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/url>/gi)].map((m) => m[1]); urls.push(...pageMatches); } catch (error) { onLog(`Sitemap fetch failed for ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`); } }); } yield walk(`${origin}/sitemap.xml`); return urls; }); } /** * Normalize a URL for dedup: strip hash, drop default `index.html`, * and remove trailing slashes (except root). */ function normalizeUrl(raw) { try { const u = new URL(raw); u.hash = ''; u.pathname = u.pathname.replace(/\/index\.html?$/i, '/'); if (u.pathname.length > 1 && u.pathname.endsWith('/')) { u.pathname = u.pathname.replace(/\/+$/, ''); } return u.href; } catch (_a) { return raw; } } function parseFrontmatter(markdown) { if (!markdown.startsWith('---')) return null; const end = markdown.indexOf('\n---', 3); if (end === -1) return null; const block = markdown.slice(3, end); const result = {}; for (const line of block.split('\n')) { const match = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/); if (!match) continue; result[match[1]] = match[2].replace(/^"(.*)"$/, '$1'); } return result; } /** * Walk `rootDir` recursively and delete every file whose absolute * path is not in `keepPaths`. Empty directories left behind after * the sweep are removed, too. Returns the absolute paths that were * actually deleted. */ function pruneOrphans(rootDir, keepPaths) { const removed = []; if (!(0, fs_1.existsSync)(rootDir)) return removed; const entries = (0, fs_1.readdirSync)(rootDir, { withFileTypes: true }); for (const entry of entries) { const full = (0, path_1.join)(rootDir, entry.name); if (entry.isDirectory()) { removed.push(...pruneOrphans(full, keepPaths)); // Remove directory if now empty. try { if ((0, fs_1.readdirSync)(full).length === 0) (0, fs_1.rmdirSync)(full); } catch (_a) { // Directory not empty or already gone — ignore. } } else if (entry.isFile() && !keepPaths.has(full)) { try { (0, fs_1.unlinkSync)(full); removed.push(full); } catch (_b) { // File already removed or permission denied — skip. } } } return removed; } function renderFrontmatter(info) { const lines = [ '---', `title: "${escapeYaml(info.title)}"`, `source_url: "${info.url}"`, `source_domain: "${new URL(info.url).hostname}"`, `crawl_depth: ${info.depth}`, `download_date: "${info.downloadDate}"`, `first_downloaded: "${info.firstDownloaded}"`, info.description ? `description: "${escapeYaml(truncate(info.description, 500))}"` : null, info.author ? `author: "${escapeYaml(info.author)}"` : null, info.language ? `language: "${escapeYaml(info.language)}"` : null, info.ogImage ? `og_image: "${escapeYaml(info.ogImage)}"` : null, info.imageCount ? `image_count: ${info.imageCount}` : null, `word_count: ${info.wordCount}`, 'content_type: "webpage"', '---', ].filter((l) => l !== null); return lines.join('\n'); } function renderOverview(startUrl, pages) { const ordered = [...pages].sort((a, b) => a.url.localeCompare(b.url)); const host = new URL(startUrl).host; const lines = []; lines.push(`# ${host} — Knowledge Base`); lines.push(''); lines.push(`Source: ${startUrl}`); lines.push(''); lines.push(`Generated: ${new Date().toISOString()}`); lines.push(''); lines.push(`Pages: ${ordered.length}`); lines.push(''); lines.push('## Pages'); lines.push(''); for (const page of ordered) { lines.push(`### [${page.title}](${page.relativePath.split(/[\\/]/).join('/')})`); lines.push(''); lines.push(`- URL: ${page.url}`); if (page.description) { lines.push(`- ${truncate(page.description, 240)}`); } lines.push(`- Updated: ${page.downloadDate}`); lines.push(''); } return lines.join('\n'); } function sameOrigin(url, origin) { try { return new URL(url).origin === origin; } catch (_a) { return false; } } function truncate(value, max) { return value.length <= max ? value : `${value.slice(0, max - 1)}…`; }