@lenne.tech/cli
Version:
lenne.Tech CLI: lt
662 lines (661 loc) • 30.3 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.crawlSite = crawlSite;
/**
* Website crawler utilities.
*
* Fetches web pages (optionally guided by sitemap.xml), extracts the
* main content using the same defuddle + Turndown pipeline as the
* chrome-md browser extension (see ../../../chrome-md/content/content.js),
* converts it to Markdown, and writes one .md file per page plus an
* overview README when multiple pages are discovered. Designed for
* building Claude Code knowledge bases.
*/
const axios_1 = __importDefault(require("axios"));
const crypto_1 = require("crypto");
const defuddle_1 = __importDefault(require("defuddle"));
const fs_1 = require("fs");
const jsdom_1 = require("jsdom");
const path_1 = require("path");
const turndown_1 = __importDefault(require("turndown"));
const turndown_plugin_gfm_1 = require("turndown-plugin-gfm");
const browser_fetcher_1 = require("./browser-fetcher");
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
/**
* Crawl a website starting at `options.url` and write the collected
* pages as Markdown files beneath `options.outDir`.
*/
function crawlSite(options) {
return __awaiter(this, void 0, void 0, function* () {
const { autoInstallBrowser = false, concurrency = 4, depth: rawDepth, includeImages = true, includeSitemap = true, maxPages = 200, onLog = () => undefined, outDir, prune = false, renderJs = false, selector, timeout = 20000, url: seedUrl, userAgent = DEFAULT_USER_AGENT, } = options;
// Normalize depth. `'all'` and negative numbers mean "follow every
// same-origin link we find" — bounded by `maxPages`.
const depth = rawDepth === 'all' || (typeof rawDepth === 'number' && rawDepth < 0) ? Number.POSITIVE_INFINITY : Number(rawDepth);
const http = axios_1.default.create({
headers: { 'User-Agent': userAgent },
maxRedirects: 5,
responseType: 'text',
timeout,
validateStatus: (status) => status >= 200 && status < 400,
});
// Headless browser only spun up when needed (SPA-mode).
let browserFetcher = null;
if (renderJs) {
browserFetcher = yield (0, browser_fetcher_1.createBrowserFetcher)({
autoInstall: autoInstallBrowser,
extraWaitMs: 500,
maxWaitMs: timeout,
onLog,
userAgent,
});
}
try {
const seed = new URL(seedUrl);
const origin = seed.origin;
if (!(0, fs_1.existsSync)(outDir)) {
(0, fs_1.mkdirSync)(outDir, { recursive: true });
}
// Queue preserves the depth at which a URL was discovered so children
// are only followed when `discovered.depth < options.depth`.
const queue = [{ depth: 0, url: normalizeUrl(seedUrl) }];
const seen = new Set([normalizeUrl(seedUrl)]);
if (includeSitemap) {
onLog(`Checking sitemap at ${origin}/sitemap.xml`);
const sitemapUrls = yield fetchSitemapUrls(http, origin, onLog);
for (const sitemapUrl of sitemapUrls) {
const normalized = normalizeUrl(sitemapUrl);
if (!seen.has(normalized) && sameOrigin(normalized, origin)) {
seen.add(normalized);
queue.push({ depth: 0, url: normalized });
}
}
if (sitemapUrls.length > 0) {
onLog(`Sitemap discovered ${sitemapUrls.length} URLs`);
}
}
const pages = [];
const errors = [];
const skipped = [];
// Shared deduplicated image map (content hash -> relative path under outDir).
const imageHashToPath = new Map();
// We can't know upfront whether the crawl is single- or multi-page,
// so we render pages into a buffer first and only materialize files
// once the queue drains.
const rendered = [];
const processPage = (item) => __awaiter(this, void 0, void 0, function* () {
var _a, _b;
if (pages.length + errors.length >= maxPages) {
skipped.push(item.url);
return;
}
onLog(`Fetching (depth ${item.depth}): ${item.url}`);
try {
let html;
let finalUrl = normalizeUrl(item.url);
if (browserFetcher) {
// In render mode we trust the URL we navigated to. We can't
// cheaply detect redirects here, so assume same origin (the
// crawler already filtered non-HTML URLs out of the queue).
html = yield browserFetcher.fetch(item.url);
}
else {
const response = yield http.get(item.url);
finalUrl = normalizeUrl(((_b = (_a = response.request) === null || _a === void 0 ? void 0 : _a.res) === null || _b === void 0 ? void 0 : _b.responseUrl) || item.url);
if (!sameOrigin(finalUrl, origin)) {
skipped.push(item.url);
return;
}
const contentType = String(response.headers['content-type'] || '');
if (!contentType.includes('text/html') && !contentType.includes('application/xhtml')) {
skipped.push(item.url);
return;
}
html = String(response.data || '');
}
const extracted = yield extractContent(html, finalUrl, { selector });
// Follow links when depth budget is left.
if (item.depth < depth) {
for (const link of extracted.links) {
if (!sameOrigin(link, origin))
continue;
const normalized = normalizeUrl(link);
if (seen.has(normalized))
continue;
seen.add(normalized);
queue.push({ depth: item.depth + 1, url: normalized });
}
}
// Download images and build a URL -> local path map for Turndown.
const imageEntries = [];
if (includeImages && extracted.images.length > 0) {
for (const imgUrl of extracted.images) {
try {
const absolute = new URL(imgUrl, finalUrl).href;
const result = yield fetchImage(http, absolute);
if (!result)
continue;
const hash = (0, crypto_1.createHash)('sha1').update(result.buffer).digest('hex');
let relativeImagePath = imageHashToPath.get(hash);
if (!relativeImagePath) {
// Filename uses a content-hash suffix so re-runs with
// identical bytes overwrite the same file instead of
// leaving orphans with rotating counter suffixes.
const filename = buildImageFilename(absolute, hash, result.contentType);
relativeImagePath = `images/${filename}`;
imageHashToPath.set(hash, relativeImagePath);
imageEntries.push({ data: result.buffer, filename });
}
extracted.imageMap.set(imgUrl, relativeImagePath);
extracted.imageMap.set(absolute, relativeImagePath);
}
catch (_c) {
// Skip image on error; continue with others.
}
}
}
const markdown = convertToMarkdown(extracted.contentHtml, finalUrl, extracted.imageMap);
const filename = buildPageFilename(finalUrl, rendered.length === 0);
rendered.push({
filename,
images: imageEntries,
info: {
author: extracted.meta.author,
depth: item.depth,
description: extracted.meta.description,
downloadDate: new Date().toISOString(),
firstDownloaded: new Date().toISOString(),
imageCount: imageEntries.length,
language: extracted.meta.language,
ogImage: extracted.meta.ogImage,
title: extracted.meta.title,
url: finalUrl,
wordCount: extracted.meta.wordCount || countWords(extracted.contentText),
},
markdown,
});
}
catch (error) {
errors.push({
reason: error instanceof Error ? error.message : String(error),
url: item.url,
});
}
});
// Simple parallel worker pool. `queue` grows as pages are discovered,
// so workers pick new items until nothing is left.
let cursor = 0;
const worker = () => __awaiter(this, void 0, void 0, function* () {
while (cursor < queue.length && pages.length + errors.length < maxPages) {
const item = queue[cursor++];
yield processPage(item);
}
});
const workers = Array.from({ length: Math.max(1, concurrency) }, () => worker());
yield Promise.all(workers);
// Drain any late discoveries added after all initial workers exited.
while (cursor < queue.length) {
yield Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
}
const multiPage = rendered.length > 1;
const pagesDir = multiPage ? (0, path_1.join)(outDir, 'pages') : outDir;
const imagesDir = (0, path_1.join)(outDir, 'images');
if (rendered.length === 0) {
onLog('No pages rendered');
return { errors, outDir, pages, pruned: [], skipped };
}
(0, fs_1.mkdirSync)(pagesDir, { recursive: true });
if (includeImages && imageHashToPath.size > 0) {
(0, fs_1.mkdirSync)(imagesDir, { recursive: true });
}
// Write deduplicated images.
const writtenImageFilenames = new Set();
for (const entry of rendered.flatMap((r) => r.images)) {
if (writtenImageFilenames.has(entry.filename))
continue;
writtenImageFilenames.add(entry.filename);
(0, fs_1.writeFileSync)((0, path_1.join)(imagesDir, entry.filename), entry.data);
}
// Persist pages. When updating, preserve the original
// `first_downloaded` timestamp so history stays intact.
for (const entry of rendered) {
const outputPath = (0, path_1.join)(pagesDir, entry.filename);
const relativePath = (0, path_1.relative)(outDir, outputPath);
// Images live under `<outDir>/images/`. Each page rewrites the
// Turndown-emitted `images/<file>` placeholder to the correct
// relative path so nested URL slugs (`pages/ueber-uns/…`, or a
// single-page crawl that lands in `<outDir>/ueber-uns/…`) still
// render in Markdown previews.
const imagePrefix = `${(0, path_1.relative)((0, path_1.dirname)(outputPath), imagesDir).split(/[\\/]/).join('/')}/`;
const fixedMarkdown = entry.markdown.replace(/\]\(images\//g, `](${imagePrefix}`);
if ((0, fs_1.existsSync)(outputPath)) {
const existing = (0, fs_1.readFileSync)(outputPath, 'utf8');
const existingMeta = parseFrontmatter(existing);
if (existingMeta === null || existingMeta === void 0 ? void 0 : existingMeta.first_downloaded) {
entry.info.firstDownloaded = String(existingMeta.first_downloaded);
}
}
const frontmatter = renderFrontmatter(entry.info);
(0, fs_1.mkdirSync)((0, path_1.dirname)(outputPath), { recursive: true });
(0, fs_1.writeFileSync)(outputPath, `${frontmatter}\n${fixedMarkdown.trim()}\n`);
pages.push(Object.assign(Object.assign({}, entry.info), { outputPath, relativePath }));
}
let indexFile;
if (multiPage) {
indexFile = (0, path_1.join)(outDir, 'README.md');
(0, fs_1.writeFileSync)(indexFile, renderOverview(seed.href, pages));
}
// Prune orphans (files left over from previous crawls). Scoped to
// `pages/` and `images/` so stray user files in outDir root never
// get touched. Only active in multi-page mode — a single-page
// crawl writes into `outDir` itself and has no page subfolder to
// sweep.
const pruned = [];
if (prune && multiPage) {
const keep = new Set(pages.map((p) => p.outputPath));
for (const entry of rendered.flatMap((r) => r.images)) {
keep.add((0, path_1.join)(imagesDir, entry.filename));
}
pruned.push(...pruneOrphans(pagesDir, keep));
if ((0, fs_1.existsSync)(imagesDir)) {
pruned.push(...pruneOrphans(imagesDir, keep));
}
if (pruned.length > 0) {
onLog(`Pruned ${pruned.length} orphaned file(s)`);
}
}
return { errors, indexFile, outDir, pages, pruned, skipped };
}
finally {
// Guarantee the headless browser is shut down on every exit path,
// including thrown errors, so no orphan chromium processes linger.
if (browserFetcher) {
yield browserFetcher.close().catch(() => undefined);
}
}
});
}
function buildImageFilename(url, contentHash, contentType) {
let basename = 'image';
let extension = '';
try {
const u = new URL(url);
const last = u.pathname.split('/').filter(Boolean).pop() || '';
const parsedExt = (0, path_1.extname)(last).replace('.', '').toLowerCase();
if (parsedExt && /^(jpg|jpeg|png|gif|webp|svg|avif)$/.test(parsedExt)) {
extension = parsedExt;
}
basename =
last
.replace((0, path_1.extname)(last), '')
.replace(/[^a-zA-Z0-9-_]/g, '_')
.substring(0, 40) || 'image';
}
catch (_a) {
// fall through
}
if (!extension) {
const fromType = contentType.split(';')[0].split('/')[1];
if (fromType && /^(jpeg|jpg|png|gif|webp|svg\+xml|avif)$/.test(fromType)) {
extension = fromType === 'svg+xml' ? 'svg' : fromType;
}
else {
extension = 'png';
}
}
return `${basename}-${contentHash.slice(0, 8)}.${extension}`;
}
function buildPageFilename(url, isFirst) {
const u = new URL(url);
const segments = u.pathname.split('/').filter(Boolean);
if (segments.length === 0) {
return isFirst ? 'index.md' : 'home.md';
}
const slugged = segments
.map((s) => s
.toLowerCase()
.replace(/\.(html?|php|aspx?)$/, '')
.replace(/[^a-z0-9-_]/g, '-')
.replace(/-+/g, '-')
.replace(/^-|-$/g, '') || 'page')
.join('/');
return `${slugged}.md`;
}
function convertToMarkdown(html, baseUrl, imageMap) {
const turndown = new turndown_1.default({
bulletListMarker: '-',
codeBlockStyle: 'fenced',
emDelimiter: '*',
headingStyle: 'atx',
linkStyle: 'inlined',
strongDelimiter: '**',
});
// Enable GFM so tables, strikethrough and task lists convert cleanly.
if (turndown_plugin_gfm_1.gfm) {
turndown.use(turndown_plugin_gfm_1.gfm);
}
turndown.addRule('absoluteLinks', {
filter: 'a',
replacement: (content, node) => {
var _a, _b, _c, _d;
const href = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'href')) || '';
if (!href || href === '#' || href.startsWith('javascript:')) {
return content;
}
let absolute = href;
try {
absolute = new URL(href, baseUrl).href;
}
catch (_e) {
// keep original
}
const title = (_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'title');
return title ? `[${content}](${absolute} "${title}")` : `[${content}](${absolute})`;
},
});
turndown.addRule('localImages', {
filter: 'img',
replacement: (_content, node) => {
var _a, _b, _c, _d;
const src = ((_b = (_a = node).getAttribute) === null || _b === void 0 ? void 0 : _b.call(_a, 'src')) || '';
if (!src)
return '';
let absolute = src;
try {
absolute = new URL(src, baseUrl).href;
}
catch (_e) {
// keep original
}
const local = imageMap.get(src) || imageMap.get(absolute);
const alt = ((_d = (_c = node).getAttribute) === null || _d === void 0 ? void 0 : _d.call(_c, 'alt')) || '';
const target = local || absolute;
return ``;
},
});
turndown.remove(['script', 'style', 'noscript', 'iframe']);
const markdown = turndown.turndown(html);
return markdown.replace(/\n{3,}/g, '\n\n').trim();
}
function countWords(text) {
return text.replace(/\s+/g, ' ').trim().split(' ').filter(Boolean).length;
}
function escapeYaml(value) {
return value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, ' ');
}
/**
* Extract main content + metadata using defuddle (the same engine as
* chrome-md). Falls back to a raw body dump if defuddle fails.
*/
function extractContent(html, pageUrl, options) {
return __awaiter(this, void 0, void 0, function* () {
var _a, _b, _c, _d, _e, _f, _g;
const dom = new jsdom_1.JSDOM(html, { url: pageUrl });
const doc = dom.window.document;
const defuddleOptions = {
markdown: false,
removeHiddenElements: true,
removeLowScoring: true,
removeSmallImages: false,
};
if (options.selector) {
defuddleOptions.contentSelector = options.selector;
}
let parsed;
try {
// Same class-based API as chrome-md's content script.
const instance = new defuddle_1.default(doc, defuddleOptions);
parsed = instance.parse();
}
catch (_h) {
parsed = {
content: ((_a = doc.body) === null || _a === void 0 ? void 0 : _a.innerHTML) || html,
title: doc.title,
};
}
const contentHtml = parsed.content || ((_b = doc.body) === null || _b === void 0 ? void 0 : _b.innerHTML) || '';
// Collect images and links from the cleaned content.
const helperDom = new jsdom_1.JSDOM(`<!DOCTYPE html><html><body>${contentHtml}</body></html>`, {
url: pageUrl,
});
const contentDoc = helperDom.window.document;
const links = new Set();
contentDoc.querySelectorAll('a[href]').forEach((el) => {
const href = (el.getAttribute('href') || '').trim();
if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) {
return;
}
try {
links.add(new URL(href, pageUrl).href);
}
catch (_a) {
// ignore malformed URLs
}
});
const images = new Set();
contentDoc.querySelectorAll('img').forEach((el) => {
const src = (el.getAttribute('src') || el.getAttribute('data-src') || '').trim();
if (!src || src.startsWith('data:'))
return;
try {
images.add(new URL(src, pageUrl).href);
}
catch (_a) {
// ignore malformed URLs
}
});
// Some lazy-loading frameworks keep the real URL only in the source
// document (stripped out by defuddle), so also consult the original DOM.
doc.querySelectorAll('img[data-src], img[data-lazy-src]').forEach((el) => {
const src = (el.getAttribute('data-src') || el.getAttribute('data-lazy-src') || '').trim();
if (!src || src.startsWith('data:'))
return;
try {
images.add(new URL(src, pageUrl).href);
}
catch (_a) {
// ignore
}
});
const meta = {
author: parsed.author || ((_c = doc.querySelector('meta[name="author"]')) === null || _c === void 0 ? void 0 : _c.getAttribute('content')) || undefined,
description: parsed.description ||
((_d = doc.querySelector('meta[name="description"]')) === null || _d === void 0 ? void 0 : _d.getAttribute('content')) ||
((_e = doc.querySelector('meta[property="og:description"]')) === null || _e === void 0 ? void 0 : _e.getAttribute('content')) ||
'',
language: parsed.language || doc.documentElement.getAttribute('lang') || undefined,
ogImage: parsed.image || ((_f = doc.querySelector('meta[property="og:image"]')) === null || _f === void 0 ? void 0 : _f.getAttribute('content')) || undefined,
title: parsed.title || doc.title || pageUrl,
wordCount: parsed.wordCount,
};
return {
contentHtml,
contentText: ((_g = contentDoc.body) === null || _g === void 0 ? void 0 : _g.textContent) || '',
imageMap: new Map(),
images: [...images],
links: [...links],
meta,
};
});
}
function fetchImage(http, url) {
return __awaiter(this, void 0, void 0, function* () {
try {
const response = yield http.get(url, { responseType: 'arraybuffer' });
const buffer = Buffer.from(response.data);
if (buffer.byteLength === 0)
return null;
return { buffer, contentType: String(response.headers['content-type'] || '') };
}
catch (_a) {
return null;
}
});
}
function fetchSitemapUrls(http, origin, onLog) {
return __awaiter(this, void 0, void 0, function* () {
const urls = [];
const visited = new Set();
function walk(sitemapUrl) {
return __awaiter(this, void 0, void 0, function* () {
if (visited.has(sitemapUrl))
return;
visited.add(sitemapUrl);
try {
const response = yield http.get(sitemapUrl);
const xml = String(response.data || '');
// Nested sitemap index: follow each <sitemap><loc>...</loc></sitemap>.
const nested = [...xml.matchAll(/<sitemap>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/sitemap>/gi)].map((m) => m[1]);
for (const child of nested) {
yield walk(child);
}
const pageMatches = [...xml.matchAll(/<url>[\s\S]*?<loc>\s*([^<\s]+)\s*<\/loc>[\s\S]*?<\/url>/gi)].map((m) => m[1]);
urls.push(...pageMatches);
}
catch (error) {
onLog(`Sitemap fetch failed for ${sitemapUrl}: ${error instanceof Error ? error.message : String(error)}`);
}
});
}
yield walk(`${origin}/sitemap.xml`);
return urls;
});
}
/**
* Normalize a URL for dedup: strip hash, drop default `index.html`,
* and remove trailing slashes (except root).
*/
function normalizeUrl(raw) {
try {
const u = new URL(raw);
u.hash = '';
u.pathname = u.pathname.replace(/\/index\.html?$/i, '/');
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
u.pathname = u.pathname.replace(/\/+$/, '');
}
return u.href;
}
catch (_a) {
return raw;
}
}
function parseFrontmatter(markdown) {
if (!markdown.startsWith('---'))
return null;
const end = markdown.indexOf('\n---', 3);
if (end === -1)
return null;
const block = markdown.slice(3, end);
const result = {};
for (const line of block.split('\n')) {
const match = line.match(/^([a-zA-Z0-9_]+):\s*(.*)$/);
if (!match)
continue;
result[match[1]] = match[2].replace(/^"(.*)"$/, '$1');
}
return result;
}
/**
* Walk `rootDir` recursively and delete every file whose absolute
* path is not in `keepPaths`. Empty directories left behind after
* the sweep are removed, too. Returns the absolute paths that were
* actually deleted.
*/
function pruneOrphans(rootDir, keepPaths) {
const removed = [];
if (!(0, fs_1.existsSync)(rootDir))
return removed;
const entries = (0, fs_1.readdirSync)(rootDir, { withFileTypes: true });
for (const entry of entries) {
const full = (0, path_1.join)(rootDir, entry.name);
if (entry.isDirectory()) {
removed.push(...pruneOrphans(full, keepPaths));
// Remove directory if now empty.
try {
if ((0, fs_1.readdirSync)(full).length === 0)
(0, fs_1.rmdirSync)(full);
}
catch (_a) {
// Directory not empty or already gone — ignore.
}
}
else if (entry.isFile() && !keepPaths.has(full)) {
try {
(0, fs_1.unlinkSync)(full);
removed.push(full);
}
catch (_b) {
// File already removed or permission denied — skip.
}
}
}
return removed;
}
function renderFrontmatter(info) {
const lines = [
'---',
`title: "${escapeYaml(info.title)}"`,
`source_url: "${info.url}"`,
`source_domain: "${new URL(info.url).hostname}"`,
`crawl_depth: ${info.depth}`,
`download_date: "${info.downloadDate}"`,
`first_downloaded: "${info.firstDownloaded}"`,
info.description ? `description: "${escapeYaml(truncate(info.description, 500))}"` : null,
info.author ? `author: "${escapeYaml(info.author)}"` : null,
info.language ? `language: "${escapeYaml(info.language)}"` : null,
info.ogImage ? `og_image: "${escapeYaml(info.ogImage)}"` : null,
info.imageCount ? `image_count: ${info.imageCount}` : null,
`word_count: ${info.wordCount}`,
'content_type: "webpage"',
'---',
].filter((l) => l !== null);
return lines.join('\n');
}
function renderOverview(startUrl, pages) {
const ordered = [...pages].sort((a, b) => a.url.localeCompare(b.url));
const host = new URL(startUrl).host;
const lines = [];
lines.push(`# ${host} — Knowledge Base`);
lines.push('');
lines.push(`Source: ${startUrl}`);
lines.push('');
lines.push(`Generated: ${new Date().toISOString()}`);
lines.push('');
lines.push(`Pages: ${ordered.length}`);
lines.push('');
lines.push('## Pages');
lines.push('');
for (const page of ordered) {
lines.push(`### [${page.title}](${page.relativePath.split(/[\\/]/).join('/')})`);
lines.push('');
lines.push(`- URL: ${page.url}`);
if (page.description) {
lines.push(`- ${truncate(page.description, 240)}`);
}
lines.push(`- Updated: ${page.downloadDate}`);
lines.push('');
}
return lines.join('\n');
}
function sameOrigin(url, origin) {
try {
return new URL(url).origin === origin;
}
catch (_a) {
return false;
}
}
function truncate(value, max) {
return value.length <= max ? value : `${value.slice(0, max - 1)}…`;
}