@lenne.tech/cli
Version:
lenne.Tech CLI: lt
310 lines (309 loc) • 13 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.help = void 0;
const path_1 = require("path");
const crawler_1 = require("../../lib/crawler");
/**
* Crawl a website (optionally following same-origin links up to a
* configurable depth) and store the content as Markdown files for use
* as a Claude Code knowledge base. Inspired by ../../../../chrome-md:
* shares the defuddle + Turndown extraction pipeline but runs headless
* from Node and follows links / sitemaps automatically.
*/
exports.help = {
aliases: ['cr'],
description: 'Crawl a website into Markdown files (for Claude Code knowledge bases)',
name: 'crawl',
options: [
{
description: 'Start URL (absolute http/https URL)',
flag: '--url',
required: true,
type: 'string',
},
{
default: '.',
description: 'Output directory (created if missing)',
flag: '--out',
type: 'string',
},
{
default: 0,
description: 'Link depth. 0 = only start page; 1 = + direct links; N = up to N hops; "all" (or -1) = follow every same-origin link until --max-pages is reached',
flag: '--depth',
type: 'number|all',
},
{
default: true,
description: 'Download images and inline them with local paths',
flag: '--images',
type: 'boolean',
},
{
default: true,
description: 'Also seed queue from <origin>/sitemap.xml',
flag: '--sitemap',
type: 'boolean',
},
{
default: 4,
description: 'Parallel HTTP requests',
flag: '--concurrency',
type: 'number',
},
{
default: 200,
description: 'Maximum number of pages to crawl (safety cap)',
flag: '--max-pages',
type: 'number',
},
{
description: 'CSS selector for the main content container',
flag: '--selector',
type: 'string',
},
{
default: 20000,
description: 'HTTP request timeout in ms',
flag: '--timeout',
type: 'number',
},
{
default: false,
description: 'Shortcut for --depth all (follows every same-origin link until --max-pages)',
flag: '--all',
type: 'boolean',
},
{
default: true,
description: "Render pages through a headless browser before extracting (for SPAs like Vue/Nuxt/React/Angular). Uses playwright-core with system Chrome / Edge, falling back to Playwright's bundled chromium. Disable with --no-render for plain HTTP fetches.",
flag: '--render',
type: 'boolean',
},
{
default: false,
description: 'If --render cannot find any browser, auto-install Playwright chromium (one-time ~170 MB download).',
flag: '--install-browser',
type: 'boolean',
},
{
default: true,
description: 'After a multi-page crawl, remove any .md or image files inside <outDir>/pages and <outDir>/images that were not written by this run. Disable with --no-prune to preserve old files.',
flag: '--prune',
type: 'boolean',
},
{
default: false,
description: 'Skip confirmation prompts',
flag: '--noConfirm',
type: 'boolean',
},
],
};
const NewCommand = {
alias: ['cr'],
description: 'Crawl site to Markdown',
hidden: false,
name: 'crawl',
run: (toolbox) => __awaiter(void 0, void 0, void 0, function* () {
var _a, _b, _c, _d, _e;
const { config, filesystem, helper, parameters, print: { error, info, spin, success, warning }, prompt: { confirm }, tools, } = toolbox;
if (tools.helpJson(exports.help)) {
return 'crawl';
}
tools.nonInteractiveHint('lt tools crawl <url> --out <dir> --depth 1 --noConfirm');
const ltConfig = config.loadConfig();
const commandConfig = (_b = (_a = ltConfig === null || ltConfig === void 0 ? void 0 : ltConfig.commands) === null || _a === void 0 ? void 0 : _a.tools) === null || _b === void 0 ? void 0 : _b.crawl;
// URL: positional argument > --url > interactive prompt.
const urlInput = parameters.first ||
parameters.options.url ||
(yield helper.getInput(undefined, { name: 'Website URL', showError: false }));
if (!urlInput) {
error('No URL provided');
return;
}
const url = normalizeSeedUrl(urlInput);
try {
new URL(url);
}
catch (_f) {
error(`Invalid URL: ${urlInput}`);
return;
}
const depthRaw = config.getValue({
// `--all` is a convenience shortcut for `--depth all`. It wins
// over a numeric `--depth` so users can combine both.
cliValue: parameters.options.all === true ? 'all' : parameters.options.depth,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.depth,
defaultValue: 0,
});
const depth = parseDepth(depthRaw);
const includeImages = config.getValue({
cliValue: parameters.options.images === false ? false : undefined,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.includeImages,
defaultValue: true,
});
const includeSitemap = config.getValue({
cliValue: parameters.options.sitemap === false ? false : undefined,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.includeSitemap,
defaultValue: true,
});
const concurrency = Number(config.getValue({
cliValue: parameters.options.concurrency,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.concurrency,
defaultValue: 4,
}));
const maxPages = Number(config.getValue({
cliValue: (_c = parameters.options.maxPages) !== null && _c !== void 0 ? _c : parameters.options['max-pages'],
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.maxPages,
defaultValue: 200,
}));
const timeout = Number(config.getValue({
cliValue: parameters.options.timeout,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.timeout,
defaultValue: 20000,
}));
const selector = config.getValue({
cliValue: parameters.options.selector,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.selector,
});
// `--render` and `--prune` default ON — the common case is a
// full SPA-aware knowledge-base crawl that stays in sync on
// updates. `--no-render` / `--no-prune` opt out explicitly.
const renderJs = config.getValue({
cliValue: parameters.options.render === false ? false : undefined,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.renderJs,
defaultValue: true,
});
const installBrowser = parameters.options['install-browser'] === true || parameters.options.installBrowser === true;
const pruneOrphans = config.getValue({
cliValue: parameters.options.prune === false ? false : undefined,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.prune,
defaultValue: true,
});
const outDir = (0, path_1.resolve)(config.getValue({
cliValue: (_d = parameters.options.out) !== null && _d !== void 0 ? _d : parameters.options.output,
configValue: commandConfig === null || commandConfig === void 0 ? void 0 : commandConfig.out,
defaultValue: filesystem.cwd(),
}) || filesystem.cwd());
const noConfirm = config.getNoConfirm({
cliValue: parameters.options.noConfirm,
commandConfig,
config: ltConfig,
parentConfig: (_e = ltConfig === null || ltConfig === void 0 ? void 0 : ltConfig.commands) === null || _e === void 0 ? void 0 : _e.tools,
});
info('');
info(`Crawling: ${url}`);
info(`Output: ${outDir}`);
info(`Depth: ${depth === 'all' ? 'all (bounded by --max-pages)' : depth}`);
info(`Sitemap: ${includeSitemap ? 'yes' : 'no'}`);
info(`Images: ${includeImages ? 'yes' : 'no'}`);
info(`Parallel: ${concurrency}`);
info(`Max: ${maxPages} pages`);
info(`Render: ${renderJs ? 'yes (headless browser)' : 'no (raw HTTP)'}`);
info(`Prune: ${pruneOrphans ? 'yes (remove orphaned pages/images)' : 'no'}`);
if (selector)
info(`Selector: ${selector}`);
info('');
if (!noConfirm && !(yield confirm('Start crawl?'))) {
return 'crawl cancelled';
}
const spinner = spin('Crawling...');
const result = yield (0, crawler_1.crawlSite)({
autoInstallBrowser: installBrowser,
concurrency,
depth,
includeImages,
includeSitemap,
maxPages,
onLog: (msg) => {
spinner.text = msg;
},
outDir,
prune: pruneOrphans,
renderJs,
selector,
timeout,
url,
}).catch((err) => {
spinner.fail('Crawl failed');
error(err.message);
return null;
});
if (!result) {
return;
}
spinner.succeed(`Crawl complete: ${result.pages.length} page(s)`);
info('');
if (result.indexFile) {
success(`Overview: ${result.indexFile}`);
}
for (const page of result.pages.slice(0, 10)) {
info(` - ${page.relativePath} (${page.url})`);
}
if (result.pages.length > 10) {
info(` ... and ${result.pages.length - 10} more`);
}
if (result.pruned.length > 0) {
info(`Pruned ${result.pruned.length} orphaned file(s)`);
for (const path of result.pruned.slice(0, 5)) {
info(` - ${path}`);
}
if (result.pruned.length > 5)
info(` ... and ${result.pruned.length - 5} more`);
}
if (result.skipped.length > 0) {
warning(`Skipped ${result.skipped.length} URL(s) (non-HTML or foreign origin)`);
}
if (result.errors.length > 0) {
warning(`${result.errors.length} error(s):`);
for (const err of result.errors.slice(0, 5)) {
warning(` - ${err.url}: ${err.reason}`);
}
}
if (!toolbox.parameters.options.fromGluegunMenu) {
process.exit();
}
return `crawled ${result.pages.length} pages`;
}),
};
function normalizeSeedUrl(raw) {
const trimmed = raw.trim();
if (/^https?:\/\//i.test(trimmed))
return trimmed;
return `https://${trimmed}`;
}
/**
* Parse the --depth parameter. Accepts positive integers, the string
* "all", and negative values (treated as "all"). Invalid values fall
* back to `0` so the crawl still runs against the seed URL.
*/
function parseDepth(raw) {
if (raw === undefined || raw === null)
return 0;
if (typeof raw === 'string') {
const normalized = raw.trim().toLowerCase();
if (normalized === 'all' || normalized === '-1')
return 'all';
const n = Number(normalized);
if (!Number.isFinite(n))
return 0;
return n < 0 ? 'all' : Math.floor(n);
}
if (typeof raw === 'number') {
if (!Number.isFinite(raw))
return 'all';
return raw < 0 ? 'all' : Math.floor(raw);
}
return 0;
}
exports.default = NewCommand;