@lenne.tech/cli
Version:
lenne.Tech CLI: lt
140 lines (139 loc) • 6.25 kB
JavaScript
;
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.createBrowserFetcher = createBrowserFetcher;
/**
* Headless-browser HTML fetcher for single-page applications.
*
* Mirrors the chrome-md content script's PageReadyDetector:
* waits for the network to settle, then returns the fully hydrated
* HTML so Defuddle can extract the real content instead of the
* pre-render shell.
*
* Uses `playwright-core` with a three-tier strategy:
* 1. System Chrome / Edge via `channel: 'chrome' | 'msedge'`.
* 2. Playwright's own bundled Chromium (if already installed).
* 3. Auto-install Playwright's Chromium (`npx playwright install
* chromium`) and retry — opt-in via `autoInstall`.
*/
const child_process_1 = require("child_process");
const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)';
/**
* Try to construct a browser fetcher. Prefers a system Chrome /
* Edge via Playwright channels, falls back to Playwright's bundled
* Chromium, and (optionally) auto-installs Chromium on demand.
*/
function createBrowserFetcher() {
return __awaiter(this, arguments, void 0, function* (options = {}) {
const log = options.onLog || (() => undefined);
const reasons = [];
const { chromium } = require('playwright-core');
// 1. System Chrome.
const chromeFetcher = yield launch(chromium, { channel: 'chrome' }, options, 'system-chrome').catch((error) => {
reasons.push(`channel:chrome: ${error.message}`);
return null;
});
if (chromeFetcher) {
log(`Browser engine: ${chromeFetcher.engine}`);
return chromeFetcher;
}
// 2. System Edge (Windows fallback, also common on macOS).
const edgeFetcher = yield launch(chromium, { channel: 'msedge' }, options, 'system-edge').catch((error) => {
reasons.push(`channel:msedge: ${error.message}`);
return null;
});
if (edgeFetcher) {
log(`Browser engine: ${edgeFetcher.engine}`);
return edgeFetcher;
}
// 3. Playwright's bundled Chromium.
const bundledFetcher = yield launch(chromium, {}, options, 'playwright-chromium').catch((error) => {
reasons.push(`playwright-chromium: ${error.message}`);
return null;
});
if (bundledFetcher) {
log(`Browser engine: ${bundledFetcher.engine}`);
return bundledFetcher;
}
// 4. Optional auto-install, then retry Playwright's chromium.
if (options.autoInstall) {
log('No browser available — installing Playwright chromium (one-time download, ~170 MB)…');
try {
yield runNpx(['playwright', 'install', 'chromium']);
const retry = yield launch(chromium, {}, options, 'playwright-chromium');
if (retry) {
log(`Browser engine: ${retry.engine}`);
return retry;
}
}
catch (error) {
reasons.push(`auto-install: ${error instanceof Error ? error.message : String(error)}`);
}
}
throw new Error([
'Could not start a headless browser for SPA rendering.',
...reasons.map((r) => ` - ${r}`),
'',
'Fix one of these:',
' 1. Install Google Chrome or Microsoft Edge (Playwright picks them up automatically).',
' 2. Install Playwright browsers manually: `npx playwright install chromium`.',
' 3. Re-run with --install-browser to let the CLI install them.',
].join('\n'));
});
}
function launch(chromium, launchOptions, options, engineLabel) {
return __awaiter(this, void 0, void 0, function* () {
const browser = yield chromium.launch(Object.assign(Object.assign({}, launchOptions), { headless: true }));
const context = yield browser.newContext({
userAgent: options.userAgent || DEFAULT_USER_AGENT,
});
return {
close: () => __awaiter(this, void 0, void 0, function* () {
yield context.close();
yield browser.close();
}),
engine: engineLabel,
fetch: (url) => __awaiter(this, void 0, void 0, function* () {
var _a;
const page = yield context.newPage();
try {
yield page.goto(url, {
timeout: (_a = options.maxWaitMs) !== null && _a !== void 0 ? _a : 20000,
waitUntil: 'networkidle',
});
if (options.extraWaitMs) {
yield page.waitForTimeout(options.extraWaitMs);
}
return yield page.content();
}
finally {
yield page.close();
}
}),
};
});
}
/**
* Run an `npx` command, streaming its output to the current stdio.
* Resolves on exit code 0, rejects otherwise.
*/
function runNpx(args) {
return new Promise((resolve, reject) => {
const child = (0, child_process_1.spawn)('npx', args, { shell: false, stdio: 'inherit' });
child.on('error', reject);
child.on('exit', (code) => {
if (code === 0)
resolve();
else
reject(new Error(`npx ${args.join(' ')} exited with code ${code}`));
});
});
}