UNPKG

@lenne.tech/cli

Version:

lenne.Tech CLI: lt

140 lines (139 loc) 6.25 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.createBrowserFetcher = createBrowserFetcher; /** * Headless-browser HTML fetcher for single-page applications. * * Mirrors the chrome-md content script's PageReadyDetector: * waits for the network to settle, then returns the fully hydrated * HTML so Defuddle can extract the real content instead of the * pre-render shell. * * Uses `playwright-core` with a three-tier strategy: * 1. System Chrome / Edge via `channel: 'chrome' | 'msedge'`. * 2. Playwright's own bundled Chromium (if already installed). * 3. Auto-install Playwright's Chromium (`npx playwright install * chromium`) and retry — opt-in via `autoInstall`. */ const child_process_1 = require("child_process"); const DEFAULT_USER_AGENT = 'Mozilla/5.0 (compatible; lenneTech-CLI-Crawler/1.0; +https://lenne.tech)'; /** * Try to construct a browser fetcher. Prefers a system Chrome / * Edge via Playwright channels, falls back to Playwright's bundled * Chromium, and (optionally) auto-installs Chromium on demand. */ function createBrowserFetcher() { return __awaiter(this, arguments, void 0, function* (options = {}) { const log = options.onLog || (() => undefined); const reasons = []; const { chromium } = require('playwright-core'); // 1. System Chrome. const chromeFetcher = yield launch(chromium, { channel: 'chrome' }, options, 'system-chrome').catch((error) => { reasons.push(`channel:chrome: ${error.message}`); return null; }); if (chromeFetcher) { log(`Browser engine: ${chromeFetcher.engine}`); return chromeFetcher; } // 2. System Edge (Windows fallback, also common on macOS). const edgeFetcher = yield launch(chromium, { channel: 'msedge' }, options, 'system-edge').catch((error) => { reasons.push(`channel:msedge: ${error.message}`); return null; }); if (edgeFetcher) { log(`Browser engine: ${edgeFetcher.engine}`); return edgeFetcher; } // 3. Playwright's bundled Chromium. const bundledFetcher = yield launch(chromium, {}, options, 'playwright-chromium').catch((error) => { reasons.push(`playwright-chromium: ${error.message}`); return null; }); if (bundledFetcher) { log(`Browser engine: ${bundledFetcher.engine}`); return bundledFetcher; } // 4. Optional auto-install, then retry Playwright's chromium. if (options.autoInstall) { log('No browser available — installing Playwright chromium (one-time download, ~170 MB)…'); try { yield runNpx(['playwright', 'install', 'chromium']); const retry = yield launch(chromium, {}, options, 'playwright-chromium'); if (retry) { log(`Browser engine: ${retry.engine}`); return retry; } } catch (error) { reasons.push(`auto-install: ${error instanceof Error ? error.message : String(error)}`); } } throw new Error([ 'Could not start a headless browser for SPA rendering.', ...reasons.map((r) => ` - ${r}`), '', 'Fix one of these:', ' 1. Install Google Chrome or Microsoft Edge (Playwright picks them up automatically).', ' 2. Install Playwright browsers manually: `npx playwright install chromium`.', ' 3. Re-run with --install-browser to let the CLI install them.', ].join('\n')); }); } function launch(chromium, launchOptions, options, engineLabel) { return __awaiter(this, void 0, void 0, function* () { const browser = yield chromium.launch(Object.assign(Object.assign({}, launchOptions), { headless: true })); const context = yield browser.newContext({ userAgent: options.userAgent || DEFAULT_USER_AGENT, }); return { close: () => __awaiter(this, void 0, void 0, function* () { yield context.close(); yield browser.close(); }), engine: engineLabel, fetch: (url) => __awaiter(this, void 0, void 0, function* () { var _a; const page = yield context.newPage(); try { yield page.goto(url, { timeout: (_a = options.maxWaitMs) !== null && _a !== void 0 ? _a : 20000, waitUntil: 'networkidle', }); if (options.extraWaitMs) { yield page.waitForTimeout(options.extraWaitMs); } return yield page.content(); } finally { yield page.close(); } }), }; }); } /** * Run an `npx` command, streaming its output to the current stdio. * Resolves on exit code 0, rejects otherwise. */ function runNpx(args) { return new Promise((resolve, reject) => { const child = (0, child_process_1.spawn)('npx', args, { shell: false, stdio: 'inherit' }); child.on('error', reject); child.on('exit', (code) => { if (code === 0) resolve(); else reject(new Error(`npx ${args.join(' ')} exited with code ${code}`)); }); }); }