UNPKG

crawldown

Version:

Crawl websites and convert their content into clean, readable Markdown using Mozilla's Readability and Turndown

330 lines (323 loc) 9.53 kB
// src/main.ts import { Readability } from "@mozilla/readability"; import consola from "consola"; import defu from "defu"; import { JSDOM as JSDOM2 } from "jsdom"; import pLimit from "p-limit"; import TurndownService from "turndown"; import { withoutTrailingSlash } from "ufo"; // src/lib/browser.ts import { chromium } from "playwright"; // src/lib/config.ts var DEFAULT_HEADLESS = true; var ConfigManager = class _ConfigManager { static instance = null; config = { browserPath: null, headless: DEFAULT_HEADLESS }; // Private constructor to prevent direct construction calls with `new` // eslint-disable-next-line @typescript-eslint/no-empty-function constructor() { } static getInstance() { if (!_ConfigManager.instance) { _ConfigManager.instance = new _ConfigManager(); } return _ConfigManager.instance; } getConfig() { return this.config; } setConfig(newConfig) { this.config = { ...this.config, ...newConfig }; } }; // src/lib/browser.ts var BrowserManager = class _BrowserManager { static instance = null; browser = null; context = null; // Make constructor private since this is a singleton // eslint-disable-next-line @typescript-eslint/no-empty-function constructor() { } // Static method to get instance static getInstance() { if (!_BrowserManager.instance) { _BrowserManager.instance = new _BrowserManager(); } return _BrowserManager.instance; } async getBrowserContext() { if (this.context) { return this.context; } if (!this.browser) { const config = ConfigManager.getInstance().getConfig(); this.browser = await chromium.launch({ executablePath: config.browserPath ?? void 0, headless: config.headless }); } this.context = await this.browser.newContext(); return this.context; } async createPage() { const context = await this.getBrowserContext(); return await context.newPage(); } async cleanup() { if (this.context) { await this.context.close(); this.context = null; } if (this.browser) { await this.browser.close(); this.browser = null; } } }; var PagePool = class { constructor(pages) { this.pages = pages; this.pageInUse = new Array(pages.length).fill(false); } pageIndex = 0; pageInUse; async getAvailablePage() { while (true) { for (let i = 0; i < this.pages.length; i++) { const currentIndex = (this.pageIndex + i) % this.pages.length; if (!this.pageInUse[currentIndex]) { this.pageInUse[currentIndex] = true; this.pageIndex = (currentIndex + 1) % this.pages.length; return this.pages[currentIndex]; } } await new Promise((resolve) => setTimeout(resolve, 100)); } } releasePage(page) { const index = this.pages.indexOf(page); if (index !== -1) { this.pageInUse[index] = false; } } }; // src/lib/get-links.ts import { JSDOM } from "jsdom"; import { hasProtocol, isRelative, isSamePath, joinRelativeURL, parseURL, withBase, withHttps } from "ufo"; function getLinks(html, scopeUrl) { const dom = new JSDOM(html); const document = dom.window.document; const base = parseURL(scopeUrl); if (!base.host) throw new Error("Invalid base URL"); const linkElements = document.querySelectorAll("a[href]"); const links = Array.from(linkElements).map((element) => element.getAttribute("href")).filter((href) => href !== null).map((href) => href.trim()).filter( (href) => href !== "" && !href.startsWith("javascript:") && !href.startsWith("mailto:") && !href.startsWith("tel:") && !href.startsWith("#") ).map((url) => { if (isRelative(url) || !hasProtocol(url) && !url.startsWith("/")) { return joinRelativeURL(base.pathname, url); } return url; }).map((url) => withHttps(withBase(url, base.host))).map((url) => parseURL(url)).filter((url) => url.host === base.host).filter( (url) => url.pathname.startsWith(base.pathname) || isSamePath(url.pathname, base.pathname) ).map((url) => withHttps(`${url.host}${url.pathname}`)); return [...new Set(links)]; } // src/lib/scrape.ts var DEFAULT_TIMEOUT = 1e4; var DEFAULT_FORCE = false; async function scrapeWithForce(page, url, timeout) { const safetyMarginMs = 1e3; let content = null; const timeoutPromise = new Promise((resolve, reject) => { setTimeout(async () => { try { content = await page.content(); resolve(content); } catch (err) { reject(err); } }, timeout - safetyMarginMs); }); const navigationPromise = page.goto(url, { timeout, waitUntil: "load" }).then(() => page.content()); return await Promise.race([navigationPromise, timeoutPromise]); } async function scrapeNormal(page, url, timeout) { await page.goto(url, { timeout, waitUntil: "load" }); return await page.content(); } async function scrapeHtml({ page, url, force = DEFAULT_FORCE, timeout = DEFAULT_TIMEOUT }) { try { return force ? await scrapeWithForce(page, url, timeout) : await scrapeNormal(page, url, timeout); } catch (error) { if (force && await page.content()) { return await page.content(); } console.error(`Error scraping ${url}:`, error); throw error; } } // src/main.ts var DEFAULT_OPTIONS = { depth: 0, concurrency: 4, noHeadless: !DEFAULT_HEADLESS, force: DEFAULT_FORCE, timeout: DEFAULT_TIMEOUT }; async function processSingleUrl({ url, currentDepth, context }) { if (context.processedUrls.has(url)) { consola.debug(`Skipping already parsed URL: ${url}`); return null; } consola.info(`Crawling ${url}, current depth: ${currentDepth}`); try { const page = await context.pagePool.getAvailablePage(); try { const html = await scrapeHtml({ page, url, force: context.force, timeout: context.timeout }); const dom = new JSDOM2(html); const reader = new Readability(dom.window.document); const article = reader.parse(); if (!article?.content) { consola.warn(`No article content found for ${url}`); return null; } const markdown = context.turndownService.turndown(article.content); context.processedUrls.add(url); processNextDepthLinks({ html, url, currentDepth, context }); return { url, markdown, title: article.title }; } finally { context.pagePool.releasePage(page); } } catch (error) { consola.error(`Error processing ${url}:`, error); return null; } } function processNextDepthLinks({ html, currentDepth, context }) { if (currentDepth > 0) { const links = getLinks(html, context.scopeUrl); const newLinks = links.filter((link) => !context.processedUrls.has(link)).map((link) => withoutTrailingSlash(link)); const nextDepth = currentDepth - 1; const existingUrls = context.urlsByDepth.get(nextDepth) ?? []; context.urlsByDepth.set(nextDepth, [...existingUrls, ...newLinks]); } } async function processDepthLevel({ currentDepth, context }) { const urlsToProcess = context.urlsByDepth.get(currentDepth) ?? []; consola.info( `Processing ${urlsToProcess.length} URLs at depth ${currentDepth}` ); const currentDepthPromises = urlsToProcess.map( (url) => context.limit(async () => processSingleUrl({ url, currentDepth, context })) ); const currentDepthResults = await Promise.all(currentDepthPromises); context.results.push( ...currentDepthResults.filter((r) => r !== null) ); } async function crawl(options) { const browserManager = BrowserManager.getInstance(); const processedOptions = defu( options, DEFAULT_OPTIONS ); processedOptions.url = withoutTrailingSlash(processedOptions.url); if (processedOptions.browserPath || processedOptions.noHeadless) { ConfigManager.getInstance().setConfig({ browserPath: processedOptions.browserPath, headless: !processedOptions.noHeadless }); } try { const pages = await Promise.all( Array(processedOptions.concurrency).fill(null).map(() => browserManager.createPage()) ); const pagePool = new PagePool(pages); const turndownService = new TurndownService({ headingStyle: "atx", hr: "---", bulletListMarker: "-", codeBlockStyle: "fenced" }); consola.start(`Starting job for: ${processedOptions.url}`); consola.debug(`Options: `); consola.debug(processedOptions); const context = { pagePool, turndownService, processedUrls: /* @__PURE__ */ new Set(), urlsByDepth: /* @__PURE__ */ new Map(), limit: pLimit(processedOptions.concurrency), results: [], scopeUrl: options.scopeUrl ?? processedOptions.url, force: processedOptions.force, timeout: processedOptions.timeout }; context.urlsByDepth.set(processedOptions.depth, [processedOptions.url]); for (let currentDepth = processedOptions.depth; currentDepth >= 0; currentDepth--) { await processDepthLevel({ currentDepth, context }); } await Promise.all(pages.map((page) => page.close())); await browserManager.cleanup(); consola.success(`Completed processing all URLs`); return context.results; } catch (error) { await browserManager.cleanup(); throw error; } } export { BrowserManager, DEFAULT_OPTIONS, crawl }; //# sourceMappingURL=chunk-MHYQT2UX.js.map