crawldown
Version:
Crawl websites and convert their content into clean, readable Markdown using Mozilla's Readability and Turndown
330 lines (323 loc) • 9.53 kB
JavaScript
// src/main.ts
import { Readability } from "@mozilla/readability";
import consola from "consola";
import defu from "defu";
import { JSDOM as JSDOM2 } from "jsdom";
import pLimit from "p-limit";
import TurndownService from "turndown";
import { withoutTrailingSlash } from "ufo";
// src/lib/browser.ts
import { chromium } from "playwright";
// src/lib/config.ts
var DEFAULT_HEADLESS = true;
var ConfigManager = class _ConfigManager {
static instance = null;
config = {
browserPath: null,
headless: DEFAULT_HEADLESS
};
// Private constructor to prevent direct construction calls with `new`
// eslint-disable-next-line @typescript-eslint/no-empty-function
constructor() {
}
static getInstance() {
if (!_ConfigManager.instance) {
_ConfigManager.instance = new _ConfigManager();
}
return _ConfigManager.instance;
}
getConfig() {
return this.config;
}
setConfig(newConfig) {
this.config = {
...this.config,
...newConfig
};
}
};
// src/lib/browser.ts
var BrowserManager = class _BrowserManager {
static instance = null;
browser = null;
context = null;
// Make constructor private since this is a singleton
// eslint-disable-next-line @typescript-eslint/no-empty-function
constructor() {
}
// Static method to get instance
static getInstance() {
if (!_BrowserManager.instance) {
_BrowserManager.instance = new _BrowserManager();
}
return _BrowserManager.instance;
}
async getBrowserContext() {
if (this.context) {
return this.context;
}
if (!this.browser) {
const config = ConfigManager.getInstance().getConfig();
this.browser = await chromium.launch({
executablePath: config.browserPath ?? void 0,
headless: config.headless
});
}
this.context = await this.browser.newContext();
return this.context;
}
async createPage() {
const context = await this.getBrowserContext();
return await context.newPage();
}
async cleanup() {
if (this.context) {
await this.context.close();
this.context = null;
}
if (this.browser) {
await this.browser.close();
this.browser = null;
}
}
};
var PagePool = class {
constructor(pages) {
this.pages = pages;
this.pageInUse = new Array(pages.length).fill(false);
}
pageIndex = 0;
pageInUse;
async getAvailablePage() {
while (true) {
for (let i = 0; i < this.pages.length; i++) {
const currentIndex = (this.pageIndex + i) % this.pages.length;
if (!this.pageInUse[currentIndex]) {
this.pageInUse[currentIndex] = true;
this.pageIndex = (currentIndex + 1) % this.pages.length;
return this.pages[currentIndex];
}
}
await new Promise((resolve) => setTimeout(resolve, 100));
}
}
releasePage(page) {
const index = this.pages.indexOf(page);
if (index !== -1) {
this.pageInUse[index] = false;
}
}
};
// src/lib/get-links.ts
import { JSDOM } from "jsdom";
import {
hasProtocol,
isRelative,
isSamePath,
joinRelativeURL,
parseURL,
withBase,
withHttps
} from "ufo";
function getLinks(html, scopeUrl) {
const dom = new JSDOM(html);
const document = dom.window.document;
const base = parseURL(scopeUrl);
if (!base.host) throw new Error("Invalid base URL");
const linkElements = document.querySelectorAll("a[href]");
const links = Array.from(linkElements).map((element) => element.getAttribute("href")).filter((href) => href !== null).map((href) => href.trim()).filter(
(href) => href !== "" && !href.startsWith("javascript:") && !href.startsWith("mailto:") && !href.startsWith("tel:") && !href.startsWith("#")
).map((url) => {
if (isRelative(url) || !hasProtocol(url) && !url.startsWith("/")) {
return joinRelativeURL(base.pathname, url);
}
return url;
}).map((url) => withHttps(withBase(url, base.host))).map((url) => parseURL(url)).filter((url) => url.host === base.host).filter(
(url) => url.pathname.startsWith(base.pathname) || isSamePath(url.pathname, base.pathname)
).map((url) => withHttps(`${url.host}${url.pathname}`));
return [...new Set(links)];
}
// src/lib/scrape.ts
var DEFAULT_TIMEOUT = 1e4;
var DEFAULT_FORCE = false;
async function scrapeWithForce(page, url, timeout) {
const safetyMarginMs = 1e3;
let content = null;
const timeoutPromise = new Promise((resolve, reject) => {
setTimeout(async () => {
try {
content = await page.content();
resolve(content);
} catch (err) {
reject(err);
}
}, timeout - safetyMarginMs);
});
const navigationPromise = page.goto(url, {
timeout,
waitUntil: "load"
}).then(() => page.content());
return await Promise.race([navigationPromise, timeoutPromise]);
}
async function scrapeNormal(page, url, timeout) {
await page.goto(url, {
timeout,
waitUntil: "load"
});
return await page.content();
}
async function scrapeHtml({
page,
url,
force = DEFAULT_FORCE,
timeout = DEFAULT_TIMEOUT
}) {
try {
return force ? await scrapeWithForce(page, url, timeout) : await scrapeNormal(page, url, timeout);
} catch (error) {
if (force && await page.content()) {
return await page.content();
}
console.error(`Error scraping ${url}:`, error);
throw error;
}
}
// src/main.ts
var DEFAULT_OPTIONS = {
depth: 0,
concurrency: 4,
noHeadless: !DEFAULT_HEADLESS,
force: DEFAULT_FORCE,
timeout: DEFAULT_TIMEOUT
};
async function processSingleUrl({
url,
currentDepth,
context
}) {
if (context.processedUrls.has(url)) {
consola.debug(`Skipping already parsed URL: ${url}`);
return null;
}
consola.info(`Crawling ${url}, current depth: ${currentDepth}`);
try {
const page = await context.pagePool.getAvailablePage();
try {
const html = await scrapeHtml({
page,
url,
force: context.force,
timeout: context.timeout
});
const dom = new JSDOM2(html);
const reader = new Readability(dom.window.document);
const article = reader.parse();
if (!article?.content) {
consola.warn(`No article content found for ${url}`);
return null;
}
const markdown = context.turndownService.turndown(article.content);
context.processedUrls.add(url);
processNextDepthLinks({ html, url, currentDepth, context });
return {
url,
markdown,
title: article.title
};
} finally {
context.pagePool.releasePage(page);
}
} catch (error) {
consola.error(`Error processing ${url}:`, error);
return null;
}
}
function processNextDepthLinks({
html,
currentDepth,
context
}) {
if (currentDepth > 0) {
const links = getLinks(html, context.scopeUrl);
const newLinks = links.filter((link) => !context.processedUrls.has(link)).map((link) => withoutTrailingSlash(link));
const nextDepth = currentDepth - 1;
const existingUrls = context.urlsByDepth.get(nextDepth) ?? [];
context.urlsByDepth.set(nextDepth, [...existingUrls, ...newLinks]);
}
}
async function processDepthLevel({
currentDepth,
context
}) {
const urlsToProcess = context.urlsByDepth.get(currentDepth) ?? [];
consola.info(
`Processing ${urlsToProcess.length} URLs at depth ${currentDepth}`
);
const currentDepthPromises = urlsToProcess.map(
(url) => context.limit(async () => processSingleUrl({ url, currentDepth, context }))
);
const currentDepthResults = await Promise.all(currentDepthPromises);
context.results.push(
...currentDepthResults.filter((r) => r !== null)
);
}
async function crawl(options) {
const browserManager = BrowserManager.getInstance();
const processedOptions = defu(
options,
DEFAULT_OPTIONS
);
processedOptions.url = withoutTrailingSlash(processedOptions.url);
if (processedOptions.browserPath || processedOptions.noHeadless) {
ConfigManager.getInstance().setConfig({
browserPath: processedOptions.browserPath,
headless: !processedOptions.noHeadless
});
}
try {
const pages = await Promise.all(
Array(processedOptions.concurrency).fill(null).map(() => browserManager.createPage())
);
const pagePool = new PagePool(pages);
const turndownService = new TurndownService({
headingStyle: "atx",
hr: "---",
bulletListMarker: "-",
codeBlockStyle: "fenced"
});
consola.start(`Starting job for: ${processedOptions.url}`);
consola.debug(`Options: `);
consola.debug(processedOptions);
const context = {
pagePool,
turndownService,
processedUrls: /* @__PURE__ */ new Set(),
urlsByDepth: /* @__PURE__ */ new Map(),
limit: pLimit(processedOptions.concurrency),
results: [],
scopeUrl: options.scopeUrl ?? processedOptions.url,
force: processedOptions.force,
timeout: processedOptions.timeout
};
context.urlsByDepth.set(processedOptions.depth, [processedOptions.url]);
for (let currentDepth = processedOptions.depth; currentDepth >= 0; currentDepth--) {
await processDepthLevel({
currentDepth,
context
});
}
await Promise.all(pages.map((page) => page.close()));
await browserManager.cleanup();
consola.success(`Completed processing all URLs`);
return context.results;
} catch (error) {
await browserManager.cleanup();
throw error;
}
}
export {
BrowserManager,
DEFAULT_OPTIONS,
crawl
};
//# sourceMappingURL=chunk-MHYQT2UX.js.map