UNPKG

linkinator

Version:

Find broken links, missing images, etc in your HTML. Scurry around your site and find all those broken links.

github.com/JustinBeckwith/linkinator

JustinBeckwith/linkinator

459 lines (458 loc) • 18.1 kB

JavaScript

import { EventEmitter } from 'node:events'; import * as path from 'node:path'; import process from 'node:process'; import { request } from 'gaxios'; import { getLinks } from './links.js'; import { processOptions, } from './options.js'; import { Queue } from './queue.js'; import { startWebServer } from './server.js'; export { getConfig } from './config.js'; export var LinkState; (function (LinkState) { LinkState["OK"] = "OK"; LinkState["BROKEN"] = "BROKEN"; LinkState["SKIPPED"] = "SKIPPED"; })(LinkState || (LinkState = {})); /** * Instance class used to perform a crawl job. */ export class LinkChecker extends EventEmitter { // biome-ignore lint/suspicious/noExplicitAny: this can in fact be generic on(event, listener) { return super.on(event, listener); } /** * Crawl a given url or path, and return a list of visited links along with * status codes. * @param options Options to use while checking for 404s */ async check(options_) { const options = await processOptions(options_); if (!Array.isArray(options.path)) { options.path = [options.path]; } options.linksToSkip ||= []; let server; const hasHttpPaths = options.path.find((x) => x.startsWith('http')); if (!hasHttpPaths) { let { port } = options; server = await startWebServer({ root: options.serverRoot ?? '', port, markdown: options.markdown, directoryListing: options.directoryListing, }); if (port === undefined) { const addr = server.address(); port = addr.port; } for (let i = 0; i < options.path.length; i++) { if (options.path[i].startsWith('/')) { options.path[i] = options.path[i].slice(1); } options.path[i] = `http://localhost:${port}/${options.path[i]}`; } options.staticHttpServerHost = `http://localhost:${port}/`; } if (process.env.LINKINATOR_DEBUG) { console.log(options); } const queue = new Queue({ concurrency: options.concurrency || 100, }); const results = []; const initCache = new Set(); const delayCache = new Map(); const retryErrorsCache = new Map(); for (const path of options.path) { const url = new URL(path); initCache.add(url.href); queue.add(async () => { await this.crawl({ url, crawl: true, checkOptions: options, results, cache: initCache, delayCache, retryErrorsCache, queue, rootPath: path, retry: Boolean(options_.retry), retryErrors: Boolean(options_.retryErrors), retryErrorsCount: options_.retryErrorsCount ?? 5, retryErrorsJitter: options_.retryErrorsJitter ?? 3000, }); }); } await queue.onIdle(); const result = { links: results, passed: results.filter((x) => x.state === LinkState.BROKEN).length === 0, }; if (server) { server.destroy(); } return result; } /** * Crawl a given url with the provided options. * @pram opts List of options used to do the crawl * @private * @returns A list of crawl results consisting of urls and status codes */ async crawl(options) { // Apply any regex url replacements if (options.checkOptions.urlRewriteExpressions) { for (const exp of options.checkOptions.urlRewriteExpressions) { const newUrl = options.url.href.replace(exp.pattern, exp.replacement); if (options.url.href !== newUrl) { options.url.href = newUrl; } } } // Explicitly skip non-http[s] links before making the request const proto = options.url.protocol; if (proto !== 'http:' && proto !== 'https:') { const r = { url: mapUrl(options.url.href, options.checkOptions), status: 0, state: LinkState.SKIPPED, parent: mapUrl(options.parent, options.checkOptions), }; options.results.push(r); this.emit('link', r); return; } // Check for a user-configured function to filter out links if (typeof options.checkOptions.linksToSkip === 'function' && (await options.checkOptions.linksToSkip(options.url.href))) { const result = { url: mapUrl(options.url.href, options.checkOptions), state: LinkState.SKIPPED, parent: options.parent, }; options.results.push(result); this.emit('link', result); return; } // Check for a user-configured array of link regular expressions that should be skipped if (Array.isArray(options.checkOptions.linksToSkip)) { const skips = options.checkOptions.linksToSkip .map((linkToSkip) => { return new RegExp(linkToSkip).test(options.url.href); }) .filter(Boolean); if (skips.length > 0) { const result = { url: mapUrl(options.url.href, options.checkOptions), state: LinkState.SKIPPED, parent: mapUrl(options.parent, options.checkOptions), }; options.results.push(result); this.emit('link', result); return; } } // Check if this host has been marked for delay due to 429 if (options.delayCache.has(options.url.host)) { const timeout = options.delayCache.get(options.url.host); if (timeout === undefined) { throw new Error('timeout not found'); } if (timeout > Date.now()) { options.queue.add(async () => { await this.crawl(options); }, { delay: timeout - Date.now(), }); return; } } // Perform a HEAD or GET request based on the need to crawl let status = 0; let state = LinkState.BROKEN; let shouldRecurse = false; let response; const failures = []; try { response = await request({ method: options.crawl ? 'GET' : 'HEAD', url: options.url.href, headers: { 'User-Agent': options.checkOptions.userAgent }, responseType: 'stream', validateStatus: () => true, timeout: options.checkOptions.timeout, }); if (this.shouldRetryAfter(response, options)) { return; } // If we got an HTTP 405, the server may not like HEAD. GET instead! if (response.status === 405) { response = await request({ method: 'GET', url: options.url.href, headers: { 'User-Agent': options.checkOptions.userAgent }, responseType: 'stream', validateStatus: () => true, timeout: options.checkOptions.timeout, }); if (this.shouldRetryAfter(response, options)) { return; } } } catch (error) { // Request failure: invalid domain name, etc. // this also occasionally catches too many redirects, but is still valid (e.g. https://www.ebay.com) // for this reason, we also try doing a GET below to see if the link is valid failures.push(error); } try { // Some sites don't respond well to HEAD requests, even if they don't return a 405. // This is a last gasp effort to see if the link is valid. if ((response === undefined || response.status < 200 || response.status >= 300) && !options.crawl) { response = await request({ method: 'GET', url: options.url.href, responseType: 'stream', validateStatus: () => true, headers: { 'User-Agent': options.checkOptions.userAgent }, timeout: options.checkOptions.timeout, }); if (this.shouldRetryAfter(response, options)) { return; } } } catch (error) { failures.push(error); // Catch the next failure } if (response !== undefined) { status = response.status; shouldRecurse = isHtml(response); } // If retryErrors is enabled, retry 5xx and 0 status (which indicates // a network error likely occurred): if (this.shouldRetryOnError(status, options)) { return; } // Assume any 2xx status is 👌 if (status >= 200 && status < 300) { state = LinkState.OK; } else if (response !== undefined) { failures.push(response); } const result = { url: mapUrl(options.url.href, options.checkOptions), status, state, parent: mapUrl(options.parent, options.checkOptions), failureDetails: failures, }; options.results.push(result); this.emit('link', result); // If we need to go deeper, scan the next level of depth for links and crawl if (options.crawl && shouldRecurse) { this.emit('pagestart', options.url); const urlResults = response?.data ? await getLinks(response.data, options.url.href) : []; for (const result of urlResults) { // If there was some sort of problem parsing the link while // creating a new URL obj, treat it as a broken link. if (!result.url) { const r = { url: mapUrl(result.link, options.checkOptions), status: 0, state: LinkState.BROKEN, parent: mapUrl(options.url.href, options.checkOptions), }; options.results.push(r); this.emit('link', r); continue; } let crawl = options.checkOptions.recurse && result.url?.href.startsWith(options.rootPath); // Only crawl links that start with the same host if (crawl) { try { const pathUrl = new URL(options.rootPath); crawl = result.url.host === pathUrl.host; } catch { // ignore errors } } // Ensure the url hasn't already been touched, largely to avoid a // very large queue length and runaway memory consumption if (!options.cache.has(result.url.href)) { options.cache.add(result.url.href); options.queue.add(async () => { if (result.url === undefined) { throw new Error('url is undefined'); } await this.crawl({ url: result.url, crawl: crawl ?? false, cache: options.cache, delayCache: options.delayCache, retryErrorsCache: options.retryErrorsCache, results: options.results, checkOptions: options.checkOptions, queue: options.queue, parent: options.url.href, rootPath: options.rootPath, retry: options.retry, retryErrors: options.retryErrors, retryErrorsCount: options.retryErrorsCount, retryErrorsJitter: options.retryErrorsJitter, }); }); } } } } /** * Check the incoming response for a `retry-after` header. If present, * and if the status was an HTTP 429, calculate the date at which this * request should be retried. Ensure the delayCache knows that we're * going to wait on requests for this entire host. * @param response GaxiosResponse returned from the request * @param opts CrawlOptions used during this request */ shouldRetryAfter(response, options) { if (!options.retry) { return false; } const retryAfterRaw = response.headers['retry-after']; if (response.status !== 429 || !retryAfterRaw) { return false; } // The `retry-after` header can come in either <seconds> or // A specific date to go check. let retryAfter = Number(retryAfterRaw) * 1000 + Date.now(); if (Number.isNaN(retryAfter)) { retryAfter = Date.parse(retryAfterRaw); if (Number.isNaN(retryAfter)) { return false; } } // Check to see if there is already a request to wait for this host const currentTimeout = options.delayCache.get(options.url.host); if (currentTimeout !== undefined) { // Use whichever time is higher in the cache if (retryAfter > currentTimeout) { options.delayCache.set(options.url.host, retryAfter); } } else { options.delayCache.set(options.url.host, retryAfter); } options.queue.add(async () => { await this.crawl(options); }, { delay: retryAfter - Date.now(), }); const retryDetails = { url: options.url.href, status: response.status, secondsUntilRetry: Math.round((retryAfter - Date.now()) / 1000), }; this.emit('retry', retryDetails); return true; } /** * If the response is a 5xx or synthetic 0 response retry N times. * @param status Status returned by request or 0 if request threw. * @param opts CrawlOptions used during this request */ shouldRetryOnError(status, options) { const maxRetries = options.retryErrorsCount; if (!options.retryErrors) { return false; } // Only retry 0 and >5xx status codes: if (status > 0 && status < 500) { return false; } // Check to see if there is already a request to wait for this URL: let currentRetries = 1; const cachedRetries = options.retryErrorsCache.get(options.url.href); if (cachedRetries !== undefined) { // Use whichever time is higher in the cache currentRetries = cachedRetries; if (currentRetries > maxRetries) return false; options.retryErrorsCache.set(options.url.href, currentRetries + 1); } else { options.retryErrorsCache.set(options.url.href, 1); } // Use exponential backoff algorithm to take pressure off upstream service: const retryAfter = 2 ** currentRetries * 1000 + Math.random() * options.retryErrorsJitter; options.queue.add(async () => { await this.crawl(options); }, { delay: retryAfter, }); const retryDetails = { url: options.url.href, status, secondsUntilRetry: Math.round(retryAfter / 1000), }; this.emit('retry', retryDetails); return true; } } /** * Convenience method to perform a scan. * @param options CheckOptions to be passed on */ export async function check(options) { const checker = new LinkChecker(); const results = await checker.check(options); return results; } /** * Checks to see if a given source is HTML. * @param {object} response Page response. * @returns {boolean} */ function isHtml(response) { const contentType = response.headers['content-type'] || ''; return (Boolean(/text\/html/g.test(contentType)) || Boolean(/application\/xhtml\+xml/g.test(contentType))); } /** * When running a local static web server for the user, translate paths from * the Url generated back to something closer to a local filesystem path. * @example * http://localhost:0000/test/route/README.md => test/route/README.md * @param url The url that was checked * @param options Original CheckOptions passed into the client */ function mapUrl(url, options) { if (!url) { return url; } let newUrl = url; // Trim the starting http://localhost:0000 if we stood up a local static server if (options?.staticHttpServerHost?.length && url?.startsWith(options.staticHttpServerHost)) { newUrl = url.slice(options.staticHttpServerHost.length); // Add the full filesystem path back if we trimmed it if (options?.syntheticServerRoot?.length) { newUrl = path.join(options.syntheticServerRoot, newUrl); } if (newUrl === '') { newUrl = `.${path.sep}`; } } return newUrl; }