UNPKG

linkinator

Version:

Find broken links, missing images, etc in your HTML. Scurry around your site and find all those broken links.

github.com/JustinBeckwith/linkinator

JustinBeckwith/linkinator

996 lines (995 loc) • 43.9 kB

JavaScript

import { EventEmitter } from 'node:events'; import * as path from 'node:path'; import process from 'node:process'; import { Agent, ProxyAgent, fetch as undiciFetch, } from 'undici'; import { getCssLinks, getLinks, validateFragments } from './links.js'; // Shared HTTP agent for insecure certificate requests. // Using a single shared agent prevents port exhaustion from creating // new agents per request (which was the original bug). let sharedInsecureAgent; // Shared proxy agent, cached per proxy URL to avoid repeated allocations. let sharedProxyAgent; let cachedProxyUrl; /** * Reset the shared HTTP agents. This is primarily useful for testing * to ensure a fresh agent state between tests. */ export function resetSharedAgents() { sharedInsecureAgent = undefined; sharedProxyAgent = undefined; cachedProxyUrl = undefined; } /** * Returns the proxy URL from well-known environment variables, or undefined * if none are set. Precedence: https_proxy > HTTPS_PROXY > http_proxy > HTTP_PROXY. */ function getProxyUrl() { const url = process.env.https_proxy ?? process.env.HTTPS_PROXY ?? process.env.http_proxy ?? process.env.HTTP_PROXY; return url || undefined; } function getSharedProxyAgent(proxyUrl) { if (!sharedProxyAgent || cachedProxyUrl !== proxyUrl) { sharedProxyAgent = new ProxyAgent(proxyUrl); cachedProxyUrl = proxyUrl; } return sharedProxyAgent; } /** * Get or create a shared HTTP agent that accepts insecure certificates. * This is used when allowInsecureCerts is enabled to bypass certificate * validation while still maintaining connection pooling. * @param _allowInsecureCerts Always true when called (parameter kept for clarity) * @returns The shared insecure agent instance */ function getSharedAgent(_allowInsecureCerts) { if (!sharedInsecureAgent) { sharedInsecureAgent = new Agent({ connect: { rejectUnauthorized: false, }, // Keep connections alive for reuse keepAliveTimeout: 30_000, keepAliveMaxTimeout: 60_000, // Allow multiple connections per host for concurrency connections: 100, }); } return sharedInsecureAgent; } import { processOptions, } from './options.js'; import { Queue } from './queue.js'; import { startWebServer, stopWebServer } from './server.js'; import { bufferStream, drainStream, toNodeReadable } from './stream-utils.js'; import { normalizeBaseUrl } from './url-utils.js'; export { getConfig } from './config.js'; export var LinkState; (function (LinkState) { LinkState["OK"] = "OK"; LinkState["BROKEN"] = "BROKEN"; LinkState["SKIPPED"] = "SKIPPED"; })(LinkState || (LinkState = {})); /** * Instance class used to perform a crawl job. */ export class LinkChecker extends EventEmitter { // Track which fragments need to be checked for each URL fragmentsToCheck = new Map(); // biome-ignore lint/suspicious/noExplicitAny: this can in fact be generic on(event, listener) { return super.on(event, listener); } /** * Crawl a given url or path, and return a list of visited links along with * status codes. * @param options Options to use while checking for 404s */ async check(options_) { const options = await processOptions(options_); if (!Array.isArray(options.path)) { options.path = [options.path]; } options.linksToSkip ||= []; let server; const hasHttpPaths = options.path.find((x) => x.startsWith('http')); if (!hasHttpPaths) { let { port } = options; server = await startWebServer({ root: options.serverRoot ?? '', port, markdown: options.markdown, directoryListing: options.directoryListing, cleanUrls: options.cleanUrls, }); if (port === undefined) { const addr = server.address(); port = addr.port; } for (let i = 0; i < options.path.length; i++) { if (options.path[i].startsWith('/')) { options.path[i] = options.path[i].slice(1); } options.path[i] = `http://localhost:${port}/${options.path[i]}`; } options.staticHttpServerHost = `http://localhost:${port}/`; } if (process.env.LINKINATOR_DEBUG) { console.log(options); } const queue = new Queue({ concurrency: options.concurrency || 100, }); const results = []; const initCache = new Set(); const relationshipCache = new Set(); const pendingChecks = new Map(); const delayCache = new Map(); const retryErrorsCache = new Map(); for (const path of options.path) { const url = new URL(path); initCache.add(url.href); // Create a promise for this starting page so other pages can wait for it const crawlPromise = (async () => { await this.crawl({ url, crawl: true, checkOptions: options, results, cache: initCache, relationshipCache, pendingChecks, delayCache, retryErrorsCache, queue, rootPath: path, retry: Boolean(options_.retry), retryErrors: Boolean(options_.retryErrors), retryErrorsCount: options_.retryErrorsCount ?? 5, retryErrorsJitter: options_.retryErrorsJitter ?? 3000, }); })(); // Store the promise pendingChecks.set(url.href, crawlPromise); // Queue the crawl queue.add(() => crawlPromise); } await queue.onIdle(); const result = { links: results, passed: results.filter((x) => x.state === LinkState.BROKEN).length === 0, }; if (server) { await stopWebServer(server); } return result; } /** * Crawl a given url with the provided options. * @pram opts List of options used to do the crawl * @private * @returns A list of crawl results consisting of urls and status codes */ async crawl(options) { // Apply any regex url replacements if (options.checkOptions.urlRewriteExpressions) { for (const exp of options.checkOptions.urlRewriteExpressions) { const newUrl = options.url.href.replace(exp.pattern, exp.replacement); if (options.url.href !== newUrl) { options.url.href = newUrl; } } } // Explicitly skip non-http[s] links before making the request const proto = options.url.protocol; if (proto !== 'http:' && proto !== 'https:') { const r = { url: mapUrl(options.url.href, options.checkOptions), status: 0, state: LinkState.SKIPPED, parent: mapUrl(options.parent, options.checkOptions), }; options.results.push(r); this.emit('link', r); return; } // Check for a user-configured function to filter out links if (typeof options.checkOptions.linksToSkip === 'function' && (await options.checkOptions.linksToSkip(options.url.href))) { const result = { url: mapUrl(options.url.href, options.checkOptions), state: LinkState.SKIPPED, parent: options.parent, }; options.results.push(result); this.emit('link', result); return; } // Check for a user-configured array of link regular expressions that should be skipped if (Array.isArray(options.checkOptions.linksToSkip)) { const skips = options.checkOptions.linksToSkip .map((linkToSkip) => { return new RegExp(linkToSkip).test(options.url.href); }) .filter(Boolean); if (skips.length > 0) { const result = { url: mapUrl(options.url.href, options.checkOptions), state: LinkState.SKIPPED, parent: mapUrl(options.parent, options.checkOptions), }; options.results.push(result); this.emit('link', result); return; } } // Check if this host has been marked for delay due to 429 if (options.delayCache.has(options.url.host)) { const timeout = options.delayCache.get(options.url.host); if (timeout === undefined) { throw new Error('timeout not found'); } if (timeout > Date.now()) { options.queue.add(async () => { await this.crawl(options); }, { delay: timeout - Date.now(), }); return; } } // Perform a HEAD or GET request based on the need to crawl let status = 0; let state = LinkState.BROKEN; let shouldRecurse = false; let response; const failures = []; const originalUrl = options.url.href; const redirectMode = options.checkOptions.redirects === 'error' ? 'manual' : 'follow'; try { response = await makeRequest(options.crawl ? 'GET' : 'HEAD', options.url.href, { headers: options.checkOptions.headers, timeout: options.checkOptions.timeout, redirect: redirectMode, allowInsecureCerts: options.checkOptions.allowInsecureCerts, }); if (this.shouldRetryAfter(response, options)) { return; } // If we got an HTTP 405, the server may not like HEAD. GET instead! if (response.status === 405) { response = await makeRequest('GET', options.url.href, { headers: options.checkOptions.headers, timeout: options.checkOptions.timeout, redirect: redirectMode, allowInsecureCerts: options.checkOptions.allowInsecureCerts, }); if (this.shouldRetryAfter(response, options)) { return; } } } catch (error) { // Request failure: invalid domain name, etc. // this also occasionally catches too many redirects, but is still valid (e.g. https://www.ebay.com) // for this reason, we also try doing a GET below to see if the link is valid failures.push(error); } try { // Some sites don't respond well to HEAD requests, even if they don't return a 405. // This is a last gasp effort to see if the link is valid. if ((response === undefined || response.status < 200 || response.status >= 300) && !options.crawl) { response = await makeRequest('GET', options.url.href, { headers: options.checkOptions.headers, timeout: options.checkOptions.timeout, redirect: redirectMode, allowInsecureCerts: options.checkOptions.allowInsecureCerts, }); if (this.shouldRetryAfter(response, options)) { return; } } } catch (error) { failures.push(error); // Catch the next failure } if (response !== undefined) { status = response.status; shouldRecurse = isHtml(response) || (isCss(response) && options.checkOptions.checkCss === true); } // If we want to recurse into a CSS file and we used HEAD, we need to do a GET // to get the body for parsing URLs (only if checkCss is enabled) if (shouldRecurse && response !== undefined && isCss(response) && !response.body && options.crawl && options.checkOptions.checkCss) { try { response = await makeRequest('GET', options.url.href, { headers: options.checkOptions.headers, timeout: options.checkOptions.timeout, redirect: redirectMode, allowInsecureCerts: options.checkOptions.allowInsecureCerts, }); if (response !== undefined) { status = response.status; } } catch (error) { failures.push(error); } } // If we need to check fragments and we used HEAD, we need to do a GET // to get the HTML body for parsing fragment IDs if (options.checkOptions.checkFragments && response !== undefined && isHtml(response) && !response.body) { const fragmentsToCheck = this.fragmentsToCheck.get(options.url.href); if (fragmentsToCheck && fragmentsToCheck.size > 0) { try { response = await makeRequest('GET', options.url.href, { headers: options.checkOptions.headers, timeout: options.checkOptions.timeout, redirect: redirectMode, allowInsecureCerts: options.checkOptions.allowInsecureCerts, }); if (response !== undefined) { status = response.status; } } catch (error) { failures.push(error); } } } // If retryErrors is enabled, retry 5xx and 0 status (which indicates // a network error likely occurred) or 429 without retry-after data: if (this.shouldRetryOnError(status, options)) { return; } // Detect if this was a redirect const redirect = detectRedirect(status, originalUrl, response); // Check for custom status code actions first (highest priority) const customAction = getStatusCodeAction(status, options.checkOptions.statusCodes); if (customAction === 'ok') { // Treat as success state = LinkState.OK; } else if (customAction === 'warn') { // Treat as success but emit warning state = LinkState.OK; this.emit('statusCodeWarning', { url: originalUrl, status, }); } else if (customAction === 'skip') { // Skip this link entirely state = LinkState.SKIPPED; } else if (customAction === 'error') { // Force failure state = LinkState.BROKEN; if (response !== undefined) { failures.push(response); } } // Special handling for bot protection responses // Status 999: Used by LinkedIn and other sites to block automated requests // Status 403 with cf-mitigated: Cloudflare bot protection challenge // Since we cannot distinguish between valid and invalid URLs when blocked, // treat these as skipped rather than broken. else if (status === 999) { state = LinkState.SKIPPED; } else if (status === 403 && response !== undefined && response.headers['cf-mitigated']) { state = LinkState.SKIPPED; } // Handle 'error' mode - treat any redirect as broken else if (options.checkOptions.redirects === 'error' && redirect.isRedirect) { state = LinkState.BROKEN; const targetInfo = redirect.targetUrl ? ` to ${redirect.targetUrl}` : ''; failures.push({ status, headers: response?.headers || {}, }); failures.push(new Error(`Redirect detected (${originalUrl}${targetInfo}) but redirects are disabled`)); } // Handle 'warn' mode - allow but warn on redirects else if (options.checkOptions.redirects === 'warn') { // Check if a redirect happened (either 3xx status or URL changed) if (redirect.isRedirect || redirect.wasFollowed) { // Emit warning about redirect this.emit('redirect', { url: originalUrl, targetUrl: redirect.targetUrl, // Report actual redirect status if we have it, otherwise 200 status: redirect.isRedirect ? status : 200, isNonStandard: redirect.isNonStandard, }); } // Still check final status for success/failure if (status >= 200 && status < 300) { state = LinkState.OK; } else if (redirect.isRedirect && redirect.wasFollowed && response?.body) { // Non-standard redirect with content - treat as OK even in warn mode state = LinkState.OK; } else if (response !== undefined) { failures.push(response); } } // Handle 'allow' mode (default) - accept 2xx or non-standard redirects with content else if (status >= 200 && status < 300) { state = LinkState.OK; } else if (redirect.isRedirect && redirect.wasFollowed && response?.body) { // Non-standard redirect with content - treat as OK in allow mode state = LinkState.OK; } else if (response !== undefined) { failures.push(response); } // Handle HTTPS enforcement // Skip enforcement for our own local static server since it can't use HTTPS const isHttpUrl = originalUrl.startsWith('http://'); const isLocalStaticServer = options.checkOptions.staticHttpServerHost && originalUrl.startsWith(options.checkOptions.staticHttpServerHost); if (isHttpUrl && !isLocalStaticServer && options.checkOptions.requireHttps === 'error') { // Treat HTTP as broken in error mode state = LinkState.BROKEN; failures.push(new Error(`HTTP link detected (${originalUrl}) but HTTPS is required`)); } else if (isHttpUrl && !isLocalStaticServer && options.checkOptions.requireHttps === 'warn') { // Emit warning about HTTP link in warn mode this.emit('httpInsecure', { url: originalUrl, }); } const result = { url: mapUrl(options.url.href, options.checkOptions), status, state, parent: mapUrl(options.parent, options.checkOptions), failureDetails: failures, }; options.results.push(result); this.emit('link', result); // Check for fragment identifiers if needed (before we start crawling deeper) // Only validate fragments if the base URL returned a successful (2xx) response if (options.checkOptions.checkFragments && response?.body && isHtml(response) && state === LinkState.OK) { const fragmentsToValidate = this.fragmentsToCheck.get(options.url.href); if (fragmentsToValidate && fragmentsToValidate.size > 0) { // Convert and buffer the response body const nodeStream = toNodeReadable(response.body); const htmlContent = await bufferStream(nodeStream); // Check if this is likely a soft 404 by looking for noindex/nofollow meta tags // Many soft 404 pages (pages that return 200 but show "Page Not Found") include these tags const htmlString = htmlContent.toString('utf-8'); const isSoft404 = htmlString.includes('content="noindex') && htmlString.includes('nofollow'); // Only validate fragments if this is NOT a soft 404 if (!isSoft404) { // Validate fragments const validationResults = await validateFragments(htmlContent, fragmentsToValidate); // Emit results for invalid fragments for (const result of validationResults) { if (!result.isValid) { const fragmentResult = { url: mapUrl(`${options.url.href}#${result.fragment}`, options.checkOptions), status: response.status, state: LinkState.BROKEN, parent: mapUrl(options.parent, options.checkOptions), failureDetails: [ new Error(`Fragment identifier '#${result.fragment}' not found on page`), ], }; options.results.push(fragmentResult); this.emit('link', fragmentResult); } } } // Create a new stream from the buffered content for link extraction const { Readable } = await import('node:stream'); const linkStream = Readable.from([htmlContent]); response.body = linkStream; } } // If we need to go deeper, scan the next level of depth for links and crawl if (options.crawl && shouldRecurse) { this.emit('pagestart', options.url); let urlResults = []; let htmlContentForFragments; if (response?.body) { // Convert to Node.js Readable stream (handles both Web and Node.js streams) const nodeStream = toNodeReadable(response.body); // Use the final URL after redirects (if available) as the base for resolving // relative links. This ensures relative links are resolved correctly even when // the original URL doesn't have a trailing slash but redirects to one. let baseUrl = response.url || options.url.href; // Fix for issue #374: Normalize the base URL to ensure relative links resolve // correctly. See normalizeBaseUrl() in url-utils.ts for details. if (isHtml(response)) { baseUrl = normalizeBaseUrl(baseUrl, options.checkOptions.cleanUrls); } // Parse HTML or CSS depending on content type if (isHtml(response)) { // If we're checking fragments, buffer the HTML content so we can validate // same-page fragments after extracting links if (options.checkOptions.checkFragments) { htmlContentForFragments = await bufferStream(nodeStream); // Create a new stream from the buffer for link extraction const { Readable } = await import('node:stream'); const linkStream = Readable.from([htmlContentForFragments]); urlResults = await getLinks(linkStream, baseUrl, options.checkOptions.checkCss); } else { urlResults = await getLinks(nodeStream, baseUrl, options.checkOptions.checkCss); } } else if (isCss(response) && options.checkOptions.checkCss) { urlResults = await getCssLinks(nodeStream, baseUrl); } } for (const result of urlResults) { // If there was some sort of problem parsing the link while // creating a new URL obj, treat it as a broken link. if (!result.url) { const r = { url: mapUrl(result.link, options.checkOptions), status: 0, state: LinkState.BROKEN, parent: mapUrl(options.url.href, options.checkOptions), }; options.results.push(r); this.emit('link', r); continue; } // Track fragments that need validation if checkFragments is enabled if (options.checkOptions.checkFragments && result.fragment && result.fragment.length > 0) { const urlKey = result.url.href; if (!this.fragmentsToCheck.has(urlKey)) { this.fragmentsToCheck.set(urlKey, new Set()); } this.fragmentsToCheck.get(urlKey)?.add(result.fragment); } let crawl = options.checkOptions.recurse && result.url?.href.startsWith(options.rootPath); // Only crawl links that start with the same host if (crawl) { try { const pathUrl = new URL(options.rootPath); crawl = result.url.host === pathUrl.host; } catch { // ignore errors } } // Create a unique key for this URL-parent relationship // Use the current page (options.url.href) as the parent in the relationship const relationshipKey = `${result.url.href}|${options.url.href}`; // Check if we've already reported this specific relationship if (options.relationshipCache.has(relationshipKey)) { continue; } // Mark this relationship as seen options.relationshipCache.add(relationshipKey); // Check if URL has been HTTP-checked before const inCache = options.cache.has(result.url.href); if (!inCache) { // URL hasn't been checked, add to cache and create a promise for the check options.cache.add(result.url.href); // Create a promise that will resolve when the check completes const checkPromise = (async () => { if (result.url === undefined) { throw new Error('url is undefined'); } await this.crawl({ url: result.url, crawl: crawl ?? false, cache: options.cache, relationshipCache: options.relationshipCache, pendingChecks: options.pendingChecks, delayCache: options.delayCache, retryErrorsCache: options.retryErrorsCache, results: options.results, checkOptions: options.checkOptions, queue: options.queue, parent: options.url.href, rootPath: options.rootPath, retry: options.retry, retryErrors: options.retryErrors, retryErrorsCount: options.retryErrorsCount, retryErrorsJitter: options.retryErrorsJitter, }); })(); // Store the promise so other parents can wait for it options.pendingChecks.set(result.url.href, checkPromise); // Queue the check options.queue.add(() => checkPromise); } else { // URL is being checked or has been checked // Only report duplicate results for BROKEN links so users can see // all parents that reference broken URLs. For OK/SKIPPED links, // we don't need to report them multiple times as this causes // massive result inflation for heavily interlinked sites. const urlHref = result.url.href; const parentHref = options.url.href; const pendingCheck = options.pendingChecks.get(urlHref); // Queue the reuse operation to check if the link is broken options.queue.add(async () => { // If there's a pending check, wait for it if (pendingCheck) { await pendingCheck; } // Now the result should be in the results array const cachedResult = options.results.find((r) => r.url === mapUrl(urlHref, options.checkOptions)); // Only emit duplicate results for BROKEN links if (cachedResult && cachedResult.state === LinkState.BROKEN) { const reusedResult = { url: cachedResult.url, status: cachedResult.status, state: cachedResult.state, parent: mapUrl(parentHref, options.checkOptions), failureDetails: cachedResult.failureDetails, }; options.results.push(reusedResult); this.emit('link', reusedResult); } }); } } // Validate same-page fragments that were found during link extraction // These fragments reference the current page and need immediate validation // since the page won't be checked again (it's already in the cache) if (options.checkOptions.checkFragments && htmlContentForFragments && response && isHtml(response) && state === LinkState.OK) { const samePageFragments = this.fragmentsToCheck.get(options.url.href); if (samePageFragments && samePageFragments.size > 0) { const validationResults = await validateFragments(htmlContentForFragments, samePageFragments); // Emit results for invalid same-page fragments for (const result of validationResults) { if (!result.isValid) { const fragmentResult = { url: mapUrl(`${options.url.href}#${result.fragment}`, options.checkOptions), status: response.status, state: LinkState.BROKEN, parent: mapUrl(options.parent, options.checkOptions), failureDetails: [ new Error(`Fragment identifier '#${result.fragment}' not found on page`), ], }; options.results.push(fragmentResult); this.emit('link', fragmentResult); } } // Clear the validated fragments to avoid duplicate validation this.fragmentsToCheck.delete(options.url.href); } } } // Drain any unconsumed response body to release the connection back to the pool. // This is critical for preventing port exhaustion - if the body isn't consumed, // the underlying TCP connection may not be reused. await drainStream(response?.body); } /** * Parse the retry-after header value into a timestamp. * Supports standard formats (seconds, HTTP date) and non-standard formats (30s, 1m30s). * @param retryAfterRaw Raw retry-after header value * @returns Timestamp in milliseconds when to retry, or NaN if invalid */ parseRetryAfter(retryAfterRaw) { // Try parsing as seconds let retryAfter = Number(retryAfterRaw) * 1000 + Date.now(); if (!Number.isNaN(retryAfter)) return retryAfter; // Try parsing as HTTP date retryAfter = Date.parse(retryAfterRaw); if (!Number.isNaN(retryAfter)) return retryAfter; // Handle non-standard formats like "30s" or "1m30s" const matches = retryAfterRaw.match(/^(?:(\d+)m)?(\d+)s$/); if (!matches) return Number.NaN; return ((Number(matches[1] || 0) * 60 + Number(matches[2])) * 1000 + Date.now()); } /** * Check the incoming response for a `retry-after` header. If present, * and if the status was an HTTP 429, calculate the date at which this * request should be retried. Ensure the delayCache knows that we're * going to wait on requests for this entire host. * @param response HttpResponse returned from the request * @param opts CrawlOptions used during this request */ shouldRetryAfter(response, options) { if (!options.retry) { return false; } const retryAfterRaw = response.headers['retry-after']; if (response.status !== 429 || !retryAfterRaw) { return false; } const retryAfter = this.parseRetryAfter(retryAfterRaw); if (Number.isNaN(retryAfter)) { return false; } // Check to see if there is already a request to wait for this host const currentTimeout = options.delayCache.get(options.url.host); if (currentTimeout !== undefined) { // Use whichever time is higher in the cache if (retryAfter > currentTimeout) { options.delayCache.set(options.url.host, retryAfter); } } else { options.delayCache.set(options.url.host, retryAfter); } options.queue.add(async () => { await this.crawl(options); }, { delay: retryAfter - Date.now(), }); const retryDetails = { url: options.url.href, status: response.status, secondsUntilRetry: Math.round((retryAfter - Date.now()) / 1000), }; this.emit('retry', retryDetails); return true; } /** * If the response is a 5xx, synthetic 0 or 429 without retry-after header retry N times. * There are cases where we can get 429 but without retry-after data, for those cases we * are going to handle it as error so we can retry N times. * @param status Status returned by request or 0 if request threw. * @param opts CrawlOptions used during this request */ shouldRetryOnError(status, options) { const maxRetries = options.retryErrorsCount; if (!options.retryErrors) { return false; } // Only retry 0 and >5xx or 429 without retry-after header status codes: if (status > 0 && status < 500 && status !== 429) { return false; } // Check to see if there is already a request to wait for this URL: let currentRetries = 1; const cachedRetries = options.retryErrorsCache.get(options.url.href); if (cachedRetries !== undefined) { // Use whichever time is higher in the cache currentRetries = cachedRetries; if (currentRetries > maxRetries) return false; options.retryErrorsCache.set(options.url.href, currentRetries + 1); } else { options.retryErrorsCache.set(options.url.href, 1); } // Use exponential backoff algorithm to take pressure off upstream service: const retryAfter = 2 ** currentRetries * 1000 + Math.random() * options.retryErrorsJitter; options.queue.add(async () => { await this.crawl(options); }, { delay: retryAfter, }); const retryDetails = { url: options.url.href, status, secondsUntilRetry: Math.round(retryAfter / 1000), }; this.emit('retry', retryDetails); return true; } } /** * Convenience method to perform a scan. * @param options CheckOptions to be passed on */ export async function check(options) { const checker = new LinkChecker(); const results = await checker.check(options); return results; } /** * Checks to see if a given source is HTML. * @param {object} response Page response. * @returns {boolean} */ function isHtml(response) { const contentType = response.headers['content-type'] || ''; return (Boolean(/text\/html/g.test(contentType)) || Boolean(/application\/xhtml\+xml/g.test(contentType))); } function isCss(response) { const contentType = response.headers['content-type'] || ''; return Boolean(/text\/css/g.test(contentType)); } /** * When running a local static web server for the user, translate paths from * the Url generated back to something closer to a local filesystem path. * @example * http://localhost:0000/test/route/README.md => test/route/README.md * @param url The url that was checked * @param options Original CheckOptions passed into the client */ function mapUrl(url, options) { if (!url) { return url; } let newUrl = url; // Trim the starting http://localhost:0000 if we stood up a local static server if (options?.staticHttpServerHost?.length && url?.startsWith(options.staticHttpServerHost)) { newUrl = url.slice(options.staticHttpServerHost.length); // Add the full filesystem path back if we trimmed it if (options?.syntheticServerRoot?.length) { newUrl = path.join(options.syntheticServerRoot, newUrl); } if (newUrl === '') { newUrl = `.${path.sep}`; } } return newUrl; } /** * Helper function to make HTTP requests using native fetch * @param method HTTP method * @param url URL to request * @param options Additional options (headers, timeout, allowInsecureCerts) * @returns Response with status, headers, and body stream */ async function makeRequest(method, url, options = {}) { // Build browser-like headers to avoid bot detection const defaultHeaders = { Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Cache-Control': 'no-cache', Pragma: 'no-cache', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Upgrade-Insecure-Requests': '1', }; const requestOptions = { method, headers: { ...defaultHeaders, ...options.headers }, redirect: options.redirect ?? 'follow', }; if (options.timeout) { requestOptions.signal = AbortSignal.timeout(options.timeout); } // For normal requests, use native fetch which uses the global dispatcher // with built-in connection pooling. This also ensures the Node.js default // User-Agent ('node') is used and allows tests to mock via setGlobalDispatcher. // // For insecure cert requests, we must use undiciFetch with a custom // dispatcher because the global dispatcher doesn't support disabling // certificate validation. We use a shared agent to prevent port exhaustion // from creating new agents per request (which was the original bug). // // For proxy requests, Node's native fetch does not automatically read // http_proxy / https_proxy environment variables, so we must explicitly // create a ProxyAgent dispatcher when those env vars are present. const proxyUrl = getProxyUrl(); const response = options.allowInsecureCerts ? await undiciFetch(url, { ...requestOptions, dispatcher: getSharedAgent(true), }) : proxyUrl ? await undiciFetch(url, { ...requestOptions, dispatcher: getSharedProxyAgent(proxyUrl), }) : await fetch(url, requestOptions); // Convert headers to a plain object const headers = {}; response.headers.forEach((value, key) => { headers[key] = value; }); const status = response.status; return { status, headers, body: (response.body ?? undefined), url: response.url, }; } /** * Checks if a status code matches a pattern (e.g., "403", "4xx", "5xx"). * * @param status - HTTP status code to check * @param pattern - Pattern to match against (specific code like "403" or wildcard like "4xx") * @returns True if the status matches the pattern */ function matchesStatusCodePattern(status, pattern) { // Exact match (e.g., "403") if (pattern === status.toString()) { return true; } // Pattern match (e.g., "4xx", "5xx") // The pattern should be in the form "Xxx" where X is the first digit and xx are wildcards if (pattern.endsWith('xx') && pattern.length === 3) { const firstDigit = pattern[0]; const statusFirstDigit = Math.floor(status / 100).toString(); return firstDigit === statusFirstDigit; } return false; } /** * Gets the configured action for a given status code. * Checks exact matches first, then patterns (4xx, 5xx). * * @param status - HTTP status code * @param statusCodes - Configuration mapping status codes/patterns to actions * @returns The action to take, or undefined if no match */ function getStatusCodeAction(status, statusCodes) { if (!statusCodes) { return undefined; } // Check for exact match first (e.g., "403") const exactMatch = statusCodes[status.toString()]; if (exactMatch) { return exactMatch; } // Check for pattern matches (e.g., "4xx", "5xx") for (const [pattern, action] of Object.entries(statusCodes)) { if (matchesStatusCodePattern(status, pattern)) { return action; } } return undefined; } /** * Helper function to detect if a redirect occurred * @param status HTTP status code * @param originalUrl Original URL requested * @param response HTTP response object * @returns Redirect detection details */ function detectRedirect(status, originalUrl, response) { const isRedirectStatus = status >= 300 && status < 400; const urlChanged = response?.url && response.url !== originalUrl; const hasLocation = Boolean(response?.headers.location); const hasBody = response?.body !== undefined; // Non-standard redirect: 3xx status without Location header or with body const isNonStandard = isRedirectStatus && (!hasLocation || (hasBody && !hasLocation)); return { isRedirect: isRedirectStatus, wasFollowed: Boolean(urlChanged || (isRedirectStatus && hasBody)), isNonStandard, targetUrl: response?.url || response?.headers.location, }; }