UNPKG

link-checker-cli

Version:

CLI tool to check for broken links in a website or project

198 lines (197 loc) 8.2 kB
import { isLink } from "../utils.js"; import { parserHtml } from "./parserHTML.js"; import { promisePool } from '../promisePool.js'; import { parserXml } from "./ParserXML.js"; import { render } from "hanji"; import chalk from "chalk"; export class LinksProcessor { constructor(size) { this.pageIds = new Map(); this.uniqueLinks = new Set(); this.history = new Set(); this.poolSize = 10; this.getAllSitemapLinks = async (link, view) => { if (!isLink(link)) return []; if (this.history.has(link)) return []; if (view) view.setCounter(this.uniqueLinks.size); const url = new URL(link); const siteMapResponses = []; if (!url.pathname.endsWith(".xml")) { const sitemapXmlUrl = `${url.origin}/sitemap.xml`; const sitemapXmlResponse = await fetch(sitemapXmlUrl); if (sitemapXmlResponse.ok) siteMapResponses.push(sitemapXmlResponse); if (siteMapResponses.length === 0) { const sitemapUrl = `${url.origin}/sitemap`; const siteMapResponse = await fetch(sitemapUrl); if (siteMapResponse.ok) siteMapResponses.push(siteMapResponse); } if (siteMapResponses.length === 0) { const robotsTxt = `${url.origin}/robots.txt`; const sitemaps = await this.getLinksFromRobotsTxt(robotsTxt); for (const sitemap of sitemaps) { const response = await fetch(sitemap); if (response.ok) siteMapResponses.push(response); } } if (siteMapResponses.length === 0) return []; } else if (url.pathname.endsWith("robots.txt")) { const sitemaps = await this.getLinksFromRobotsTxt(link); for (const sitemap of sitemaps) { const response = await fetch(sitemap); if (response.ok) siteMapResponses.push(response); } } else { const response = await fetch(link); if (response.ok) siteMapResponses.push(response); } let sitemapLinks = []; for (const response of siteMapResponses) { const data = await response.text(); sitemapLinks = sitemapLinks.concat(parserXml.getLinks(data)); } const nextLinks = []; for (const { value } of sitemapLinks) { if (!this.uniqueLinks.has(value)) { this.uniqueLinks.add(value); if (value.endsWith(".xml")) nextLinks.push(value); } } await promisePool({ items: nextLinks, concurrency: this.poolSize, handler: async (link) => { const links = await this.getAllSitemapLinks(link, view); sitemapLinks = sitemapLinks.concat(links); } }); return sitemapLinks; }; this.getAllLinks = async (link, recursive, view) => { if (this.history.has(link)) return []; let result = []; let responseText = null; try { const response = await fetch(link); if (response.ok) responseText = await response.text(); } catch (err) { render(chalk.red(`ERROR fetching link ${link}`)); console.log(err); } const url = new URL(link); if (responseText) { this.history.add(link); const body = responseText; const { links, ids } = parserHtml.parsePage(body, url); this.pageIds.set(link, ids); const internalLinks = []; for (const link of links) { const { value } = link; if (link.type === "internal" && !this.history.has(value)) { internalLinks.push(link); } if (!this.uniqueLinks.has(value)) { this.uniqueLinks.add(link.value); result.push(link); } ; } if (view) view.setCounter(this.uniqueLinks.size); if (recursive) { await promisePool({ items: internalLinks, concurrency: this.poolSize, handler: async (internalLink) => { const links = await this.getAllLinks(internalLink.value, recursive, view); result = result.concat(links); } }); } } return result; }; this.checkAllLinks = async (links, included, view) => { const brokenLinks = []; let validCount = 0; let excluded = 0; await promisePool({ items: links, concurrency: this.poolSize, handler: async (link) => { if (included.includes(link.type)) { const res = await this.checkLink(link, undefined, view); if (!res.isValid) brokenLinks.push(res); else validCount++; } else excluded++; } }); return { validCount, brokenLinks, excluded }; }; this.checkLink = async (link, userAgent, view) => { const { value, type } = link; const defaultUserAgent = "Broken links checker/1.0 (Node.js; is link active)"; try { const headers = { "User-Agent": userAgent ?? defaultUserAgent, "Cache-Control": "no-cache" }; if (type === "anchor") { const [site, anchor] = value.split("#"); const ids = this.pageIds.get(site) ?? []; const hasAnchor = ids.includes(anchor); const checkRes = hasAnchor ? { link: value, statusCode: 200, isValid: true, parent: link.parent } : { link: value, statusCode: 404, isValid: false, message: "Anchor section not found!", parent: link.parent }; return checkRes; } const response = await fetch(value, { headers, }); if (view) view.increment(); return { link: value, statusCode: response.status, isValid: response.status === 200 ? true : false, parent: link.parent }; } catch { if (view) view.increment(); return { link: value, statusCode: 500, isValid: false, parent: link.parent }; } }; this.getLinksFromRobotsTxt = async (url) => { const robotsTxtResponse = await fetch(url); const sitemaps = []; if (robotsTxtResponse.ok) { const robotsTxt = await robotsTxtResponse.text(); const sitemapRegex = /^sitemap:\s*(.+)$/i; robotsTxt.split("\n").forEach((line) => { const match = line.match(sitemapRegex); if (match && match[1]) { sitemaps.push(match[1].trim()); } }); } return sitemaps; }; this.poolSize = size; } }