link-checker-cli
Version:
CLI tool to check for broken links in a website or project
198 lines (197 loc) • 8.2 kB
JavaScript
import { isLink } from "../utils.js";
import { parserHtml } from "./parserHTML.js";
import { promisePool } from '../promisePool.js';
import { parserXml } from "./ParserXML.js";
import { render } from "hanji";
import chalk from "chalk";
export class LinksProcessor {
constructor(size) {
this.pageIds = new Map();
this.uniqueLinks = new Set();
this.history = new Set();
this.poolSize = 10;
this.getAllSitemapLinks = async (link, view) => {
if (!isLink(link))
return [];
if (this.history.has(link))
return [];
if (view)
view.setCounter(this.uniqueLinks.size);
const url = new URL(link);
const siteMapResponses = [];
if (!url.pathname.endsWith(".xml")) {
const sitemapXmlUrl = `${url.origin}/sitemap.xml`;
const sitemapXmlResponse = await fetch(sitemapXmlUrl);
if (sitemapXmlResponse.ok)
siteMapResponses.push(sitemapXmlResponse);
if (siteMapResponses.length === 0) {
const sitemapUrl = `${url.origin}/sitemap`;
const siteMapResponse = await fetch(sitemapUrl);
if (siteMapResponse.ok)
siteMapResponses.push(siteMapResponse);
}
if (siteMapResponses.length === 0) {
const robotsTxt = `${url.origin}/robots.txt`;
const sitemaps = await this.getLinksFromRobotsTxt(robotsTxt);
for (const sitemap of sitemaps) {
const response = await fetch(sitemap);
if (response.ok)
siteMapResponses.push(response);
}
}
if (siteMapResponses.length === 0)
return [];
}
else if (url.pathname.endsWith("robots.txt")) {
const sitemaps = await this.getLinksFromRobotsTxt(link);
for (const sitemap of sitemaps) {
const response = await fetch(sitemap);
if (response.ok)
siteMapResponses.push(response);
}
}
else {
const response = await fetch(link);
if (response.ok)
siteMapResponses.push(response);
}
let sitemapLinks = [];
for (const response of siteMapResponses) {
const data = await response.text();
sitemapLinks = sitemapLinks.concat(parserXml.getLinks(data));
}
const nextLinks = [];
for (const { value } of sitemapLinks) {
if (!this.uniqueLinks.has(value)) {
this.uniqueLinks.add(value);
if (value.endsWith(".xml"))
nextLinks.push(value);
}
}
await promisePool({
items: nextLinks, concurrency: this.poolSize, handler: async (link) => {
const links = await this.getAllSitemapLinks(link, view);
sitemapLinks = sitemapLinks.concat(links);
}
});
return sitemapLinks;
};
this.getAllLinks = async (link, recursive, view) => {
if (this.history.has(link))
return [];
let result = [];
let responseText = null;
try {
const response = await fetch(link);
if (response.ok)
responseText = await response.text();
}
catch (err) {
render(chalk.red(`ERROR fetching link ${link}`));
console.log(err);
}
const url = new URL(link);
if (responseText) {
this.history.add(link);
const body = responseText;
const { links, ids } = parserHtml.parsePage(body, url);
this.pageIds.set(link, ids);
const internalLinks = [];
for (const link of links) {
const { value } = link;
if (link.type === "internal" && !this.history.has(value)) {
internalLinks.push(link);
}
if (!this.uniqueLinks.has(value)) {
this.uniqueLinks.add(link.value);
result.push(link);
}
;
}
if (view)
view.setCounter(this.uniqueLinks.size);
if (recursive) {
await promisePool({
items: internalLinks, concurrency: this.poolSize, handler: async (internalLink) => {
const links = await this.getAllLinks(internalLink.value, recursive, view);
result = result.concat(links);
}
});
}
}
return result;
};
this.checkAllLinks = async (links, included, view) => {
const brokenLinks = [];
let validCount = 0;
let excluded = 0;
await promisePool({
items: links, concurrency: this.poolSize, handler: async (link) => {
if (included.includes(link.type)) {
const res = await this.checkLink(link, undefined, view);
if (!res.isValid)
brokenLinks.push(res);
else
validCount++;
}
else
excluded++;
}
});
return { validCount, brokenLinks, excluded };
};
this.checkLink = async (link, userAgent, view) => {
const { value, type } = link;
const defaultUserAgent = "Broken links checker/1.0 (Node.js; is link active)";
try {
const headers = { "User-Agent": userAgent ?? defaultUserAgent, "Cache-Control": "no-cache" };
if (type === "anchor") {
const [site, anchor] = value.split("#");
const ids = this.pageIds.get(site) ?? [];
const hasAnchor = ids.includes(anchor);
const checkRes = hasAnchor
? { link: value, statusCode: 200, isValid: true, parent: link.parent }
: { link: value, statusCode: 404, isValid: false, message: "Anchor section not found!", parent: link.parent };
return checkRes;
}
const response = await fetch(value, {
headers,
});
if (view)
view.increment();
return {
link: value,
statusCode: response.status,
isValid: response.status === 200 ? true : false,
parent: link.parent
};
}
catch {
if (view)
view.increment();
return {
link: value,
statusCode: 500,
isValid: false,
parent: link.parent
};
}
};
this.getLinksFromRobotsTxt = async (url) => {
const robotsTxtResponse = await fetch(url);
const sitemaps = [];
if (robotsTxtResponse.ok) {
const robotsTxt = await robotsTxtResponse.text();
const sitemapRegex = /^sitemap:\s*(.+)$/i;
robotsTxt.split("\n").forEach((line) => {
const match = line.match(sitemapRegex);
if (match && match[1]) {
sitemaps.push(match[1].trim());
}
});
}
return sitemaps;
};
this.poolSize = size;
}
}