markdown-link-check
Version:
checks the all of the hyperlinks in a markdown text to determine if they are alive or dead
216 lines (179 loc) • 8.35 kB
JavaScript
const async = require('async');
const linkCheck = require('link-check');
const LinkCheckResult = require('link-check').LinkCheckResult;
const markdownLinkExtractor = require('markdown-link-extractor');
const ProgressBar = require('progress');
const envVarPatternMatcher = /(?<pattern>{{env\.(?<name>[a-zA-Z0-9\-_]+)}})/;
/*
* Performs some special replacements for the following patterns:
* - {{BASEURL}} - to be replaced with opts.projectBaseUrl
* - {{env.<env_var_name>}} - to be replaced with the environment variable specified with <env_var_name>
*/
function performSpecialReplacements(str, opts) {
// replace the `{{BASEURL}}` with the opts.projectBaseUrl. Helpful to build absolute urls "relative" to project roots
str = str.replace('{{BASEURL}}', opts.projectBaseUrl);
// replace {{env.<env_var_name>}} with the corresponding environment variable or an empty string if none is set.
var envVarMatch;
do {
envVarMatch = envVarPatternMatcher.exec(str);
if(!envVarMatch) {
break;
}
var envVarPattern = envVarMatch.groups.pattern;
var envVarName = envVarMatch.groups.name;
var envVarPatternReplacement = '';
if(envVarName in process.env) {
envVarPatternReplacement = process.env[envVarName];
}
str = str.replace(envVarPattern, envVarPatternReplacement);
// eslint-disable-next-line no-constant-condition
} while (true);
return str;
}
function removeCodeBlocks(markdown) {
return markdown.replace(/^```[\S\s]+?^```$/gm, '');
}
function extractHtmlSections(markdown) {
markdown =
// remove code blocks
removeCodeBlocks(markdown)
// remove HTML comments
.replace(/<!--[\S\s]+?-->/gm, '')
// remove single line code (if not escaped with "\")
.replace(/(?<!\\)`[\S\s]+?(?<!\\)`/gm, '');
const regexAllId = /<(?<tag>[^\s]+).*?id=["'](?<id>[^"']*?)["'].*?>/gmi;
const regexAName = /<a.*?name=["'](?<name>[^"']*?)["'].*?>/gmi;
const sections = []
.concat(Array.from(markdown.matchAll(regexAllId), (match) => match.groups.id))
.concat(Array.from(markdown.matchAll(regexAName), (match) => match.groups.name));
return sections
}
function extractSections(markdown) {
// First remove code blocks.
markdown = removeCodeBlocks(markdown);
const sectionTitles = markdown.match(/^#+ .*$/gm) || [];
const sections = sectionTitles.map(section =>
// The links are compared with the headings (simple text comparison).
// However, the links are url-encoded beforehand, so the headings
// have to also be encoded so that they can also be matched.
encodeURIComponent(
section
// replace links, the links can start with "./", "/", "http://", "https://" or "#"
// and keep the value of the text ($1)
.replace(/\[(.+)\]\(((?:\.?\/|https?:\/\/|#)[\w\d./?=#-]+)\)/, "$1")
// make everything (Unicode-aware) lower case
.toLowerCase()
// remove white spaces and "#" at the beginning
.replace(/^#+\s*/, '')
// remove everything that is NOT a (Unicode) Letter, (Unicode) Number decimal,
// (Unicode) Number letter, white space, underscore or hyphen
// https://ruby-doc.org/3.3.2/Regexp.html#class-Regexp-label-Unicode+Character+Categories
.replace(/[^\p{L}\p{Nd}\p{Nl}\s_\-`]/gu, "")
// remove sequences of *
.replace(/\*(?=.*)/gu, "")
// remove leftover backticks
.replace(/`/gu, "")
// Now replace remaining blanks with '-'
.replace(/\s/gu, "-")
)
);
var uniq = {};
for (var section of sections) {
if (section in uniq) {
uniq[section]++;
section = section + '-' + uniq[section];
}
uniq[section] = 0;
}
const uniqueSections = Object.keys(uniq) ?? [];
return uniqueSections;
}
module.exports = function markdownLinkCheck(markdown, opts, callback) {
if (arguments.length === 2 && typeof opts === 'function') {
// optional 'opts' not supplied.
callback = opts;
opts = {};
}
if(!opts.ignoreDisable) {
markdown = [
/(<!--[ \t]+markdown-link-check-disable[ \t]+-->[\S\s]*?<!--[ \t]+markdown-link-check-enable[ \t]+-->)/mg,
/(<!--[ \t]+markdown-link-check-disable[ \t]+-->[\S\s]*(?!<!--[ \t]+markdown-link-check-enable[ \t]+-->))/mg,
/(<!--[ \t]+markdown-link-check-disable-next-line[ \t]+-->\r?\n[^\r\n]*)/mg,
/([^\r\n]*<!--[ \t]+markdown-link-check-disable-line[ \t]+-->[^\r\n]*)/mg
].reduce(function(_markdown, disablePattern) {
return _markdown.replace(new RegExp(disablePattern), '');
}, markdown);
}
const links = markdownLinkExtractor(markdown);
const sections = extractSections(markdown).concat(extractHtmlSections(markdown));
const linksCollection = [...new Set(links)]
const bar = (opts.showProgressBar) ?
new ProgressBar('Checking... [:bar] :percent', {
complete: '=',
incomplete: ' ',
width: 25,
total: linksCollection.length
}) : undefined;
async.mapLimit(linksCollection, 2, function (link, callback) {
if (opts.ignorePatterns) {
const shouldIgnore = opts.ignorePatterns.some(function(ignorePattern) {
return ignorePattern.pattern instanceof RegExp ? ignorePattern.pattern.test(link) : (new RegExp(ignorePattern.pattern)).test(link) ? true : false;
});
if (shouldIgnore) {
const result = new LinkCheckResult(opts, link, 0, undefined);
result.status = 'ignored'; // custom status for ignored links
callback(null, result);
return;
}
}
if (opts.replacementPatterns) {
for (let replacementPattern of opts.replacementPatterns) {
let pattern = replacementPattern.pattern instanceof RegExp ? replacementPattern.pattern : new RegExp(replacementPattern.pattern, replacementPattern.global ? 'g' : '');
link = link.replace(pattern, performSpecialReplacements(replacementPattern.replacement, opts));
}
}
// Make sure it is not undefined and that the appropriate headers are always recalculated for a given link.
opts.headers = {};
if (opts.httpHeaders) {
for (const httpHeader of opts.httpHeaders) {
if (httpHeader.headers) {
for (const header of Object.keys(httpHeader.headers)) {
httpHeader.headers[header] = performSpecialReplacements(httpHeader.headers[header], opts);
}
}
for (const url of httpHeader.urls) {
if (link.startsWith(url)) {
Object.assign(opts.headers, httpHeader.headers);
// The headers of this httpHeader has been applied, the other URLs of this httpHeader don't need to be evaluated any further.
break;
}
}
}
}
let sectionLink = null;
if (link.startsWith('#')) {
sectionLink = link;
}
else if ('baseUrl' in opts && link.startsWith(opts.baseUrl)) {
if (link.substring(opts.baseUrl.length).match(/^\/*#/)) {
sectionLink = link.replace(/^[^#]+/, '');
}
}
if (sectionLink) {
const result = new LinkCheckResult(opts, sectionLink, sections.includes(sectionLink.substring(1)) ? 200 : 404, undefined);
callback(null, result);
return;
}
linkCheck(link, opts, function (err, result) {
if (opts.showProgressBar) {
bar.tick();
}
if (err) {
result = new LinkCheckResult(opts, link, 500, err);
result.status = 'error'; // custom status for errored links
}
callback(null, result);
});
}, callback);
};
;