remark-validate-links
Version:
remark plugin to validate links to headings and files
810 lines (699 loc) • 21.6 kB
JavaScript
/**
* @import {Hosts} from 'hosted-git-info'
* @import {} from 'mdast-util-to-hast'
* @import {Nodes, Resource, Root} from 'mdast'
* @import {FileSet} from 'unified-engine'
*/
/**
* @typedef {Map<string, Map<string, boolean>>} Landmarks
* Landmarks.
*/
/**
* @typedef Options
* Configuration.
* @property {string | false | null | undefined} [repository]
* URL to hosted Git (default: detected from Git remote);
* if you’re not in a Git repository, you must pass `false`;
* if the repository resolves to something npm understands as a Git host such
* as GitHub, GitLab, or Bitbucket, full URLs to that host (say,
* `https://github.com/remarkjs/remark-validate-links/readme.md#install`) are
* checked.
* @property {string | null | undefined} [root]
* Path to Git root folder (default: local Git folder);
* if both `root` and `repository` are nullish, the Git root is detected;
* if `root` is not given but `repository` is, `file.cwd` is used.
* @property {ReadonlyArray<RegExp | string> | null | undefined} [skipPathPatterns]
* List of patterns for *paths* that should be skipped;
* each absolute local path + hash will be tested against each pattern and
* will be ignored if `new RegExp(pattern).test(value) === true`;
* example value are then `/Users/tilde/path/to/repo/readme.md#some-heading`.
* @property {Readonly<UrlConfig> | null | undefined} [urlConfig]
* Config on how hosted Git works (default: detected from repo);
* `github.com`, `gitlab.com`, or `bitbucket.org` work automatically;
* otherwise, pass `urlConfig` manually.
*/
/**
* @callback Propose
* @param {string} value
* @param {ReadonlyArray<string>} dictionary
* @param {Readonly<ProposeOptions> | null | undefined} [options]
* @returns {string | undefined}
*/
/**
* @typedef ProposeOptions
* Configuration for `propose`.
* @property {number| null | undefined} [threshold]
* Threshold.
*/
/**
* @typedef Reference
* Reference to something.
* @property {string} filePath
* Path to file.
* @property {string | undefined} hash
* Hash.
*/
/**
* @typedef ReferenceInfo
* Info on a reference.
* @property {VFile} file
* File.
* @property {Readonly<Reference>} reference
* Reference.
* @property {ReadonlyArray<Readonly<Resources>>} nodes
* Nodes that reference it.
*/
/**
* @typedef {Extract<Nodes, Resource>} Resources
* Resources.
*/
/**
* @typedef State
* Info passed around.
* @property {string} base
* Folder of file.
* @property {string} path
* Path to file.
* @property {string | null | undefined} root
* Path to Git folder.
* @property {ReadonlyArray<RegExp>} skipPathPatterns
* List of patterns for paths that should be skipped.
* @property {Readonly<UrlConfig>} urlConfig
* Configuration.
*/
/**
* @typedef UrlConfig
* Hosted Git info.
*
* ###### Notes
*
* For this repository (`remarkjs/remark-validate-links` on GitHub)
* `urlConfig` looks as follows:
*
* ```js
* {
* // Domain of URLs:
* hostname: 'github.com',
* // Path prefix before files:
* prefix: '/remarkjs/remark-validate-links/blob/',
* // Prefix of headings:
* headingPrefix: '#',
* // Hash to top of markdown documents:
* topAnchor: '#readme',
* // Whether lines in files can be linked:
* lines: true
* }
* ```
*
* If this project were hosted on Bitbucket, it would be:
*
* ```js
* {
* hostname: 'bitbucket.org',
* prefix: '/remarkjs/remark-validate-links/src/',
* headingPrefix: '#markdown-header-',
* lines: false
* }
* ```
* @property {string | null | undefined} [headingPrefix]
* Prefix of headings (example: `'#'`, `'#markdown-header-'`).
* @property {string | null | undefined} [hostname]
* Domain of URLs (example: `'github.com'`, `'bitbucket.org'`).
* @property {boolean | null | undefined} [resolveAbsolutePathsInRepo]
* Whether absolute paths (`/x/y/z.md`) resolve relative to a repo.
* @property {boolean | null | undefined} [lines]
* Whether lines in files can be linked.
* @property {string | null | undefined} [prefix]
* Path prefix before files (example:
* `'/remarkjs/remark-validate-links/blob/'`,
* `'/remarkjs/remark-validate-links/src/'`).
* @property {string | null | undefined} [topAnchor]
* Hash to top of readme (example: `#readme`).
*/
import fs from 'node:fs/promises'
import path from 'node:path'
import GithubSlugger from 'github-slugger'
import hostedGitInfo from 'hosted-git-info'
import {toString} from 'mdast-util-to-string'
// @ts-expect-error: untyped.
import propose_ from 'propose'
import {visit} from 'unist-util-visit'
import {VFile} from 'vfile'
import {constants} from './constants.js'
import {checkFiles} from '#check-files'
import {findRepo} from '#find-repo'
const propose = /** @type {Propose} */ (propose_)
cliCompleter.pluginId = constants.sourceId
/** @type {Readonly<Partial<Record<Hosts, string>>>} */
const viewPaths = {bitbucket: 'src', github: 'blob', gitlab: 'blob'}
/** @type {Readonly<Partial<Record<Hosts, string>>>} */
const headingPrefixes = {
bitbucket: '#markdown-header-',
github: '#',
gitlab: '#'
}
/** @type {Readonly<Partial<Record<Hosts, string>>>} */
const topAnchors = {github: '#readme', gitlab: '#readme'}
/** @type {Readonly<Partial<Record<Hosts, boolean>>>} */
const lineLinks = {github: true, gitlab: true}
const slugger = new GithubSlugger()
const slash = '/'
const numberSign = '#'
const questionMark = '?'
const https = 'https:'
const http = 'http:'
const slashes = '//'
const lineExpression = /^#l\d/i
// List from: https://github.com/github/markup#markups
const readmeExtensions = new Set(['.markdown', '.mdown', '.md', '.mkdn'])
const readmeBasename = /^readme$/i
/**
* Check that markdown links and images point to existing local files and
* headings in a Git repo.
*
* > ⚠️ **Important**: The API in Node.js checks links to headings and files
* > but does not check whether headings in other files exist.
* > The API in browsers only checks links to headings in the same file.
* > The CLI can check everything.
*
* @param {Readonly<Options> | null | undefined} [options]
* Configuration (optional).
* @param {FileSet | null | undefined} [fileSet]
* File set (optional).
* @returns
* Transform.
*/
export default function remarkValidateLinks(options, fileSet) {
const settings = options || {}
const skipPathPatterns = settings.skipPathPatterns
? settings.skipPathPatterns.map(function (d) {
return typeof d === 'string' ? new RegExp(d) : d
})
: []
// Attach a `completer`.
if (fileSet) {
fileSet.use(cliCompleter)
}
/**
* Transform.
*
* @param {Root} tree
* Tree.
* @param {VFile} file
* File.
* @returns {Promise<void>}
* Nothing.
*
* Note: `void` needed because `unified` doesn’t seem to accept `undefined`.
*/
return async function (tree, file) {
/* c8 ignore next -- this yields `undefined` in browsers. */
const [repo, root] = (await findRepo(file, settings)) || []
let urlConfig = settings.urlConfig
if (!urlConfig) {
/** @type {UrlConfig} */
const config = {
headingPrefix: '#',
hostname: undefined,
lines: false,
prefix: '',
topAnchor: undefined
}
if (repo) {
const info = hostedGitInfo.fromUrl(repo)
if (info && info.type !== 'gist') {
if (info.type in viewPaths) {
config.prefix = '/' + info.path() + '/' + viewPaths[info.type] + '/'
}
if (info.type in headingPrefixes) {
config.headingPrefix = headingPrefixes[info.type]
}
if (info.type in lineLinks) {
config.lines = lineLinks[info.type]
}
if (info.type === 'github') {
config.resolveAbsolutePathsInRepo = true
}
if (info.type in topAnchors) {
config.topAnchor = topAnchors[info.type]
}
config.hostname = info.domain
}
}
urlConfig = config
}
const absolute = file.path ? path.resolve(file.cwd, file.path) : ''
const space = file.data
/** @type {Map<string, Map<string, Array<Resources>>>} */
const references = new Map()
/** @type {Landmarks} */
const landmarks = new Map()
/** @type {State} */
const state = {
base: absolute ? path.dirname(absolute) : file.cwd,
path: absolute,
root,
skipPathPatterns,
urlConfig
}
/** @type {Set<string>} */
const statted = new Set()
/** @type {Set<string>} */
const added = new Set()
/** @type {Array<Promise<void>>} */
const promises = []
space[constants.referenceId] = references
space[constants.landmarkId] = landmarks
addLandmarks(absolute, '')
slugger.reset()
visit(tree, function (node) {
const data = node.data || {}
const hProperties = data.hProperties || {}
// @ts-expect-error: accept a `data.id`, which is not standard mdast, but
// is here for historical reasons.
const dataId = /** @type {unknown} */ (data.id)
let id = String(hProperties.name || hProperties.id || dataId || '')
if (!id && node.type === 'heading') {
id = slugger.slug(
toString(node, {includeHtml: false, includeImageAlt: false})
)
}
if (id) {
addLandmarks(absolute, id)
}
if ('url' in node && node.url) {
const info = urlToPath(node.url, state, node.type)
if (info) {
const fp = info.filePath
const hash = info.hash
const together = fp + (hash ? '#' + hash : '')
if (
state.skipPathPatterns.some(function (skipPattern) {
return skipPattern.test(together)
})
) {
return
}
addReference(fp, '', node)
if (hash) {
if (fileSet || fp === absolute) {
addReference(fp, hash, node)
}
if (fileSet && fp && !statted.has(fp)) {
promises.push(addFile(fp))
}
}
}
}
})
await Promise.all(promises)
if (!fileSet) {
await checkAll([file])
}
/**
* @param {string} filePath
* Absolute path to file.
* @param {string} hash
* Hash.
* @returns {undefined}
* Nothing.
*/
function addLandmarks(filePath, hash) {
addLandmark(filePath, hash)
// Note: this may add marks too many anchors as defined.
// For example, if there is both a `readme.md` and a `readme.markdown` in a
// folder, both their landmarks will be defined for their parent folder.
// To solve this, we could check whichever sorts first, and ignore the
// others.
// This is an unlikely scenario though, and adds a lot of complexity, so
// we’re ignoring it.
if (readme(filePath)) {
addLandmark(path.dirname(filePath), hash)
}
}
/**
* @param {string} filePath
* Absolute path to file.
* @param {string} hash
* Hash.
* @returns {undefined}
* Nothing.
*/
function addLandmark(filePath, hash) {
let marks = landmarks.get(filePath)
if (!marks) {
marks = new Map()
landmarks.set(filePath, marks)
}
marks.set(hash, true)
}
/**
* @param {string} filePath
* Absolute path to file.
* @param {string} hash
* Hash.
* @param {Resources} node
* Node.
* @returns {undefined}
* Nothing.
*/
function addReference(filePath, hash, node) {
let referenceMap = references.get(filePath)
if (!referenceMap) {
referenceMap = new Map()
references.set(filePath, referenceMap)
}
let hashes = referenceMap.get(hash)
if (!hashes) {
hashes = []
referenceMap.set(hash, hashes)
}
hashes.push(node)
}
/**
* @param {string} filePath
* Absolute path to file.
* @returns {Promise<undefined>}
* Nothing.
*/
async function addFile(filePath) {
statted.add(filePath)
try {
const stats = await fs.stat(filePath)
if (stats.isDirectory()) {
/** @type {Array<string>} */
let entries = []
try {
entries = await fs.readdir(filePath)
/* c8 ignore next -- seems to never happen after a stat. */
} catch {}
const files = entries.sort()
/** @type {string | undefined} */
let file
for (const entry of files) {
if (readme(entry)) {
file = entry
break
}
}
// To do: test for no readme in folder.
// Else, there’s no readme that we can parse, so add the folder.
if (file) {
filePath = path.join(filePath, file)
statted.add(filePath)
}
}
} catch {}
if (fileSet && !added.has(filePath)) {
added.add(filePath)
fileSet.add(
new VFile({cwd: file.cwd, path: path.relative(file.cwd, filePath)})
)
}
}
}
}
/**
* Completer for the CLI (multiple files, supports parsing more files).
*
* @param {FileSet} set
* @returns {Promise<undefined>}
*/
async function cliCompleter(set) {
await checkAll(set.valueOf())
}
/**
* Completer for the CLI (multiple files, supports parsing more files).
*
* @param {ReadonlyArray<VFile>} files
* Files.
* @returns {Promise<undefined>}
* Nothing.
*/
async function checkAll(files) {
// Merge landmarks.
/** @type {Landmarks} */
const landmarks = new Map()
for (const file of files) {
const fileLandmarks = /** @type {Landmarks | undefined} */ (
file.data[constants.landmarkId]
)
if (fileLandmarks) {
for (const [filePath, marks] of fileLandmarks) {
landmarks.set(filePath, new Map(marks))
}
}
}
// Merge references.
/** @type {Map<string, Map<string, Array<ReferenceInfo>>>} */
const references = new Map()
for (const file of files) {
const fileReferences =
/** @type {Map<string, Map<string, Array<Resources>>> | undefined} */ (
file.data[constants.referenceId]
)
if (!fileReferences) {
continue
}
for (const [reference, internal] of fileReferences) {
let all = references.get(reference)
if (!all) {
all = new Map()
references.set(reference, all)
}
for (const [hash, nodes] of internal) {
let list = all.get(hash)
if (!list) {
list = []
all.set(hash, list)
}
list.push({
file,
nodes,
reference: {filePath: reference, hash}
})
}
}
}
// Access files to see whether they exist.
await checkFiles(landmarks, [...references.keys()])
/** @type {Array<ReferenceInfo>} */
const missing = []
for (const [key, referenceMap] of references) {
const lands = landmarks.get(key)
for (const [hash, infos] of referenceMap) {
/* c8 ignore next -- `else` can only happen in browser. */
const exists = lands ? lands.get(hash) : false
if (!exists) {
missing.push(...infos)
}
}
}
for (const reference of missing) {
warn(landmarks, reference)
}
}
/**
* @param {Landmarks} landmarks
* Landmarks.
* @param {ReferenceInfo} reference
* Reference.
* @returns {undefined}
* Nothing.
*/
function warn(landmarks, reference) {
const absolute = reference.file.path
? path.resolve(reference.file.cwd, reference.file.path)
: ''
const base = absolute ? path.dirname(absolute) : undefined
const filePath = base
? path.relative(base, reference.reference.filePath)
: reference.reference.filePath
const hash = reference.reference.hash
/** @type {Array<string>} */
const dictionary = []
/** @type {string} */
let reason
/** @type {string} */
let ruleId
if (hash) {
reason = 'Cannot find heading for `#' + hash + '`'
ruleId = constants.headingRuleId
if (base && path.join(base, filePath) !== absolute) {
reason += ' in `' + filePath + '`'
ruleId = constants.headingInFileRuleId
}
} else {
reason = 'Cannot find file `' + filePath + '`'
ruleId = constants.fileRuleId
}
const origin = [constants.sourceId, ruleId].join(':')
for (const [landmark, marks] of landmarks) {
// Only suggest if file exists.
if (!marks || !marks.get('')) {
continue
}
const relativeLandmark = base ? path.relative(base, landmark) : landmark
if (!hash) {
dictionary.push(relativeLandmark)
continue
}
if (relativeLandmark !== filePath) {
continue
}
for (const [subhash] of marks) {
if (subhash !== '') {
dictionary.push(subhash)
}
}
}
const suggestion = propose(hash || filePath, dictionary, {
threshold: 0.7
})
if (suggestion) {
reason += '; did you mean `' + suggestion + '`'
}
for (const node of reference.nodes) {
const message = reference.file.message(reason, {
place: node.position,
source: origin,
ruleId
})
message.url = 'https://github.com/remarkjs/remark-validate-links#readme'
}
}
/**
* @param {string} value
* URL.
* @param {State} state
* State.
* @param {Resources['type']} type
* Type of node (`'link'` or `'image'`).
* @returns {Reference | undefined}
* Reference.
*/
// eslint-disable-next-line complexity
function urlToPath(value, state, type) {
// Absolute paths: `/folder/example.md`.
if (value.charAt(0) === slash) {
if (!state.urlConfig.hostname) {
return
}
const pathname =
state.urlConfig.resolveAbsolutePathsInRepo && state.urlConfig.prefix
? state.urlConfig.prefix + 'main' + value
: value
// Create a URL.
value = https + slashes + state.urlConfig.hostname + pathname
}
/** @type {URL | undefined} */
let url
try {
url = new URL(value)
} catch {}
// URLs: `https://github.com/wooorm/test/blob/main/folder/example.md`.
if (url && state.root) {
// Exit if we don’t have hosted Git info or this is not a URL to the repo.
if (
!state.urlConfig.prefix ||
!state.urlConfig.hostname ||
(url.protocol !== https && url.protocol !== http) ||
url.hostname !== state.urlConfig.hostname ||
url.pathname.slice(0, state.urlConfig.prefix.length) !==
state.urlConfig.prefix
) {
return
}
value = url.pathname.slice(state.urlConfig.prefix.length)
// Things get interesting here: branches: `foo/bar/baz` could be `baz` on
// the `foo/bar` branch, or, `baz` in the `bar` folder on the `foo`
// branch.
// Currently, we’re ignoring this and just not supporting branches.
value = value.split(slash).slice(1).join(slash)
return normalize(
path.resolve(state.root, value + (type === 'image' ? '' : url.hash)),
state
)
}
// Remove the search: `?foo=bar`.
// But don’t remove stuff if it’s in the hash: `readme.md#heading?`.
let numberSignIndex = value.indexOf(numberSign)
const questionMarkIndex = value.indexOf(questionMark)
if (
questionMarkIndex !== -1 &&
(numberSignIndex === -1 || numberSignIndex > questionMarkIndex)
) {
value =
value.slice(0, questionMarkIndex) +
(numberSignIndex === -1 ? '' : value.slice(numberSignIndex))
numberSignIndex = value.indexOf(numberSign)
}
// Ignore "headings" in image links: `image.png#metadata`
if (numberSignIndex !== -1 && type === 'image') {
value = value.slice(0, numberSignIndex)
}
// Local: `#heading`.
if (value.charAt(0) === numberSign) {
value = state.path ? state.path + value : value
}
// Anything else, such as `readme.md`.
else {
value = state.path ? path.resolve(state.base, value) : ''
}
return normalize(value, state)
}
/**
* @param {string} url
* URL.
* @param {State} state
* State.
* @returns {Reference}
* Reference.
*/
function normalize(url, state) {
const numberSignIndex = url.indexOf(numberSign)
const lines = state.urlConfig.lines
const prefix = state.urlConfig.headingPrefix
const topAnchor = state.urlConfig.topAnchor
/** @type {string} */
let filePath
/** @type {string | undefined} */
let hash
if (numberSignIndex === -1) {
filePath = url
} else {
filePath = url.slice(0, numberSignIndex)
hash = url.slice(numberSignIndex).toLowerCase()
// Ignore the hash if it references the top anchor of the environment
if (topAnchor && hash === topAnchor) {
hash = undefined
}
// Ignore the hash if it references lines in a file or doesn’t start
// with a heading prefix.
else if (
prefix &&
((lines && lineExpression.test(hash)) ||
hash.slice(0, prefix.length) !== prefix)
) {
hash = undefined
}
// Use the hash if it starts with a heading prefix.
else if (prefix) {
hash = hash.slice(prefix.length)
}
}
return {filePath: decodeURIComponent(filePath), hash}
}
/**
* @param {string} filePath
* Absolute path to file.
* @returns {boolean}
* Whether `filePath` is a readme.
*/
function readme(filePath) {
const extension = path.extname(filePath)
return (
readmeExtensions.has(extension) &&
readmeBasename.test(path.basename(filePath, extension))
)
}