linkinator
Version:
Find broken links, missing images, etc in your HTML. Scurry around your site and find all those broken links.
369 lines (368 loc) • 13.1 kB
JavaScript
import { Readable } from 'node:stream';
import { WritableStream } from 'htmlparser2/WritableStream';
import { parseSrcset } from 'srcset';
import schemaOrgUrlFields from './schema-org-url-fields.json' with {
type: 'json'
};
const linksAttribute = {
background: ['body'],
cite: ['blockquote', 'del', 'ins', 'q'],
data: ['object'],
href: ['a', 'area', 'embed', 'link'],
icon: ['command'],
longdesc: ['frame', 'iframe'],
manifest: ['html'],
content: ['meta'],
poster: ['video'],
pluginspage: ['embed'],
pluginurl: ['embed'],
src: [
'audio',
'embed',
'frame',
'iframe',
'img',
'input',
'script',
'source',
'track',
'video',
],
srcset: ['img', 'source'],
};
// Create lookup table for tag name to attribute that contains URL:
const tagAttribute = {};
for (const attribute of Object.keys(linksAttribute)) {
for (const tag of linksAttribute[attribute]) {
tagAttribute[tag] ||= [];
tagAttribute[tag].push(attribute);
}
}
/**
* Parses meta refresh content to extract the URL.
* Meta refresh format: "0;url=https://example.com" or "0; url=https://example.com"
* @param content The content attribute value from a meta refresh tag
* @returns The extracted URL or null if parsing fails
*/
function parseMetaRefresh(content) {
// Meta refresh format: "delay;url=URL" or "delay; url=URL"
// The delay can be any number, URL parameter can have optional spaces
const match = content.match(/^\s*\d+\s*;\s*url\s*=\s*(.+)/i);
if (match?.[1]) {
return match[1].trim();
}
return null;
}
export async function getLinks(source, baseUrl, checkCss = false) {
let realBaseUrl = baseUrl;
let baseSet = false;
const links = [];
let isInStyleTag = false;
let styleTagContent = '';
let isJsonLd = false;
let jsonLdContent = '';
const parser = new WritableStream({
onopentag(tag, attributes) {
// Allow alternate base URL to be specified in tag:
if (tag === 'base' && !baseSet) {
realBaseUrl = getBaseUrl(attributes.href, baseUrl);
baseSet = true;
}
// Track when we enter a <style> tag (only if checkCss is enabled)
if (tag === 'style' && checkCss) {
isInStyleTag = true;
styleTagContent = '';
}
if (tag === 'script' && attributes.type === 'application/ld+json') {
isJsonLd = true;
jsonLdContent = '';
}
// ignore href properties for link tags where rel is likely to fail
const relValuesToIgnore = ['dns-prefetch', 'preconnect'];
if (tag === 'link' && relValuesToIgnore.includes(attributes.rel)) {
return;
}
// Only for <meta content=""> tags, only validate the url if
// the content actually looks like a url
if (tag === 'meta' && attributes.content) {
// Handle meta refresh redirects: <meta http-equiv="refresh" content="0;url=https://example.com">
if (attributes['http-equiv']?.toLowerCase() === 'refresh') {
const refreshUrl = parseMetaRefresh(attributes.content);
if (refreshUrl) {
links.push(parseLink(refreshUrl, realBaseUrl));
}
return;
}
try {
new URL(attributes.content);
}
catch {
return;
}
}
// Check for inline style attribute with url() references (only if checkCss is enabled)
if (attributes.style && checkCss) {
const urls = extractUrlsFromCss(attributes.style);
for (const url of urls) {
links.push(parseLink(url, realBaseUrl));
}
}
if (tagAttribute[tag]) {
for (const attribute of tagAttribute[tag]) {
const linkString = attributes[attribute];
if (linkString) {
for (const link of parseAttribute(attribute, linkString)) {
links.push(parseLink(link, realBaseUrl));
}
}
}
}
},
ontext(text) {
// Collect text content when inside a <style> tag
if (isInStyleTag) {
styleTagContent += text;
}
// Collect text content when inside a JSON-LD <script> tag
if (isJsonLd) {
jsonLdContent += text;
}
},
onclosetag(tag) {
// When we close a <style> tag, extract URLs from the collected CSS
if (tag === 'style' && isInStyleTag) {
isInStyleTag = false;
const urls = extractUrlsFromCss(styleTagContent);
for (const url of urls) {
links.push(parseLink(url, realBaseUrl));
}
styleTagContent = '';
}
if (tag === 'script' && isJsonLd) {
isJsonLd = false;
try {
const json = JSON.parse(jsonLdContent);
const urls = extractLinksFromJson(json, undefined);
for (const url of urls) {
links.push(parseLink(url, realBaseUrl));
}
}
catch {
// Silently ignore JSON parsing errors
}
jsonLdContent = '';
}
},
});
await new Promise((resolve, reject) => {
source.pipe(parser).on('finish', resolve).on('error', reject);
});
return links;
}
function getBaseUrl(htmlBaseUrl, oldBaseUrl) {
if (isAbsoluteUrl(htmlBaseUrl)) {
return htmlBaseUrl;
}
const url = new URL(htmlBaseUrl, oldBaseUrl);
url.hash = '';
return url.href;
}
function isAbsoluteUrl(url) {
// Don't match Windows paths
if (/^[a-zA-Z]:\\/.test(url)) {
return false;
}
// Scheme: https://tools.ietf.org/html/rfc3986#section-3.1
// Absolute URL: https://tools.ietf.org/html/rfc3986#section-4.3
return /^[a-zA-Z][a-zA-Z\d+\-.]*:/.test(url);
}
function parseAttribute(name, value) {
switch (name) {
case 'srcset': {
// The swapping of any multiple spaces into a single space is here to
// work around this bug:
// https://github.com/sindresorhus/srcset/issues/14
const strippedValue = value.replace(/\s+/, ' ');
return parseSrcset(strippedValue).map((p) => p.url);
}
default: {
return [value];
}
}
}
function parseLink(link, baseUrl) {
try {
const url = new URL(link, baseUrl);
const fragment = url.hash
? decodeURIComponent(url.hash.slice(1))
: undefined;
url.hash = '';
return { link, url, fragment };
}
catch (error) {
return { link, error: error };
}
}
/**
* Extracts URLs from CSS content.
* Finds URLs in:
* - @import rules: @import url(...) or @import "..."
* - url() functions in property values: background: url(...)
* @param source Readable stream of CSS content
* @param baseUrl Base URL for resolving relative URLs
* @returns Array of parsed URLs found in the CSS
*/
export async function getCssLinks(source, baseUrl) {
const links = [];
const chunks = [];
// Read the entire CSS content
for await (const chunk of source) {
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
}
const cssContent = Buffer.concat(chunks).toString('utf-8');
// Extract URLs from the CSS content
const urls = extractUrlsFromCss(cssContent);
for (const url of urls) {
links.push(parseLink(url, baseUrl));
}
return links;
}
/**
* Extracts all URLs from CSS content string.
* Handles @import statements and url() functions.
* @param css CSS content as string
* @returns Array of URL strings found in the CSS
*/
function extractUrlsFromCss(css) {
const urls = [];
// Remove CSS comments /* ... */
const cleanCss = css.replace(/\/\*[\s\S]*?\*\//g, '');
// Match @import statements
// Formats: @import url("..."); @import url('...'); @import url(...);
// @import "..."; @import '...';
const importRegex = /@import\s+(?:url\(\s*['"]?([^'")]+)['"]?\s*\)|['"]([^'"]+)['"])/gi;
let match;
match = importRegex.exec(cleanCss);
while (match !== null) {
const url = match[1] || match[2];
if (url) {
urls.push(url.trim());
}
match = importRegex.exec(cleanCss);
}
// Match url() functions in CSS properties
// Formats: url("...") url('...') url(...)
const urlRegex = /url\(\s*['"]?([^'")]+)['"]?\s*\)/gi;
match = urlRegex.exec(cleanCss);
while (match !== null) {
const url = match[1];
if (url && !url.startsWith('data:')) {
// Skip data URLs
urls.push(url.trim());
}
match = urlRegex.exec(cleanCss);
}
return urls;
}
/**
* Extracts URLs from JSON-LD content. I took the approach of only
* extracting URLs from known schema.org fields that are expected to contain URLs.
* In the future, we may consider validating non-schema.org fields, or moving
* towards a more fuzzy logic for idenfiying potential URLs.
*/
function extractLinksFromJson(json, parentKey) {
const links = [];
if (!json || typeof json !== 'object') {
// If it's a string and we have a parentKey, check if it's a URL field
if (typeof json === 'string' &&
parentKey &&
schemaOrgUrlFields.includes(parentKey)) {
try {
new URL(json);
links.push(json);
}
catch (_e) {
// Not a valid URL
}
}
return links;
}
for (const key in json) {
const value = json[key];
if (typeof value === 'string' && schemaOrgUrlFields.includes(key)) {
try {
new URL(value);
links.push(value);
}
catch (_e) {
// Not a valid URL.
}
}
else if (Array.isArray(value)) {
for (const item of value) {
links.push(...extractLinksFromJson(item, key)); // Pass the key down
}
}
else if (typeof value === 'object') {
links.push(...extractLinksFromJson(value, key)); // Pass the key down
}
}
return links;
}
/**
* Extracts all valid fragment identifiers from HTML.
* Valid fragment targets include:
* - Elements with id attribute: <div id="section">
* - Named anchors: <a name="section">
* @param source Readable stream of HTML content
* @returns Set of valid fragment identifiers
*/
export async function extractFragmentIds(source) {
const fragments = new Set();
const parser = new WritableStream({
onopentag(_tag, attributes) {
// Check for id attribute (most common)
if (attributes.id) {
fragments.add(attributes.id);
}
// Check for name attribute on anchors (legacy but still valid)
if (_tag === 'a' && attributes.name) {
fragments.add(attributes.name);
}
// Check for href attributes that are fragment-only links (start with #)
// This handles GitHub-style anchors where the actual element has id="user-content-foo"
// but the href is "#foo"
if (_tag === 'a' && attributes.href) {
const href = attributes.href;
if (href.startsWith('#') && href.length > 1) {
// Extract the fragment (removing the leading #)
fragments.add(href.substring(1));
}
}
},
});
await new Promise((resolve, reject) => {
source.pipe(parser).on('finish', resolve).on('error', reject);
});
return fragments;
}
/**
* Validates fragment identifiers against HTML content.
* @param htmlContent The HTML content as a Buffer
* @param fragmentsToValidate Set of fragment identifiers to validate
* @returns Array of validation results for each fragment
*/
export async function validateFragments(htmlContent, fragmentsToValidate) {
// Extract valid fragment IDs from the HTML
const fragmentStream = Readable.from([htmlContent]);
const validFragments = await extractFragmentIds(fragmentStream);
// Check each fragment
const results = [];
for (const fragment of fragmentsToValidate) {
results.push({
fragment,
isValid: validFragments.has(fragment),
});
}
return results;
}