hyperlink
Version:
A node library and command line tool to test the integrity of your internal an external hyperlinks
977 lines (868 loc) • 29.2 kB
JavaScript
const AssetGraph = require('assetgraph');
const asyncLib = require('async');
const request = require('request');
const extendWithSitemap = require('assetgraph-plugin-sitemap');
const version = require('../package.json').version;
const relationDebugDescription = require('./relationDebugDescription');
const getModifiedHref = require('./getModifiedHref');
const prettyBytes = require('pretty-bytes');
const net = require('net');
const tls = require('tls');
const defaultSkipFilters = [require('./known-culprits/linkedin')];
const hyperlinkUserAgent = `Hyperlink v${version} (https://www.npmjs.com/package/hyperlink)`;
const userContentFragmentOrigins = [
'https://github.com',
'https://www.npmjs.com',
];
function checkCompatibility(asset, Class) {
if (typeof Class === 'undefined') {
Class = AssetGraph.Asset;
} else if (typeof Class === 'string') {
Class = AssetGraph[Class];
}
return (
asset instanceof Class ||
!asset._type ||
Class.prototype instanceof AssetGraph[asset._type] ||
!!(asset.isImage && Class === AssetGraph.Image) || // Svg is modelled as a subclass of Xml, not Image
!!(asset.isImage && Class === AssetGraph.Font) // Svg can be used as a font as well
);
}
function returnFalse() {
return false;
}
/**
* A tap-render instance (https://www.npmjs.com/package/@munter/tap-render)
*
* @typedef {Object} TapRender
* @property {function} pipe
* @property {function} begin
* @property {function} push
* @property {function} close
* @property {function} handleResult
*
* @return {stream} pause-stream instance (https://www.npmjs.com/package/pause-stream)
*/
/**
* Hyperlink
*
* @param {Object} options Hyperlink options
* @param {String} options.root AssetGraph instance root
* @param {String} [options.canonicalRoot] AssetGraph instance canonicalRoot
* @param {String[]} [options.inputUrls = ['index.html']] Files to start the population with
* @param {Function} [options.skipFilter] Filter function to mark failed tests as [skipped](https://testanything.org/tap-version-13-specification.html#skipping-tests). Return a `String` to add a message or `true` to just mark as skipped
* @param {Function} [options.todoFilter] Filter function to mark failed tests as [todo](https://testanything.org/tap-version-13-specification.html#todo-tests)'s. Return a `String` to add a message or `true` to just mark as todo
* @param {Boolean} [options.recursive = false] Recurse onto other pages within the root parameters origin
* @param {Boolean} [options.internalOnly = false] Only check links to assets within your own web root
* @param {Boolean} [options.pretty = false] Resolve extensionless links to their corresponding .html-file on disk
* @param {Boolean} [options.followSourceMaps = false] Check source maps
* @param {Boolean} [options.verbose = false] Verbose output from AssetGraph
* @param {Boolean} [options.memdebug = false] Memory debugging
* @param {Number} [options.concurrency = 25] Concurrency limit
* @param {TapRender} t tap-render instance
* @return {AssetGraph}
*/
async function hyperlink(
{
root,
canonicalRoot,
inputUrls,
skipFilter = returnFalse,
todoFilter = returnFalse,
recursive = false,
internalOnly = false,
pretty = false,
followSourceMaps = false,
verbose = false,
memdebug = false,
concurrency = 25,
} = {},
t
) {
const ag = new AssetGraph({
root,
canonicalRoot,
});
extendWithSitemap(ag);
ag.teepee.headers['User-Agent'] = hyperlinkUserAgent;
ag.teepee.timeout = 30000;
function shouldSkip(report) {
let skip;
for (const filter of defaultSkipFilters) {
const message = filter(report);
if (message) {
skip = message;
}
}
if (!skip) {
try {
skip = skipFilter(report);
} catch (err) {
console.error(err.stack);
process.exit();
}
}
if (skip === true || typeof skip === 'string') {
t.push(null, {
...report,
skip,
ok: true,
});
return true;
}
return false;
}
function reportTest(report) {
if (report.ok) {
t.push(null, report);
return;
}
const todo = todoFilter(report);
if (todo === true || typeof todo === 'string') {
report.todo = todo;
}
t.push(null, report);
}
function tryConnect(asset, connectionReport) {
const hostname = asset.hostname;
const isTls = asset.protocol === 'https:';
const port = asset.port ? parseInt(asset.port, 10) : isTls ? 443 : 80;
return (callback) => {
if (shouldSkip(connectionReport)) {
return setTimeout(callback);
}
const socket = (isTls ? tls : net)
.connect(port, hostname, () => {
reportTest({
...connectionReport,
ok: true,
});
socket.destroy();
callback();
})
.on('error', (error) => {
const message = error.message;
let actual = message || 'Unknown error';
if (error.code === 'ENOTFOUND') {
actual = `DNS missing: ${hostname}`;
}
reportTest({
...connectionReport,
ok: false,
actual,
});
callback();
});
};
}
function httpStatus(asset, attempt = 1) {
const url = asset.url;
const relations = asset._incoming;
const loadReport = {
operator: 'external-check',
name: `external-check ${url}`,
at: [...new Set(relations.map((r) => r.debugDescription))].join(
'\n '
),
expected: `200 ${url}`,
};
return (callback) => {
if (shouldSkip(loadReport)) {
return setTimeout(callback);
}
request(
{
method: attempt === 1 ? 'head' : 'get',
url: asset.url,
strictSSL: true,
gzip: true,
headers: {
'User-Agent': hyperlinkUserAgent,
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch, br',
},
},
(error, res) => {
if (error) {
const code = error.code;
let actual = code || 'Unknown error';
switch (code) {
case 'ENOTFOUND':
actual = `DNS missing: ${asset.hostname}`;
break;
case 'HPE_INVALID_CONSTANT':
if (attempt === 1) {
return httpStatus(asset, attempt + 1)(callback);
}
break;
}
reportTest({
...loadReport,
ok: false,
actual,
});
return callback();
}
const status = res.statusCode;
if (status >= 200 && status < 300) {
const contentType = res.headers['content-type'];
if (contentType && asset.type) {
const matchContentType = contentType.match(
/^\s*([\w\-+.]+\/[\w-+.]+)(?:\s|;|$)/i
);
if (matchContentType && asset.expectedTypes) {
asset.contentType = matchContentType[1].toLowerCase();
asset._inferredType = undefined;
asset._tryUpgrade();
}
} else if (!contentType) {
const contentTypeMisingReport = {
ok: false,
name: `content-type-missing ${asset.urlOrDescription}`,
operator: 'content-type-missing',
expected:
asset.contentType ||
`A Content-Type compatible with ${asset.type}`,
actual: contentType,
at: [...new Set(relations.map((r) => r.debugDescription))].join(
'\n '
),
};
if (!shouldSkip(contentTypeMisingReport)) {
reportTest(contentTypeMisingReport);
}
}
}
// Some servers respond weirdly to HEAD requests. Make a second attempt with GET
if (attempt === 1 && status >= 400 && status < 600) {
return httpStatus(asset, attempt + 1)(callback);
}
// Some servers (jspm.io) respond with 502 if requesting HEAD, then GET to close in succession. Give the server a second to cool down
if (attempt === 2 && status === 502) {
setTimeout(() => httpStatus(asset, attempt + 1)(callback), 1000);
return;
}
const redirects = res.request._redirect.redirects;
if (redirects.length > 0) {
const log = [{ redirectUri: url }, ...redirects].map(
(item, idx, arr) => {
if (arr[idx + 1]) {
item.statusCode = arr[idx + 1].statusCode;
} else {
item.statusCode = 200;
}
return item;
}
);
const redirectReport = {
operator: 'external-redirect',
name: `external-redirect ${url}`,
at: [...new Set(relations.map((r) => r.debugDescription))].join(
'\n '
),
expected: `302 ${url} --> 200 ${log[log.length - 1].redirectUri}`,
};
const actual = log
.map(
(redirect) => `${redirect.statusCode} ${redirect.redirectUri}`
)
.join(' --> ');
if (!shouldSkip(redirectReport)) {
// A single temporary redirect is allowed
if ([302, 307].includes(log[0].statusCode)) {
if (log.length < 3) {
reportTest({
...redirectReport,
expected: actual,
actual,
ok: true,
});
} else {
reportTest({
...redirectReport,
expected: `${log[0].statusCode} ${url} --> 200 ${
log[log.length - 1].redirectUri
}`,
actual,
ok: false,
});
}
} else {
reportTest({
...redirectReport,
actual,
ok: false,
});
}
}
}
if (status === 200) {
reportTest({
...loadReport,
ok: true,
actual: loadReport.expected,
});
return callback();
}
reportTest({
...loadReport,
actual: `${status} ${url}`,
ok: false,
});
return callback();
}
);
};
}
if (verbose) {
ag.on('addRelation', (relation) => {
console.error('addRelation', relation.toString());
});
ag.on('addAsset', (asset) => {
console.error('addAsset', asset.toString());
});
}
function handleError(error) {
const message = error.message || error;
const asset = error.asset || (error.relation && error.relation.to);
const report = {
ok: false,
name: `Failed loading ${
error.relation
? 'relation'
: (asset && asset.urlOrDescription) || 'asset'
}`,
operator: 'error',
actual:
((asset && asset.urlOrDescription + ': ') || '') +
message.split('\nIncluding assets:').shift(),
};
if (error.asset) {
if (error.asset._incoming) {
report.at = error.asset._incoming[0].debugDescription;
}
} else if (error.relation) {
report.at = relationDebugDescription(error.relation);
}
if (error.stack) {
report.actual.stack += error.stack;
}
reportTest(report);
}
ag.on('warn', handleError);
ag.on('error', handleError);
if (memdebug) {
setInterval(() => {
const memoryUsage = process.memoryUsage();
for (const key of Object.keys(memoryUsage)) {
console.error(key, prettyBytes(memoryUsage[key]));
}
}, 5000);
}
t.begin();
t.push({
name: 'Crawling internal assets',
});
// Would be nice for this information to be more easily accessible:
const assetTypesWithoutRelations = Object.keys(AssetGraph).filter(
(key) =>
AssetGraph[key] &&
AssetGraph[key].prototype &&
AssetGraph[key].prototype.findOutgoingRelationsInParseTree ===
AssetGraph.Asset.prototype.findOutgoingRelationsInParseTree
);
const assetQueue = ag.addAssets(inputUrls);
const entrypoints = [...assetQueue];
const processedAssets = new Set();
const processedUrls = new Set();
// eslint-disable-next-line no-inner-declarations
async function processAsset(asset) {
if (!processedUrls.has(asset.urlOrDescription)) {
processedAssets.add(asset);
processedUrls.add(asset.urlOrDescription);
const loadReport = {
operator: 'load',
name: `load ${asset.urlOrDescription}`,
expected: `200 ${asset.urlOrDescription}`,
};
if (asset._incoming && asset._incoming[0].debugDescription) {
loadReport.at = asset._incoming[0].debugDescription;
} else {
loadReport.at = `${asset.urlOrDescription} (input URL)`;
}
if (shouldSkip(loadReport)) {
return;
}
try {
await asset.load();
reportTest({
...loadReport,
ok: true,
});
} catch (err) {
const failedLoadReport = {
...loadReport,
ok: false,
actual: err.message,
};
// If configured, check for extensionless html file links.
// Some web serves are configured to resolve these automatically
if (
pretty &&
!entrypoints.includes(asset) &&
asset.protocol === 'file:' &&
asset.extension !== '.html'
) {
const originalUrl = asset.url;
const prettyUrl = asset.url.replace(/(\?|#|$)/, '.html$1');
let prettyAsset = ag.findAssets({ url: prettyUrl })[0];
if (!prettyAsset) {
prettyAsset = ag.addAsset({
url: prettyUrl,
});
}
try {
await prettyAsset.load();
asset.isRedirect = true;
asset.fileRedirectTargetUrl = prettyUrl;
} catch (err) {
reportTest(failedLoadReport);
return;
}
reportTest({
...loadReport,
ok: true,
});
asset.url = originalUrl;
} else {
reportTest(failedLoadReport);
return;
}
}
if (asset.isRedirect) {
asset._redirectTarget = asset.outgoingRelations.find((relation) =>
/Redirect$/.test(relation.type)
).to;
}
if (asset.type === 'Html') {
// If this asset got into the graph through an unexpected relation, we need to guard against
// treating it as a new entry point
// Always process entry points
if (!entrypoints.includes(asset)) {
// Cross-origin should always stop the recursion
// Same-origin should only recurse of configured to
asset.stopProcessing =
!asset.isRedirect && (asset.crossedOrigins || !recursive);
}
// Remember the set of names and ids in the document before unloading so incoming fragments can be checked:
// See https://github.com/Munter/hyperlink/issues/160
asset.ids = new Set();
asset.names = new Set();
if (asset.isLoaded && asset.parseTree) {
for (const element of Array.from(
asset.parseTree.querySelectorAll('[id]')
)) {
asset.ids.add(element.getAttribute('id'));
}
for (const element of Array.from(
asset.parseTree.querySelectorAll('[name]')
)) {
asset.names.add(element.getAttribute('name'));
}
}
}
// In non-recursive mode local assets might be marked as end-of-line.
// This is specifically relevant to local file-URLs
if (asset.stopProcessing) {
asset.unload();
return;
}
for (const relation of asset.externalRelations) {
// Only do work for supported protocols
if (!['http:', 'https:', 'file:'].includes(relation.to.protocol)) {
continue;
}
if (relation.targetType) {
relation.to.expectedTypes = relation.to.expectedTypes || new Set();
relation.to.expectedTypes.add(relation.targetType);
}
// Store a description of this incoming relation for future error messages
// so that we can unload the source asset (destroying the relation):
const fragment = relation.fragment;
if (fragment) {
if (fragment === '#') {
const fragmentReport = {
name: `fragment-check ${relation.from.urlOrDescription} --> ${relation.href}`,
operator: 'fragment-check',
expected:
'Fragment identifiers in links to different documents should not be empty',
at: relationDebugDescription(relation),
};
if (!shouldSkip(fragmentReport)) {
if (relation.to !== asset) {
reportTest({
...fragmentReport,
ok: false,
});
}
}
} else {
(relation.to.incomingFragments =
relation.to.incomingFragments || []).push({
fragment,
relationDebugDescription: relationDebugDescription(relation),
href: relation.href,
fromUrlOrDescription: relation.from.urlOrDescription,
fromUrl: relation.from.url,
});
}
}
if (!relation.to._incoming) {
(relation.to._incoming = relation.to._incoming || []).push({
type: relation.type,
debugDescription: relationDebugDescription(relation),
});
}
// Check for mixed-content warning:
if (
relation.from.nonInlineAncestor.protocol === 'https:' &&
relation.to.protocol === 'http:' &&
!['HtmlAnchor', 'SvgAnchor'].includes(relation.type)
) {
const href = relation.href || relation.to.url;
const mixedContentReport = {
name: `mixed-content ${relation.from.urlOrDescription} --> ${href}`,
operator: 'mixed-content',
at: relationDebugDescription(relation),
expected: `${relation.from.urlOrDescription} --> ${href.replace(
/\bhttps?:/g,
'https:'
)}`,
actual: `${relation.from.urlOrDescription} --> ${href}`,
};
if (!shouldSkip(mixedContentReport)) {
if (mixedContentReport.actual !== mixedContentReport.expected) {
reportTest({
...mixedContentReport,
ok: false,
});
} else {
reportTest({
...mixedContentReport,
ok: true,
});
}
}
}
let follow;
if (
['HtmlPreconnectLink', 'HtmlDnsPrefetchLink'].includes(relation.type)
) {
follow = false;
relation.to['check' + relation.type] = true;
} else if (
['HtmlAnchor', 'SvgAnchor', 'HtmlIFrame'].includes(relation.type)
) {
if (
!relation.crossorigin &&
!relation.from.crossedOrigins &&
recursive
) {
follow = true;
} else if (relation.from !== relation.to) {
// If we are handling local file-urls, follow but mark as end-of-line in processing
if (
!recursive &&
relation.from.protocol === 'file:' &&
relation.to.protocol === 'file:'
) {
follow = true;
relation.to.stopProcessing = true;
} else if (!internalOnly) {
if (relation.fragment && relation.fragment !== '#') {
follow = true;
relation.to.stopProcessing = true;
} else {
relation.to.check = true;
}
}
}
} else if (
/^(?:JavaScript|Css)Source(?:Mapping)Url$/.test(relation.type)
) {
if (followSourceMaps) {
follow = true;
} else {
relation.to.check = true;
}
} else if (
['SourceMapFile', 'SourceMapSource'].includes(relation.type)
) {
if (followSourceMaps) {
relation.to.check = true;
}
} else if (relation.type === 'JavaScriptFetch') {
follow = false;
} else if (internalOnly) {
follow = !relation.crossorigin;
} else {
follow = true;
}
if (follow) {
// Save information about cross origin navigations for later
relation.to.crossedOrigins =
relation.crossorigin || relation.from.crossedOrigins;
if (assetTypesWithoutRelations.includes(relation.to.type)) {
// If we are handling local file-urls, follow but mark as end-of-line in processing
if (
relation.from.nonInlineAncestor.protocol === 'file:' &&
relation.to.protocol === 'file:'
) {
relation.to.stopProcessing = !recursive;
assetQueue.push(relation.to);
} else {
relation.to.check = true;
}
} else {
assetQueue.push(relation.to);
}
}
}
// Conserve memory by immediately unloading the asset:
if (verbose) {
reportTest({
ok: true,
name: `unloading ${asset.urlOrDescription}`,
});
}
asset.unload();
}
}
await new Promise((resolve) => {
let numInFlight = 0;
(function proceed() {
while (assetQueue.length > 0 && numInFlight < concurrency) {
numInFlight += 1;
processAsset(assetQueue.shift()).then(() => {
numInFlight -= 1;
proceed();
});
}
if (numInFlight === 0) {
resolve();
}
})();
});
// Forward incomingFragments through redirects:
for (const redirectAsset of ag.findAssets({ isRedirect: true })) {
if (redirectAsset.incomingFragments) {
const assetQueue = new Set([redirectAsset]);
for (const asset of assetQueue) {
if (asset._redirectTarget) {
assetQueue.add(asset._redirectTarget);
} else {
if (
typeof redirectAsset.statusCode === 'number' || // HttpRedirect
!/\/(?:\?|$)/.test(redirectAsset.url)
) {
for (const {
fragment,
href,
fromUrl,
relationDebugDescription,
fromUrlOrDescription,
} of redirectAsset.incomingFragments) {
const to = redirectAsset._redirectTarget;
const expected =
getModifiedHref(href, fromUrl, to.url, ag.root).replace(
'/index.html',
'/'
) + fragment;
const fragmentRedirectReport = {
operator: 'fragment-redirect',
name: `fragment-redirect ${fromUrlOrDescription} --> ${redirectAsset.urlOrDescription}${fragment} --> ${to.urlOrDescription}`,
expected,
actual: href,
at: relationDebugDescription,
};
if (!shouldSkip(fragmentRedirectReport)) {
reportTest({
...fragmentRedirectReport,
ok: false,
});
}
}
}
(asset.incomingFragments = asset.incomingFragments || []).push(
...redirectAsset.incomingFragments
);
}
}
delete redirectAsset.incomingFragments;
}
}
// Check fragments
for (const asset of ag.findAssets({
type: 'Html',
incomingFragments: { $exists: true },
ids: { $exists: true },
})) {
for (const {
fragment,
relationDebugDescription,
href,
fromUrlOrDescription,
} of asset.incomingFragments) {
const fragmentId = fragment.substr(1);
const fragmentReport = {
operator: 'fragment-check',
name: `fragment-check ${fromUrlOrDescription} --> ${href}`,
expected: `id="${fragmentId}"`,
at: relationDebugDescription,
};
if (!shouldSkip(fragmentReport)) {
if (asset.ids.has(fragmentId)) {
reportTest({
...fragmentReport,
ok: true,
actual: fragmentReport.expected,
});
} else {
// Some hosts do weird things with mangling fragments and reversing it with runtime js
if (
userContentFragmentOrigins.includes(asset.origin) &&
asset.ids.has(`user-content-${fragmentId}`)
) {
reportTest({
...fragmentReport,
ok: true,
actual: `id="user-content-${fragmentId}"`,
});
} else {
if (asset.names.has(fragmentId)) {
reportTest({
...fragmentReport,
ok: true,
actual: `name="${fragmentId}"`,
});
} else {
reportTest({
...fragmentReport,
ok: false,
actual: null,
});
}
}
}
}
}
}
// Check urls
if (!internalOnly) {
const assetsToCheck = ag
.findAssets({ check: true })
.filter((asset) => !processedAssets.has(asset));
t.push({
name: `Crawling ${assetsToCheck.length} outgoing urls`,
});
await new Promise((resolve, reject) =>
asyncLib.parallelLimit(
assetsToCheck.map((asset) => httpStatus(asset)),
20,
(err) => {
if (err) {
reject(err);
} else {
resolve();
}
}
)
);
}
// Check Content-Type vs. incoming relation targetTypes:
for (const asset of ag.findAssets({ expectedTypes: { $exists: true } })) {
const incompatibleTypes = [...asset.expectedTypes].filter(
(expectedType) => !checkCompatibility(asset, expectedType)
);
if (incompatibleTypes.length > 0) {
const expected =
asset.contentType || `A Content-Type compatible with ${asset.type}`;
const contentTypeMismatchReport = {
ok: false,
operator: 'content-type-mismatch',
name: `content-type-mismatch ${asset.urlOrDescription}`,
expected,
actual: `Asset is used as both ${[...incompatibleTypes, asset.type]
.sort()
.join(' and ')}`,
at: [...new Set(asset._incoming.map((r) => r.debugDescription))].join(
'\n '
),
};
if (!shouldSkip(contentTypeMismatchReport)) {
reportTest(contentTypeMismatchReport);
}
}
}
// Check preconnects:
const preconnectAssetsToCheck = ag.findAssets({
checkHtmlPreconnectLink: true,
});
t.push({
name: `Connecting to ${preconnectAssetsToCheck.length} hosts (checking <link rel="preconnect" href="...">`,
});
await new Promise((resolve, reject) =>
asyncLib.parallelLimit(
preconnectAssetsToCheck.map((asset) =>
tryConnect(asset, {
operator: 'preconnect-check',
name: `preconnect-check ${asset.url}`,
at: [...new Set(asset._incoming.map((r) => r.debugDescription))].join(
'\n '
),
expected: `connection accepted ${asset.url}`,
})
),
20,
(err) => {
if (err) {
reject(err);
} else {
resolve();
}
}
)
);
// Check dns-prefetches:
const dnsPrefetchAssetsToCheck = ag.findAssets({
checkHtmlDnsPrefetchLink: true,
});
t.push({
name: `Looking up ${dnsPrefetchAssetsToCheck.length} host names (checking <link rel="dns-prefetch" href="...">`,
});
await new Promise((resolve, reject) =>
asyncLib.parallelLimit(
dnsPrefetchAssetsToCheck.map((asset) =>
tryConnect(asset, {
operator: 'dns-prefetch-check',
name: `dns-prefetch-check ${asset.hostname}`,
at: [...new Set(asset._incoming.map((r) => r.debugDescription))].join(
'\n '
),
expected: `DNS exists ${asset.hostname}`,
})
),
20,
(err) => {
if (err) {
reject(err);
} else {
resolve();
}
}
)
);
return ag;
}
module.exports = hyperlink;