broken-link-checker
Version:
Find broken links, missing images, etc in your HTML.
282 lines (214 loc) • 7.28 kB
JavaScript
var isString = require("is-string");
var urllib = require("url");
var urlobj = require("urlobj");
var hasOwnProperty = Object.prototype.hasOwnProperty;
function linkObj(url)
{
if (url===undefined || isString(url)===false)
{
url = null;
}
var link =
{
url:
{
original: url, // The URL as it was inputted
resolved: null, // The URL, resolved as a browser would do so
redirected: null // The URL, after its last redirection, if any
},
base:
{
original: null, // The base URL as it was inputted
resolved: null // The base URL, resolved as a browser would do so
},
html:
{
index: null, // The order in which the link appeared in its document -- using max-level tag filter
offsetIndex: null, // Sequential (gap-free) indicies for skipped and unskipped links
location:null, // Source code location of the attribute that the link was found within
selector: null, // CSS selector for element in document
tagName: null, // Tag name that the link was found on
attrName: null, // Attribute name that the link was found within
attrs: null, // All attributes on the element
text: null, // TextNode/innerText within the element
tag: null, // The entire tag string
// Temporary keys
base: null
},
http:
{
cached: null, // If the response was pulled from cache
response: null // The request response
},
broken: null, // If the link was determined to be broken or not
internal: null, // If the link is to the same server as its base/document
samePage: null, // If the link is to the same page as its base/document
excluded: null, // If the link was excluded due to any filtering
brokenReason: null, // The reason why the link was considered broken, if it indeed is
excludedReason: null, // The reason why the link was excluded from being checked, if it indeed was
// Temporary keys
broken_link_checker: true,
resolved: false
};
// Not enumerable -- hidden from `JSON.stringify()`
Object.defineProperty(link.base, "parsed", { value:null, writable:true }); // Same as `link.base.resolved`, but is an Object
Object.defineProperty(link.url, "parsed", { value:null, writable:true }); // Same as `link.url.resolved`, but is an Object
return link;
}
/*
Remove unnecessary keys for public use.
*/
linkObj.clean = function(link)
{
delete link.broken_link_checker;
delete link.html.base; // TODO :: don't clean this?
delete link.resolved;
return link;
};
/*
Define relationships with base URL.
*/
linkObj.relation = function(link, url_parsed)
{
if (url_parsed === undefined) url_parsed = link.url.parsed;
else if (typeof url_parsed === "string") url_parsed = urlobj.parse(url_parsed);
var relation;
// If no protocols, it's impossible to determine if they link to the same server
if (url_parsed.protocol===null || link.base.parsed.protocol===null)
{
// Overwrite any previous values
link.internal = null;
link.samePage = null;
}
else
{
// Resolved base not used because html base could be remote
relation = urlobj.relation(url_parsed, link.base.parsed);
link.internal = relation >= urlobj.component.AUTH;
link.samePage = link.internal===true && relation>=urlobj.component.PATH;
}
return link;
};
/*
Absolute'ize a link based on its base URL and HTML's <base>.
*/
// TODO :: make similar to `url.resolve(from,to)` ?
linkObj.resolve = function(link, base, options)
{
// If already resolved
if (link.resolved === true) return;
// Parity with core `url.resolve()`
var parseOptions = { slashesDenoteHost:true };
// TODO :: we're constantly re-parsing base and html base -- find way to cache them
var base_parsed = base==null ? "" : base;
base_parsed = urlobj.normalize( urlobj.parse(base_parsed, parseOptions) );
var htmlBase_parsed = link.html.base==null ? "" : link.html.base;
htmlBase_parsed = urlobj.normalize( urlobj.parse(htmlBase_parsed, parseOptions) );
// TODO :: options.normalize=false
// TODO :: options.clone=true ?
var resolvedBase_parsed = urlobj.resolve(base_parsed, htmlBase_parsed);
if (resolvedBase_parsed.hash !== null)
{
// Hashes are useless in a base
resolvedBase_parsed.hash = null;
resolvedBase_parsed.href = urllib.format(resolvedBase_parsed); // TODO :: use urlobj.format() when available
}
// TODO :: is this necessary if `link.base.parsed` is cleaned?
if (base_parsed.hash !== null)
{
// Hashes are useless in a base
base_parsed.hash = null;
base_parsed.href = urllib.format(base_parsed); // TODO :: use urlobj.format() when available
}
// `link.url.original` should only ever not have a value within internal tests
var linkOrg_parsed = link.url.original==null ? "" : link.url.original;
linkOrg_parsed = urlobj.parse(linkOrg_parsed, parseOptions);
urlobj.normalize(linkOrg_parsed);
// `linkOrg_parsed` is cloned to avoid it being mutated
// TODO :: options.clone=true
var resolvedUrl_parsed = urlobj.resolve( resolvedBase_parsed, cloneObject(linkOrg_parsed) );
if (base !== undefined)
{
link.base.original = base;
}
// TODO :: use url types (>UNKNOWN && !=EMPTY ... not simple enough)
if (resolvedBase_parsed.href !== "")
{
link.base.resolved = parity(resolvedBase_parsed.href);
}
link.base.parsed = base_parsed;
// If resolved link has accepted scheme
if (options.acceptedSchemes[ resolvedUrl_parsed.extra.protocolTruncated ] === true)
{
link.url.resolved = parity(resolvedUrl_parsed.href);
link.url.parsed = resolvedUrl_parsed;
// TODO :: move relation stuff out of this function -- separation of concerns?
linkObj.relation(link);
}
// Else could not be properly resolved
else
{
link.url.parsed = linkOrg_parsed;
// If at least resolved to absolute
if (resolvedUrl_parsed.extra.type === urlobj.type.ABSOLUTE)
{
// If base is accepted scheme
if (options.acceptedSchemes[ base_parsed.extra.protocolTruncated ] === true)
{
link.internal = false;
link.samePage = false;
}
}
}
// Avoid future resolving
link.resolved = true;
return link;
};
//::: PRIVATE FUNCTIONS
/*
Clones an object and its prototype while maintaining enumerable
keys and support for `instanceof`.
*/
// TODO :: this may not be necessary if linkObj.base.parsed and linkObj.url.parsed are cleaned out
// TODO :: move this into urlobj
function cloneObject(source)
{
var clone,key,value;
if (Array.isArray(source) === true)
{
clone = [];
}
else
{
// Only clone the prototype -- more efficient as it will not convert keys to prototype keys
clone = Object.create( Object.getPrototypeOf(source) );
}
// Clone keys/indexes
// TODO :: use Object.keys() for more speed
for (key in source)
{
if (hasOwnProperty.call(source, key) === true)
{
value = source[key];
if (value!==null && typeof value==="object")
{
clone[key] = cloneObject(value);
}
else
{
clone[key] = value;
}
}
}
return clone;
}
/*
Maintain parity with core `url.resolve()`.
*/
// TODO :: remove this?
function parity(url)
{
return (url !== "http://") ? url : "http:///";
}
module.exports = linkObj;
;