website-validator
Version:
Comprehensive website validation
421 lines • 22.1 kB
JavaScript
import path from "node:path";
import { deepEqual } from "fast-equals";
import fs from "node:fs/promises";
import mime from "mime";
import { strict as assert } from "node:assert";
import { getUrlsFromSitemap } from "./get-links.js";
import { recursiveFetchFiles } from "./fetch-files.js";
import { withPool } from "./worker-runner.js";
import xml2js from "xml2js";
import { getInterestingPageElements, vnuValidates } from "./utils.js";
import { debuglog } from "node:util";
export const log = debuglog("website-validator");
export const getRedirect = async (res) => {
if ([301, 302, 307, 308].includes(res.status)) {
return Object.entries(res.headers).find(([name]) => name.toLowerCase() === "location")?.[1];
}
else {
const contentType = Object.entries(res.headers).find(([name]) => name.toLowerCase() === "content-type")?.[1];
if (contentType === "text/html") {
const allMetaTags = (await getInterestingPageElements(res.data)).tagCollections.meta;
const contentRegex = /^\d+(; url=(?<url>.*))?$/;
const redirectMetas = allMetaTags.filter((meta) => meta.attrs["http-equiv"] === "refresh" && meta.attrs["content"]?.match(contentRegex)?.groups?.["url"] !== undefined);
if (redirectMetas.length > 1) {
throw new Error("More than one redirect metas are not supported...");
}
else if (redirectMetas.length === 1) {
return new URL(redirectMetas[0].attrs["content"].match(contentRegex).groups["url"], res.url).href;
}
else {
return undefined;
}
}
else {
return undefined;
}
}
};
export const toCanonical = (baseUrl, indexName) => (url) => {
const urlObj = new URL(url, baseUrl);
if (urlObj.protocol !== new URL(baseUrl).protocol) {
return url;
}
else {
const resolvedPathName = urlObj.pathname.endsWith("/") ? urlObj.pathname + indexName : urlObj.pathname;
return urlObj.origin + resolvedPathName;
}
};
export const isInternalLink = (baseUrl) => (url) => {
return new URL(url, baseUrl).origin === baseUrl;
};
const toRelativeUrl = (baseUrl) => (url) => {
const urlObj = new URL(url, baseUrl);
return urlObj.pathname + urlObj.search;
};
const defaultResponseMeta = (filePath) => {
return {
headers: {
"Content-Type": mime.getType(path.extname(filePath)) ?? "application/octet-stream",
},
status: 200,
};
};
const fetchSingleFile = (baseUrl, targetConfig) => (url) => {
const indexName = targetConfig.indexName ?? "index.html";
const responseMeta = targetConfig.responseMeta ?? defaultResponseMeta;
const fetchFile = (() => {
return async (url) => {
if (!isInternalLink(url)) {
throw new Error(`Link not internal: ${url}`);
}
const fileUrl = toRelativeUrl(baseUrl)(toCanonical(baseUrl, indexName)(url));
const filePath = path.join(targetConfig.dir, fileUrl);
try {
const stat = await fs.stat(filePath);
return {
...responseMeta(fileUrl),
url,
data: { path: filePath, mtime: stat.mtimeMs },
};
}
catch (e) {
if (e.code === "ENOENT") {
return {
url,
headers: {},
status: 404,
data: null,
};
}
else {
throw e;
}
}
};
})();
return fetchFile(url);
};
export const fetchFileGraph = (pool) => (baseUrl, targetConfig) => async (fetchBases, extras) => {
const indexName = targetConfig.indexName ?? "index.html";
const responseMeta = targetConfig.responseMeta ?? defaultResponseMeta;
const fetchFile = (() => {
return async (url) => {
if (!isInternalLink(url)) {
throw new Error(`Link not internal: ${url}`);
}
const fileUrl = toRelativeUrl(baseUrl)(toCanonical(baseUrl, indexName)(url));
const filePath = path.join(targetConfig.dir, fileUrl);
try {
const stat = await fs.stat(filePath);
return {
...responseMeta(fileUrl),
url,
data: { path: filePath, mtime: stat.mtimeMs },
};
}
catch (e) {
if (e.code === "ENOENT") {
return {
url,
headers: {},
status: 404,
data: null,
};
}
else {
throw e;
}
}
};
})();
return await recursiveFetchFiles(pool, fetchFile, baseUrl, indexName)([
...fetchBases,
...(await Promise.all((extras.extraXmlSitemaps ?? []).map(async (xmlSitemap) => getUrlsFromSitemap(xmlSitemap, "xml")))).flat(),
...(await Promise.all((extras.extraTxtSitemaps ?? []).map(async (txtSitemap) => getUrlsFromSitemap(txtSitemap, "txt")))).flat(),
...(extras.extraUrls ?? []).map((url) => ({ url, role: { type: "asset" } })),
]);
};
const getExtraLinks = async (extras) => {
return [
...(await Promise.all((extras.extraXmlSitemaps ?? []).map(async (xmlSitemap, sitemapIndex) => (await getUrlsFromSitemap(xmlSitemap, "xml")).map(({ location, ...rest }) => {
return {
...rest,
asserts: [{ type: "document" }],
location: {
...location,
sitemaplocation: {
extrasitemapIndex: sitemapIndex,
},
},
};
})))).flat(),
...(await Promise.all((extras.extraTxtSitemaps ?? []).map(async (txtSitemap, sitemapIndex) => (await getUrlsFromSitemap(txtSitemap, "txt")).map(({ location, ...rest }) => {
return {
...rest,
asserts: [{ type: "document" }],
location: {
...location,
sitemaplocation: {
extrasitemapIndex: sitemapIndex,
},
},
};
})))).flat(),
...(extras.extraUrls ?? []).map((url, index) => ({ url, role: { type: "asset" }, asserts: [], location: { type: "extraurl", index } })),
];
};
export const validate = (options) => (baseUrl, targetConfig) => async (fetchBases, extras, additionalValidators) => {
assert((extras.extraUrls ?? []).every((url) => isInternalLink(baseUrl)(url)), "extraUrls must be internal links");
assert(path.isAbsolute(targetConfig.dir), "targetConfig.dir must be an absolute path");
const indexName = targetConfig.indexName ?? "index.html";
return withPool(options?.concurrency)(async (pool) => {
const fetchedFiles = await fetchFileGraph(pool)(baseUrl, targetConfig)(fetchBases, extras);
const files = fetchedFiles.reduce((memo, { url, role, res, links }) => {
const existing = memo[url];
if (memo[url]) {
memo[url] = {
...existing,
roles: [...existing.roles, role].filter((e, i, l) => l.findIndex((e2) => deepEqual(e, e2)) === i),
links: [...(existing.links ?? []), ...(links ?? [])].filter((e, i, l) => l.findIndex((e2) => deepEqual(e, e2)) === i),
};
return memo;
}
else {
memo[url] = { res, roles: [role], links };
return memo;
}
}, {});
const extraLinks = await getExtraLinks(extras);
const allLinks = [
...(Object.values(files).flatMap(({ links }) => links === null ? [] : links.map((link) => ({ url: link.url, asserts: link.asserts, location: link.location })))),
...extraLinks,
];
log("fetchedFiles: %O, allLinks: %O, files: %O", fetchedFiles, allLinks, files);
return (await Promise.all([
(async () => {
// not found errors
return [
...fetchBases.map(({ url }, index) => {
return { url, index, type: "fetchBase" };
}),
].flatMap(({ url, index, type }) => {
const canonical = toCanonical(baseUrl, indexName)(url);
const file = fetchedFiles.find((file) => file.url === canonical);
if (file === undefined || file.res.data === null) {
return [{
type: "NOT_FOUND",
location: {
url,
location: {
type,
index,
}
}
}];
}
else {
return [];
}
});
})(),
(async () => {
// link errors
return (await Promise.all(allLinks.filter((link) => isInternalLink(baseUrl)(link.url)).map(async (link) => {
const target = files[toCanonical(baseUrl, indexName)(link.url)]?.res;
assert(target, "whops; " + JSON.stringify({ canonical: toCanonical(baseUrl, indexName)(link.url) }, undefined, 4));
return pool.checkLink({ baseUrl, indexName, target, link });
}))).flat(1);
})(),
(async () => {
// file errors
const vnuCheckFiles = Object.entries(files).flatMap(([, { res }]) => {
const contentType = Object.entries(res.headers).find(([name]) => name.toLowerCase() === "content-type")?.[1];
if (res.data !== null && contentType === "text/html") {
return [{
type: "html",
data: res.data,
}];
}
else if (res.data !== null && contentType === "text/css") {
return [{
type: "css",
data: res.data,
}];
}
else if (res.data !== null && contentType === "image/svg+xml") {
return [{
type: "svg",
data: res.data,
}];
}
else {
return [];
}
});
const vnuValidationResults = await vnuValidates(vnuCheckFiles);
const fileValidationResults = await Promise.all(Object.entries(files).map(async ([url, { res, roles, links }]) => {
if (res.data !== null) {
assert(links);
const linkedFiles = Object.fromEntries(links.filter(({ url }) => isInternalLink(baseUrl)(url)).map(({ url }) => {
const target = files[toCanonical(baseUrl, indexName)(url)]?.res;
assert(target);
return [toCanonical(baseUrl, indexName)(url), target];
}));
const vnuResults = vnuValidationResults[res.data.path];
const matchedAdditionalValidators = additionalValidators.filter(({ urlPattern }) => urlPattern.test(toRelativeUrl(baseUrl)(url)));
return { matchedAdditionalValidators, errors: await pool.validateFile({ baseUrl, indexName, url, res: res, roles, linkedFiles, vnuResults, additionalValidators: matchedAdditionalValidators.map(({ config }) => config) }) };
}
else {
return { matchedAdditionalValidators: [], errors: [] };
}
}));
return [
...additionalValidators.map((additionalValidator) => {
return [additionalValidator, fileValidationResults.reduce((memo, { matchedAdditionalValidators }) => {
return memo + (matchedAdditionalValidators.includes(additionalValidator) ? 1 : 0);
}, 0)];
}).flatMap(([additionalValidator, num]) => {
if (num >= (additionalValidator.minMatches ?? 0) && num <= (additionalValidator.maxMatches ?? Number.MAX_SAFE_INTEGER)) {
return [];
}
else {
return [{
type: "ADDITIONAL_VALIDATOR_MATCH_NUMBER_OUTSIDE_EXPECTED_RANGE",
minMatches: additionalValidator.minMatches,
maxMatches: additionalValidator.maxMatches,
actualMatches: num,
urlPattern: additionalValidator.urlPattern.toString(),
}];
}
}),
...fileValidationResults.map(({ errors }) => errors).flat(2),
];
})(),
])).flat(1);
});
};
export const compareVersions = (options) => (baseUrl, targetConfig) => (fetchBases, extras) => (originalBaseUrl, originalTargetConfig) => async (originalFetchBases, originalExtras) => {
assert(path.isAbsolute(targetConfig.dir), "targetConfig.dir must be an absolute path");
assert(path.isAbsolute(originalTargetConfig.dir), "originalTargetConfig.dir must be an absolute path");
return withPool(options?.concurrency)(async (pool) => {
const [originalFileGraph, newFileGraph] = await Promise.all([
fetchFileGraph(pool)(originalBaseUrl, originalTargetConfig)(originalFetchBases, originalExtras),
fetchFileGraph(pool)(baseUrl, targetConfig)(fetchBases, extras),
]);
const getAllLinks = (files) => async (extras) => {
const extraLinks = await getExtraLinks(extras);
const getLinksFromFileGraph = (files) => {
return files
.flatMap(({ links }) => links ?? [])
.map(({ url, role, asserts, location }) => ({ url, role, asserts, location }));
};
const allLinks = [
...getLinksFromFileGraph(files),
...extraLinks,
];
return allLinks;
};
const getAllPermanentLinks = (files) => async (extras) => {
return (await getAllLinks(files)(extras))
.filter(({ asserts }) => asserts.some(({ type }) => type === "permanent"));
};
const [removedPermanentUrls, nonForwardCompatibleJsonLinks, feedGuidsChanged] = await Promise.all([
(async () => {
const originalPermanentLinks = await getAllPermanentLinks(originalFileGraph)(originalExtras);
const newPermanentLinks = await getAllPermanentLinks(newFileGraph)(extras);
return originalPermanentLinks.filter((link) => {
return !newPermanentLinks.some(({ url }) => link.url === url);
});
})(),
(async () => {
const getLinksShallow = (baseUrl, targetConfig) => async (fetchBases) => {
return Promise.all(fetchBases.map(async (fetchBase) => {
const res = await fetchSingleFile(baseUrl, targetConfig)(fetchBase.url);
if (res.data === null) {
return [];
}
return pool.getLinks({ url: fetchBase.url, role: fetchBase.role, res: res });
}));
};
const linksInJsonsWithNewFilesNewConfig = (await getLinksShallow(baseUrl, targetConfig)(fetchBases.filter(({ role }) => role.type === "json"))).flat();
const linksInJsonsWithNewFilesOriginalConfig = (await getLinksShallow(baseUrl, targetConfig)(originalFetchBases.filter(({ role }) => role.type === "json"))).flat();
return linksInJsonsWithNewFilesNewConfig.filter((link) => !linksInJsonsWithNewFilesOriginalConfig.some((l2) => deepEqual(link, l2)));
})(),
(async () => {
const [changedRssItems, changedAtomItems] = await Promise.all([
(async () => {
const oldRssFiles = originalFileGraph.filter(({ role }) => role.type === "rss");
const newRssFiles = newFileGraph.filter(({ role }) => role.type === "rss");
const existingRssFiles = newRssFiles.map((newFile) => [newFile, oldRssFiles.find((oldFile) => newFile.url === oldFile.url)]).filter(([newFile, oldFile]) => oldFile !== undefined && oldFile.res.data !== null && newFile.res.data !== null);
const changedRssGuids = await Promise.all(existingRssFiles.map(async ([newFile, oldFile]) => {
const getRssItems = async (file) => {
const contents = await fs.readFile(file);
const parsed = await xml2js.parseStringPromise(contents.toString("utf8"), { explicitCharkey: true });
return parsed.rss.channel.flatMap((channel) => (channel.item.map((c) => ({ link: c.link, guid: c.guid }))).flatMap(({ link, guid }) => ({ link, guid }))).flatMap(({ link, guid }) => {
return {
link: link[0]._,
guid: guid[0]._,
};
});
};
const originalRssItems = await getRssItems(oldFile.res.data.path);
const newRssItems = await getRssItems(newFile.res.data.path);
return originalRssItems.flatMap(({ link, guid }) => {
const matchingItem = newRssItems.find((item) => item.link === link);
if (matchingItem && matchingItem.guid !== guid) {
return [{
url: link,
feedUrl: newFile.url,
originalGuid: guid,
newGuid: matchingItem.guid,
}];
}
else {
return [];
}
});
}));
return changedRssGuids.flat();
})(),
(async () => {
const oldAtomFiles = originalFileGraph.filter(({ role }) => role.type === "atom");
const newAtomFiles = newFileGraph.filter(({ role }) => role.type === "atom");
const existingAtomFiles = newAtomFiles.map((newFile) => [newFile, oldAtomFiles.find((oldFile) => newFile.url === oldFile.url)]).filter(([newFile, oldFile]) => oldFile !== undefined && oldFile.res.data !== null && newFile.res.data !== null);
const changedAtomGuids = await Promise.all(existingAtomFiles.map(async ([newFile, oldFile]) => {
const getAtomItems = async (file) => {
const contents = await fs.readFile(file);
const parsed = await xml2js.parseStringPromise(contents.toString("utf8"), { explicitCharkey: true });
return parsed.feed.entry.flatMap((entry) => ({ href: entry.link[0].$.href, id: entry.id[0]._ })).map(({ href, id }) => {
return {
link: href,
id,
};
});
};
const originalAtomItems = await getAtomItems(oldFile.res.data.path);
const newAtomItems = await getAtomItems(newFile.res.data.path);
return originalAtomItems.flatMap(({ link, id }) => {
const matchingItem = newAtomItems.find((item) => item.link === link);
if (matchingItem && matchingItem.id !== id) {
return [{
url: link,
feedUrl: newFile.url,
originalGuid: id,
newGuid: matchingItem.id,
}];
}
else {
return [];
}
});
}));
return changedAtomGuids.flat();
})(),
]);
return [...changedRssItems, ...changedAtomItems];
})(),
]);
return { removedPermanentUrls, nonForwardCompatibleJsonLinks, feedGuidsChanged };
});
};
//# sourceMappingURL=index.js.map