UNPKG

website-validator

Version:
421 lines 22.1 kB
import path from "node:path"; import { deepEqual } from "fast-equals"; import fs from "node:fs/promises"; import mime from "mime"; import { strict as assert } from "node:assert"; import { getUrlsFromSitemap } from "./get-links.js"; import { recursiveFetchFiles } from "./fetch-files.js"; import { withPool } from "./worker-runner.js"; import xml2js from "xml2js"; import { getInterestingPageElements, vnuValidates } from "./utils.js"; import { debuglog } from "node:util"; export const log = debuglog("website-validator"); export const getRedirect = async (res) => { if ([301, 302, 307, 308].includes(res.status)) { return Object.entries(res.headers).find(([name]) => name.toLowerCase() === "location")?.[1]; } else { const contentType = Object.entries(res.headers).find(([name]) => name.toLowerCase() === "content-type")?.[1]; if (contentType === "text/html") { const allMetaTags = (await getInterestingPageElements(res.data)).tagCollections.meta; const contentRegex = /^\d+(; url=(?<url>.*))?$/; const redirectMetas = allMetaTags.filter((meta) => meta.attrs["http-equiv"] === "refresh" && meta.attrs["content"]?.match(contentRegex)?.groups?.["url"] !== undefined); if (redirectMetas.length > 1) { throw new Error("More than one redirect metas are not supported..."); } else if (redirectMetas.length === 1) { return new URL(redirectMetas[0].attrs["content"].match(contentRegex).groups["url"], res.url).href; } else { return undefined; } } else { return undefined; } } }; export const toCanonical = (baseUrl, indexName) => (url) => { const urlObj = new URL(url, baseUrl); if (urlObj.protocol !== new URL(baseUrl).protocol) { return url; } else { const resolvedPathName = urlObj.pathname.endsWith("/") ? urlObj.pathname + indexName : urlObj.pathname; return urlObj.origin + resolvedPathName; } }; export const isInternalLink = (baseUrl) => (url) => { return new URL(url, baseUrl).origin === baseUrl; }; const toRelativeUrl = (baseUrl) => (url) => { const urlObj = new URL(url, baseUrl); return urlObj.pathname + urlObj.search; }; const defaultResponseMeta = (filePath) => { return { headers: { "Content-Type": mime.getType(path.extname(filePath)) ?? "application/octet-stream", }, status: 200, }; }; const fetchSingleFile = (baseUrl, targetConfig) => (url) => { const indexName = targetConfig.indexName ?? "index.html"; const responseMeta = targetConfig.responseMeta ?? defaultResponseMeta; const fetchFile = (() => { return async (url) => { if (!isInternalLink(url)) { throw new Error(`Link not internal: ${url}`); } const fileUrl = toRelativeUrl(baseUrl)(toCanonical(baseUrl, indexName)(url)); const filePath = path.join(targetConfig.dir, fileUrl); try { const stat = await fs.stat(filePath); return { ...responseMeta(fileUrl), url, data: { path: filePath, mtime: stat.mtimeMs }, }; } catch (e) { if (e.code === "ENOENT") { return { url, headers: {}, status: 404, data: null, }; } else { throw e; } } }; })(); return fetchFile(url); }; export const fetchFileGraph = (pool) => (baseUrl, targetConfig) => async (fetchBases, extras) => { const indexName = targetConfig.indexName ?? "index.html"; const responseMeta = targetConfig.responseMeta ?? defaultResponseMeta; const fetchFile = (() => { return async (url) => { if (!isInternalLink(url)) { throw new Error(`Link not internal: ${url}`); } const fileUrl = toRelativeUrl(baseUrl)(toCanonical(baseUrl, indexName)(url)); const filePath = path.join(targetConfig.dir, fileUrl); try { const stat = await fs.stat(filePath); return { ...responseMeta(fileUrl), url, data: { path: filePath, mtime: stat.mtimeMs }, }; } catch (e) { if (e.code === "ENOENT") { return { url, headers: {}, status: 404, data: null, }; } else { throw e; } } }; })(); return await recursiveFetchFiles(pool, fetchFile, baseUrl, indexName)([ ...fetchBases, ...(await Promise.all((extras.extraXmlSitemaps ?? []).map(async (xmlSitemap) => getUrlsFromSitemap(xmlSitemap, "xml")))).flat(), ...(await Promise.all((extras.extraTxtSitemaps ?? []).map(async (txtSitemap) => getUrlsFromSitemap(txtSitemap, "txt")))).flat(), ...(extras.extraUrls ?? []).map((url) => ({ url, role: { type: "asset" } })), ]); }; const getExtraLinks = async (extras) => { return [ ...(await Promise.all((extras.extraXmlSitemaps ?? []).map(async (xmlSitemap, sitemapIndex) => (await getUrlsFromSitemap(xmlSitemap, "xml")).map(({ location, ...rest }) => { return { ...rest, asserts: [{ type: "document" }], location: { ...location, sitemaplocation: { extrasitemapIndex: sitemapIndex, }, }, }; })))).flat(), ...(await Promise.all((extras.extraTxtSitemaps ?? []).map(async (txtSitemap, sitemapIndex) => (await getUrlsFromSitemap(txtSitemap, "txt")).map(({ location, ...rest }) => { return { ...rest, asserts: [{ type: "document" }], location: { ...location, sitemaplocation: { extrasitemapIndex: sitemapIndex, }, }, }; })))).flat(), ...(extras.extraUrls ?? []).map((url, index) => ({ url, role: { type: "asset" }, asserts: [], location: { type: "extraurl", index } })), ]; }; export const validate = (options) => (baseUrl, targetConfig) => async (fetchBases, extras, additionalValidators) => { assert((extras.extraUrls ?? []).every((url) => isInternalLink(baseUrl)(url)), "extraUrls must be internal links"); assert(path.isAbsolute(targetConfig.dir), "targetConfig.dir must be an absolute path"); const indexName = targetConfig.indexName ?? "index.html"; return withPool(options?.concurrency)(async (pool) => { const fetchedFiles = await fetchFileGraph(pool)(baseUrl, targetConfig)(fetchBases, extras); const files = fetchedFiles.reduce((memo, { url, role, res, links }) => { const existing = memo[url]; if (memo[url]) { memo[url] = { ...existing, roles: [...existing.roles, role].filter((e, i, l) => l.findIndex((e2) => deepEqual(e, e2)) === i), links: [...(existing.links ?? []), ...(links ?? [])].filter((e, i, l) => l.findIndex((e2) => deepEqual(e, e2)) === i), }; return memo; } else { memo[url] = { res, roles: [role], links }; return memo; } }, {}); const extraLinks = await getExtraLinks(extras); const allLinks = [ ...(Object.values(files).flatMap(({ links }) => links === null ? [] : links.map((link) => ({ url: link.url, asserts: link.asserts, location: link.location })))), ...extraLinks, ]; log("fetchedFiles: %O, allLinks: %O, files: %O", fetchedFiles, allLinks, files); return (await Promise.all([ (async () => { // not found errors return [ ...fetchBases.map(({ url }, index) => { return { url, index, type: "fetchBase" }; }), ].flatMap(({ url, index, type }) => { const canonical = toCanonical(baseUrl, indexName)(url); const file = fetchedFiles.find((file) => file.url === canonical); if (file === undefined || file.res.data === null) { return [{ type: "NOT_FOUND", location: { url, location: { type, index, } } }]; } else { return []; } }); })(), (async () => { // link errors return (await Promise.all(allLinks.filter((link) => isInternalLink(baseUrl)(link.url)).map(async (link) => { const target = files[toCanonical(baseUrl, indexName)(link.url)]?.res; assert(target, "whops; " + JSON.stringify({ canonical: toCanonical(baseUrl, indexName)(link.url) }, undefined, 4)); return pool.checkLink({ baseUrl, indexName, target, link }); }))).flat(1); })(), (async () => { // file errors const vnuCheckFiles = Object.entries(files).flatMap(([, { res }]) => { const contentType = Object.entries(res.headers).find(([name]) => name.toLowerCase() === "content-type")?.[1]; if (res.data !== null && contentType === "text/html") { return [{ type: "html", data: res.data, }]; } else if (res.data !== null && contentType === "text/css") { return [{ type: "css", data: res.data, }]; } else if (res.data !== null && contentType === "image/svg+xml") { return [{ type: "svg", data: res.data, }]; } else { return []; } }); const vnuValidationResults = await vnuValidates(vnuCheckFiles); const fileValidationResults = await Promise.all(Object.entries(files).map(async ([url, { res, roles, links }]) => { if (res.data !== null) { assert(links); const linkedFiles = Object.fromEntries(links.filter(({ url }) => isInternalLink(baseUrl)(url)).map(({ url }) => { const target = files[toCanonical(baseUrl, indexName)(url)]?.res; assert(target); return [toCanonical(baseUrl, indexName)(url), target]; })); const vnuResults = vnuValidationResults[res.data.path]; const matchedAdditionalValidators = additionalValidators.filter(({ urlPattern }) => urlPattern.test(toRelativeUrl(baseUrl)(url))); return { matchedAdditionalValidators, errors: await pool.validateFile({ baseUrl, indexName, url, res: res, roles, linkedFiles, vnuResults, additionalValidators: matchedAdditionalValidators.map(({ config }) => config) }) }; } else { return { matchedAdditionalValidators: [], errors: [] }; } })); return [ ...additionalValidators.map((additionalValidator) => { return [additionalValidator, fileValidationResults.reduce((memo, { matchedAdditionalValidators }) => { return memo + (matchedAdditionalValidators.includes(additionalValidator) ? 1 : 0); }, 0)]; }).flatMap(([additionalValidator, num]) => { if (num >= (additionalValidator.minMatches ?? 0) && num <= (additionalValidator.maxMatches ?? Number.MAX_SAFE_INTEGER)) { return []; } else { return [{ type: "ADDITIONAL_VALIDATOR_MATCH_NUMBER_OUTSIDE_EXPECTED_RANGE", minMatches: additionalValidator.minMatches, maxMatches: additionalValidator.maxMatches, actualMatches: num, urlPattern: additionalValidator.urlPattern.toString(), }]; } }), ...fileValidationResults.map(({ errors }) => errors).flat(2), ]; })(), ])).flat(1); }); }; export const compareVersions = (options) => (baseUrl, targetConfig) => (fetchBases, extras) => (originalBaseUrl, originalTargetConfig) => async (originalFetchBases, originalExtras) => { assert(path.isAbsolute(targetConfig.dir), "targetConfig.dir must be an absolute path"); assert(path.isAbsolute(originalTargetConfig.dir), "originalTargetConfig.dir must be an absolute path"); return withPool(options?.concurrency)(async (pool) => { const [originalFileGraph, newFileGraph] = await Promise.all([ fetchFileGraph(pool)(originalBaseUrl, originalTargetConfig)(originalFetchBases, originalExtras), fetchFileGraph(pool)(baseUrl, targetConfig)(fetchBases, extras), ]); const getAllLinks = (files) => async (extras) => { const extraLinks = await getExtraLinks(extras); const getLinksFromFileGraph = (files) => { return files .flatMap(({ links }) => links ?? []) .map(({ url, role, asserts, location }) => ({ url, role, asserts, location })); }; const allLinks = [ ...getLinksFromFileGraph(files), ...extraLinks, ]; return allLinks; }; const getAllPermanentLinks = (files) => async (extras) => { return (await getAllLinks(files)(extras)) .filter(({ asserts }) => asserts.some(({ type }) => type === "permanent")); }; const [removedPermanentUrls, nonForwardCompatibleJsonLinks, feedGuidsChanged] = await Promise.all([ (async () => { const originalPermanentLinks = await getAllPermanentLinks(originalFileGraph)(originalExtras); const newPermanentLinks = await getAllPermanentLinks(newFileGraph)(extras); return originalPermanentLinks.filter((link) => { return !newPermanentLinks.some(({ url }) => link.url === url); }); })(), (async () => { const getLinksShallow = (baseUrl, targetConfig) => async (fetchBases) => { return Promise.all(fetchBases.map(async (fetchBase) => { const res = await fetchSingleFile(baseUrl, targetConfig)(fetchBase.url); if (res.data === null) { return []; } return pool.getLinks({ url: fetchBase.url, role: fetchBase.role, res: res }); })); }; const linksInJsonsWithNewFilesNewConfig = (await getLinksShallow(baseUrl, targetConfig)(fetchBases.filter(({ role }) => role.type === "json"))).flat(); const linksInJsonsWithNewFilesOriginalConfig = (await getLinksShallow(baseUrl, targetConfig)(originalFetchBases.filter(({ role }) => role.type === "json"))).flat(); return linksInJsonsWithNewFilesNewConfig.filter((link) => !linksInJsonsWithNewFilesOriginalConfig.some((l2) => deepEqual(link, l2))); })(), (async () => { const [changedRssItems, changedAtomItems] = await Promise.all([ (async () => { const oldRssFiles = originalFileGraph.filter(({ role }) => role.type === "rss"); const newRssFiles = newFileGraph.filter(({ role }) => role.type === "rss"); const existingRssFiles = newRssFiles.map((newFile) => [newFile, oldRssFiles.find((oldFile) => newFile.url === oldFile.url)]).filter(([newFile, oldFile]) => oldFile !== undefined && oldFile.res.data !== null && newFile.res.data !== null); const changedRssGuids = await Promise.all(existingRssFiles.map(async ([newFile, oldFile]) => { const getRssItems = async (file) => { const contents = await fs.readFile(file); const parsed = await xml2js.parseStringPromise(contents.toString("utf8"), { explicitCharkey: true }); return parsed.rss.channel.flatMap((channel) => (channel.item.map((c) => ({ link: c.link, guid: c.guid }))).flatMap(({ link, guid }) => ({ link, guid }))).flatMap(({ link, guid }) => { return { link: link[0]._, guid: guid[0]._, }; }); }; const originalRssItems = await getRssItems(oldFile.res.data.path); const newRssItems = await getRssItems(newFile.res.data.path); return originalRssItems.flatMap(({ link, guid }) => { const matchingItem = newRssItems.find((item) => item.link === link); if (matchingItem && matchingItem.guid !== guid) { return [{ url: link, feedUrl: newFile.url, originalGuid: guid, newGuid: matchingItem.guid, }]; } else { return []; } }); })); return changedRssGuids.flat(); })(), (async () => { const oldAtomFiles = originalFileGraph.filter(({ role }) => role.type === "atom"); const newAtomFiles = newFileGraph.filter(({ role }) => role.type === "atom"); const existingAtomFiles = newAtomFiles.map((newFile) => [newFile, oldAtomFiles.find((oldFile) => newFile.url === oldFile.url)]).filter(([newFile, oldFile]) => oldFile !== undefined && oldFile.res.data !== null && newFile.res.data !== null); const changedAtomGuids = await Promise.all(existingAtomFiles.map(async ([newFile, oldFile]) => { const getAtomItems = async (file) => { const contents = await fs.readFile(file); const parsed = await xml2js.parseStringPromise(contents.toString("utf8"), { explicitCharkey: true }); return parsed.feed.entry.flatMap((entry) => ({ href: entry.link[0].$.href, id: entry.id[0]._ })).map(({ href, id }) => { return { link: href, id, }; }); }; const originalAtomItems = await getAtomItems(oldFile.res.data.path); const newAtomItems = await getAtomItems(newFile.res.data.path); return originalAtomItems.flatMap(({ link, id }) => { const matchingItem = newAtomItems.find((item) => item.link === link); if (matchingItem && matchingItem.id !== id) { return [{ url: link, feedUrl: newFile.url, originalGuid: id, newGuid: matchingItem.id, }]; } else { return []; } }); })); return changedAtomGuids.flat(); })(), ]); return [...changedRssItems, ...changedAtomItems]; })(), ]); return { removedPermanentUrls, nonForwardCompatibleJsonLinks, feedGuidsChanged }; }); }; //# sourceMappingURL=index.js.map