website-validator
Version:
Comprehensive website validation
454 lines • 23.5 kB
JavaScript
import { getRedirect, isInternalLink, toCanonical } from "./index.js";
import { validateEpub, validatePdf, getImageDimensions, getInterestingPageElements } from "./utils.js";
import fs from "node:fs/promises";
import robotsParser from "robots-parser";
import { getUrlsFromSitemap } from "./get-links.js";
import path from "node:path";
import xml2js from "xml2js";
import { strict as assert } from "node:assert";
import { parseSrcset } from "srcset";
import Ajv from "ajv";
import addFormats from "ajv-formats";
const ajv = new Ajv();
addFormats(ajv);
export const validateFile = async (baseUrl, indexName, url, res, roles, linkedFiles, vnuResults, additionalValidators) => {
const contentType = Object.entries(res.headers).find(([name]) => name.toLowerCase() === "content-type")?.[1];
const redirect = await getRedirect(res);
return (await Promise.all([
(async () => {
if (redirect !== undefined && contentType === "text/html") {
const allLinks = (await getInterestingPageElements(res.data)).tagCollections.link;
const allCanonicals = allLinks.filter((link) => link.attrs["rel"] === "canonical");
if (allCanonicals.length > 0) {
const canonicalHref = (() => {
const href = allCanonicals[0].attrs["href"];
if (href) {
if (isInternalLink(baseUrl)(href)) {
return toCanonical(baseUrl, indexName)(href);
}
else {
return href;
}
}
else {
return "";
}
})();
const canonicalRedirect = isInternalLink(baseUrl)(redirect) ? toCanonical(baseUrl, indexName)(redirect) : redirect;
if (canonicalHref !== canonicalRedirect) {
return [{
type: "REDIRECT_DIFFERENT_CANONICAL",
redirectTarget: redirect,
canonicalTarget: allCanonicals[0].attrs["href"] ?? "",
}];
}
else {
return [];
}
}
else {
return [];
}
}
else {
return [];
}
})(),
(async () => {
if (contentType === "text/html") {
return (await Promise.all([
(async () => {
assert(vnuResults);
return (vnuResults).map((object) => {
return {
type: "VNU",
object,
location: {
url,
}
};
});
})(),
(async () => {
const allLinks = (await getInterestingPageElements(res.data)).tagCollections.link;
const allCanonicals = allLinks.filter((link) => link.attrs["rel"] === "canonical");
if (allCanonicals.length > 1) {
return [{
type: "MULTIPLE_CANONICAL_LINKS",
canonicalLinks: allCanonicals.map(({ outerHTML, selector }) => ({
outerHTML,
selector,
})),
}];
}
else if (redirect === undefined && allCanonicals.length === 1) {
const canonicalHref = allCanonicals[0].attrs["href"] ? toCanonical(baseUrl, indexName)(allCanonicals[0].attrs["href"]) : "";
if (canonicalHref !== url) {
return [{
type: "NON_REDIRECT_DIFFERENT_CANONICAL",
canonicalLink: allCanonicals[0].attrs["href"] ?? "",
location: {
url: url,
},
}];
}
else {
return [];
}
}
else {
return [];
}
})(),
(async () => {
const allJSONLDs = (await getInterestingPageElements(res.data)).tagCollections.script.filter(({ attrs }) => attrs["type"] === "application/ld+json");
return allJSONLDs.flatMap((jsonLd) => {
try {
JSON.parse(jsonLd.innerHTML);
return [];
}
catch {
return [{ type: "JSON_LD_UNPARSEABLE", location: { url, location: { outerHTML: jsonLd.outerHTML, selector: jsonLd.selector } } }];
}
});
})(),
(async () => {
const allImgs = (await getInterestingPageElements(res.data)).tagCollections.img;
return (await Promise.all(allImgs.map(async (img) => {
const src = await (async () => {
const srcAttr = img.attrs["src"];
if (srcAttr) {
if (isInternalLink(baseUrl)(srcAttr)) {
const res = linkedFiles[toCanonical(baseUrl, indexName)(srcAttr)];
assert(res);
assert(res.data);
const dimensions = await getImageDimensions(res.data);
assert(dimensions.width !== undefined);
assert(dimensions.height !== undefined);
return {
url: srcAttr,
width: dimensions.width,
height: dimensions.height,
external: false,
};
}
else {
return {
url: srcAttr,
external: true,
};
}
}
else {
return undefined;
}
})();
const srcset = await (async () => {
if (img.attrs["srcset"]) {
const srcset = parseSrcset(img.attrs["srcset"]);
return Promise.all(srcset.map(async ({ url, density, width }) => {
if (isInternalLink(baseUrl)(url)) {
const res = linkedFiles[toCanonical(baseUrl, indexName)(url)];
assert(res, JSON.stringify({ url, linkedFiles }, undefined, 4));
assert(res.data);
const dimensions = await getImageDimensions(res.data);
assert(dimensions.width !== undefined);
assert(dimensions.height !== undefined);
return {
url,
width: dimensions.width,
height: dimensions.height,
external: false,
descriptor: density !== undefined ? { density } : { width: width },
};
}
else {
return {
url,
external: true,
descriptor: density !== undefined ? { density } : { width: width },
};
}
}));
}
else {
return undefined;
}
})();
const srcSetWidthsIncorrect = srcset?.some(({ width, descriptor }) => descriptor.width !== undefined && width !== descriptor.width);
const srcSetDensitiesIncorrect = (() => {
const mergedSrcSets = [
...(srcset ?? []).filter(({ descriptor }) => descriptor.density !== undefined),
...(src !== undefined ? [{ ...src, descriptor: { density: 1 } }] : []),
].filter(({ external }) => !external);
const maxDensity = Math.max(...mergedSrcSets.map(({ descriptor }) => {
assert(descriptor.density !== undefined);
return descriptor.density;
}));
if (maxDensity === Number.NEGATIVE_INFINITY) {
return false;
}
else {
const maxWidth = mergedSrcSets.find(({ descriptor }) => descriptor.density === maxDensity)?.width;
assert(maxWidth);
return mergedSrcSets.some(({ width, descriptor }) => {
assert(descriptor.density !== undefined);
assert(width !== undefined);
return Math.abs(width - (maxWidth * descriptor.density / maxDensity)) > 1;
});
}
})();
const aspectRatiosIncorrect = (() => {
const mergedSrcSets = [
...(srcset ?? []),
...(src !== undefined ? [src] : []),
].filter(({ external }) => !external).map(({ width, height }) => ({ width, height }));
return mergedSrcSets.length !== 0 && mergedSrcSets.some(({ width, height }, _i, l) => {
assert(width !== undefined);
assert(height !== undefined);
assert(l[0].width !== undefined);
assert(l[0].height !== undefined);
const res = Math.abs(l[0].height - height * l[0].width / width) > 2;
return res;
});
})();
const sizesIncorrect = (() => {
const sizes = img.attrs["sizes"];
const sizePattern = /^(?<num>\d+)px$/;
if (srcset?.length > 0 && sizes !== undefined && sizes.includes(" ") === false && sizes.match(sizePattern)) {
const num = parseInt(sizes.match(sizePattern).groups["num"]);
return !(srcset ?? []).some(({ width }) => width === num);
}
else {
return false;
}
})();
if (srcSetWidthsIncorrect || srcSetDensitiesIncorrect || aspectRatiosIncorrect || sizesIncorrect) {
return [{
type: "IMG_SRC_INVALID",
location: {
url: url,
location: { outerHTML: img.outerHTML, selector: img.selector },
},
src,
srcset,
sizes: img.attrs["sizes"],
}];
}
return [];
}))).flat(1);
})(),
])).flat(1);
}
else if (contentType === "application/epub+zip") {
const results = await validateEpub(res.data);
return results.map((msg) => ({
type: "EPUBCHECK",
location: { url },
object: msg,
}));
}
else if (contentType === "application/pdf") {
const results = await validatePdf(res.data);
return results.map((msg) => ({
type: "PDF_CAN_NOT_BE_PARSED",
location: { url },
message: msg,
}));
}
else if (contentType === "application/json" || (contentType !== undefined && contentType.startsWith("application/") && contentType.endsWith("+json"))) {
const contents = await fs.readFile(res.data.path);
try {
JSON.parse(contents.toString("utf8"));
return [];
}
catch (e) {
return [{
type: "JSON_FILE_UNPARSEABLE",
location: { url },
}];
}
}
else if (contentType === "text/css") {
const cssErrors = await (async () => {
assert(vnuResults);
return (vnuResults).map((object) => {
return {
type: "VNU",
object,
location: {
url,
}
};
});
})();
return [...cssErrors];
}
else if (contentType === "image/svg+xml") {
const svgErrors = await (async () => {
assert(vnuResults);
return (vnuResults).map((object) => {
return {
type: "VNU",
object,
location: {
url,
}
};
});
})();
return [...svgErrors];
}
else if (roles.some(({ type }) => type === "robotstxt")) {
const contents = await fs.readFile(res.data.path);
const robots = robotsParser(url, contents.toString("utf8"));
const hostErrors = (() => {
const host = robots.getPreferredHost();
const baseUrlHost = new URL(baseUrl).host;
if (host !== null && host !== baseUrlHost) {
return [{
type: "ROBOTS_TXT_HOST_INVALID",
expectedHost: baseUrlHost,
actualHost: host,
}];
}
else {
return [];
}
})();
const sitemapErrors = (() => {
return robots.getSitemaps().flatMap((sitemap) => {
if (isInternalLink(baseUrl)(sitemap) === false) {
return [{
type: "ROBOTS_TXT_SITEMAP_INVALID",
sitemapUrl: sitemap,
}];
}
else {
return [];
}
});
})();
return [...hostErrors, ...sitemapErrors];
}
else if (roles.some(({ type }) => type === "sitemap")) {
const contents = await fs.readFile(res.data.path);
const extension = path.extname(new URL(url).pathname);
const urls = await getUrlsFromSitemap(contents.toString("utf8"), extension === ".txt" ? "txt" : "xml");
const sitemapUrl = url;
return urls.flatMap(({ url }) => {
if (isInternalLink(baseUrl)(url) === false) {
return [{
type: "SITEMAP_LINK_INVALID",
sitemapUrl,
url,
}];
}
else {
return [];
}
});
}
else if (contentType === "application/xml" || (contentType !== undefined && contentType.endsWith("+xml"))) {
const contents = await fs.readFile(res.data.path);
try {
await xml2js.parseStringPromise(contents);
return [];
}
catch (e) {
return [{
type: "XML_FILE_UNPARSEABLE",
location: { url },
}];
}
}
else {
return [];
}
})(),
(async () => {
const contents = await fs.readFile(res.data.path);
return (await Promise.all(additionalValidators.map(async (additionalValidator) => {
assert(["json", "json-ld"].includes(additionalValidator.type));
if (additionalValidator.type === "json") {
const validate = ajv.compile(additionalValidator.schema);
const validationResult = await validate(JSON.parse(contents.toString("utf8")));
if (!validationResult) {
return validate.errors.map((obj) => ({
type: "JSON_DOES_NOT_MATCH_SCHEMA",
result: obj,
schema: additionalValidator.schema,
url,
}));
}
else {
return [];
}
}
else if (additionalValidator.type === "json-ld") {
const allJSONLDs = (await getInterestingPageElements(res.data)).tagCollections.script.filter(({ attrs }) => attrs["type"] === "application/ld+json");
const allParsedJsonLd = allJSONLDs.flatMap((jsonLd) => {
try {
return [JSON.parse(jsonLd.innerHTML)];
}
catch (e) {
return [];
}
});
const validate = ajv.compile(additionalValidator.filter);
const matchedJsonLds = allParsedJsonLd.filter((jsonLd) => {
return validate(jsonLd) === true;
});
return (await Promise.all([
(async () => {
const schema = additionalValidator.schema;
if (schema !== undefined) {
return (await Promise.all(matchedJsonLds.map(async (matchedJsonLd) => {
const validate = ajv.compile(schema);
const validationResult = await validate(matchedJsonLd);
if (!validationResult) {
return validate.errors.map((obj) => ({
type: "JSON_LD_DOES_NOT_MATCH_SCHEMA",
filter: additionalValidator.filter,
result: obj,
schema,
url,
}));
}
else {
return [];
}
}))).flat(1);
}
else {
return [];
}
})(),
(async () => {
if (matchedJsonLds.length >= (additionalValidator.minOccurrence ?? 0) && matchedJsonLds.length <= (additionalValidator.maxOccurrence ?? Number.MAX_SAFE_INTEGER)) {
return [];
}
else {
return [{
type: "JSON_LD_DOES_NOT_MATCH_OCCURRENCE_REQUIREMENT",
filter: additionalValidator.filter,
minOccurrence: additionalValidator.minOccurrence,
maxOccurrence: additionalValidator.maxOccurrence,
actualOccurrence: matchedJsonLds.length,
url,
}];
}
})(),
])).flat(1);
}
else {
return [];
}
}))).flat(1);
})(),
])).flat(1);
// TODO: validate rss item can have 1 link and 1 guid
// TODO: if rss.item.guid isPermalink=true or missing then validate target URL
// TODO: validate atom item can have 1 id
};
//# sourceMappingURL=validate-file.js.map