sitemap
Version:
Sitemap-generating lib/cli
385 lines (384 loc) • 16.3 kB
JavaScript
/*!
* Sitemap
* Copyright(c) 2011 Eugene Kalinin
* MIT Licensed
*/
import { InvalidPathError, InvalidHostnameError, InvalidLimitError, InvalidPublicBasePathError, InvalidXSLUrlError, ChangeFreqInvalidError, InvalidAttrValue, InvalidNewsAccessValue, InvalidNewsFormat, InvalidVideoDescription, InvalidVideoDuration, InvalidVideoFormat, InvalidVideoRating, NoURLError, NoConfigError, PriorityInvalidError, InvalidVideoTitle, InvalidVideoViewCount, InvalidVideoTagCount, InvalidVideoCategory, InvalidVideoFamilyFriendly, InvalidVideoRestriction, InvalidVideoRestrictionRelationship, InvalidVideoPriceType, InvalidVideoResolution, InvalidVideoPriceCurrency, } from './errors.js';
import { ErrorLevel, EnumChangefreq, } from './types.js';
import { LIMITS } from './constants.js';
import { isAbsolute } from 'node:path';
/**
* Validator regular expressions for various sitemap fields
*/
const allowDeny = /^(?:allow|deny)$/;
export const validators = {
'price:currency': /^[A-Z]{3}$/,
'price:type': /^(?:rent|purchase|RENT|PURCHASE)$/,
'price:resolution': /^(?:HD|hd|sd|SD)$/,
'platform:relationship': allowDeny,
'restriction:relationship': allowDeny,
restriction: /^([A-Z]{2}( +[A-Z]{2})*)?$/,
platform: /^((web|mobile|tv)( (web|mobile|tv))*)?$/,
// Language codes: zh-cn, zh-tw, or ISO 639 2-3 letter codes
language: /^(zh-cn|zh-tw|[a-z]{2,3})$/,
genres: /^(PressRelease|Satire|Blog|OpEd|Opinion|UserGenerated)(, *(PressRelease|Satire|Blog|OpEd|Opinion|UserGenerated))*$/,
stock_tickers: /^(\w+:\w+(, *\w+:\w+){0,4})?$/,
};
/**
* Type guard to check if a string is a valid price type
*/
export function isPriceType(pt) {
return validators['price:type'].test(pt);
}
/**
* Type guard to check if a string is a valid resolution
*/
export function isResolution(res) {
return validators['price:resolution'].test(res);
}
/**
* Type guard to check if a string is a valid changefreq value
*/
const CHANGEFREQ = Object.values(EnumChangefreq);
export function isValidChangeFreq(freq) {
return CHANGEFREQ.includes(freq);
}
/**
* Type guard to check if a string is a valid yes/no value
*/
export function isValidYesNo(yn) {
return /^YES|NO|[Yy]es|[Nn]o$/.test(yn);
}
/**
* Type guard to check if a string is a valid allow/deny value
*/
export function isAllowDeny(ad) {
return allowDeny.test(ad);
}
/**
* Validates that a URL is well-formed and meets security requirements
*
* Security: This function enforces that URLs use safe protocols (http/https),
* are within reasonable length limits (2048 chars per sitemaps.org spec),
* and can be properly parsed. This prevents protocol injection attacks and
* ensures compliance with sitemap specifications.
*
* @param url - The URL to validate
* @param paramName - The parameter name for error messages
* @throws {InvalidHostnameError} If the URL is invalid
*/
export function validateURL(url, paramName) {
if (!url || typeof url !== 'string') {
throw new InvalidHostnameError(url, `${paramName} must be a non-empty string`);
}
if (url.length > LIMITS.MAX_URL_LENGTH) {
throw new InvalidHostnameError(url, `${paramName} exceeds maximum length of ${LIMITS.MAX_URL_LENGTH} characters`);
}
if (!LIMITS.URL_PROTOCOL_REGEX.test(url)) {
throw new InvalidHostnameError(url, `${paramName} must use http:// or https:// protocol`);
}
// Validate URL can be parsed
try {
new URL(url);
}
catch (err) {
throw new InvalidHostnameError(url, `${paramName} is not a valid URL: ${err instanceof Error ? err.message : String(err)}`);
}
}
/**
* Validates that a path doesn't contain path traversal sequences
*
* Security: This function prevents path traversal attacks by detecting
* any occurrence of '..' in the path, whether it appears as '../', '/..',
* or standalone. This prevents attackers from accessing files outside
* the intended directory structure.
*
* @param path - The path to validate
* @param paramName - The parameter name for error messages
* @throws {InvalidPathError} If the path contains traversal sequences
*/
export function validatePath(path, paramName) {
if (!path || typeof path !== 'string') {
throw new InvalidPathError(path, `${paramName} must be a non-empty string`);
}
// Reject absolute paths to prevent arbitrary write location when caller input
// reaches destinationDir (BB-04)
if (isAbsolute(path)) {
throw new InvalidPathError(path, `${paramName} must be a relative path (absolute paths are not allowed)`);
}
// Check for path traversal sequences - must check before and after normalization
// to catch both Windows-style (\) and Unix-style (/) separators
if (path.includes('..')) {
throw new InvalidPathError(path, `${paramName} contains path traversal sequence (..)`);
}
// Additional check after normalization to catch encoded or obfuscated attempts
const normalizedPath = path.replace(/\\/g, '/');
const pathComponents = normalizedPath.split('/').filter((p) => p.length > 0);
if (pathComponents.includes('..')) {
throw new InvalidPathError(path, `${paramName} contains path traversal sequence (..)`);
}
// Check for null bytes (security issue in some contexts)
if (path.includes('\0')) {
throw new InvalidPathError(path, `${paramName} contains null byte character`);
}
}
/**
* Validates that a public base path is safe for URL construction
*
* Security: This function prevents path traversal attacks and validates
* that the path is safe for use in URL construction within sitemap indexes.
* It checks for '..' sequences, null bytes, and invalid whitespace that
* could be used to manipulate URL structure or inject malicious content.
*
* @param publicBasePath - The public base path to validate
* @throws {InvalidPublicBasePathError} If the path is invalid
*/
export function validatePublicBasePath(publicBasePath) {
if (!publicBasePath || typeof publicBasePath !== 'string') {
throw new InvalidPublicBasePathError(publicBasePath, 'must be a non-empty string');
}
// Check for path traversal - check the raw string first
if (publicBasePath.includes('..')) {
throw new InvalidPublicBasePathError(publicBasePath, 'contains path traversal sequence (..)');
}
// Additional check for path components after normalization
const normalizedPath = publicBasePath.replace(/\\/g, '/');
const pathComponents = normalizedPath.split('/').filter((p) => p.length > 0);
if (pathComponents.includes('..')) {
throw new InvalidPublicBasePathError(publicBasePath, 'contains path traversal sequence (..)');
}
// Check for null bytes
if (publicBasePath.includes('\0')) {
throw new InvalidPublicBasePathError(publicBasePath, 'contains null byte character');
}
// Check for potentially dangerous characters that could break URL construction
if (/[\r\n\t]/.test(publicBasePath)) {
throw new InvalidPublicBasePathError(publicBasePath, 'contains invalid whitespace characters');
}
}
/**
* Validates that a limit is within acceptable range per sitemaps.org spec
*
* Security: This function enforces sitemap size limits (1-50,000 URLs per
* sitemap) as specified by sitemaps.org. This prevents resource exhaustion
* attacks and ensures compliance with search engine requirements.
*
* @param limit - The limit to validate
* @throws {InvalidLimitError} If the limit is out of range
*/
export function validateLimit(limit) {
if (typeof limit !== 'number' ||
!Number.isFinite(limit) ||
Number.isNaN(limit)) {
throw new InvalidLimitError(limit);
}
if (limit < LIMITS.MIN_SITEMAP_ITEM_LIMIT ||
limit > LIMITS.MAX_SITEMAP_ITEM_LIMIT) {
throw new InvalidLimitError(limit);
}
// Ensure it's an integer
if (!Number.isInteger(limit)) {
throw new InvalidLimitError(limit);
}
}
/**
* Validates that an XSL URL is safe and well-formed
*
* Security: This function validates XSL stylesheet URLs to prevent
* injection attacks. It blocks dangerous protocols and content patterns
* that could be used for XSS or other attacks. The validation uses
* case-insensitive matching to catch obfuscated attacks.
*
* @param xslUrl - The XSL URL to validate
* @throws {InvalidXSLUrlError} If the URL is invalid
*/
export function validateXSLUrl(xslUrl) {
if (!xslUrl || typeof xslUrl !== 'string') {
throw new InvalidXSLUrlError(xslUrl, 'must be a non-empty string');
}
if (xslUrl.length > LIMITS.MAX_URL_LENGTH) {
throw new InvalidXSLUrlError(xslUrl, `exceeds maximum length of ${LIMITS.MAX_URL_LENGTH} characters`);
}
if (!LIMITS.URL_PROTOCOL_REGEX.test(xslUrl)) {
throw new InvalidXSLUrlError(xslUrl, 'must use http:// or https:// protocol');
}
// Validate URL can be parsed
try {
new URL(xslUrl);
}
catch (err) {
throw new InvalidXSLUrlError(xslUrl, `is not a valid URL: ${err instanceof Error ? err.message : String(err)}`);
}
// Check for potentially dangerous content (case-insensitive)
const lowerUrl = xslUrl.toLowerCase();
// Block dangerous HTML/script content
if (lowerUrl.includes('<script')) {
throw new InvalidXSLUrlError(xslUrl, 'contains potentially malicious content (<script tag)');
}
// Block dangerous protocols (already checked http/https above, but double-check for encoded variants)
const dangerousProtocols = [
'javascript:',
'data:',
'vbscript:',
'file:',
'about:',
];
for (const protocol of dangerousProtocols) {
if (lowerUrl.includes(protocol)) {
throw new InvalidXSLUrlError(xslUrl, `contains dangerous protocol: ${protocol}`);
}
}
// Check for URL-encoded variants of dangerous patterns
// %3C = '<', %3E = '>', %3A = ':'
const encodedPatterns = [
'%3cscript', // <script
'%3c%73%63%72%69%70%74', // <script (fully encoded)
'javascript%3a', // javascript:
'data%3a', // data:
];
for (const pattern of encodedPatterns) {
if (lowerUrl.includes(pattern)) {
throw new InvalidXSLUrlError(xslUrl, 'contains URL-encoded malicious content');
}
}
// Reject unencoded XML special characters — these must be percent-encoded in
// valid URLs and could break out of XML attribute context if left raw.
if (xslUrl.includes('"') || xslUrl.includes('<') || xslUrl.includes('>')) {
throw new InvalidXSLUrlError(xslUrl, 'contains unencoded XML special characters (" < >); percent-encode them in the URL');
}
}
/**
* Internal helper to validate fields against their validators
*/
function validate(subject, name, url, level) {
Object.keys(subject).forEach((key) => {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
const val = subject[key];
if (validators[key] && !validators[key].test(val)) {
if (level === ErrorLevel.THROW) {
throw new InvalidAttrValue(key, val, validators[key]);
}
else {
console.warn(`${url}: ${name} key ${key} has invalid value: ${val}`);
}
}
});
}
/**
* Internal helper to handle errors based on error level
*/
function handleError(error, level) {
if (level === ErrorLevel.THROW) {
throw error;
}
else if (level === ErrorLevel.WARN) {
console.warn(error.name, error.message);
}
}
/**
* Verifies all data passed in will comply with sitemap spec.
* @param conf Options to validate
* @param level logging level
* @param errorHandler error handling func
*/
export function validateSMIOptions(conf, level = ErrorLevel.WARN, errorHandler = handleError) {
if (!conf) {
throw new NoConfigError();
}
if (level === ErrorLevel.SILENT) {
return conf;
}
const { url, changefreq, priority, news, video } = conf;
if (!url) {
errorHandler(new NoURLError(), level);
}
if (changefreq) {
if (!isValidChangeFreq(changefreq)) {
errorHandler(new ChangeFreqInvalidError(url, changefreq), level);
}
}
if (priority) {
if (!(priority >= 0.0 && priority <= 1.0)) {
errorHandler(new PriorityInvalidError(url, priority), level);
}
}
if (news) {
if (news.access &&
news.access !== 'Registration' &&
news.access !== 'Subscription') {
errorHandler(new InvalidNewsAccessValue(url, news.access), level);
}
if (!news.publication ||
!news.publication.name ||
!news.publication.language ||
!news.publication_date ||
!news.title) {
errorHandler(new InvalidNewsFormat(url), level);
}
validate(news, 'news', url, level);
validate(news.publication, 'publication', url, level);
}
if (video) {
video.forEach((vid) => {
if (vid.duration !== undefined) {
if (vid.duration < 0 || vid.duration > 28800) {
errorHandler(new InvalidVideoDuration(url, vid.duration), level);
}
}
if (vid.rating !== undefined && (vid.rating < 0 || vid.rating > 5)) {
errorHandler(new InvalidVideoRating(url, vid.title, vid.rating), level);
}
if (typeof vid !== 'object' ||
!vid.thumbnail_loc ||
!vid.title ||
!vid.description) {
// has to be an object and include required categories https://support.google.com/webmasters/answer/80471?hl=en&ref_topic=4581190
errorHandler(new InvalidVideoFormat(url), level);
}
if (vid.title.length > 100) {
errorHandler(new InvalidVideoTitle(url, vid.title.length), level);
}
if (vid.description.length > 2048) {
errorHandler(new InvalidVideoDescription(url, vid.description.length), level);
}
if (vid.view_count !== undefined && vid.view_count < 0) {
errorHandler(new InvalidVideoViewCount(url, vid.view_count), level);
}
if (vid.tag.length > 32) {
errorHandler(new InvalidVideoTagCount(url, vid.tag.length), level);
}
if (vid.category !== undefined && vid.category?.length > 256) {
errorHandler(new InvalidVideoCategory(url, vid.category.length), level);
}
if (vid.family_friendly !== undefined &&
!isValidYesNo(vid.family_friendly)) {
errorHandler(new InvalidVideoFamilyFriendly(url, vid.family_friendly), level);
}
if (vid.restriction) {
if (!validators.restriction.test(vid.restriction)) {
errorHandler(new InvalidVideoRestriction(url, vid.restriction), level);
}
if (!vid['restriction:relationship'] ||
!isAllowDeny(vid['restriction:relationship'])) {
errorHandler(new InvalidVideoRestrictionRelationship(url, vid['restriction:relationship']), level);
}
}
// TODO price element should be unbounded
if ((vid.price === '' && vid['price:type'] === undefined) ||
(vid['price:type'] !== undefined && !isPriceType(vid['price:type']))) {
errorHandler(new InvalidVideoPriceType(url, vid['price:type'], vid.price), level);
}
if (vid['price:resolution'] !== undefined &&
!isResolution(vid['price:resolution'])) {
errorHandler(new InvalidVideoResolution(url, vid['price:resolution']), level);
}
if (vid['price:currency'] !== undefined &&
!validators['price:currency'].test(vid['price:currency'])) {
errorHandler(new InvalidVideoPriceCurrency(url, vid['price:currency']), level);
}
validate(vid, 'video', url, level);
});
}
return conf;
}