@raven-js/fledge
Version:
From nestling to flight-ready - Build & bundle tool for modern JavaScript apps
86 lines (73 loc) • 2.49 kB
JavaScript
/**
* @author Anonyfox <max@anonyfox.com>
* @license MIT
* @see {@link https://ravenjs.dev}
* @see {@link https://github.com/Anonyfox/ravenjs}
* @see {@link https://anonyfox.com}
*/
/**
* @file URL normalization for crawling consistency.
*
* Handles common URL normalization traps: hash fragments, relative URLs,
* missing protocols, case sensitivity, and query parameter stripping.
*/
/**
* Normalize URL for consistent crawling and storage
* @param {string | URL} url - URL string or URL object to normalize
* @param {string | URL | null} [baseUrl] - Base URL for resolving relative URLs
* @returns {URL} Normalized URL object
* @throws {Error} If URL is invalid or has no domain
*/
export function normalizeUrl(url, baseUrl = null) {
// Convert URL object to string for processing
let urlString = url instanceof URL ? url.href : url;
// Convert baseUrl to string if provided
const baseUrlString = baseUrl instanceof URL ? baseUrl.href : baseUrl;
let resolved;
try {
if (baseUrlString) {
// Resolve relative URLs against base
resolved = new URL(urlString, baseUrlString);
} else {
// Add http:// protocol if missing
if (!urlString.includes("://")) {
// Handle cases like "example.com" or "www.example.com"
if (
urlString.includes(".") &&
!urlString.startsWith("/") &&
!urlString.startsWith(".")
) {
urlString = `http://${urlString}`;
}
}
resolved = new URL(urlString);
}
} catch (error) {
const err = /** @type {any} */ (error);
throw new Error(`Invalid URL: ${urlString} - ${err.message}`);
}
// Validate has domain part
if (!resolved.hostname) {
throw new Error(`URL must have domain: ${urlString}`);
}
// Apply normalizations for consistent storage
// 1. Strip hash fragments (client-side only)
resolved.hash = "";
// 2. Normalize domain casing (domains are case-insensitive)
resolved.hostname = resolved.hostname.toLowerCase();
// 3. Strip default ports for cleaner URLs
if (
(resolved.protocol === "http:" && resolved.port === "80") ||
(resolved.protocol === "https:" && resolved.port === "443")
) {
resolved.port = "";
}
// 4. Strip query parameters for static site generation
resolved.search = "";
// 5. Normalize pathname (remove double slashes, but preserve trailing slash semantics)
if (resolved.pathname) {
// Remove double slashes but keep single trailing slash
resolved.pathname = resolved.pathname.replace(/\/+/g, "/");
}
return resolved;
}