qgenutils
Version:
A security-first Node.js utility library providing authentication, HTTP operations, URL processing, validation, datetime formatting, and template rendering. Designed as a lightweight alternative to heavy npm packages with comprehensive error handling and
338 lines (313 loc) • 15.5 kB
JavaScript
/*
* URL Utility Module
*
* This module provides URL manipulation utilities for web applications that need
* to normalize, parse, and manipulate URLs safely and consistently. Common use
* cases include API proxying, link validation, and URL standardization.
*
* DESIGN PHILOSOPHY:
* - Security first: Default to HTTPS for all protocol-less URLs
* - Graceful failure: Return null for invalid inputs rather than throwing
* - Consistent output: Normalize URLs to standard formats for comparison
* - Edge case handling: Deal with malformed URLs, missing protocols, etc.
*
* COMMON SCENARIOS:
* - User enters "example.com" and needs "https://example.com"
* - Comparing URLs that might have different cases or trailing slashes
* - Extracting base URLs from full URLs for API routing
* - Sanitizing user-provided URLs before storing or processing
*/
const { qerrors } = require('qerrors'); // error reporting utility
const logger = require('./logger'); // structured logger
/*
* These URL utilities are designed to be chained together. For example a proxy
* server might first call ensureProtocol to add missing schemes, then
* normalizeUrlOrigin to compare against an allow list, and finally
* parseUrlParts to route the request. Link shorteners or configuration tools may
* also chain stripProtocol after parsing for a clean domain display.
*/
/**
* Helper function to check if URL already has a protocol
* This centralizes the protocol detection regex used by multiple URL functions
* CHAIN POSITION: utility used by ensureProtocol and other functions to decide
* whether to prepend a protocol when chaining operations.
* SECURITY: Verifying a protocol prevents accidental double-prepending and
* guards against non-http(s) schemes that may introduce security risks.
*
* @param {string} url - URL to check for protocol
* @returns {boolean} True if URL has http or https protocol, false otherwise
*/
function hasProtocol(url) {
return /^https?:\/\//i.test(url); // ensures only http(s) schemes are detected
}
/**
* Ensure a URL has a protocol (defaults to HTTPS)
*
* RATIONALE: Users often enter URLs without protocols (example.com instead of
* https://example.com), but browsers and HTTP clients require explicit protocols.
* This function standardizes URL input while defaulting to HTTPS for security.
* CHAIN POSITION: typically the first step so that later functions like
* normalizeUrlOrigin and parseUrlParts receive a complete URL. Defaulting to
* HTTPS ensures downstream processing and proxying use encrypted connections.
*
* IMPLEMENTATION DECISIONS:
* - Default to HTTPS (not HTTP) for security best practices
* - Use case-insensitive regex to catch HTTP/HTTPS/http/https variants
* - Return null for invalid inputs to allow caller to handle gracefully
* - Preserve existing protocols if already present
*
* SECURITY RATIONALE:
* Defaulting to HTTPS prevents accidental transmission of sensitive data over
* unencrypted connections. In 2024, HTTPS should be the default assumption
* for all web traffic unless explicitly specified otherwise.
*
* REGEX EXPLANATION:
* /^https?:\/\//i matches:
* - ^ : Start of string
* - https? : "http" followed by optional "s"
* - :\/\/ : Literal "://" (protocol separator)
* - i flag: Case insensitive
*
* EDGE CASES HANDLED:
* - Empty string input
* - Non-string input (numbers, objects, etc.)
* - URLs with existing protocols (HTTP, HTTPS, mixed case)
* - Malformed URLs that would break URL constructor
*
* @param {string} url - The URL to check and potentially modify
* @returns {string|null} The URL with protocol added or null if input is invalid
*/
function ensureProtocol(url) {
console.log(`ensureProtocol is running with ${url}`); logger.debug(`ensureProtocol is running with ${url}`); // trace incoming URL for debugging
try {
// Validate that input is a usable string before processing
if (typeof url !== 'string' || !url) {
qerrors(new Error('invalid url input'), 'ensureProtocol', url); // record misuse or programming error
console.log(`ensureProtocol is returning null`); logger.debug(`ensureProtocol is returning null`); // early exit for unusable input
return null; // propagate null to caller for invalid input
}
const trimmedUrl = url.trim(); // discard whitespace to compare accurately
// Handle empty or invalid input gracefully - return null for missing URLs
if (trimmedUrl.length === 0) {
console.log(`ensureProtocol is returning null`); logger.debug(`ensureProtocol is returning null`); // treat blank strings as invalid
return null; // indicate missing URL value
}
let finalUrl = trimmedUrl; // sanitized copy prevents altering original value
// Check if protocol is already present using centralized detection
if (!hasProtocol(finalUrl)) {
finalUrl = 'https://' + finalUrl; // default to https for security
}
console.log(`ensureProtocol is returning ${finalUrl}`); logger.debug(`ensureProtocol is returning ${finalUrl}`); // trace normalized URL
return finalUrl; // provide caller with safe URL
} catch (error) {
// Handle unexpected errors in URL processing
qerrors(error, 'ensureProtocol', url); // log and keep original URL to avoid breaking caller
return url; // fallback to provided value if something went wrong
}
}
/**
* Normalize a URL to its origin in lowercase
*
* RATIONALE: URL comparison often needs to focus on the origin (protocol + domain + port)
* while ignoring path, query parameters, and case differences. This function creates
* a canonical form suitable for comparison and caching. SECURITY: Normalization
* prevents origin spoofing via case or port variations when enforcing same-origin
* policies or caching rules.
*
* IMPLEMENTATION STRATEGY:
* - Use native URL constructor for robust parsing
* - Extract only the origin portion (no path/query/fragment)
* - Convert to lowercase for case-insensitive comparison
* - Handle protocol addition through ensureProtocol first
* CHAIN POSITION: generally used after ensureProtocol. Provides a canonical
* origin for subsequent comparison or proxy routing steps.
*
* USE CASES:
* - Comparing if two URLs point to the same server
* - Creating cache keys based on API endpoints
* - Validating allowed origins for CORS
* - Grouping URLs by their base domain
*
* URL.origin EXPLANATION:
* The origin property includes:
* - Protocol (https:)
* - Domain (example.com)
* - Port (if non-standard: :8080)
* But excludes path, query parameters, and fragments
*
* @param {string} url - The URL to normalize to its origin
* @returns {string|null} The normalized origin in lowercase or null if invalid
*/
function normalizeUrlOrigin(url) {
console.log(`normalizeUrlOrigin is running with ${url}`); logger.debug(`normalizeUrlOrigin is running with ${url}`); // track incoming URL before normalization
try {
// First ensure the URL has a protocol, then extract and normalize origin
const processedUrl = ensureProtocol(url); // add protocol if missing for valid parsing
// If protocol normalization failed, abort parsing
if (processedUrl === null) {
console.log(`normalizeUrlOrigin is returning null`); logger.debug(`normalizeUrlOrigin is returning null`); // protocol check failed
return null; // cannot continue without valid protocol
}
// Parse the URL to extract components
// This allows us to normalize the origin while preserving other parts
const urlObj = new URL(processedUrl); // leverage built-in parser for reliability
// Build normalized origin, preserving explicit ports (including 443 for HTTPS)
// We use hostname instead of host to get just the domain without port
// Then add port back explicitly if it exists to maintain consistency
let normalizedOrigin = `${urlObj.protocol}//${urlObj.hostname.toLowerCase()}`; // start with protocol and lowercase host
// Include port if explicitly specified (even standard ports like 443 for HTTPS)
// This preserves the original intent when port was explicitly provided
if (urlObj.port && urlObj.port !== '') {
normalizedOrigin += `:${urlObj.port}`; // retain explicit port to respect caller intent
}
console.log(`normalizeUrlOrigin is returning ${normalizedOrigin}`); logger.debug(`normalizeUrlOrigin is returning ${normalizedOrigin}`); // output normalized origin for verification
return normalizedOrigin; // provide canonical origin
} catch (error) {
// Handle malformed URLs that can't be parsed
qerrors(error, 'normalizeUrlOrigin', url); // log parsing error for investigation
return null; // indicate failure so caller can react
}
}
/**
* Strip protocol and trailing slash from URL
*
* RATIONALE: Sometimes you need the "bare" version of a URL for display purposes,
* configuration files, or systems that add their own protocols. This function
* creates a clean, minimal representation of the URL. SECURITY: Removing the
* protocol ensures user-supplied strings cannot inject unexpected schemes when
* re-combined with application routing logic.
*
* IMPLEMENTATION APPROACH:
* - Use regex replacements for precise control over what's removed
* - Remove protocol prefix (http:// or https://)
* - Remove trailing slash for consistency
* - Preserve everything else (path, query, fragment)
* CHAIN POSITION: often used after normalizeUrlOrigin when displaying or storing
* domains without schemes.
*
* USE CASES:
* - Displaying URLs in user interfaces without protocol clutter
* - Configuration files that specify domains without protocols
* - Input normalization before protocol addition
* - Creating human-readable URL representations
*
* REGEX EXPLANATIONS:
* - /^https?:\/\//i : Removes http:// or https:// from start (case-insensitive)
* ensuring only standard web protocols are stripped for safety
* - /\/$/ : Removes trailing slash from end to avoid inconsistent paths
*
* WHY NOT USE URL CONSTRUCTOR:
* The URL constructor requires a valid protocol, but we're trying to remove it.
* Regex replacement is more appropriate for this text manipulation task.
*
* @param {string} url - The URL to strip protocol and trailing slash from
* @returns {string} The URL without protocol prefix or trailing slash
*/
function stripProtocol(url) {
console.log(`stripProtocol is running with ${url}`); logger.debug(`stripProtocol is running with ${url}`); // trace original URL before modifications
try {
// Chain replacements to remove protocol and trailing slash
// Using centralized regex pattern for consistency with other URL functions
const processed = url
.replace(/^https?:\/\//i, '') // regex removes http:// or https:// prefix only
.replace(/\/$/, ''); // regex trims a single trailing slash
console.log(`stripProtocol is returning ${processed}`); logger.debug(`stripProtocol is returning ${processed}`); // show stripped result
return processed; // send cleaned value back
} catch (error) {
// Handle unexpected errors in string processing
qerrors(error, 'stripProtocol', url); // log unexpected issue
return url; // fallback to input on failure
}
}
/**
* Parse URL into base URL and endpoint parts
*
* RATIONALE: API clients often need to separate the base URL (for server/routing)
* from the endpoint path (for specific API calls). This separation enables
* flexible API routing and proxy configurations. SECURITY: By parsing and
* returning structured segments we avoid string concatenation mistakes that may
* allow path traversal or host spoofing.
*
* IMPLEMENTATION STRATEGY:
* - Normalize URL with protocol first (ensures valid parsing)
* - Use URL constructor for robust parsing of complex URLs
* - Split into origin (base) and pathname+search (endpoint)
* - Return structured object for easy destructuring
* CHAIN POSITION: often the final step after ensureProtocol and
* normalizeUrlOrigin when routing proxy requests or API calls.
*
* RETURN STRUCTURE:
* {
* baseUrl: "https://api.example.com", // Origin only
* endpoint: "/v1/users?limit=10" // Path + query string
* }
*
* USE CASES:
* - API proxy configuration (route based on baseUrl, forward endpoint)
* - Load balancing (distribute based on baseUrl)
* - Caching strategies (cache by endpoint within baseUrl)
* - Request routing in microservices
*
* WHY COMBINE PATHNAME AND SEARCH:
* The endpoint typically includes both the path (/api/users) and query parameters
* (?limit=10) as they're both part of the specific API call being made.
*
* ERROR HANDLING:
* Returns null if URL parsing fails, allowing caller to handle invalid URLs
* appropriately (show error, use default, etc.). SECURITY: failing closed
* avoids routing requests to unintended endpoints when input is malformed.
*
* @param {string} url - The URL to parse into components
* @returns {object|null} Object with baseUrl and endpoint properties, or null if parsing fails
*/
function parseUrlParts(url) {
console.log(`parseUrlParts is running with ${url}`); logger.debug(`parseUrlParts is running with ${url}`); // trace incoming full URL
try {
// First normalize the URL to ensure it has a protocol for valid parsing
const processedUrl = ensureProtocol(url); // add protocol if absent to satisfy URL constructor
// If protocol normalization failed, abort parsing
if (processedUrl === null) {
console.log(`parseUrlParts is returning null`); logger.debug(`parseUrlParts is returning null`); // normalization failed
return null; // input was invalid so fail closed for safety
}
// Parse URL into components using native URL constructor
const parsed = new URL(processedUrl); // reliable parse of URL components
// Create structured result with base URL and endpoint
const result = {
baseUrl: parsed.origin, // protocol + domain + port only
endpoint: parsed.pathname + parsed.search // path plus query string
};
console.log(`parseUrlParts is returning ${JSON.stringify(result)}`); logger.debug(`parseUrlParts is returning ${JSON.stringify(result)}`); // output parsed pieces
return result; // deliver structured parts
} catch (error) {
// Handle URLs that can't be parsed by URL constructor
qerrors(error, 'parseUrlParts', url); // log failure for debugging
return null; // fail closed on parse error to avoid unsafe routing
}
}
/*
* Module Export Strategy:
*
* We export all URL utility functions because they serve complementary purposes
* in URL processing workflows:
*
* 1. ensureProtocol - Input normalization (first step)
* 2. normalizeUrlOrigin - Standardization for comparison
* 3. stripProtocol - Display/configuration formatting
* 4. parseUrlParts - Structural analysis for routing
*
* These functions can be used independently or chained together for complex
* URL processing pipelines.
*
* FUTURE ENHANCEMENTS:
* - Add subdomain extraction utilities
* - Add URL validation beyond protocol checking
* - Add query parameter manipulation functions
* - Add relative URL resolution utilities
*/
module.exports = {
ensureProtocol, // export protocol check/append
normalizeUrlOrigin, // export origin normalization
stripProtocol, // export protocol stripping
parseUrlParts // export URL dissection
};