UNPKG

@nanggo/social-preview

Version:

Generate beautiful social media preview images from any URL

464 lines (463 loc) 20.7 kB
"use strict"; /** * Metadata Extractor Module * Extracts Open Graph and Twitter Card metadata from URLs */ var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.__test_inflightRequests = void 0; exports.getInflightRequestStats = getInflightRequestStats; exports.clearInflightRequests = clearInflightRequests; exports.extractMetadata = extractMetadata; exports.fetchImage = fetchImage; exports.validateMetadata = validateMetadata; exports.applyFallbacks = applyFallbacks; const open_graph_scraper_1 = __importDefault(require("open-graph-scraper")); const axios_1 = __importDefault(require("axios")); const types_1 = require("../types"); const validators_1 = require("../utils/validators"); const enhanced_secure_agent_1 = require("../utils/enhanced-secure-agent"); const image_security_1 = require("../utils/image-security"); const cache_1 = require("../utils/cache"); const logger_1 = require("../utils/logger"); // In-flight request management to prevent cache stampede // Limit the size to prevent memory exhaustion DoS attacks const MAX_INFLIGHT_REQUESTS = 1000; // Timeout for in-flight requests to prevent stuck promises from blocking the map indefinitely const INFLIGHT_REQUEST_TIMEOUT = process.env.NODE_ENV === 'test' ? 1000 : 30000; // 1s for tests, 30s for production const inflightRequests = new Map(); /** * Get statistics about in-flight requests for monitoring/debugging * @returns Object containing in-flight request statistics */ function getInflightRequestStats() { return { count: inflightRequests.size, keys: Array.from(inflightRequests.keys()), maxLimit: MAX_INFLIGHT_REQUESTS, utilizationPercent: Math.round((inflightRequests.size / MAX_INFLIGHT_REQUESTS) * 100) }; } /** * Clear all in-flight requests (useful for testing or cleanup) * WARNING: This will cause pending requests to potentially duplicate work */ function clearInflightRequests() { inflightRequests.clear(); } /** * Test helper to access internal inflightRequests map * WARNING: Only for testing purposes */ exports.__test_inflightRequests = process.env.NODE_ENV === 'test' ? inflightRequests : undefined; /** * Extract metadata from a given URL * @param url - The URL to extract metadata from * @param securityOptions - Security configuration options * @returns Extracted metadata object */ async function extractMetadata(url, securityOptions) { try { // Create cache key based on URL and security options // Sort object entries to ensure deterministic cache key generation const options = securityOptions || {}; const sortedOptions = Object.fromEntries(Object.entries(options).sort()); const cacheKey = `${url}:${JSON.stringify(sortedOptions)}`; // Check cache first const cachedMetadata = cache_1.metadataCache.get(cacheKey); if (cachedMetadata) { return cachedMetadata; } // Check if there's already an in-flight request for this cache key let metadataPromise = inflightRequests.get(cacheKey); if (!metadataPromise) { // Check if we've reached the maximum number of in-flight requests (DoS protection) if (inflightRequests.size >= MAX_INFLIGHT_REQUESTS) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.FETCH_ERROR, `In-flight requests limit reached (${MAX_INFLIGHT_REQUESTS}). Server is busy, please try again later.`); } // Create AbortController for request cancellation const abortController = new AbortController(); // If no request is in-flight, create one and store it in the map. // This ensures that even if multiple requests arrive concurrently, // only one will create the promise. const originalPromise = extractMetadataInternal(url, cacheKey, securityOptions, abortController.signal); let timedOut = false; // Add timeout protection to prevent stuck promises from blocking the map indefinitely let timeoutId; const timeoutPromise = new Promise((_, reject) => { timeoutId = setTimeout(() => { timedOut = true; // Mark that timeout has occurred // Cancel the ongoing request to prevent resource waste abortController.abort(); reject(new types_1.PreviewGeneratorError(types_1.ErrorType.FETCH_ERROR, `In-flight request timeout after ${INFLIGHT_REQUEST_TIMEOUT}ms for URL: ${url}`)); }, INFLIGHT_REQUEST_TIMEOUT); }); // Prevent unhandled rejection if timeout occurs before original promise settles originalPromise.catch((error) => { // Only log the warning if the timeout has already happened. // Otherwise, the main promise race will handle the rejection. if (timedOut) { logger_1.logger.warn('Original metadata promise rejected after timeout', { operation: 'metadata-extraction', url, error: error instanceof Error ? error : String(error), }); } }); // Race the original promise against the timeout. // We attach .finally() to the race itself to ensure the timeout is cleared // as soon as the race is decided, preventing a memory leak from lingering timers. metadataPromise = Promise.race([originalPromise, timeoutPromise]).finally(() => { clearTimeout(timeoutId); }); inflightRequests.set(cacheKey, metadataPromise); // The creator of the promise is responsible for cleaning it up from the map // once it settles (resolves or rejects). metadataPromise.finally(() => { try { // To avoid race conditions, only delete if the promise in the map is still this one. if (inflightRequests.get(cacheKey) === metadataPromise) { inflightRequests.delete(cacheKey); } } catch (cleanupError) { // Silently handle cleanup errors to prevent unhandled promise rejections logger_1.logger.warn('Error during in-flight request cleanup', { operation: 'metadata-extraction', url, error: cleanupError instanceof Error ? cleanupError : String(cleanupError), }); } }).catch(() => { // Prevent unhandled promise rejection warnings // The actual error will be handled by the caller }); } // Wait for the (either existing or new) request to complete and return its result. return metadataPromise; } catch (error) { if (error instanceof types_1.PreviewGeneratorError) { throw error; } throw new types_1.PreviewGeneratorError(types_1.ErrorType.METADATA_ERROR, `Failed to extract metadata from ${url}: ${error instanceof Error ? error.message : String(error)}`, error); } } /** * Internal metadata extraction function * Separated to handle in-flight request management properly */ async function extractMetadataInternal(url, cacheKey, securityOptions, abortSignal) { // Validate URL with SSRF protection and security options const validatedUrl = await validateUrl(url, securityOptions); // Extract Open Graph data const ogData = await fetchOpenGraphData(validatedUrl, securityOptions, abortSignal); // Parse and normalize metadata const metadata = parseMetadata(ogData, validatedUrl); // Cache the result cache_1.metadataCache.set(cacheKey, metadata); return metadata; } /** * Validate and normalize URL with SSRF protection */ async function validateUrl(url, securityOptions) { try { const urlObj = new URL(url); // Ensure protocol is http or https if (!['http:', 'https:'].includes(urlObj.protocol)) { throw new Error('Invalid protocol. Only HTTP and HTTPS are supported.'); } // Check HTTPS-only requirement if (securityOptions?.httpsOnly && urlObj.protocol !== 'https:') { throw new Error('HTTP URLs are not allowed when HTTPS-only mode is enabled.'); } // Enhanced security validation with TOCTOU protection const securityValidation = await (0, enhanced_secure_agent_1.validateRequestSecurity)(url); if (!securityValidation.allowed) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.VALIDATION_ERROR, `URL blocked by security validation: ${securityValidation.reason}`, { url, blockedIPs: securityValidation.blockedIPs, allowedIPs: securityValidation.allowedIPs }); } return urlObj.toString(); } catch (error) { if (error instanceof types_1.PreviewGeneratorError) { throw error; } throw new types_1.PreviewGeneratorError(types_1.ErrorType.VALIDATION_ERROR, `Invalid URL: ${url}`, error); } } /** * Fetch Open Graph data using open-graph-scraper */ async function fetchOpenGraphData(url, securityOptions, abortSignal) { try { // Create secure axios config with redirect validation and secure agent const axiosConfig = { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; SocialPreviewBot/1.0)', Accept: 'text/html,application/xhtml+xml', }, timeout: securityOptions?.timeout || 8000, // Configurable timeout maxRedirects: securityOptions?.maxRedirects ?? 3, // Configurable redirects maxContentLength: 1 * 1024 * 1024, // Reduced from 2MB to 1MB for HTML content maxBodyLength: 1 * 1024 * 1024, // Ensure body is also limited httpAgent: (0, enhanced_secure_agent_1.getEnhancedSecureAgentForUrl)(url), httpsAgent: (0, enhanced_secure_agent_1.getEnhancedSecureAgentForUrl)(url), signal: abortSignal, // Add abort signal for request cancellation beforeRedirect: (options, _responseDetails) => { // Validate each redirect URL for SSRF protection using typed interface for clarity const redirectOptions = options; const redirectUrl = `${redirectOptions.protocol}//${redirectOptions.hostname}${redirectOptions.port ? `:${redirectOptions.port}` : ''}${redirectOptions.path || ''}${redirectOptions.search || ''}`; try { (0, validators_1.validateUrlInput)(redirectUrl); } catch (error) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.VALIDATION_ERROR, `Redirect to unsafe URL blocked: ${redirectUrl}`, error); } }, }; // First, try to fetch HTML content const response = await axios_1.default.get(url, axiosConfig); // Extract OG data from HTML const { error, result } = await (0, open_graph_scraper_1.default)({ html: response.data, url }); if (error) { throw new Error('Failed to parse Open Graph data'); } return result; } catch { // Fallback: Try direct OG scraping try { const { error: ogError, result } = await (0, open_graph_scraper_1.default)({ url }); if (ogError) { throw new Error('Failed to fetch Open Graph data'); } return result; } catch (fallbackError) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.FETCH_ERROR, `Failed to fetch data from ${url}`, fallbackError); } } } /** * Parse and normalize metadata from Open Graph data */ function parseMetadata(ogData, url) { const urlObj = new URL(url); // Extract title (prioritize OG title, then Twitter, then HTML title) const title = ogData.ogTitle || ogData.twitterTitle || ogData.dcTitle || ogData.title || urlObj.hostname; // Extract description const description = ogData.ogDescription || ogData.twitterDescription || ogData.dcDescription || ogData.description || ''; // Extract image URL (prioritize OG image, then Twitter image) let image; if (ogData.ogImage) { if (Array.isArray(ogData.ogImage)) { const firstImage = ogData.ogImage[0]; if (typeof firstImage === 'object' && firstImage !== null && 'url' in firstImage) { image = firstImage.url; } else if (typeof firstImage === 'string') { image = firstImage; } } else if (typeof ogData.ogImage === 'object' && ogData.ogImage !== null && 'url' in ogData.ogImage) { image = ogData.ogImage.url; } else if (typeof ogData.ogImage === 'string') { image = ogData.ogImage; } } else if (ogData.twitterImage) { if (Array.isArray(ogData.twitterImage)) { const firstImage = ogData.twitterImage[0]; if (typeof firstImage === 'object' && firstImage !== null && 'url' in firstImage) { image = firstImage.url; } else if (typeof firstImage === 'string') { image = firstImage; } } else if (typeof ogData.twitterImage === 'object' && ogData.twitterImage !== null && 'url' in ogData.twitterImage) { image = ogData.twitterImage.url; } else if (typeof ogData.twitterImage === 'string') { image = ogData.twitterImage; } } // Ensure image URL is absolute if (image && !image.startsWith('http')) { try { const imageUrl = new URL(image, url); image = imageUrl.toString(); } catch { image = undefined; } } // Extract site name const siteName = ogData.ogSiteName || ogData.twitterSite || ogData.applicationName || urlObj.hostname.replace('www.', ''); // Extract favicon let favicon; if (ogData.favicon) { favicon = ogData.favicon; if (favicon && !favicon.startsWith('http')) { try { const faviconUrl = new URL(favicon, url); favicon = faviconUrl.toString(); } catch { // Try default favicon path favicon = `${urlObj.protocol}//${urlObj.hostname}/favicon.ico`; } } } else { // Default favicon path favicon = `${urlObj.protocol}//${urlObj.hostname}/favicon.ico`; } // Extract author const author = ogData.author || ogData.dcCreator || ogData.twitterCreator || ogData.articleAuthor; // Extract published date const publishedDate = ogData.ogArticlePublishedTime || ogData.articlePublishedTime || ogData.dcDate || ogData.publishedTime; // Extract locale const locale = ogData.ogLocale || ogData.inLanguage || 'en_US'; return { title: cleanText(title), description: description ? cleanText(description) : undefined, image, siteName: siteName ? cleanText(siteName) : undefined, favicon, author: author ? cleanText(author) : undefined, publishedDate, url, domain: urlObj.hostname, locale, }; } /** * Clean and normalize text */ function cleanText(text) { return text .replace(/[\n\r]+/g, ' ') .replace(/\s+/g, ' ') .trim(); } /** * Fetch image from URL and return as buffer with size and type validation * @param imageUrl - URL of the image to fetch * @param securityOptions - Security configuration options * @param abortSignal - Optional abort signal for request cancellation * @returns Image buffer */ async function fetchImage(imageUrl, securityOptions, abortSignal) { try { // Validate URL with SSRF protection before fetching const validatedUrl = await validateUrl(imageUrl, securityOptions); // Maximum allowed image size (15MB) const MAX_IMAGE_SIZE = 15 * 1024 * 1024; // Allowed MIME types for images (SVG conditionally allowed based on security settings) const ALLOWED_MIME_TYPES = new Set([ 'image/jpeg', 'image/jpg', 'image/png', 'image/gif', 'image/webp', 'image/bmp', 'image/tiff', ]); // Add SVG to allowed types only if explicitly permitted if (securityOptions?.allowSvg) { ALLOWED_MIME_TYPES.add('image/svg+xml'); } const response = await axios_1.default.get(validatedUrl, { responseType: 'arraybuffer', headers: { 'User-Agent': 'Mozilla/5.0 (compatible; SocialPreviewBot/1.0)', }, timeout: securityOptions?.timeout || 12000, // Configurable timeout (default 12s for images) maxRedirects: securityOptions?.maxRedirects ?? 3, // Configurable redirects maxContentLength: MAX_IMAGE_SIZE, maxBodyLength: MAX_IMAGE_SIZE, httpAgent: (0, enhanced_secure_agent_1.getEnhancedSecureAgentForUrl)(validatedUrl), httpsAgent: (0, enhanced_secure_agent_1.getEnhancedSecureAgentForUrl)(validatedUrl), signal: abortSignal, // Add abort signal for request cancellation beforeRedirect: (options, _responseDetails) => { // Validate each redirect URL for SSRF protection using typed interface for clarity const redirectOptions = options; const redirectUrl = `${redirectOptions.protocol}//${redirectOptions.hostname}${redirectOptions.port ? `:${redirectOptions.port}` : ''}${redirectOptions.path || ''}${redirectOptions.search || ''}`; try { (0, validators_1.validateUrlInput)(redirectUrl); } catch (error) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.VALIDATION_ERROR, `Image redirect to unsafe URL blocked: ${redirectUrl}`, error); } }, }); // Check content-type header if available const contentType = response.headers?.['content-type']?.toLowerCase(); if (contentType && !ALLOWED_MIME_TYPES.has(contentType)) { throw new Error(`Unsupported image type: ${contentType}. Only JPEG, PNG, GIF, WebP, BMP, and TIFF are allowed.`); } // Convert to Buffer efficiently without unnecessary copying const imageBuffer = Buffer.isBuffer(response.data) ? response.data : Buffer.from(response.data); // Check actual content length const contentLength = imageBuffer.length; if (contentLength > MAX_IMAGE_SIZE) { throw new Error(`Image too large: ${contentLength} bytes. Maximum allowed: ${MAX_IMAGE_SIZE} bytes.`); } // Validate image for security (pixel bombs, malformed files, etc.) await (0, image_security_1.validateImageBuffer)(imageBuffer, securityOptions?.allowSvg); return imageBuffer; } catch (error) { throw new types_1.PreviewGeneratorError(types_1.ErrorType.IMAGE_ERROR, `Failed to fetch image from ${imageUrl}: ${error instanceof Error ? error.message : String(error)}`, error); } } /** * Validate metadata to ensure required fields are present */ function validateMetadata(metadata) { return !!(metadata.title && metadata.title.length > 0); } /** * Apply fallback values to incomplete metadata */ function applyFallbacks(metadata, url) { const urlObj = new URL(url); return { title: metadata.title || urlObj.hostname, description: metadata.description, image: metadata.image, siteName: metadata.siteName || urlObj.hostname.replace('www.', ''), favicon: metadata.favicon || `${urlObj.protocol}//${urlObj.hostname}/favicon.ico`, author: metadata.author, publishedDate: metadata.publishedDate, url: metadata.url || url, domain: metadata.domain || urlObj.hostname, locale: metadata.locale || 'en_US', }; }