UNPKG

@knowcode/screenshotfetch

Version:

Web application spider with screenshot capture and customer journey documentation. Automate user flow documentation with authentication support.

199 lines (165 loc) 5.46 kB
const urlParse = require('url-parse'); class URLCapture { constructor(options = {}) { this.options = { includeQueryParams: true, normalizeUrls: true, ...options }; this.visitedUrls = new Map(); } captureURL(page, metadata = {}) { const rawUrl = page.url(); const timestamp = new Date().toISOString(); const urlData = { raw: rawUrl, normalized: this.normalizeURL(rawUrl), timestamp, ...metadata }; // Parse URL components const parsed = urlParse(rawUrl); urlData.components = { protocol: parsed.protocol, hostname: parsed.hostname, port: parsed.port, pathname: parsed.pathname, query: parsed.query, hash: parsed.hash }; // Generate clean URL for display urlData.display = this.getDisplayURL(rawUrl); // Track if we've seen this URL before const normalizedKey = this.getNormalizedKey(rawUrl); if (this.visitedUrls.has(normalizedKey)) { urlData.revisit = true; urlData.firstVisit = this.visitedUrls.get(normalizedKey).timestamp; urlData.visitCount = this.visitedUrls.get(normalizedKey).count + 1; } else { urlData.revisit = false; urlData.firstVisit = timestamp; urlData.visitCount = 1; } // Update visit tracking this.visitedUrls.set(normalizedKey, { timestamp: urlData.firstVisit, count: urlData.visitCount, lastVisit: timestamp }); return urlData; } normalizeURL(url) { if (!this.options.normalizeUrls) { return url; } try { const parsed = urlParse(url); // Remove common tracking parameters const trackingParams = [ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'fbclid', 'gclid', 'ref', 'source', '_ga', '_gl' ]; if (parsed.query && !this.options.includeQueryParams) { // Remove all query parameters return `${parsed.protocol}//${parsed.host}${parsed.pathname}`; } else if (parsed.query && this.options.includeQueryParams) { // Remove only tracking parameters const queryParams = new URLSearchParams(parsed.query); trackingParams.forEach(param => queryParams.delete(param)); const cleanQuery = queryParams.toString(); return `${parsed.protocol}//${parsed.host}${parsed.pathname}${cleanQuery ? '?' + cleanQuery : ''}`; } return url; } catch (error) { console.warn('⚠️ Could not normalize URL:', url, error.message); return url; } } getDisplayURL(url) { try { const parsed = urlParse(url); // For display purposes, show the main path without too much detail let display = `${parsed.hostname}${parsed.pathname}`; // Add important query parameters (but not tracking ones) if (parsed.query && this.options.includeQueryParams) { const queryParams = new URLSearchParams(parsed.query); const importantParams = []; // Keep parameters that seem important for the application state for (const [key, value] of queryParams) { if (this.isImportantParam(key)) { importantParams.push(`${key}=${value}`); } } if (importantParams.length > 0) { display += '?' + importantParams.join('&'); } } return display; } catch (error) { return url; } } isImportantParam(paramName) { const importantParams = [ 'id', 'user', 'page', 'tab', 'view', 'mode', 'type', 'category', 'section', 'step', 'stage', 'status', 'filter', 'sort', 'q', 'search' ]; const paramLower = paramName.toLowerCase(); return importantParams.some(param => paramLower.includes(param)); } getNormalizedKey(url) { return this.normalizeURL(url); } hasVisited(url) { const key = this.getNormalizedKey(url); return this.visitedUrls.has(key); } getVisitCount(url) { const key = this.getNormalizedKey(url); return this.visitedUrls.get(key)?.count || 0; } getAllVisitedUrls() { const results = []; for (const [url, data] of this.visitedUrls) { results.push({ url, ...data }); } return results.sort((a, b) => new Date(a.timestamp) - new Date(b.timestamp)); } generateUrlIndex() { return { totalUrls: this.visitedUrls.size, urls: this.getAllVisitedUrls(), generatedAt: new Date().toISOString(), options: this.options }; } isDuplicateUrl(url) { return this.hasVisited(url) && this.getVisitCount(url) > 1; } shouldSkipUrl(url) { // Skip certain file types that aren't useful for journey mapping const skipExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.exe']; const skipPatterns = [ '/api/', '/download/', '/export/', '/print/', '/pdf/', 'logout', 'signout', 'sign-out', 'delete', 'remove' ]; const urlLower = url.toLowerCase(); // Check file extensions if (skipExtensions.some(ext => urlLower.endsWith(ext))) { return true; } // Check URL patterns if (skipPatterns.some(pattern => urlLower.includes(pattern))) { return true; } return false; } cleanUrl(url) { // Remove fragments for cleaner URLs return url.split('#')[0]; } } module.exports = URLCapture;