@knowcode/screenshotfetch
Version:
Web application spider with screenshot capture and customer journey documentation. Automate user flow documentation with authentication support.
199 lines (165 loc) • 5.46 kB
JavaScript
const urlParse = require('url-parse');
class URLCapture {
constructor(options = {}) {
this.options = {
includeQueryParams: true,
normalizeUrls: true,
...options
};
this.visitedUrls = new Map();
}
captureURL(page, metadata = {}) {
const rawUrl = page.url();
const timestamp = new Date().toISOString();
const urlData = {
raw: rawUrl,
normalized: this.normalizeURL(rawUrl),
timestamp,
...metadata
};
// Parse URL components
const parsed = urlParse(rawUrl);
urlData.components = {
protocol: parsed.protocol,
hostname: parsed.hostname,
port: parsed.port,
pathname: parsed.pathname,
query: parsed.query,
hash: parsed.hash
};
// Generate clean URL for display
urlData.display = this.getDisplayURL(rawUrl);
// Track if we've seen this URL before
const normalizedKey = this.getNormalizedKey(rawUrl);
if (this.visitedUrls.has(normalizedKey)) {
urlData.revisit = true;
urlData.firstVisit = this.visitedUrls.get(normalizedKey).timestamp;
urlData.visitCount = this.visitedUrls.get(normalizedKey).count + 1;
} else {
urlData.revisit = false;
urlData.firstVisit = timestamp;
urlData.visitCount = 1;
}
// Update visit tracking
this.visitedUrls.set(normalizedKey, {
timestamp: urlData.firstVisit,
count: urlData.visitCount,
lastVisit: timestamp
});
return urlData;
}
normalizeURL(url) {
if (!this.options.normalizeUrls) {
return url;
}
try {
const parsed = urlParse(url);
// Remove common tracking parameters
const trackingParams = [
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'fbclid', 'gclid', 'ref', 'source', '_ga', '_gl'
];
if (parsed.query && !this.options.includeQueryParams) {
// Remove all query parameters
return `${parsed.protocol}//${parsed.host}${parsed.pathname}`;
} else if (parsed.query && this.options.includeQueryParams) {
// Remove only tracking parameters
const queryParams = new URLSearchParams(parsed.query);
trackingParams.forEach(param => queryParams.delete(param));
const cleanQuery = queryParams.toString();
return `${parsed.protocol}//${parsed.host}${parsed.pathname}${cleanQuery ? '?' + cleanQuery : ''}`;
}
return url;
} catch (error) {
console.warn('⚠️ Could not normalize URL:', url, error.message);
return url;
}
}
getDisplayURL(url) {
try {
const parsed = urlParse(url);
// For display purposes, show the main path without too much detail
let display = `${parsed.hostname}${parsed.pathname}`;
// Add important query parameters (but not tracking ones)
if (parsed.query && this.options.includeQueryParams) {
const queryParams = new URLSearchParams(parsed.query);
const importantParams = [];
// Keep parameters that seem important for the application state
for (const [key, value] of queryParams) {
if (this.isImportantParam(key)) {
importantParams.push(`${key}=${value}`);
}
}
if (importantParams.length > 0) {
display += '?' + importantParams.join('&');
}
}
return display;
} catch (error) {
return url;
}
}
isImportantParam(paramName) {
const importantParams = [
'id', 'user', 'page', 'tab', 'view', 'mode', 'type', 'category',
'section', 'step', 'stage', 'status', 'filter', 'sort', 'q', 'search'
];
const paramLower = paramName.toLowerCase();
return importantParams.some(param => paramLower.includes(param));
}
getNormalizedKey(url) {
return this.normalizeURL(url);
}
hasVisited(url) {
const key = this.getNormalizedKey(url);
return this.visitedUrls.has(key);
}
getVisitCount(url) {
const key = this.getNormalizedKey(url);
return this.visitedUrls.get(key)?.count || 0;
}
getAllVisitedUrls() {
const results = [];
for (const [url, data] of this.visitedUrls) {
results.push({
url,
...data
});
}
return results.sort((a, b) => new Date(a.timestamp) - new Date(b.timestamp));
}
generateUrlIndex() {
return {
totalUrls: this.visitedUrls.size,
urls: this.getAllVisitedUrls(),
generatedAt: new Date().toISOString(),
options: this.options
};
}
isDuplicateUrl(url) {
return this.hasVisited(url) && this.getVisitCount(url) > 1;
}
shouldSkipUrl(url) {
// Skip certain file types that aren't useful for journey mapping
const skipExtensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.zip', '.exe'];
const skipPatterns = [
'/api/', '/download/', '/export/', '/print/', '/pdf/',
'logout', 'signout', 'sign-out', 'delete', 'remove'
];
const urlLower = url.toLowerCase();
// Check file extensions
if (skipExtensions.some(ext => urlLower.endsWith(ext))) {
return true;
}
// Check URL patterns
if (skipPatterns.some(pattern => urlLower.includes(pattern))) {
return true;
}
return false;
}
cleanUrl(url) {
// Remove fragments for cleaner URLs
return url.split('#')[0];
}
}
module.exports = URLCapture;