UNPKG

@akukral/site-comparator

Version:

A sophisticated website comparison tool with intelligent content analysis and offset-aware difference detection

1,268 lines (1,134 loc) 80.3 kB
#!/usr/bin/env node const fs = require('fs').promises; const path = require('path'); const puppeteer = require('puppeteer'); const cheerio = require('cheerio'); const crypto = require('crypto'); const { URL } = require('url'); const readline = require('readline'); const readlineSync = require('readline-sync'); /** * Site Comparator - A tool for comparing websites and detecting differences in content, structure, and functionality. * * This class provides comprehensive website comparison capabilities including: * - Automated page discovery and crawling * - Content normalization and comparison * - Authentication handling for protected sites * - Detailed difference analysis and reporting * - HTML and JSON report generation * * @class Comparator * @example * const comparator = new Comparator({ maxPages: 10, delay: 2000 }); * await comparator.compare('https://staging.example.com', 'https://example.com'); */ class Comparator { /** * Creates a new Comparator instance with configurable options. * * @param {Object} options - Configuration options for the comparator * @param {number} [options.maxPages=20] - Maximum number of pages to crawl per site * @param {number} [options.maxDiscovery=500] - Maximum number of unique links to discover * @param {number} [options.delay=1000] - Delay between requests in milliseconds * @param {number} [options.timeout=30000] - Page load timeout in milliseconds * @param {string[]} [options.ignoreElements=['script', 'noscript', 'style']] - HTML elements to ignore during comparison * @param {string[]} [options.ignoreAttributes=['data-csrf', 'csrf-token', '_token', 'nonce']] - HTML attributes to ignore during comparison * @param {string[]} [options.ignoreClasses=['timestamp', 'csrf', 'nonce', 'random']] - CSS classes to ignore during comparison * @param {string} [options.userAgent='Comparator Bot 1.3.0'] - User agent string for requests * @param {string} [options.outputDir='./comparator-results'] - Directory to save comparison results */ constructor(options = {}) { this.options = { maxPages: 20, maxDiscovery: 500, delay: 1000, timeout: 30000, ignoreElements: ['script', 'noscript', 'style'], ignoreAttributes: ['data-csrf', 'csrf-token', '_token', 'nonce'], ignoreClasses: ['timestamp', 'csrf', 'nonce', 'random'], userAgent: 'Comparator Bot 1.3.0', outputDir: './comparator-results', ...options }; this.visited = new Set(); this.results = { compared: 0, differences: [], errors: [], summary: {} }; } /** * Helper method for waiting - compatible with all Puppeteer versions * * This method provides a consistent way to wait for a specified time period * across different versions of Puppeteer, handling the API changes between * older versions (waitForTimeout) and newer versions (setTimeout). * * @async * @param {Page} page - Puppeteer page instance * @param {number} milliseconds - Time to wait in milliseconds * @returns {Promise<void>} Promise that resolves after the specified delay * @example * await comparator.waitFor(page, 2000); // Wait 2 seconds */ async waitFor(page, milliseconds) { if (typeof page.waitForTimeout === 'function') { // Older Puppeteer versions return await page.waitForTimeout(milliseconds); } else { // Newer Puppeteer versions - use setTimeout return new Promise(resolve => setTimeout(resolve, milliseconds)); } } /** * Initializes the comparator by creating output directory and launching browser. * * This method sets up the environment for website comparison including: * - Creating the output directory if it doesn't exist * - Launching a Puppeteer browser instance with optimized settings * - Configuring browser security and compatibility options * * @async * @throws {Error} If browser launch fails or directory creation fails * @example * await comparator.init(); */ async init() { // Create output directory await fs.mkdir(this.options.outputDir, { recursive: true }); // Launch browser with better compatibility options this.browser = await puppeteer.launch({ headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-web-security', '--allow-running-insecure-content', '--ignore-certificate-errors', '--ignore-ssl-errors' ] }); } /** * Cleans up resources by closing the browser instance. * * This method should be called when the comparator is no longer needed * to free up system resources and close browser processes. * * @async * @example * await comparator.cleanup(); */ async cleanup() { if (this.browser) { await this.browser.close(); } } /** * Retrieves authentication credentials for a domain. * * This method attempts to get credentials from multiple sources in order: * 1. Command line options passed to the compare method * 2. Environment variables (COMPARATOR_USER_<DOMAIN>, COMPARATOR_PASSWORD_<DOMAIN>) * 3. Global environment variables (COMPARATOR_USERNAME, COMPARATOR_PASSWORD) * 4. Interactive prompts for username and password * * @async * @param {string} domain - The domain requiring authentication * @param {Object} options - Authentication options * @param {string} [options.username] - Username for authentication * @param {string} [options.password] - Password for authentication * @returns {Promise<Object|null>} Authentication object with username and password, or null if no credentials * @example * const auth = await comparator.getAuthCredentials('https://protected.example.com'); * if (auth) { * console.log(`Using credentials for ${auth.username}`); * } */ async getAuthCredentials(domain, options = {}) { // Check if credentials were provided via command line or environment if (options.username && options.password) { return { username: options.username, password: options.password }; } // Check environment variables const envUsername = process.env[`COMPARATOR_USER_${this.getDomainKey(domain)}`] || process.env.COMPARATOR_USERNAME; const envPassword = process.env[`COMPARATOR_PASS_${this.getDomainKey(domain)}`] || process.env.COMPARATOR_PASSWORD; if (envUsername && envPassword) { console.log(`Using credentials from environment for ${domain}`); return { username: envUsername, password: envPassword }; } return new Promise((resolve) => { console.log(`\nHTTP Authentication may be required for ${domain}`); // Use readline-sync for username input const username = readlineSync.question('Username (press enter to skip): '); if (!username.trim()) { resolve(null); return; } // Use readline-sync's built-in password hiding - completely invisible keystrokes const password = readlineSync.question('Password: ', { hideEchoBack: true, // This completely hides all keystrokes mask: '' // No mask character shown }); resolve({ username: username.trim(), password: password }); }); } /** * Helper to create environment variable key from domain * * This method converts a domain URL into a valid environment variable key * by extracting the hostname and replacing non-alphanumeric characters with underscores. * This is used for domain-specific authentication environment variables. * * @param {string} domain - Domain URL to convert * @returns {string} Environment variable key for the domain * @example * const key = comparator.getDomainKey('https://staging.example.com'); * // Returns: 'STAGING_EXAMPLE_COM' */ getDomainKey(domain) { return new URL(domain).hostname.replace(/[^a-zA-Z0-9]/g, '_').toUpperCase(); } /** * Creates a new browser page with optional authentication. * * This method creates a new page instance with consistent configuration: * - Sets user agent and viewport dimensions * - Applies HTTP Basic Authentication if credentials are provided * - Sets up response handlers for authentication challenges * * @async * @param {Object} [auth=null] - Authentication credentials object * @param {string} auth.username - Username for HTTP Basic Auth * @param {string} auth.password - Password for HTTP Basic Auth * @returns {Promise<Page>} Configured Puppeteer page instance * @example * const page = await comparator.createPage({ username: 'user', password: 'pass' }); */ async createPage(auth = null) { const page = await this.browser.newPage(); await page.setUserAgent(this.options.userAgent); await page.setViewport({ width: 1920, height: 1080 }); // Set basic auth if provided if (auth) { await page.authenticate(auth); } // Handle authentication challenges page.on('response', async (response) => { if (response.status() === 401 && auth) { console.log(`Authentication challenge detected for ${response.url()}`); } }); return page; } /** * Normalizes HTML content by removing spurious differences and standardizing formatting. * * This method performs several normalization steps to ensure fair comparison: * - Removes ignored HTML elements (scripts, styles, etc.) * - Strips ignored attributes (CSRF tokens, nonces, etc.) * - Filters out ignored CSS classes * - Normalizes URLs to account for domain differences * - Removes HTML comments and normalizes whitespace * * @param {string} html - Raw HTML content to normalize * @param {string} baseUrl - Original domain URL for URL normalization * @param {string} targetUrl - Target domain URL for URL normalization * @returns {string} Normalized HTML content ready for comparison * @example * const normalized = comparator.normalizeContent(html, 'https://site1.com', 'https://site2.com'); */ normalizeContent(html, baseUrl, targetUrl) { const $ = cheerio.load(html); // Remove ignored elements this.options.ignoreElements.forEach(selector => { $(selector).remove(); }); // Remove ignored attributes $('*').each((i, elem) => { const $elem = $(elem); this.options.ignoreAttributes.forEach(attr => { $elem.removeAttr(attr); }); // Remove ignored classes const classes = $elem.attr('class'); if (classes) { const filteredClasses = classes .split(' ') .filter(cls => !this.options.ignoreClasses.some(ignore => cls.includes(ignore))) .join(' '); if (filteredClasses) { $elem.attr('class', filteredClasses); } else { $elem.removeAttr('class'); } } }); // Normalize URLs - replace baseUrl with targetUrl in links and sources $('a[href], img[src], link[href], script[src]').each((i, elem) => { const $elem = $(elem); const attrName = $elem.attr('href') ? 'href' : 'src'; const url = $elem.attr(attrName); if (url && url.includes(baseUrl)) { $elem.attr(attrName, url.replace(baseUrl, targetUrl)); } }); // Remove comments $('*').contents().filter(function() { return this.type === 'comment'; }).remove(); // Normalize whitespace return $.html() .replace(/\s+/g, ' ') .replace(/>\s+</g, '><') .trim(); } /** * Extracts structured content from HTML for detailed comparison. * * This method parses HTML and extracts key content elements: * - Page title and headings * - Paragraph text content * - Link text and URLs * - Image alt text and sources * - Form structure and input fields * * @param {string} html - HTML content to extract from * @returns {Object} Structured content object with extracted elements * @returns {string} returns.title - Page title text * @returns {string[]} returns.headings - Array of heading texts * @returns {string[]} returns.paragraphs - Array of paragraph texts * @returns {Object[]} returns.links - Array of link objects with text and href * @returns {Object[]} returns.images - Array of image objects with alt and src * @returns {Object[]} returns.forms - Array of form objects with action, method, and inputs * @example * const content = comparator.extractContent(html); * console.log(`Page has ${content.headings.length} headings`); */ extractContent(html) { const $ = cheerio.load(html); return { title: $('head > title').text().trim(), headings: $('h1, h2, h3, h4, h5, h6').map((i, el) => $(el).text().trim()).get(), paragraphs: $('p').map((i, el) => $(el).text().trim()).get().filter(p => p.length > 0), links: $('a[href]').map((i, el) => ({ text: $(el).text().trim(), href: $(el).attr('href') })).get(), images: $('img[src]').map((i, el) => ({ alt: $(el).attr('alt') || '', src: $(el).attr('src') })).get(), forms: $('form').map((i, el) => ({ action: $(el).attr('action') || '', method: $(el).attr('method') || 'GET', inputs: $(el).find('input, textarea, select').map((j, input) => ({ name: $(input).attr('name') || '', type: $(input).attr('type') || 'text' })).get() })).get() }; } /** * Compares two pages and identifies differences in content and structure. * * This method performs comprehensive comparison of normalized content: * - Compares page titles, headings, and paragraphs * - Analyzes link structures and image content * - Examines form structures and input fields * - Uses intelligent matching to detect content reordering * - Provides detailed difference analysis with snippets * * @param {string} url - URL path being compared * @param {string} content1 - HTML content from first site * @param {string} content2 - HTML content from second site * @param {string} domain1 - Base domain of first site * @param {string} domain2 - Base domain of second site * @returns {Object} Comparison result object * @returns {string} returns.url - URL path that was compared * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {Array} returns.differences - Array of difference objects * @returns {Object} returns.extracted1 - Extracted content from first site * @returns {Object} returns.extracted2 - Extracted content from second site * @example * const comparison = comparator.comparePage('/about', html1, html2, 'https://site1.com', 'https://site2.com'); * if (comparison.hasDifferences) { * console.log(`Found ${comparison.differences.length} differences`); * } */ comparePage(url, content1, content2, domain1, domain2) { const normalized1 = this.normalizeContent(content1, domain1, domain2); const normalized2 = this.normalizeContent(content2, domain2, domain1); const extracted1 = this.extractContent(normalized1); const extracted2 = this.extractContent(normalized2); const differences = []; // Compare titles if (extracted1.title !== extracted2.title) { differences.push({ type: 'title', site1: extracted1.title, site2: extracted2.title, snippet: this.getSnippet(extracted1.title, extracted2.title) }); } // Compare headings with detailed analysis const headingDiff = this.compareHeadings(extracted1.headings, extracted2.headings); if (headingDiff.hasDifferences) { differences.push({ type: 'headings', site1: extracted1.headings.length, site2: extracted2.headings.length, details: headingDiff.details, snippets: headingDiff.snippets }); } // Compare paragraphs with content analysis const paragraphDiff = this.compareParagraphs(extracted1.paragraphs, extracted2.paragraphs); if (paragraphDiff.hasDifferences) { differences.push({ type: 'paragraphs', site1: extracted1.paragraphs.length, site2: extracted2.paragraphs.length, details: paragraphDiff.details, snippets: paragraphDiff.snippets }); } // Compare links with detailed analysis const linkDiff = this.compareLinks(extracted1.links, extracted2.links); if (linkDiff.hasDifferences) { differences.push({ type: 'links', site1: extracted1.links.length, site2: extracted2.links.length, details: linkDiff.details, snippets: linkDiff.snippets }); } // Compare images const imageDiff = this.compareImages(extracted1.images, extracted2.images); if (imageDiff.hasDifferences) { differences.push({ type: 'images', site1: extracted1.images.length, site2: extracted2.images.length, details: imageDiff.details, snippets: imageDiff.snippets }); } // Compare forms const formDiff = this.compareForms(extracted1.forms, extracted2.forms); if (formDiff.hasDifferences) { differences.push({ type: 'forms', site1: extracted1.forms.length, site2: extracted2.forms.length, details: formDiff.details, snippets: formDiff.snippets }); } return { url, hasDifferences: differences.length > 0, differences, extracted1, extracted2 }; } /** * Helper method to create snippets for differences * * This method creates truncated text snippets for displaying differences * in reports, ensuring that long text content is displayed in a readable format. * * @param {string} text1 - Text from first site * @param {string} text2 - Text from second site * @param {number} [maxLength=100] - Maximum length for each snippet * @returns {Object} Object containing truncated snippets for both sites * @returns {string} returns.site1 - Truncated text from first site * @returns {string} returns.site2 - Truncated text from second site * @example * const snippet = comparator.getSnippet('Very long text content...', 'Different long content...', 50); * console.log(snippet.site1); // "Very long text content..." */ getSnippet(text1, text2, maxLength = 100) { const truncate = (text) => { if (text.length <= maxLength) return text; return text.substring(0, maxLength) + '...'; }; return { site1: truncate(text1), site2: truncate(text2) }; } /** * Intelligent content difference detection that accounts for offsets * * This method performs sophisticated content comparison that can detect: * - Content additions and deletions * - Content reordering * - Position-independent differences * * It uses the longest common subsequence algorithm to identify reordering * and provides detailed analysis of content changes. * * @param {Array} array1 - First array of content items * @param {Array} array2 - Second array of content items * @param {string} [contentType='content'] - Type of content being compared * @returns {Object} Detailed difference analysis * @returns {Array} returns.differences - Array of difference objects * @returns {Array} returns.additions - Array of addition objects * @returns {Array} returns.deletions - Array of deletion objects * @returns {number} returns.matches - Number of matching items * @returns {boolean} returns.reordered - Whether content appears to be reordered * @example * const analysis = comparator.findContentDifferences(headings1, headings2, 'heading'); * if (analysis.reordered) { * console.log('Content appears to be reordered'); * } */ findContentDifferences(array1, array2, contentType = 'content') { const differences = []; const additions = []; const deletions = []; // Normalize arrays for comparison const normalized1 = array1.map(item => this.normalizeForComparison(item)); const normalized2 = array2.map(item => this.normalizeForComparison(item)); // Find the longest common subsequence to identify reordering const lcs = this.findLongestCommonSubsequence(normalized1, normalized2); // Create maps for efficient lookup const map1 = new Map(); const map2 = new Map(); // Build maps with normalized content as key and original items as values array1.forEach((item, index) => { const key = normalized1[index]; if (!map1.has(key)) { map1.set(key, []); } map1.get(key).push(item); }); array2.forEach((item, index) => { const key = normalized2[index]; if (!map2.has(key)) { map2.set(key, []); } map2.get(key).push(item); }); // Find items that exist in both arrays (matches) const matches = new Set(); for (const [key] of map1) { if (map2.has(key)) { matches.add(key); } } // Find additions (items only in array2) for (const [key, items2] of map2) { if (!matches.has(key)) { additions.push({ type: 'addition', contentType, items: items2, count: items2.length, snippet: this.getSnippet('', items2[0], 150) }); } } // Find deletions (items only in array1) for (const [key, items1] of map1) { if (!matches.has(key)) { deletions.push({ type: 'deletion', contentType, items: items1, count: items1.length, snippet: this.getSnippet(items1[0], '', 150) }); } } // Check for reordering (items that exist in both but in different positions) if (lcs.length < Math.min(normalized1.length, normalized2.length)) { const reorderedCount = Math.min(normalized1.length, normalized2.length) - lcs.length; if (reorderedCount > 0) { differences.push({ type: 'reordering', contentType, count: reorderedCount, description: `${reorderedCount} ${contentType}s appear to be reordered` }); } } return { differences: differences, additions: additions, deletions: deletions, matches: matches.size, reordered: lcs.length < Math.min(normalized1.length, normalized2.length) }; } /** * Find longest common subsequence to detect reordering * * This method implements the longest common subsequence (LCS) algorithm * to identify how much content has been reordered between two arrays. * It's used to distinguish between actual content changes and simple reordering. * * @param {Array} arr1 - First array * @param {Array} arr2 - Second array * @returns {Array} Longest common subsequence between the arrays * @example * const lcs = comparator.findLongestCommonSubsequence(array1, array2); * const reorderedCount = Math.min(array1.length, array2.length) - lcs.length; */ findLongestCommonSubsequence(arr1, arr2) { const m = arr1.length; const n = arr2.length; const dp = Array(m + 1).fill().map(() => Array(n + 1).fill(0)); // Build LCS matrix for (let i = 1; i <= m; i++) { for (let j = 1; j <= n; j++) { if (arr1[i - 1] === arr2[j - 1]) { dp[i][j] = dp[i - 1][j - 1] + 1; } else { dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]); } } } // Reconstruct the LCS const lcs = []; let i = m, j = n; while (i > 0 && j > 0) { if (arr1[i - 1] === arr2[j - 1]) { lcs.unshift(arr1[i - 1]); i--; j--; } else if (dp[i - 1][j] > dp[i][j - 1]) { i--; } else { j--; } } return lcs; } /** * Normalize content for comparison (remove extra whitespace, etc.) * * This method standardizes text content for fair comparison by: * - Converting to lowercase * - Trimming whitespace * - Normalizing multiple spaces to single spaces * * @param {*} text - Text content to normalize (converted to string if not already) * @returns {string} Normalized text content * @example * const normalized = comparator.normalizeForComparison(' Hello World '); * // Returns: "hello world" */ normalizeForComparison(text) { if (typeof text !== 'string') { text = String(text); } return text .trim() .replace(/\s+/g, ' ') .toLowerCase(); } /** * Compare headings with detailed analysis * * This method compares heading structures between two sites and provides * detailed analysis of differences including additions, deletions, and reordering. * * @param {string[]} headings1 - Headings from first site * @param {string[]} headings2 - Headings from second site * @returns {Object} Heading comparison result * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {string[]} returns.details - Array of difference descriptions * @returns {Array} returns.snippets - Array of difference snippets (limited to 5) * @example * const result = comparator.compareHeadings(site1Headings, site2Headings); * if (result.hasDifferences) { * console.log('Heading differences:', result.details.join(', ')); * } */ compareHeadings(headings1, headings2) { const details = []; const snippets = []; if (headings1.length !== headings2.length) { details.push(`Different number of headings: ${headings1.length} vs ${headings2.length}`); } // Use intelligent matching to find actual differences, not just position shifts const { differences, additions, deletions, reordered } = this.findContentDifferences(headings1, headings2, 'heading'); if (differences.length > 0) { details.push(`${differences.length} headings have different content`); snippets.push(...differences.slice(0, 3)); } if (additions.length > 0) { details.push(`${additions.length} headings added`); snippets.push(...additions.slice(0, 2)); } if (deletions.length > 0) { details.push(`${deletions.length} headings removed`); snippets.push(...deletions.slice(0, 2)); } if (reordered) { details.push(`Headings appear to be reordered`); } return { hasDifferences: details.length > 0, details, snippets: snippets.slice(0, 5) // Limit to first 5 differences }; } /** * Compare paragraphs with content analysis * * This method compares paragraph content between two sites and provides * detailed analysis of differences including additions, deletions, and reordering. * * @param {string[]} paragraphs1 - Paragraphs from first site * @param {string[]} paragraphs2 - Paragraphs from second site * @returns {Object} Paragraph comparison result * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {string[]} returns.details - Array of difference descriptions * @returns {Array} returns.snippets - Array of difference snippets (limited to 3) * @example * const result = comparator.compareParagraphs(site1Paragraphs, site2Paragraphs); * if (result.hasDifferences) { * console.log('Paragraph differences:', result.details.join(', ')); * } */ compareParagraphs(paragraphs1, paragraphs2) { const details = []; const snippets = []; if (paragraphs1.length !== paragraphs2.length) { details.push(`Different number of paragraphs: ${paragraphs1.length} vs ${paragraphs2.length}`); } // Use intelligent matching to find actual differences, not just position shifts const { differences, additions, deletions, reordered } = this.findContentDifferences(paragraphs1, paragraphs2, 'paragraph'); if (differences.length > 0) { details.push(`${differences.length} paragraphs have different content`); snippets.push(...differences.slice(0, 3)); } if (additions.length > 0) { details.push(`${additions.length} paragraphs added`); snippets.push(...additions.slice(0, 2)); } if (deletions.length > 0) { details.push(`${deletions.length} paragraphs removed`); snippets.push(...deletions.slice(0, 2)); } if (reordered) { details.push(`Paragraphs appear to be reordered`); } return { hasDifferences: details.length > 0, details, snippets: snippets.slice(0, 3) // Limit to first 3 differences }; } /** * Compare links with detailed analysis * * This method compares link structures between two sites and provides * detailed analysis of differences including additions, deletions, and reordering. * * @param {Object[]} links1 - Links from first site * @param {Object[]} links2 - Links from second site * @returns {Object} Link comparison result * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {string[]} returns.details - Array of difference descriptions * @returns {Array} returns.snippets - Array of difference snippets * @example * const result = comparator.compareLinks(site1Links, site2Links); * if (result.hasDifferences) { * console.log('Link differences:', result.details.join(', ')); * } */ compareLinks(links1, links2) { const details = []; const snippets = []; if (links1.length !== links2.length) { details.push(`Different number of links: ${links1.length} vs ${links2.length}`); } // Extract link texts for comparison const texts1 = links1.map(l => l.text).filter(t => t.length > 0); const texts2 = links2.map(l => l.text).filter(t => t.length > 0); // Use intelligent matching to find actual differences const { differences, additions, deletions, reordered } = this.findContentDifferences(texts1, texts2, 'link'); if (differences.length > 0) { details.push(`${differences.length} links have different text`); snippets.push(...differences.slice(0, 3)); } if (additions.length > 0) { details.push(`${additions.length} links added`); snippets.push(...additions.slice(0, 2)); } if (deletions.length > 0) { details.push(`${deletions.length} links removed`); snippets.push(...deletions.slice(0, 2)); } if (reordered) { details.push(`Links appear to be reordered`); } return { hasDifferences: details.length > 0, details, snippets }; } /** * Compare images with detailed analysis * * This method compares image content between two sites and provides * detailed analysis of differences including counts, alt text, and sources. * * @param {Object[]} images1 - Images from first site * @param {Object[]} images2 - Images from second site * @returns {Object} Image comparison result * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {string[]} returns.details - Array of difference descriptions * @returns {Array} returns.snippets - Array of difference snippets * @example * const result = comparator.compareImages(site1Images, site2Images); * if (result.hasDifferences) { * console.log('Image differences:', result.details.join(', ')); * } */ compareImages(images1, images2) { const details = []; const snippets = []; if (images1.length !== images2.length) { details.push(`Different number of images: ${images1.length} vs ${images2.length}`); } // Find images with missing alt text const missingAlt1 = images1.filter(img => !img.alt || img.alt.trim() === '').length; const missingAlt2 = images2.filter(img => !img.alt || img.alt.trim() === '').length; if (missingAlt1 !== missingAlt2) { details.push(`Different number of images without alt text: ${missingAlt1} vs ${missingAlt2}`); } // Find different image sources const srcs1 = images1.map(img => img.src).filter(src => src); const srcs2 = images2.map(img => img.src).filter(src => src); const uniqueSrcs1 = srcs1.filter(src => !srcs2.includes(src)); const uniqueSrcs2 = srcs2.filter(src => !srcs1.includes(src)); if (uniqueSrcs1.length > 0 || uniqueSrcs2.length > 0) { details.push(`Different image sources found`); if (uniqueSrcs1.length > 0) { snippets.push({ type: 'images_only_in_site1', count: uniqueSrcs1.length, examples: uniqueSrcs1.slice(0, 2) }); } if (uniqueSrcs2.length > 0) { snippets.push({ type: 'images_only_in_site2', count: uniqueSrcs2.length, examples: uniqueSrcs2.slice(0, 2) }); } } return { hasDifferences: details.length > 0, details, snippets }; } /** * Compare forms with detailed analysis * * This method compares form structures between two sites and provides * detailed analysis of differences including actions, methods, and input fields. * * @param {Object[]} forms1 - Forms from first site * @param {Object[]} forms2 - Forms from second site * @returns {Object} Form comparison result * @returns {boolean} returns.hasDifferences - Whether any differences were found * @returns {string[]} returns.details - Array of difference descriptions * @returns {Array} returns.snippets - Array of difference snippets * @example * const result = comparator.compareForms(site1Forms, site2Forms); * if (result.hasDifferences) { * console.log('Form differences:', result.details.join(', ')); * } */ compareForms(forms1, forms2) { const details = []; const snippets = []; if (forms1.length !== forms2.length) { details.push(`Different number of forms: ${forms1.length} vs ${forms2.length}`); } // Compare form actions and methods const actions1 = forms1.map(f => f.action).filter(a => a); const actions2 = forms2.map(f => f.action).filter(a => a); if (actions1.length !== actions2.length) { details.push(`Different number of form actions: ${actions1.length} vs ${actions2.length}`); } // Compare input counts const totalInputs1 = forms1.reduce((sum, form) => sum + form.inputs.length, 0); const totalInputs2 = forms2.reduce((sum, form) => sum + form.inputs.length, 0); if (totalInputs1 !== totalInputs2) { details.push(`Different total input fields: ${totalInputs1} vs ${totalInputs2}`); } return { hasDifferences: details.length > 0, details, snippets }; } /** * Generate summary of difference types found * * This method analyzes all differences found during comparison and provides * a count of each type of difference (titles, headings, paragraphs, etc.). * * @returns {Object} Object with counts for each difference type * @example * const summary = comparator.getDifferenceTypeSummary(); * console.log(`Found ${summary.title || 0} title differences`); * console.log(`Found ${summary.headings || 0} heading differences`); */ getDifferenceTypeSummary() { const typeCounts = {}; this.results.differences.forEach(diff => { diff.differences.forEach(d => { typeCounts[d.type] = (typeCounts[d.type] || 0) + 1; }); }); return typeCounts; } /** * Get the most significant differences for quick overview * * This method identifies the most important differences found during comparison * based on criteria like title changes, content with snippets, and large count differences. * It's useful for providing a quick summary of the most impactful changes. * * @returns {Array} Array of significant difference objects (limited to top 5) * @example * const significant = comparator.getSignificantDifferences(); * console.log(`Found ${significant.length} significant differences`); */ getSignificantDifferences() { const significant = []; this.results.differences.forEach(diff => { const significantDiffs = diff.differences.filter(d => { // Consider title differences as most significant if (d.type === 'title') return true; // Consider content differences with snippets as significant if (d.snippets && d.snippets.length > 0) return true; // Consider large count differences as significant if (Math.abs(d.site1 - d.site2) > 2) return true; return false; }); if (significantDiffs.length > 0) { significant.push({ url: diff.url, differences: significantDiffs }); } }); return significant.slice(0, 5); // Return top 5 most significant } /** * Get a summary of offset analysis results * * This method provides a comprehensive summary of content changes including: * - Pages with additions, deletions, and reordering * - Total counts of content changes * - Breakdown by content type * * @returns {Object} Summary of content change analysis * @returns {number} returns.totalPages - Total pages analyzed * @returns {number} returns.pagesWithAdditions - Pages with content additions * @returns {number} returns.pagesWithDeletions - Pages with content deletions * @returns {number} returns.pagesWithReordering - Pages with content reordering * @returns {number} returns.totalAdditions - Total content items added * @returns {number} returns.totalDeletions - Total content items removed * @returns {Object} returns.contentTypes - Breakdown by content type * @example * const analysis = comparator.getOffsetAnalysisSummary(); * console.log(`${analysis.totalAdditions} items were added across all pages`); */ getOffsetAnalysisSummary() { const summary = { totalPages: this.results.differences.length, pagesWithAdditions: 0, pagesWithDeletions: 0, pagesWithReordering: 0, totalAdditions: 0, totalDeletions: 0, contentTypes: {} }; this.results.differences.forEach(diff => { let hasAdditions = false; let hasDeletions = false; let hasReordering = false; diff.differences.forEach(d => { if (d.snippets) { d.snippets.forEach(snippet => { if (snippet.type === 'addition') { hasAdditions = true; summary.totalAdditions += snippet.count; summary.contentTypes[snippet.contentType] = (summary.contentTypes[snippet.contentType] || 0) + snippet.count; } else if (snippet.type === 'deletion') { hasDeletions = true; summary.totalDeletions += snippet.count; summary.contentTypes[snippet.contentType] = (summary.contentTypes[snippet.contentType] || 0) + snippet.count; } else if (snippet.type === 'reordering') { hasReordering = true; } }); } }); if (hasAdditions) summary.pagesWithAdditions++; if (hasDeletions) summary.pagesWithDeletions++; if (hasReordering) summary.pagesWithReordering++; }); return summary; } /** * Crawls a single page to extract content and discover links. * * This method handles the process of visiting a page: * - Navigates to the specified URL with proper waiting * - Extracts the page content and status * - Discovers internal links for further crawling * - Handles errors gracefully and provides detailed error information * * @async * @param {Page} page - Puppeteer page instance to use for crawling * @param {string} url - URL to crawl * @param {Object} [auth=null] - Authentication credentials if needed * @returns {Promise<Object>} Crawl result object * @returns {string|null} returns.content - HTML content of the page, or null if failed * @returns {string[]} returns.links - Array of discovered internal links * @returns {number} returns.status - HTTP status code of the response * @returns {string} [returns.error] - Error message if crawling failed * @example * const result = await comparator.crawlPage(page, 'https://example.com/page'); * if (result.content) { * console.log(`Found ${result.links.length} links on page`); * } */ async crawlPage(page, url, auth = null) { try { console.log(`Crawling: ${url}`); const response = await page.goto(url, { waitUntil: 'networkidle2', timeout: this.options.timeout }); if (!response.ok()) { throw new Error(`HTTP ${response.status()}: ${response.statusText()}`); } // Wait for dynamic content - compatible with all Puppeteer versions await this.waitFor(page, this.options.delay); const content = await page.content(); // Get links with error handling let links = []; try { links = await page.$$eval('a[href]', anchors => anchors.map(a => a.href).filter(href => href && href.startsWith('http')) ); console.log(` Found ${links.length} links on ${url}`); } catch (linkError) { console.warn(`Could not extract links from ${url}:`, linkError.message); } return { content, links, status: response.status() }; } catch (error) { console.error(`Error crawling ${url}:`, error.message); return { content: null, links: [], error: error.message }; } } /** * Discovers pages on a domain by crawling and following internal links. * * This method implements intelligent page discovery: * - Starts from the root domain and follows internal links * - Respects maximum page and discovery limits * - Avoids duplicate visits and external links * - Provides progress updates during discovery * - Handles authentication challenges automatically * * @async * @param {string} domain - Domain to discover pages from * @param {Object} [auth=null] - Authentication credentials if needed * @returns {Promise<Map>} Map of discovered pages with their content and links * @example * const pages = await comparator.discoverPages('https://example.com'); * console.log(`Discovered ${pages.size} pages`); */ async discoverPages(domain, auth = null) { const page = await this.createPage(auth); const discovered = new Set([domain]); const toVisit = [domain]; const pages = new Map(); console.log(`Discovering pages from ${domain}...`); while (toVisit.length > 0 && pages.size < this.options.maxPages) { const url = toVisit.shift(); if (this.visited.has(url)) continue; this.visited.add(url); const result = await this.crawlPage(page, url, auth); pages.set(url, result); if (result.content && result.links && result.links.length > 0) { // Add internal links to discovery queue result.links.forEach(link => { try { const linkUrl = new URL(link); const baseUrl = new URL(domain); // Only add internal links that we haven't discovered yet if (linkUrl.hostname === baseUrl.hostname && !discovered.has(link) && !link.includes('#') && // Skip anchor links !link.includes('?') && // Skip query parameters for now pages.size < this.options.maxPages && discovered.size < this.options.maxDiscovery) { discovered.add(link); toVisit.push(link); console.log(` Adding to queue: ${link}`); } } catch (e) { // Invalid URL, skip silently } }); } // Show progress if (pages.size % 5 === 0) { console.log(` Crawled ${pages.size} pages from ${domain}`); } } await page.close(); console.log(`✅ Finished compairing ${pages.size} pages from ${domain}`); return pages; } /** * Main comparison function that orchestrates the entire comparison process. * * This is the primary method for comparing two websites: * - Initializes the comparator and browser * - Handles authentication for both sites * - Discovers pages on both sites * - Compares common pages for differences * - Generates comprehensive reports * - Saves results in multiple formats * * @async * @param {string} domain1 - First domain to compare * @param {string} domain2 - Second domain to compare * @param {Object} [authOptions={}] - Authentication options for both sites * @param {Object} [authOptions.site1] - Authentication for first site * @param {Object} [authOptions.site2] - Authentication for second site * @throws {Error} If comparison fails or authentication is invalid * @example * try { * await comparator.compare('https://staging.example.com', 'https://example.com', { * site1: { username: 'user1', password: 'pass1' }, * site2: { username: 'user2', password: 'pass2' } * }); * console.log('Comparison completed successfully'); * } catch (error) { * console.error('Comparison failed:', error.message); * } */ async compare(domain1, domain2,